Skip to content

Commit 18b4579

Browse files
authored
Add CPU usage test for the optimization of port-buffer-drop counterpoll (#5024)
There is a performance optimization for port buffer drop counter. Adjust port-buffer-drop counter default interval from 60000ms to 10000ms. Test is skipped on 201811, 201911 and 202012. It should work on 202106 and later images Test will check DUT memory usage and process cpu usage are within threshold for 60s. - Disable all counterpoll types except for the tested one - Collect memory and CPU usage for 60 secs - Compare the memory usage with the memory threshold - Compare the average cpu usage with the cpu threshold for the specified progress(for nvidia, the progress is sx_sdk) - Restore counterpoll interval and status to state before the test
1 parent 44df4c5 commit 18b4579

7 files changed

+295
-21
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
def add_counterpoll_cpu_usage_args(parser):
2+
'''
3+
Adding arguments required for counterpoll cpu usage test
4+
'''
5+
parser.addoption(
6+
"--port_buffer_drop_cpu_usage_threshold",
7+
action="store",
8+
type=int,
9+
default=10,
10+
help="Port buffer drop cpu usage threshold",
11+
)

tests/platform_tests/conftest.py

+5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from tests.common.broadcom_data import is_broadcom_device
1414
from tests.common.plugins.loganalyzer.loganalyzer import LogAnalyzer
1515
from tests.common.plugins.sanity_check.recover import neighbor_vm_restore
16+
from .args.counterpoll_cpu_usage_args import add_counterpoll_cpu_usage_args
1617

1718

1819
TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
@@ -592,3 +593,7 @@ def pytest_generate_tests(metafunc):
592593
metafunc.parametrize('power_off_delay', delay_list)
593594
except ValueError:
594595
metafunc.parametrize('power_off_delay', default_delay_list)
596+
597+
598+
def pytest_addoption(parser):
599+
add_counterpoll_cpu_usage_args(parser)

tests/platform_tests/counterpoll/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
class CounterpollConstants:
3+
COUNTERPOLL_SHOW = 'counterpoll show'
4+
COUNTERPOLL_DISABLE = 'counterpoll {} disable'
5+
COUNTERPOLL_ENABLE = 'counterpoll {} enable'
6+
COUNTERPOLL_RESTORE = 'counterpoll {} {}'
7+
COUNTERPOLL_INTERVAL_STR = 'counterpoll {} interval {}'
8+
COUNTERPOLL_QUEST = 'counterpoll --help'
9+
EXCLUDE_COUNTER_SUB_COMMAND = ['show', 'config-db', "flowcnt-trap", "tunnel"]
10+
INTERVAL = 'interval (in ms)'
11+
TYPE = 'type'
12+
STATUS = 'status'
13+
STDOUT = 'stdout'
14+
PG_DROP = 'pg-drop'
15+
PG_DROP_STAT_TYPE = 'PG_DROP_STAT'
16+
QUEUE_STAT_TYPE = 'QUEUE_STAT'
17+
QUEUE = 'queue'
18+
PORT_STAT_TYPE = 'PORT_STAT'
19+
PORT = 'port'
20+
PORT_BUFFER_DROP_TYPE = 'PORT_BUFFER_DROP'
21+
PORT_BUFFER_DROP = 'port-buffer-drop'
22+
RIF_STAT_TYPE = 'RIF_STAT'
23+
RIF = 'rif'
24+
WATERMARK = 'watermark'
25+
QUEUE_WATERMARK_STAT_TYPE = 'QUEUE_WATERMARK_STAT'
26+
PG_WATERMARK_STAT_TYPE = 'PG_WATERMARK_STAT'
27+
BUFFER_POOL_WATERMARK_STAT_TYPE = 'BUFFER_POOL_WATERMARK_STAT'
28+
ACL = 'acl'
29+
ACL_TYPE = "ACL"
30+
COUNTERPOLL_MAPPING = {PG_DROP_STAT_TYPE: PG_DROP,
31+
QUEUE_STAT_TYPE: QUEUE,
32+
PORT_STAT_TYPE: PORT,
33+
PORT_BUFFER_DROP_TYPE: PORT_BUFFER_DROP,
34+
RIF_STAT_TYPE: RIF,
35+
BUFFER_POOL_WATERMARK_STAT_TYPE: WATERMARK,
36+
QUEUE_WATERMARK_STAT_TYPE: WATERMARK,
37+
PG_WATERMARK_STAT_TYPE: WATERMARK,
38+
ACL_TYPE: ACL}
39+
PORT_BUFFER_DROP_INTERVAL = '10000'
40+
COUNTERPOLL_INTERVAL = {PORT_BUFFER_DROP: 10000}
41+
SX_SDK = 'sx_sdk'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from tests.platform_tests.counterpoll.counterpoll_constants import CounterpollConstants
2+
3+
4+
class ConterpollHelper:
5+
@staticmethod
6+
def get_counterpoll_show_output(duthost):
7+
return duthost.show_and_parse(CounterpollConstants.COUNTERPOLL_SHOW)
8+
9+
@staticmethod
10+
def get_available_counterpoll_types(duthost):
11+
available_option_list = []
12+
COMMANDS = 'Commands:'
13+
counterpoll_show = duthost.command(CounterpollConstants.COUNTERPOLL_QUEST)[CounterpollConstants.STDOUT]
14+
index = counterpoll_show.find(COMMANDS) + len(COMMANDS) + 1
15+
for line in counterpoll_show[index:].splitlines():
16+
available_option_list.append(line.split()[0])
17+
return [option for option in available_option_list if option not in CounterpollConstants.EXCLUDE_COUNTER_SUB_COMMAND]
18+
19+
@staticmethod
20+
def get_parsed_counterpoll_show(counterpoll_show):
21+
parsed_counterpoll = {}
22+
for counterpoll in counterpoll_show:
23+
parsed_counterpoll[counterpoll[CounterpollConstants.TYPE]] = {
24+
CounterpollConstants.INTERVAL: counterpoll[CounterpollConstants.INTERVAL],
25+
CounterpollConstants.STATUS: counterpoll[CounterpollConstants.STATUS]}
26+
return parsed_counterpoll
27+
28+
@staticmethod
29+
def restore_counterpoll_status(duthost, counterpoll_before, counterpoll_after):
30+
for counterpoll, value in counterpoll_after.items():
31+
if counterpoll not in counterpoll_before:
32+
continue
33+
else:
34+
if counterpoll_after[counterpoll][CounterpollConstants.STATUS] \
35+
!= counterpoll_before[counterpoll][CounterpollConstants.STATUS]:
36+
duthost.command(CounterpollConstants.COUNTERPOLL_RESTORE.format(
37+
CounterpollConstants.COUNTERPOLL_MAPPING[counterpoll],
38+
counterpoll_before[counterpoll][CounterpollConstants.STATUS]))
39+
40+
@staticmethod
41+
def disable_counterpoll(duthost, counter_type_list):
42+
for counterpoll_type in counter_type_list:
43+
duthost.command(CounterpollConstants.COUNTERPOLL_DISABLE.format(counterpoll_type))
44+
45+
@staticmethod
46+
def enable_counterpoll(duthost, counter_type_list):
47+
for counterpoll_type in counter_type_list:
48+
duthost.command(CounterpollConstants.COUNTERPOLL_ENABLE.format(counterpoll_type))
49+
50+
@staticmethod
51+
def set_counterpoll_interval(duthost, counterpoll_type, interval):
52+
duthost.command(CounterpollConstants.COUNTERPOLL_INTERVAL_STR.format(counterpoll_type, interval))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pytest
2+
3+
from tests.platform_tests.counterpoll.counterpoll_constants import CounterpollConstants
4+
from tests.platform_tests.counterpoll.counterpoll_helper import ConterpollHelper
5+
from tests.common.utilities import skip_release
6+
7+
8+
@pytest.fixture(params=[CounterpollConstants.PORT_BUFFER_DROP])
9+
def counterpoll_type(request):
10+
return request.param
11+
12+
13+
@pytest.fixture()
14+
def restore_counter_poll(duthosts, enum_rand_one_per_hwsku_hostname):
15+
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
16+
skip_release(duthost, ["201811", "201911", "202012"])
17+
18+
counter_poll_show = ConterpollHelper.get_counterpoll_show_output(duthost)
19+
parsed_counterpoll_before = ConterpollHelper.get_parsed_counterpoll_show(counter_poll_show)
20+
yield
21+
counter_poll_show = ConterpollHelper.get_counterpoll_show_output(duthost)
22+
parsed_counterpoll_after = ConterpollHelper.get_parsed_counterpoll_show(counter_poll_show)
23+
ConterpollHelper.restore_counterpoll_status(duthost, parsed_counterpoll_before, parsed_counterpoll_after)

tests/platform_tests/test_cpu_memory_usage.py

+163-21
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
import pytest
33

44
from collections import namedtuple, Counter
5+
from tests.platform_tests.counterpoll.cpu_memory_helper import restore_counter_poll # lgtm [py/unused-import]
6+
from tests.platform_tests.counterpoll.cpu_memory_helper import counterpoll_type # lgtm [py/unused-import]
7+
from tests.platform_tests.counterpoll.counterpoll_helper import ConterpollHelper
8+
from tests.platform_tests.counterpoll.counterpoll_constants import CounterpollConstants
9+
from tests.common.mellanox_data import is_mellanox_device
510

611

712
pytestmark = [
@@ -22,6 +27,7 @@ def setup_thresholds(duthosts, enum_rand_one_per_hwsku_hostname):
2227
high_cpu_consume_procs['syncd'] = 80
2328
return memory_threshold, cpu_threshold, high_cpu_consume_procs
2429

30+
2531
def test_cpu_memory_usage(duthosts, enum_rand_one_per_hwsku_hostname, setup_thresholds):
2632
"""Check DUT memory usage and process cpu usage are within threshold."""
2733
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
@@ -35,36 +41,172 @@ def test_cpu_memory_usage(duthosts, enum_rand_one_per_hwsku_hostname, setup_thre
3541
outstanding_procs_counter = Counter()
3642
for i, monit_result in enumerate(MonitResult(*_) for _ in monit_results):
3743
logging.debug("------ Iteration %d ------", i)
38-
if monit_result.memory['used_percent'] > memory_threshold:
39-
logging.debug("system memory usage exceeds %d%%: %s",
40-
memory_threshold, monit_result.memory)
41-
outstanding_mem_polls[i] = monit_result.memory
44+
check_memory(i, memory_threshold, monit_result, outstanding_mem_polls)
4245
for proc in monit_result.processes:
4346
cpu_threshold = normal_cpu_threshold
44-
if high_cpu_consume_procs.has_key(proc['name']):
45-
cpu_threshold = high_cpu_consume_procs[proc['name']]
46-
if proc['cpu_percent'] >= cpu_threshold:
47-
logging.debug("process %s(%d) cpu usage exceeds %d%%.",
48-
proc['name'], proc['pid'], cpu_threshold)
49-
outstanding_procs[proc['pid']] = proc['name']
50-
outstanding_procs_counter[proc['pid']] += 1
47+
if proc['name'] in high_cpu_consume_procs:
48+
cpu_threshold = high_cpu_consume_procs[proc['name']]
49+
check_cpu_usage(cpu_threshold, outstanding_procs, outstanding_procs_counter, proc)
50+
51+
analyse_monitoring_results(cpu_threshold, memory_threshold, outstanding_mem_polls, outstanding_procs,
52+
outstanding_procs_counter, persist_threshold)
53+
5154

55+
def analyse_monitoring_results(cpu_threshold, memory_threshold, outstanding_mem_polls, outstanding_procs,
56+
outstanding_procs_counter, persist_threshold):
5257
persist_outstanding_procs = []
5358
for pid, freq in outstanding_procs_counter.most_common():
5459
if freq <= persist_threshold:
5560
break
5661
persist_outstanding_procs.append(pid)
57-
5862
if outstanding_mem_polls or persist_outstanding_procs:
59-
failure_message = ""
60-
6163
if outstanding_mem_polls:
62-
failure_message += "System memory usage exceeds {}%".format(memory_threshold)
63-
if persist_outstanding_procs:
64-
failure_message += "; "
65-
64+
logging.error("system memory usage exceeds %d%%", memory_threshold)
6665
if persist_outstanding_procs:
67-
failure_message += "Processes that persistently exceed CPU usage ({}%): {}".format(
68-
cpu_threshold, [outstanding_procs[p] for p in persist_outstanding_procs])
66+
logging.error(
67+
"processes that persistently exceeds cpu usage %d%%: %s",
68+
cpu_threshold,
69+
[outstanding_procs[p] for p in persist_outstanding_procs]
70+
)
71+
pytest.fail("system cpu and memory usage check fails")
72+
73+
74+
@pytest.fixture(scope='module')
75+
def counterpoll_cpu_threshold(duthosts, request):
76+
counterpoll_cpu_usage_threshold = {"port-buffer-drop": request.config.getoption("--port_buffer_drop_cpu_usage_threshold")}
77+
return counterpoll_cpu_usage_threshold
78+
79+
80+
def test_cpu_memory_usage_counterpoll(duthosts, enum_rand_one_per_hwsku_hostname,
81+
setup_thresholds, restore_counter_poll, counterpoll_type, counterpoll_cpu_threshold):
82+
"""Check DUT memory usage and process cpu usage are within threshold.
83+
Disable all counterpoll types except tested one
84+
Collect memory and CPUs usage for 60 secs
85+
Compare the memory usage with the memory threshold
86+
Compare the average cpu usage with the cpu threshold for the specified progress
87+
Restore counterpolls status
88+
"""
89+
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
90+
program_to_check = get_manufacturer_program_to_check(duthost)
91+
if program_to_check is None:
92+
pytest.skip("Skip no program is offered to check")
93+
94+
memory_threshold, _, _ = setup_thresholds
95+
counterpoll_cpu_usage_threshold = counterpoll_cpu_threshold[counterpoll_type]
96+
97+
MonitResult = namedtuple('MonitResult', ['processes', 'memory'])
98+
disable_all_counterpoll_type_except_tested(duthost, counterpoll_type)
99+
monit_results = duthost.monit_process(iterations=60, delay_interval=1)['monit_results']
100+
poll_interval = CounterpollConstants.COUNTERPOLL_INTERVAL[counterpoll_type] // 1000
101+
102+
outstanding_mem_polls = {}
103+
outstanding_procs = {}
104+
outstanding_procs_counter = Counter()
105+
106+
cpu_usage_program_to_check = []
107+
108+
prepare_ram_cpu_usage_results(MonitResult, counterpoll_cpu_usage_threshold, memory_threshold, monit_results, outstanding_mem_polls,
109+
outstanding_procs, outstanding_procs_counter, program_to_check,
110+
cpu_usage_program_to_check)
111+
112+
log_cpu_usage_by_vendor(cpu_usage_program_to_check, counterpoll_type)
113+
114+
cpu_usage_average = caculate_cpu_usge_average_value(extract_valid_cpu_usage_data(cpu_usage_program_to_check, poll_interval), cpu_usage_program_to_check)
115+
logging.info("Average cpu_usage is {}".format(cpu_usage_average))
116+
assert cpu_usage_average < counterpoll_cpu_usage_threshold, "cpu_usage_average of {} exceeds the cpu threshold:{}".format(program_to_check, counterpoll_cpu_usage_threshold)
117+
assert not outstanding_mem_polls, " Memory {} exceeds the memory threshold {} ".format(outstanding_mem_polls, memory_threshold)
118+
119+
120+
def log_cpu_usage_by_vendor(cpu_usage_program_to_check, counterpoll_type):
121+
if cpu_usage_program_to_check:
122+
logging.info('CPU usage for counterpoll type {} : {}'.format(counterpoll_type, cpu_usage_program_to_check))
123+
124+
125+
def get_manufacturer_program_to_check(duthost):
126+
if is_mellanox_device(duthost):
127+
return CounterpollConstants.SX_SDK
128+
129+
130+
def prepare_ram_cpu_usage_results(MonitResult, cpu_threshold, memory_threshold, monit_results, outstanding_mem_polls,
131+
outstanding_procs, outstanding_procs_counter, program_to_check,
132+
program_to_check_cpu_usage):
133+
for i, monit_result in enumerate(MonitResult(*_) for _ in monit_results):
134+
logging.debug("------ Iteration %d ------", i)
135+
check_memory(i, memory_threshold, monit_result, outstanding_mem_polls)
136+
for proc in monit_result.processes:
137+
update_cpu_usage_desired_program(proc, program_to_check, program_to_check_cpu_usage)
138+
139+
140+
def extract_valid_cpu_usage_data(program_to_check_cpu_usage, poll_interval):
141+
"""
142+
This method it to extract the valid cpu usage data according to the poll_interval
143+
1. Find the index for the max one for every poll interval,
144+
2. Discard the data if the index is on the edge(0 o the length of program_to_check_cpu_usage -1)
145+
3. If the index is closed in the neighbour interval, only keep the former one
146+
4. Return all indexes
147+
For example:
148+
poll_interval = 10
149+
7, 1, 0, 1, 0, 1, 5, 1, 1,2, 0, 1, 0, 1, 0, 6, 1, 1, 1,2
150+
return [15]
151+
0, 1, 0, 1, 0, 1, 0, 1, 0, 8, 7, 1, 0, 1, 0, 6, 1, 1, 1,2
152+
return [9]
153+
"""
154+
valid_cpu_usage_center_index_list = []
155+
poll_number = len(program_to_check_cpu_usage) // poll_interval
156+
157+
def find_max_cpu_usage(cpu_usage_list, poll_times):
158+
max_cpu_usage = cpu_usage_list[0]
159+
max_cpu_usage_index = 0
160+
for i, cpu_usage in enumerate(cpu_usage_list):
161+
if cpu_usage > max_cpu_usage:
162+
max_cpu_usage = cpu_usage
163+
max_cpu_usage_index = i
164+
return [max_cpu_usage, max_cpu_usage_index + poll_times * poll_interval]
165+
166+
for i in range(0, poll_number):
167+
max_cpu_usage, max_cpu_usage_index = find_max_cpu_usage(
168+
program_to_check_cpu_usage[poll_interval * i:poll_interval * (i + 1)], i)
169+
if max_cpu_usage_index == 0 or max_cpu_usage_index == len(program_to_check_cpu_usage) - 1:
170+
logging.info("The data is on the edge:{}, discard it ".format(max_cpu_usage_index))
171+
else:
172+
if valid_cpu_usage_center_index_list and valid_cpu_usage_center_index_list[-1] + 1 == max_cpu_usage_index:
173+
continue
174+
valid_cpu_usage_center_index_list.append(max_cpu_usage_index)
175+
176+
return valid_cpu_usage_center_index_list
177+
178+
179+
def caculate_cpu_usge_average_value(valid_cpu_usage_center_index_list, program_to_check_cpu_usage):
180+
len_valid_cpu_usage = len(valid_cpu_usage_center_index_list)
181+
cpu_usage_average = 0.0
182+
for i in valid_cpu_usage_center_index_list:
183+
cpu_usage_average += sum(program_to_check_cpu_usage[i - 1: i + 2])
184+
logging.info("cpu usage center index:{}: cpu usage:{}".format(i, program_to_check_cpu_usage[i - 1:i + 2]))
185+
return cpu_usage_average / len_valid_cpu_usage / 3.0 if len_valid_cpu_usage != 0 else 0
186+
187+
188+
def check_cpu_usage(cpu_threshold, outstanding_procs, outstanding_procs_counter, proc):
189+
if proc['cpu_percent'] >= cpu_threshold:
190+
logging.debug("process %s(%d) cpu usage exceeds %d%%.",
191+
proc['name'], proc['pid'], cpu_threshold)
192+
outstanding_procs[proc['pid']] = proc['name']
193+
outstanding_procs_counter[proc['pid']] += 1
194+
195+
196+
def update_cpu_usage_desired_program(proc, program_to_check, program_to_check_cpu_usage):
197+
if program_to_check:
198+
if proc['name'] == program_to_check:
199+
program_to_check_cpu_usage.append(proc['cpu_percent'])
200+
201+
202+
def check_memory(i, memory_threshold, monit_result, outstanding_mem_polls):
203+
if monit_result.memory['used_percent'] > memory_threshold:
204+
logging.debug("system memory usage exceeds %d%%: %s",
205+
memory_threshold, monit_result.memory)
206+
outstanding_mem_polls[i] = monit_result.memory
207+
69208

70-
pytest.fail(failure_message)
209+
def disable_all_counterpoll_type_except_tested(duthost, counterpoll_type):
210+
available_types = ConterpollHelper.get_available_counterpoll_types(duthost)
211+
available_types.remove(counterpoll_type)
212+
ConterpollHelper.disable_counterpoll(duthost, available_types)

0 commit comments

Comments
 (0)