Skip to content

Commit 8fd6e48

Browse files
authored
[pfcwd] Add vs test infrastructure (sonic-net#2077)
Currently vs platform support is missing in pfcwd which is preventing us from adding unit tests in this area. This PR adds vs platform support for pfcwd and unit test for basic functionality What I did Define pfcwd port and queue attributes for vs platform and a vs platform detection lua Added unit tests for basic functionality How I verified it New unit testcases added have passed nejo@nejo-linux:~/SONiC/sonic-swss/tests$ sudo pytest --dvsname=vs test_pfcwd.py ================================================= test session starts ================================================== platform linux -- Python 3.6.9, pytest-6.0.2, py-1.9.0, pluggy-0.13.1 rootdir: /home/nejo/SONiC/sonic-swss/tests plugins: flaky-3.7.0 collected 4 items test_pfcwd.py .... [100%] ============================================= 4 passed in 90.12s (0:01:30) ============================================= Signed-off-by: Neetha John <[email protected]>
1 parent b96ee54 commit 8fd6e48

File tree

4 files changed

+326
-1
lines changed

4 files changed

+326
-1
lines changed

orchagent/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dist_swss_DATA = \
1818
pfc_detect_barefoot.lua \
1919
pfc_detect_nephos.lua \
2020
pfc_detect_cisco-8000.lua \
21+
pfc_detect_vs.lua \
2122
pfc_restore.lua \
2223
pfc_restore_cisco-8000.lua \
2324
port_rates.lua \

orchagent/orchdaemon.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ bool OrchDaemon::init()
448448
CFG_PFC_WD_TABLE_NAME
449449
};
450450

451-
if (platform == MLNX_PLATFORM_SUBSTRING)
451+
if ((platform == MLNX_PLATFORM_SUBSTRING) || (platform == VS_PLATFORM_SUBSTRING))
452452
{
453453

454454
static const vector<sai_port_stat_t> portStatIds =

orchagent/pfc_detect_vs.lua

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
-- KEYS - queue IDs
2+
-- ARGV[1] - counters db index
3+
-- ARGV[2] - counters table name
4+
-- ARGV[3] - poll time interval (milliseconds)
5+
-- return queue Ids that satisfy criteria
6+
7+
local counters_db = ARGV[1]
8+
local counters_table_name = ARGV[2]
9+
local poll_time = tonumber(ARGV[3]) * 1000
10+
11+
local rets = {}
12+
13+
redis.call('SELECT', counters_db)
14+
15+
-- Iterate through each queue
16+
local n = table.getn(KEYS)
17+
for i = n, 1, -1 do
18+
local counter_keys = redis.call('HKEYS', counters_table_name .. ':' .. KEYS[i])
19+
local counter_num = 0
20+
local old_counter_num = 0
21+
local is_deadlock = false
22+
local pfc_wd_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_STATUS')
23+
local pfc_wd_action = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_ACTION')
24+
25+
local big_red_switch_mode = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'BIG_RED_SWITCH_MODE')
26+
if not big_red_switch_mode and (pfc_wd_status == 'operational' or pfc_wd_action == 'alert') then
27+
local detection_time = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME')
28+
if detection_time then
29+
detection_time = tonumber(detection_time)
30+
local time_left = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT')
31+
if not time_left then
32+
time_left = detection_time
33+
else
34+
time_left = tonumber(time_left)
35+
end
36+
37+
local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i])
38+
local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i])
39+
-- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then
40+
-- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding
41+
-- maps haven't been updated yet.
42+
if queue_index and port_id then
43+
local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS'
44+
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US'
45+
46+
-- Get all counters
47+
local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')
48+
local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS')
49+
local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)
50+
local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key)
51+
52+
if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then
53+
occupancy_bytes = tonumber(occupancy_bytes)
54+
packets = tonumber(packets)
55+
pfc_rx_packets = tonumber(pfc_rx_packets)
56+
pfc_duration = tonumber(pfc_duration)
57+
58+
local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last')
59+
local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
60+
local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
61+
-- DEBUG CODE START. Uncomment to enable
62+
local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM')
63+
-- DEBUG CODE END.
64+
65+
-- If this is not a first run, then we have last values available
66+
if packets_last and pfc_rx_packets_last and pfc_duration_last then
67+
packets_last = tonumber(packets_last)
68+
pfc_rx_packets_last = tonumber(pfc_rx_packets_last)
69+
pfc_duration_last = tonumber(pfc_duration_last)
70+
local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8)
71+
72+
-- Check actual condition of queue being in PFC storm
73+
if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or
74+
-- DEBUG CODE START. Uncomment to enable
75+
(debug_storm == "enabled") or
76+
-- DEBUG CODE END.
77+
(occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then
78+
if time_left <= poll_time then
79+
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
80+
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
81+
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]')
82+
is_deadlock = true
83+
time_left = detection_time
84+
else
85+
time_left = time_left - poll_time
86+
end
87+
else
88+
if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then
89+
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]')
90+
end
91+
time_left = detection_time
92+
end
93+
end
94+
95+
-- Save values for next run
96+
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets)
97+
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left)
98+
if is_deadlock == false then
99+
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets)
100+
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration)
101+
end
102+
end
103+
end
104+
end
105+
end
106+
end
107+
108+
return rets

tests/test_pfcwd.py

+216
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,222 @@ def test_PfcWdAclCreationDeletion(self, dvs, dvs_acl, testlog):
7777

7878
finally:
7979
dvs_acl.remove_acl_table(PFCWD_TABLE_NAME)
80+
81+
82+
class TestPfcwdFunc(object):
83+
@pytest.fixture
84+
def setup_teardown_test(self, dvs):
85+
self.get_db_handle(dvs)
86+
87+
self.test_ports = ["Ethernet0"]
88+
89+
self.setup_test(dvs)
90+
self.get_port_oids()
91+
self.get_queue_oids()
92+
93+
yield
94+
95+
self.teardown_test(dvs)
96+
97+
def setup_test(self, dvs):
98+
# get original cable len for test ports
99+
fvs = self.config_db.get_entry("CABLE_LENGTH", "AZURE")
100+
self.orig_cable_len = dict()
101+
for port in self.test_ports:
102+
self.orig_cable_len[port] = fvs[port]
103+
# set cable len to non zero value. if port is down, default cable len is 0
104+
self.set_cable_len(port, "5m")
105+
# startup port
106+
dvs.runcmd("config interface startup {}".format(port))
107+
108+
# enable pfcwd
109+
self.set_flex_counter_status("PFCWD", "enable")
110+
# enable queue so that queue oids are generated
111+
self.set_flex_counter_status("QUEUE", "enable")
112+
113+
def teardown_test(self, dvs):
114+
# disable pfcwd
115+
self.set_flex_counter_status("PFCWD", "disable")
116+
# disable queue
117+
self.set_flex_counter_status("QUEUE", "disable")
118+
119+
for port in self.test_ports:
120+
if self.orig_cable_len:
121+
self.set_cable_len(port, self.orig_cable_len[port])
122+
# shutdown port
123+
dvs.runcmd("config interface shutdown {}".format(port))
124+
125+
def get_db_handle(self, dvs):
126+
self.app_db = dvs.get_app_db()
127+
self.asic_db = dvs.get_asic_db()
128+
self.config_db = dvs.get_config_db()
129+
self.counters_db = dvs.get_counters_db()
130+
131+
def set_flex_counter_status(self, key, state):
132+
fvs = {'FLEX_COUNTER_STATUS': state}
133+
self.config_db.update_entry("FLEX_COUNTER_TABLE", key, fvs)
134+
time.sleep(1)
135+
136+
def get_queue_oids(self):
137+
self.queue_oids = self.counters_db.get_entry("COUNTERS_QUEUE_NAME_MAP", "")
138+
139+
def get_port_oids(self):
140+
self.port_oids = self.counters_db.get_entry("COUNTERS_PORT_NAME_MAP", "")
141+
142+
def _get_bitmask(self, queues):
143+
mask = 0
144+
if queues is not None:
145+
for queue in queues:
146+
mask = mask | 1 << queue
147+
148+
return str(mask)
149+
150+
def set_ports_pfc(self, status='enable', pfc_queues=[3,4]):
151+
for port in self.test_ports:
152+
if 'enable' in status:
153+
fvs = {'pfc_enable': ",".join([str(q) for q in pfc_queues])}
154+
self.config_db.create_entry("PORT_QOS_MAP", port, fvs)
155+
else:
156+
self.config_db.delete_entry("PORT_QOS_MAP", port)
157+
158+
def set_cable_len(self, port_name, cable_len):
159+
fvs = {port_name: cable_len}
160+
self.config_db.update_entry("CABLE_LEN", "AZURE", fvs)
161+
162+
def start_pfcwd_on_ports(self, poll_interval="200", detection_time="200", restoration_time="200", action="drop"):
163+
pfcwd_info = {"POLL_INTERVAL": poll_interval}
164+
self.config_db.update_entry("PFC_WD", "GLOBAL", pfcwd_info)
165+
166+
pfcwd_info = {"action": action,
167+
"detection_time" : detection_time,
168+
"restoration_time": restoration_time
169+
}
170+
for port in self.test_ports:
171+
self.config_db.update_entry("PFC_WD", port, pfcwd_info)
172+
173+
def stop_pfcwd_on_ports(self):
174+
for port in self.test_ports:
175+
self.config_db.delete_entry("PFC_WD", port)
176+
177+
def verify_ports_pfc(self, queues=None):
178+
mask = self._get_bitmask(queues)
179+
fvs = {"SAI_PORT_ATTR_PRIORITY_FLOW_CONTROL" : mask}
180+
for port in self.test_ports:
181+
self.asic_db.wait_for_field_match("ASIC_STATE:SAI_OBJECT_TYPE_PORT", self.port_oids[port], fvs)
182+
183+
def verify_pfcwd_state(self, queues, state="stormed"):
184+
fvs = {"PFC_WD_STATUS": state}
185+
for port in self.test_ports:
186+
for queue in queues:
187+
queue_name = port + ":" + str(queue)
188+
self.counters_db.wait_for_field_match("COUNTERS", self.queue_oids[queue_name], fvs)
189+
190+
def verify_pfcwd_counters(self, queues, restore="0"):
191+
fvs = {"PFC_WD_QUEUE_STATS_DEADLOCK_DETECTED" : "1",
192+
"PFC_WD_QUEUE_STATS_DEADLOCK_RESTORED" : restore
193+
}
194+
for port in self.test_ports:
195+
for queue in queues:
196+
queue_name = port + ":" + str(queue)
197+
self.counters_db.wait_for_field_match("COUNTERS", self.queue_oids[queue_name], fvs)
198+
199+
def reset_pfcwd_counters(self, queues):
200+
fvs = {"PFC_WD_QUEUE_STATS_DEADLOCK_DETECTED" : "0",
201+
"PFC_WD_QUEUE_STATS_DEADLOCK_RESTORED" : "0"
202+
}
203+
for port in self.test_ports:
204+
for queue in queues:
205+
queue_name = port + ":" + str(queue)
206+
self.counters_db.update_entry("COUNTERS", self.queue_oids[queue_name], fvs)
207+
208+
def set_storm_state(self, queues, state="enabled"):
209+
fvs = {"DEBUG_STORM": state}
210+
for port in self.test_ports:
211+
for queue in queues:
212+
queue_name = port + ":" + str(queue)
213+
self.counters_db.update_entry("COUNTERS", self.queue_oids[queue_name], fvs)
214+
215+
def test_pfcwd_single_queue(self, dvs, setup_teardown_test):
216+
try:
217+
# enable PFC on queues
218+
test_queues = [3, 4]
219+
self.set_ports_pfc(pfc_queues=test_queues)
220+
221+
# verify in asic db
222+
self.verify_ports_pfc(test_queues)
223+
224+
# start pfcwd
225+
self.start_pfcwd_on_ports()
226+
227+
# start pfc storm
228+
storm_queue = [3]
229+
self.set_storm_state(storm_queue)
230+
231+
# verify pfcwd is triggered
232+
self.verify_pfcwd_state(storm_queue)
233+
234+
# verify pfcwd counters
235+
self.verify_pfcwd_counters(storm_queue)
236+
237+
# verify if queue is disabled
238+
self.verify_ports_pfc(queues=[4])
239+
240+
# stop storm
241+
self.set_storm_state(storm_queue, state="disabled")
242+
243+
# verify pfcwd state is restored
244+
self.verify_pfcwd_state(storm_queue, state="operational")
245+
246+
# verify pfcwd counters
247+
self.verify_pfcwd_counters(storm_queue, restore="1")
248+
249+
# verify if queue is enabled
250+
self.verify_ports_pfc(test_queues)
251+
252+
finally:
253+
self.reset_pfcwd_counters(storm_queue)
254+
self.stop_pfcwd_on_ports()
255+
256+
def test_pfcwd_multi_queue(self, dvs, setup_teardown_test):
257+
try:
258+
# enable PFC on queues
259+
test_queues = [3, 4]
260+
self.set_ports_pfc(pfc_queues=test_queues)
261+
262+
# verify in asic db
263+
self.verify_ports_pfc(test_queues)
264+
265+
# start pfcwd
266+
self.start_pfcwd_on_ports()
267+
268+
# start pfc storm
269+
self.set_storm_state(test_queues)
270+
271+
# verify pfcwd is triggered
272+
self.verify_pfcwd_state(test_queues)
273+
274+
# verify pfcwd counters
275+
self.verify_pfcwd_counters(test_queues)
276+
277+
# verify if queue is disabled. Expected mask is 0
278+
self.verify_ports_pfc()
279+
280+
# stop storm
281+
self.set_storm_state(test_queues, state="disabled")
282+
283+
# verify pfcwd state is restored
284+
self.verify_pfcwd_state(test_queues, state="operational")
285+
286+
# verify pfcwd counters
287+
self.verify_pfcwd_counters(test_queues, restore="1")
288+
289+
# verify if queue is enabled
290+
self.verify_ports_pfc(test_queues)
291+
292+
finally:
293+
self.reset_pfcwd_counters(test_queues)
294+
self.stop_pfcwd_on_ports()
295+
80296
#
81297
# Add Dummy always-pass test at end as workaroud
82298
# for issue when Flaky fail on final test it invokes module tear-down before retrying

0 commit comments

Comments
 (0)