Skip to content

Commit 1aac5e2

Browse files
authored
[VoQ chassis] : Script to debug packet drops (#3536)
* Script to debug packet loss on VoQ chassis * Updated setup.py with new script * Fixed the order of the script * Fixed error in pre-check
1 parent ad5b0c0 commit 1aac5e2

File tree

2 files changed

+372
-0
lines changed

2 files changed

+372
-0
lines changed
+371
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
#!/usr/bin/bash
2+
# defaults for env vars
3+
sleep_period=${sleep_period:-0}
4+
maxiter=${maxiter:-25} # all but 4 iterations will be polling Egress drops
5+
log=${log:-/dev/stdout}
6+
time_format="%D %T.%6N"
7+
delim="END"
8+
# options
9+
ing_check_mc=${ing_check_mc:-1}
10+
ing_check_macsec=${ing_check_macsec:-1}
11+
egr_check_mc=${egr_check_mc:-1}
12+
egr_check_pmf_hit_bits=${egr_check_pmf_hit_bits:-1}
13+
egr_diag_counter_g=${egr_diag_counter_g:-1}
14+
15+
declare -a cores=("0" "1")
16+
declare -a asics=("0" "1")
17+
queue_pair_mask_a=(0 0 0 0)
18+
dsp_map_a=(0 0 0 0)
19+
20+
timestamp(){
21+
curr_time=$(date +"$time_format")
22+
echo "$curr_time $logmsg" >> $log
23+
}
24+
25+
print_pqp_reasons() {
26+
disc_reasons=$((16#${disc_reasons}))
27+
if [ $disc_reasons -eq 0 ]; then echo "none" >> $log ; fi
28+
if [ $(($disc_reasons & 1)) -ne 0 ] ; then echo "0- Total PDs threshold violated" >> $log ; fi
29+
if [ $(($disc_reasons & 2)) -ne 0 ] ; then echo "1- Total PDs UC pool size threshold violated" >> $log ; fi
30+
if [ $(($disc_reasons & 4)) -ne 0 ] ; then echo "2- Per port UC PDs threshold" >> $log ; fi
31+
if [ $(($disc_reasons & 8)) -ne 0 ] ; then echo "3- Per queue UC PDs thresholds">> $log ; fi
32+
if [ $(($disc_reasons & 16)) -ne 0 ] ; then echo "4- Per port UC DBs threshold">> $log ; fi
33+
if [ $(($disc_reasons & 32)) -ne 0 ] ; then echo "5- Per queue UC DBs threshold">> $log ; fi
34+
if [ $(($disc_reasons & 64)) -ne 0 ] ; then echo "6- Per queue disable bit">> $log ; fi
35+
if [ $(($disc_reasons & 128)) -ne 0 ] ; then echo "7- Undefined">> $log ; fi
36+
if [ $(($disc_reasons & 256)) -ne 0 ] ; then echo "8- Total PDs MC pool size threshold">> $log ; fi
37+
if [ $(($disc_reasons & 512)) -ne 0 ] ; then echo "9- Per interface PDs threhold">> $log; fi
38+
if [ $(($disc_reasons & 1024)) -ne 0 ] ; then echo "10- MC SP threshold">> $log ; fi
39+
if [ $(($disc_reasons & 2048)) -ne 0 ] ; then echo "11- per MC-TC threshold">> $log ; fi
40+
if [ $(($disc_reasons & 4096)) -ne 0 ] ; then echo "12- MC PDs per port threshold">> $log ; fi
41+
if [ $(($disc_reasons & 8192)) -ne 0 ] ; then echo "13- MC PDs per queue threshold">> $log ; fi
42+
if [ $(($disc_reasons & 16384)) -ne 0 ] ; then echo "14- MC per port size (bytes) threshold">> $log ; fi
43+
if [ $(($disc_reasons & 32768)) -ne 0 ] ; then echo "15- MC per queue size(bytes) thresholds">> $log ; fi
44+
}
45+
print_rqp_reasons(){
46+
disc_reasons=$((16#${disc_reasons}))
47+
if [ $disc_reasons -eq 0 ]; then echo "none" >> $log ; fi
48+
if [ $(($disc_reasons & 1)) -ne 0 ] ; then echo "0- Total DBs threshold violated" >> $log ; fi
49+
if [ $(($disc_reasons & 2)) -ne 0 ] ; then echo "1- Total UC DBs pool size threshold violated" >> $log ; fi
50+
if [ $(($disc_reasons & 4)) -ne 0 ] ; then echo "2- UC packet discarded in EMR because UC FIFO is full" >> $log ; fi
51+
if [ $(($disc_reasons & 8)) -ne 0 ] ; then echo "3- MC HP packetd discarded in EMR because MC FIFO is full">> $log ; fi
52+
if [ $(($disc_reasons & 16)) -ne 0 ] ; then echo "4- MC LP packetd discarded in EMR because MC FIFO is full">> $log ; fi
53+
if [ $(($disc_reasons & 32)) -ne 0 ] ; then echo "5- Total MC DBs pool size threshold violated">> $log ; fi
54+
if [ $(($disc_reasons & 64)) -ne 0 ] ; then echo "6- Packet-DP is not eligible to take from shared DBs resources">> $log ; fi
55+
if [ $(($disc_reasons & 128)) -ne 0 ] ; then echo "7- USP DBs threshold violated">> $log ; fi
56+
if [ $(($disc_reasons & 256)) -ne 0 ] ; then echo "8- Discrete-Partitioning method: MC-TC DBs threshold violated">> $log ; fi
57+
if [ $(($disc_reasons & 512)) -ne 0 ] ; then echo "9- Strict-priority method: MC-TC mapped to SP0 DBs threshold violated">> $log; fi
58+
if [ $(($disc_reasons & 1024)) -ne 0 ] ; then echo "10- Strict-Priority method: MC-TC mapped to SP1 DBs threshold violated">> $log ; fi
59+
}
60+
61+
# whenever port_disabled mask change, print the up ports
62+
# (according to the queue-pair mask and DSP port mapping, which is what matters )
63+
64+
check_new_port_state() {
65+
last_queue_pair_mask=${queue_pair_mask_a[$index]}
66+
queue_pair_mask=$(bcmcmd -n $asic "g hex ECGM_CGM_QUEUE_PAIR_DISABLED.ECGM${core}" | head -n +2 | tail -1)
67+
if [ "$queue_pair_mask" == "$last_queue_pair_mask" ] ; then
68+
return
69+
fi
70+
queue_pair_mask_a[$index]=$queue_pair_mask
71+
logmsg="EGRESS_QPAIR asic $asic core $core new disabled mask: $queue_pair_mask"
72+
timestamp
73+
74+
start_dsp=$core
75+
let amt=255-$core
76+
dsp_map_a[$index]=$(bcmcmd -n $asic "d SCH_DSP_2_PORT_MAP_DSPP.SCH${core} $start_dsp $amt")
77+
78+
hr_num=0
79+
for pos in {-3..-129..-2}; do # todo
80+
byte=${queue_pair_mask:pos:2}
81+
if [ $hr_num -le 8 ] ; then
82+
hr_num_hex="HR_NUM=${hr_num}"
83+
else
84+
hr_num_hex=$(printf "HR_NUM=0x%x" $hr_num)
85+
fi
86+
hr_num=$(( hr_num + 8))
87+
entry=$(echo ${dsp_map_a[$index]} | sed -e "s/\r/\r\n/g" | grep -m 1 "$hr_num_hex")
88+
found=$?
89+
if [ $found -eq 1 ] ; then
90+
continue
91+
fi
92+
dsp_port=$(echo $entry |grep -o "\[.*\]"| tr -dc '[:alnum:]')
93+
if [ "$byte" = "ff" ]; then
94+
printf "DOWN %3d ${entry}\n" $dsp_port >> $log
95+
else
96+
printf "UP %3d ${entry}\n" $dsp_port >> $log
97+
fi
98+
done
99+
echo >> $log
100+
}
101+
102+
decode_last_rqp_drop() {
103+
rqp_disc=$(bcmcmd -n $asic "g hex ECGM_RQP_DISCARD_REASONS.ECGM${core}" | head -n -1 | tail -1)
104+
prefix=${rqp_disc: 0: 2}
105+
if [ "$prefix" != "0x" ]; then
106+
return; # empty (0) or a failed read
107+
fi
108+
logmsg="EGRESS_DROP RQP_DISCARD_REASONS asic $asic core $core index $index: $rqp_disc"
109+
timestamp
110+
disc_reasons=${rqp_disc: -4: 3}
111+
print_rqp_reasons
112+
}
113+
114+
decode_last_pqp_drop() {
115+
pqp_disc=$(bcmcmd -n $asic "g hex ECGM_PQP_DISCARD_REASONS.ECGM${core}" | head -n -1 | tail -1 )
116+
prefix=${pqp_disc: 0: 2}
117+
if [ "$prefix" != "0x" ]; then
118+
return; # empty (0) or a failed read
119+
fi
120+
logmsg="EGRESS_DROP PQP_DISCARD_REASONS asic $asic core $core: $pqp_disc"
121+
timestamp
122+
check_new_port_state # in case the DSP map has changed
123+
disc_reasons=${pqp_disc: -5: 4}
124+
last_reason=${pqp_disc: -9: 4}
125+
drop_cmd=${pqp_disc: -19: 10}
126+
queue=${drop_cmd: -8: 3}
127+
queue=$((16#${queue}))
128+
queue=$(($queue / 4 ))
129+
queue=$(($queue & 248))
130+
hr_num_hex=$(printf "%02x" $queue)
131+
entry=$(echo ${dsp_map_a[$index]} | sed -e "s/\r/\r\n/g" | grep -m 1 "$hr_num_hex")
132+
found=$?
133+
dsp_port=$(echo $entry |grep -o "\[.*\]"| tr -dc '[:alnum:]')
134+
if [ $found -eq 1 ] ; then
135+
echo "drop_reason 0x${disc_reasons} queue 0x${hr_num_hex} dsp_port not_found" >> $log
136+
else
137+
dsp_port=$(echo $entry |grep -o "\[.*\]"| tr -dc '[:alnum:]')
138+
echo "drop_reason 0x${disc_reasons} queue 0x${hr_num_hex} dsp port $dsp_port" >> $log
139+
fi
140+
echo "pqp discard reasons (cumulative since last read):" >> $log
141+
print_pqp_reasons
142+
echo "pqp last packet discard reasons:" >> $log
143+
disc_reasons=$last_reason
144+
print_pqp_reasons
145+
echo >> $log
146+
}
147+
148+
149+
clear_tcam_hit_bits() {
150+
cint_filename="/tmp/hitbits"
151+
cint=';print bcm_field_entry_hit_flush(0, BCM_FIELD_ENTRY_HIT_FLUSH_ALL, 0); exit;'
152+
bcmcmd -n $asic "log off; rm $cint_filename;log file=$cint_filename quiet=yes; echo '$cint';log off;cint $cint_filename" >> /dev/null
153+
}
154+
155+
dump_tcam_drop_action_hits() {
156+
echo "SAI_FG_TRAP hits:" >> $log
157+
bcmcmd -n $asic "dbal table dump Table=SAI_FG_TRAP" | grep "CORE" | awk -F'|' '{print $2,$34}' >> $log
158+
echo "EPMF_Cascade hits:" >> $log
159+
# entries 51,52,53,54,55,56 have drop action
160+
bcmcmd -n $asic "dbal table dump Table=EPMF_Cascade" | grep "CORE" | awk -F'|' '{print $2,$10}'>> $log
161+
clear_tcam_hit_bits
162+
}
163+
164+
check_egress_drops() {
165+
hit=0
166+
pqp_uc_discard=$(bcmcmd -n $asic "g hex PQP_PQP_DISCARD_UNICAST_PACKET_COUNTER.PQP${core}"| head -n -1 | tail -n +2 | tr -dc '[:alnum:]')
167+
erpp_discard=$(bcmcmd -n $asic "g hex PQP_ERPP_DISCARDED_PACKET_COUNTER.PQP${core}"| head -n -1 | tail -n +2 | tr -dc '[:alnum:]')
168+
rqp_debug_counters=$(bcmcmd -n $asic "g RQP_PRP_DEBUG_COUNTERS.RQP${core}" | head -n -1 | tail -n +2 | sed -e 's/=/ /g'| sed -e 's/,/ /g'|tr -dc "[:alnum:] =_" )
169+
170+
pqp_uc_discard=$(printf "%d" $pqp_uc_discard)
171+
erpp_discard=$(printf "%d" $erpp_discard)
172+
173+
if [ $pqp_uc_discard -ne 0 ]; then
174+
logmsg="EGRESS_DROP UC_DROP on ASIC $asic CORE $core : PQP_DISCARD_UNICAST_PACKET_COUNTER = $pqp_uc_discard"
175+
timestamp
176+
hit=1;
177+
fi
178+
if [ $erpp_discard -ne 0 ]; then
179+
logmsg="EGRESS_DROP ERPP_DROP on ASIC $asic CORE $core : PQP_ERPP_DISCARDED_PACKET_COUNTER = $erpp_discard"
180+
timestamp
181+
hit=1;
182+
fi
183+
184+
sop_discard_uc=$(echo $rqp_debug_counters | awk {'print $4'})
185+
prp_discard_uc=$(echo $rqp_debug_counters | awk {'print $14'})
186+
dbf_err_cnt=$(echo $rqp_debug_counters | awk {'print $18'})
187+
188+
sop_discard_uc=$(printf "%d" $sop_discard_uc)
189+
prp_discard_uc=$(printf "%d" $prp_discard_uc)
190+
dbf_err_cnt=$(printf "%d" $dbf_err_cnt)
191+
192+
if [ $sop_discard_uc -ne 0 ]; then
193+
logmsg="EGRESS_DROP RQP_SOP_UC_DISCARD on ASIC $asic CORE $core : $sop_discard_uc"
194+
timestamp
195+
hit=1;
196+
fi
197+
if [ $prp_discard_uc -ne 0 ]; then
198+
logmsg="EGRESS_DROP RQP_PRP_UC_DISCARD on ASIC $asic CORE $core : $prp_discard_uc"
199+
timestamp
200+
hit=1;
201+
fi
202+
if [ $dbf_err_cnt -ne 0 ]; then
203+
logmsg="EGRESS_DROP RQP_DBF_ERR on ASIC $asic CORE $core : $dbf_err_cnt"
204+
timestamp
205+
hit=1;
206+
fi
207+
if [ $egr_check_mc -ne 0 ]; then
208+
sop_discard_mc=$(echo $rqp_debug_counters | awk {'print $6'})
209+
prp_discard_mc=$(echo $rqp_debug_counters | awk {'print $16'})
210+
sop_discard_mc=$(printf "%d" $sop_discard_mc)
211+
prp_discard_mc=$(printf "%d" $prp_discard_mc)
212+
213+
pqp_mc_discard=$(bcmcmd -n $asic "g hex PQP_PQP_DISCARD_MULTICAST_PACKET_COUNTER.PQP${core}" | head -n -1 | tail -n +2 | tr -dc '[:alnum:]')
214+
pqp_mc_discard=$(printf "%d" $pqp_mc_discard)
215+
if [ $pqp_mc_discard -ne 0 ]; then
216+
logmsg="EGRESS_DROP MC_DROP ASIC $asic CORE $core : PQP_DISCARD_MULTICAST_PACKET_COUNTER = $pqp_mc_discard"
217+
timestamp
218+
hit=1;
219+
fi
220+
if [ $sop_discard_mc -ne 0 ]; then
221+
logmsg="EGRESS_DROP RQP_SOP_MC_DISCARD on ASIC $asic CORE $core : $sop_discard_mc"
222+
timestamp
223+
hit=1;
224+
fi
225+
if [ $prp_discard_mc -ne 0 ]; then
226+
logmsg="EGRESS_DROP RQP_PRP_MC_DISCARD on ASIC $asic CORE $core : $prp_discard_mc"
227+
timestamp
228+
hit=1;
229+
fi
230+
fi
231+
if [ $hit -eq 0 ] ; then
232+
return
233+
fi
234+
235+
decode_last_pqp_drop
236+
# bcmcmd -n $asic "g chg ECGM_RQP_DISCARD_REASONS.ECGM${core}" | grep "=" >> $log
237+
decode_last_rqp_drop
238+
bcmcmd -n $asic "g chg PQP_INTERRUPT_REGISTER.PQP${core}"| head -n -1 | tail -n +2 >> $log
239+
bcmcmd -n $asic "g chg RQP_INTERRUPT_REGISTER.RQP${core}"| head -n -1 | tail -n +2 >> $log
240+
bcmcmd -n $asic "s PQP_INTERRUPT_REGISTER.PQP${core} -1" > /dev/null
241+
bcmcmd -n $asic "s RQP_INTERRUPT_REGISTER.RQP${core} -1" > /dev/null
242+
243+
bcmcmd -n $asic "g chg RQP_PACKET_REASSEMBLY_INTERRUPT_REGISTER.RQP${core}"| tail -2 | head -n -1 >> $log
244+
bcmcmd -n $asic "s RQP_PACKET_REASSEMBLY_INTERRUPT_REGISTER.RQP${core} -1" > /dev/null
245+
246+
bcmcmd -n $asic "g chg FDR_INTERRUPT_REGISTER.FDR${core}"| head -n -1 | tail -n +2 >> $log
247+
# FDA0 block is shared by both cores
248+
bcmcmd -n $asic "g chg FDA_INTERRUPT_REGISTER.FDA0"| head -n -1 | tail -n +2 >> $log
249+
bcmcmd -n $asic "s FDR_INTERRUPT_REGISTER.FDR${core} -1" > /dev/null
250+
bcmcmd -n $asic "s FDA_INTERRUPT_REGISTER.FDA0 -1" > /dev/null
251+
252+
bcmcmd -n $asic "g chg ERPP_INTERRUPT_REGISTER.ERPP${core}"| head -n -1 | tail -n +2>> $log
253+
bcmcmd -n $asic "g chg ERPP_ERPP_DISCARD_INTERRUPT_REGISTER.ERPP${core}"| head -n -1 | tail -n +2 >> $log
254+
bcmcmd -n $asic "g chg ERPP_ERPP_DISCARD_INTERRUPT_REGISTER_2.ERPP${core}"| head -n -1 | tail -n +2 >> $log
255+
bcmcmd -n $asic "s ERPP_ERPP_DISCARD_INTERRUPT_REGISTER.ERPP${core} -1" > /dev/null
256+
bcmcmd -n $asic "s ERPP_ERPP_DISCARD_INTERRUPT_REGISTER_2.ERPP${core} -1" > /dev/null
257+
bcmcmd -n $asic "s ERPP_INTERRUPT_REGISTER.ERPP${core} -1" > /dev/null
258+
259+
bcmcmd -n $asic "g chg ERPP_ERPP_DISCARDS_INTERRUPT_REGISTER_MASK.ERPP${core}"| head -n -1 | tail -n +2 >> $log
260+
bcmcmd -n $asic "g chg ERPP_ERPP_DISCARDS_INTERRUPT_REGISTER_MASK_2.ERPP${core}"| head -n -1 | tail -n +2 >> $log
261+
262+
#bcmcmd -n $asic "g chg IPT_FLOW_CONTROL_DEBUG.IPT${core}" >> $log
263+
bcmcmd -n $asic "tm egr con"| head -n -1 | tail -n +2 >> $log
264+
265+
if [ $egr_check_pmf_hit_bits -eq 1 ]; then
266+
dump_tcam_drop_action_hits
267+
fi
268+
if [ $egr_diag_counter_g -eq 1 ]; then
269+
bcmcmd -n $asic "diag counter g nz core=${core}"| head -n -1 | tail -n +2 >> $log
270+
fi
271+
echo "$delim" >> $log
272+
echo >> $log
273+
}
274+
275+
dump_ingress_traps() {
276+
bcmcmd -n $asic "g IPPB_DBG_FLP_DATA_PATH_TRAP.IPPB${core}" | head -n -1 | tail -n +2 >> $log
277+
bcmcmd -n $asic "s IPPB_DBG_FLP_DATA_PATH_TRAP.IPPB${core} -1"| head -n -1 | tail -n +2 >> $log
278+
bcmcmd -n $asic "g IPPE_DBG_LLR_TRAP_0.IPPE${core}"| head -n -1 | tail -n +2 >> $log
279+
bcmcmd -n $asic "s IPPE_DBG_LLR_TRAP_0.IPPE${core} -1"| head -n -1 | tail -n +2 >> $log
280+
}
281+
dump_macsec() {
282+
bcmcmd -n $asic "sec stat show; sec stat clear" >> $log
283+
}
284+
285+
rjct_filename=rjct_status.txt
286+
287+
check_ingress_drops() {
288+
hit=0
289+
bcmcmd -n $asic "getreg chg CGM_REJECT_STATUS_BITMAP.CGM${core}" | awk '{split($0,a,":"); print a[2]}' > $rjct_filename
290+
while read -r line; do
291+
[ -z $line ] && continue
292+
res=$(echo $line | grep -v "," | grep "<>")
293+
if [ -z $res ]; then
294+
hit=1
295+
fi
296+
done < "$rjct_filename"
297+
298+
if [ $hit == 1 ]; then
299+
logmsg="INGRESS_DROP asic $asic core $core"
300+
timestamp
301+
cat $rjct_filename >> $log
302+
bcmcmd -n $asic "g CGM_MAX_VOQ_WORDS_QSIZE_TRACK.CGM${core}" | head -n -1 | tail -n +2 >> $log
303+
#bcmcmd -n $asic "g chg IPT_FLOW_CONTROL_DEBUG.IPT${core}"| head -n -1 | tail -n +2 >> $log
304+
bcmcmd -n $asic "tm ing cong core=$core" >> $log
305+
bcmcmd -n $asic "trap last info core=$core" >> $log
306+
bcmcmd -n $asic "pp vis ppi core=$core" >> $log
307+
bcmcmd -n $asic "pp vis fdt core=$core" >> $log
308+
bcmcmd -n $asic "pp vis ikleap core=$core" >> $log
309+
#bcmcmd -n $asic "pp vis last" >> $log
310+
if [ $ing_check_mc -eq 1 ] ; then
311+
bcmcmd -n $asic "dbal table dump table=mcdb" >> $log
312+
bcmcmd -n $asic "g MTM_ING_MCDB_OFFSET" | head -n -1 | tail -n +2 >> $log
313+
bcmcmd -n $asic "g MTM_EGR_MCDB_OFFSET" | head -n -1 | tail -n +2 >> $log
314+
fi
315+
bcmcmd -n $asic "diag counter g nz core=${core}" >> $log
316+
echo "" >> $log
317+
dump_ingress_traps
318+
echo "" >> $log
319+
if [ $ing_check_macsec -eq 1 ] ; then
320+
dump_macsec
321+
fi
322+
echo "$delim" >> $log
323+
fi
324+
}
325+
326+
# clear stats
327+
for asic in "${asics[@]}"
328+
do
329+
bcmcmd -n $asic "sec stat clear; clear counter; clear interrupt all" >> /dev/null
330+
done
331+
332+
iter_a=(0 0 0 0)
333+
while true;
334+
do
335+
for asic in "${asics[@]}"
336+
do
337+
for core in "${cores[@]}"
338+
do
339+
index=$(($asic*2+$core))
340+
iter=$((${iter_a[$index]}+1))
341+
if [ $iter -eq $maxiter ] ; then
342+
iter_a[$index]=0;
343+
sleep $sleep_period
344+
continue
345+
fi
346+
iter_a[$index]=$iter
347+
# for majority of polling cycles, check the PQP drop reason and queue
348+
if [ $iter -gt 4 ] ; then
349+
decode_last_pqp_drop
350+
continue
351+
fi
352+
# check for any change in pqp disabled port mask
353+
if [ $iter -eq 1 ] ; then
354+
check_new_port_state
355+
continue
356+
fi
357+
if [ $iter -eq 2 ] ; then
358+
check_egress_drops
359+
continue
360+
fi
361+
if [ $iter -eq 3 ]; then
362+
check_ingress_drops
363+
continue
364+
fi
365+
if [ $iter -eq 4 ]; then
366+
decode_last_rqp_drop
367+
fi
368+
done
369+
done
370+
done
371+

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
'scripts/dropstat',
125125
'scripts/dualtor_neighbor_check.py',
126126
'scripts/dump_nat_entries.py',
127+
'scripts/debug_voq_chassis_packet_drops.sh',
127128
'scripts/ecnconfig',
128129
'scripts/fabricstat',
129130
'scripts/fanshow',

0 commit comments

Comments
 (0)