Skip to content

Commit e2f0f2c

Browse files
vaibhavhdyxieca
authored andcommitted
LAG keepalive script to reduce lacp session wait during warm-reboot (#2806)
A new mechanism is added here to to reduce LAG flap issue during hitless upgrades. Problem being solved: During warm upgrades T0 goes down and with that wait time for LACP session starts. If the waittime to refresh LACP session is > 90s then T1 initiates LAG teardown, and as a result dataplane impact is seen. This script makes sure that LACPDUs are sent in the going down path continuously. How time is saved w/ this mechanism: The lacpsession wait period earlier used to start from when teamd container goes down. New lacpsession wait period starts when kexec in current kernel is issued, and new kernel boots up. Implementation: When warm-reboot starts, capture LACPDUs sent from all LAG member ports. For this allow 60s of prep + collection time. Start sending LACPDUs w/ ~1s interval. The last LACPDU is sent after all containers are down and kexec is issued. Results: Tested this on different platforms and images. Some results for time saved: BRCM: 201811 -> 202012 --- 18s BRCM: 202012 -> 202012 --- 20s MLNX: 201911 -> 202205 --- 10s MLNX: 202205 -> 202205 --- 10s
1 parent 3053371 commit e2f0f2c

File tree

3 files changed

+111
-0
lines changed

3 files changed

+111
-0
lines changed

scripts/fast-reboot

+8
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ STRICT=no
1717
REBOOT_METHOD="/sbin/kexec -e"
1818
ASSISTANT_IP_LIST=""
1919
ASSISTANT_SCRIPT="/usr/local/bin/neighbor_advertiser"
20+
LAG_KEEPALIVE_SCRIPT="/usr/local/bin/lag_keepalive.py"
2021
WATCHDOG_UTIL="/usr/local/bin/watchdogutil"
2122
DEVPATH="/usr/share/sonic/device"
2223
PLATFORM=$(sonic-cfggen -H -v DEVICE_METADATA.localhost.platform)
@@ -645,6 +646,13 @@ fi
645646
# disable trap-handlers which were set before
646647
trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
647648
649+
# start sending LACPDUs to keep the LAGs refreshed
650+
# this is a non-blocking call, and the process will die in 300s
651+
debug "Starting lag_keepalive to send LACPDUs ..."
652+
timeout 300 python ${LAG_KEEPALIVE_SCRIPT} &
653+
# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s)
654+
sleep 60
655+
648656
if [ -x ${LOG_SSD_HEALTH} ]; then
649657
debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..."
650658
${LOG_SSD_HEALTH}

scripts/lag_keepalive.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env python3
2+
3+
from scapy.config import conf
4+
conf.ipv6_enabled = False
5+
from scapy.all import sendp, sniff
6+
from swsscommon.swsscommon import ConfigDBConnector
7+
import time, threading, traceback
8+
import syslog
9+
10+
SYSLOG_ID = 'lag_keepalive'
11+
12+
13+
def log_info(msg):
14+
syslog.openlog(SYSLOG_ID)
15+
syslog.syslog(syslog.LOG_INFO, msg)
16+
syslog.closelog()
17+
18+
19+
def log_error(msg):
20+
syslog.openlog(SYSLOG_ID)
21+
syslog.syslog(syslog.LOG_ERR, msg)
22+
syslog.closelog()
23+
24+
25+
def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet):
26+
sniffed_packet = sniff(iface=lag_member,
27+
filter="ether proto 0x8809 and ether src {}".format(device_mac),
28+
count=1, timeout=30)
29+
lag_member_to_packet[lag_member] = sniffed_packet
30+
31+
32+
def get_lacpdu_per_lag_member():
33+
appDB = ConfigDBConnector()
34+
appDB.db_connect('APPL_DB')
35+
appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE')
36+
configDB = ConfigDBConnector()
37+
configDB.db_connect('CONFIG_DB')
38+
device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac")
39+
hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku")
40+
active_lag_members = list()
41+
lag_member_to_packet = dict()
42+
sniffer_threads = list()
43+
for lag_entry in appDB_lag_info:
44+
lag_name = str(lag_entry[0])
45+
oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status")
46+
if oper_status == "up":
47+
# only apply the workaround for active lags
48+
lag_member = str(lag_entry[1])
49+
active_lag_members.append(lag_member)
50+
# use threading to capture lacpdus from several lag members simultaneously
51+
sniffer_thread = threading.Thread(target=sniff_lacpdu,
52+
args=(device_mac, lag_member, lag_member_to_packet))
53+
sniffer_thread.start()
54+
sniffer_threads.append(sniffer_thread)
55+
56+
# sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s
57+
for sniffer in sniffer_threads:
58+
sniffer.join(timeout=30)
59+
60+
return active_lag_members, lag_member_to_packet
61+
62+
63+
def lag_keepalive(lag_member_to_packet):
64+
while True:
65+
for lag_member, packet in lag_member_to_packet.items():
66+
try:
67+
sendp(packet, iface=lag_member, verbose=False)
68+
except Exception:
69+
# log failure and continue to send lacpdu
70+
traceback_msg = traceback.format_exc()
71+
log_error("Failed to send LACPDU packet from interface {} with error: {}".format(
72+
lag_member, traceback_msg))
73+
continue
74+
log_info("sent LACPDU packets via {}".format(lag_member_to_packet.keys()))
75+
time.sleep(1)
76+
77+
78+
def main():
79+
while True:
80+
try:
81+
active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member()
82+
if len(active_lag_members) != len(lag_member_to_packet.keys()):
83+
log_error("Failed to capture LACPDU packets for some lag members. " +\
84+
"Active lag members: {}. LACPDUs captured for: {}".format(
85+
active_lag_members, lag_member_to_packet.keys()))
86+
87+
log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys()))
88+
except Exception:
89+
traceback_msg = traceback.format_exc()
90+
log_error("Failed to get LAG members and LACPDUs with error: {}".format(
91+
traceback_msg))
92+
# keep attempting until sniffed packets are ready
93+
continue
94+
# if no exceptions are thrown, break from loop as LACPDUs are ready to be sent
95+
break
96+
97+
if lag_member_to_packet:
98+
# start an infinite loop to keep sending lacpdus from lag member ports
99+
lag_keepalive(lag_member_to_packet)
100+
101+
if __name__ == "__main__":
102+
main()

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
'scripts/intfutil',
114114
'scripts/intfstat',
115115
'scripts/ipintutil',
116+
'scripts/lag_keepalive.py',
116117
'scripts/lldpshow',
117118
'scripts/log_ssd_health',
118119
'scripts/mellanox_buffer_migrator.py',

0 commit comments

Comments
 (0)