Skip to content

Commit 9a18155

Browse files
authored
Optimize lag_keepalive by crafting the LACPDU packet ourselves (sonic-net#3170)
* Optimize lag_keepalive by crafting the LACPDU packet ourselves Instead of waiting for a LACPDU packet to be sent and capturing that (which involves waiting roughly 30 seconds), get the necessary information from teamd and craft it ourselves. This means that the 60-second wait in making sure that a LACPDU packet is captured and the keepalive script is ready can be largely eliminated (this has been reduced to 10 seconds to make sure the script has a chance to craft the packets and send some LACPDUs). Signed-off-by: Saikrishna Arcot <[email protected]> * Fix pre-commit errors Signed-off-by: Saikrishna Arcot <[email protected]> * Keep a socket open, and reuse that for sending LACPDUs Signed-off-by: Saikrishna Arcot <[email protected]> * Add logic to fork into background after collecting information Signed-off-by: Saikrishna Arcot <[email protected]> * Start lag_keepalive before OA pause, and fork after building packets Start lag_keepalive before pausing orchagent, so that there's less of a delay between when orchagent is paused and when kexec happens, and so that fewer events/changes aren't handled by orchagent. Additionally, add an option into the lag_keepalive script to fork into the background after generating the LACPDUs and opening sockets, but before sending the actual packets. This serves as a sort-of error check to make sure that it is at least able to send LACPDU packets, and didn't bail out early. Signed-off-by: Saikrishna Arcot <[email protected]> --------- Signed-off-by: Saikrishna Arcot <[email protected]>
1 parent a7deb8c commit 9a18155

File tree

2 files changed

+79
-39
lines changed

2 files changed

+79
-39
lines changed

scripts/fast-reboot

+7-7
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,13 @@ fi
744744
745745
init_warm_reboot_states
746746
747+
# start sending LACPDUs to keep the LAGs refreshed
748+
# the process will die in 30s
749+
debug "Starting lag_keepalive to send LACPDUs ..."
750+
timeout 30 python3 ${LAG_KEEPALIVE_SCRIPT} --fork-into-background
751+
# give the lag_keepalive script a chance to send some LACPDUs
752+
sleep 5
753+
747754
setup_control_plane_assistant
748755
749756
TEAMD_INCREASE_RETRY_COUNT=0
@@ -813,13 +820,6 @@ fi
813820
# disable trap-handlers which were set before
814821
trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
815822
816-
# start sending LACPDUs to keep the LAGs refreshed
817-
# this is a non-blocking call, and the process will die in 300s
818-
debug "Starting lag_keepalive to send LACPDUs ..."
819-
timeout 300 python ${LAG_KEEPALIVE_SCRIPT} &
820-
# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s)
821-
sleep 60
822-
823823
if [ -x ${LOG_SSD_HEALTH} ]; then
824824
debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..."
825825
${LOG_SSD_HEALTH}

scripts/lag_keepalive.py

+72-32
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
#!/usr/bin/env python3
22

3+
import argparse
4+
import json
5+
import os
36
from scapy.config import conf
47
conf.ipv6_enabled = False
5-
from scapy.all import sendp, sniff
6-
from swsscommon.swsscommon import ConfigDBConnector
7-
import time, threading, traceback
8-
import syslog
8+
from scapy.layers.l2 import Ether # noqa: E402
9+
from scapy.sendrecv import sendp # noqa: E402
10+
import scapy.contrib.lacp # noqa: E402
11+
import subprocess # noqa: E402
12+
import syslog # noqa: E402
13+
import time # noqa: E402
14+
import traceback # noqa: E402
15+
from swsscommon.swsscommon import ConfigDBConnector # noqa: E402
916

1017
SYSLOG_ID = 'lag_keepalive'
18+
SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02"
19+
LACP_ETHERTYPE = 0x8809
1120

1221

1322
def log_info(msg):
@@ -22,67 +31,92 @@ def log_error(msg):
2231
syslog.closelog()
2332

2433

25-
def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet):
26-
sniffed_packet = sniff(iface=lag_member,
27-
filter="ether proto 0x8809 and ether src {}".format(device_mac),
28-
count=1, timeout=30)
29-
lag_member_to_packet[lag_member] = sniffed_packet
34+
def getCmdOutput(cmd):
35+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
36+
return proc.communicate()[0], proc.returncode
37+
38+
39+
def get_port_channel_config(portChannelName):
40+
(processStdout, _) = getCmdOutput(["teamdctl", portChannelName, "state", "dump"])
41+
return json.loads(processStdout)
42+
43+
44+
def craft_lacp_packet(portChannelConfig, portName):
45+
portConfig = portChannelConfig["ports"][portName]
46+
actorConfig = portConfig["runner"]["actor_lacpdu_info"]
47+
partnerConfig = portConfig["runner"]["partner_lacpdu_info"]
48+
l2 = Ether(dst=SLOW_PROTOCOL_MAC_ADDRESS, src=portConfig["ifinfo"]["dev_addr"], type=LACP_ETHERTYPE)
49+
l3 = scapy.contrib.lacp.SlowProtocol(subtype=0x01)
50+
l4 = scapy.contrib.lacp.LACP()
51+
l4.version = 0x1
52+
l4.actor_system_priority = actorConfig["system_priority"]
53+
l4.actor_system = actorConfig["system"]
54+
l4.actor_key = actorConfig["key"]
55+
l4.actor_port_priority = actorConfig["port_priority"]
56+
l4.actor_port_number = actorConfig["port"]
57+
l4.actor_state = actorConfig["state"]
58+
l4.partner_system_priority = partnerConfig["system_priority"]
59+
l4.partner_system = partnerConfig["system"]
60+
l4.partner_key = partnerConfig["key"]
61+
l4.partner_port_priority = partnerConfig["port_priority"]
62+
l4.partner_port_number = partnerConfig["port"]
63+
l4.partner_state = partnerConfig["state"]
64+
packet = l2 / l3 / l4
65+
return packet
3066

3167

3268
def get_lacpdu_per_lag_member():
3369
appDB = ConfigDBConnector()
3470
appDB.db_connect('APPL_DB')
3571
appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE')
36-
configDB = ConfigDBConnector()
37-
configDB.db_connect('CONFIG_DB')
38-
device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac")
39-
hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku")
4072
active_lag_members = list()
4173
lag_member_to_packet = dict()
42-
sniffer_threads = list()
4374
for lag_entry in appDB_lag_info:
4475
lag_name = str(lag_entry[0])
45-
oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status")
76+
oper_status = appDB.get(appDB.APPL_DB, "LAG_TABLE:{}".format(lag_name), "oper_status")
4677
if oper_status == "up":
4778
# only apply the workaround for active lags
4879
lag_member = str(lag_entry[1])
4980
active_lag_members.append(lag_member)
50-
# use threading to capture lacpdus from several lag members simultaneously
51-
sniffer_thread = threading.Thread(target=sniff_lacpdu,
52-
args=(device_mac, lag_member, lag_member_to_packet))
53-
sniffer_thread.start()
54-
sniffer_threads.append(sniffer_thread)
55-
56-
# sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s
57-
for sniffer in sniffer_threads:
58-
sniffer.join(timeout=30)
59-
81+
# craft lacpdu packets for each lag member based on config
82+
port_channel_config = get_port_channel_config(lag_name)
83+
packet = craft_lacp_packet(port_channel_config, lag_member)
84+
socket = conf.L2socket(iface=lag_member)
85+
lag_member_to_packet[lag_member] = (socket, packet)
86+
6087
return active_lag_members, lag_member_to_packet
6188

6289

6390
def lag_keepalive(lag_member_to_packet):
64-
while True:
65-
for lag_member, packet in lag_member_to_packet.items():
91+
num_iterations = 300
92+
current_iteration = 0
93+
while current_iteration < num_iterations:
94+
for lag_member, (socket, packet) in lag_member_to_packet.items():
6695
try:
67-
sendp(packet, iface=lag_member, verbose=False)
96+
sendp(packet, socket=socket, verbose=False)
6897
except Exception:
6998
# log failure and continue to send lacpdu
7099
traceback_msg = traceback.format_exc()
71100
log_error("Failed to send LACPDU packet from interface {} with error: {}".format(
72101
lag_member, traceback_msg))
73102
continue
74103
log_info("sent LACPDU packets via {}".format(lag_member_to_packet.keys()))
104+
current_iteration += 1
75105
time.sleep(1)
76106

77107

78108
def main():
109+
parser = argparse.ArgumentParser()
110+
parser.add_argument('--fork-into-background', action='store_true')
111+
args = parser.parse_args()
112+
79113
while True:
80114
try:
81115
active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member()
82116
if len(active_lag_members) != len(lag_member_to_packet.keys()):
83-
log_error("Failed to capture LACPDU packets for some lag members. " +\
84-
"Active lag members: {}. LACPDUs captured for: {}".format(
85-
active_lag_members, lag_member_to_packet.keys()))
117+
log_error("Failed to craft LACPDU packets for some lag members. " +
118+
"Active lag members: {}. LACPDUs crafted for: {}".format(
119+
active_lag_members, lag_member_to_packet.keys()))
86120

87121
log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys()))
88122
except Exception:
@@ -94,9 +128,15 @@ def main():
94128
# if no exceptions are thrown, break from loop as LACPDUs are ready to be sent
95129
break
96130

131+
if args.fork_into_background:
132+
pid = os.fork()
133+
if pid != 0: # The parent process
134+
os._exit(0) # Exit parent of the child process
135+
97136
if lag_member_to_packet:
98-
# start an infinite loop to keep sending lacpdus from lag member ports
137+
# start a loop to keep sending lacpdus from lag member ports
99138
lag_keepalive(lag_member_to_packet)
100139

140+
101141
if __name__ == "__main__":
102142
main()

0 commit comments

Comments
 (0)