Skip to content

Commit 200ef36

Browse files
Speed up route_check script (#3678)
This PR fixes #18773 and #20648 Note: resubmitting the Old PR #3544 (with minor cosmetic changes). The original PR was reverted due to PR check failure. The failure has been root caused and fixed via sonic-net/sonic-mgmt#16013 How I did it Parallely execute route_check on each Asic. Parallelly fetch ipv4 routes and ipv6 routes.
1 parent 7dc40ac commit 200ef36

File tree

1 file changed

+125
-90
lines changed

1 file changed

+125
-90
lines changed

scripts/route_check.py

+125-90
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import signal
4747
import traceback
4848
import subprocess
49+
import concurrent.futures
4950

5051
from ipaddress import ip_network
5152
from swsscommon import swsscommon
@@ -338,10 +339,18 @@ def is_suppress_fib_pending_enabled(namespace):
338339
return state == 'enabled'
339340

340341

341-
def get_frr_routes(namespace):
342+
def fetch_routes(cmd):
342343
"""
343-
Read routes from zebra through CLI command
344-
:return frr routes dictionary
344+
Fetch routes using the given command.
345+
"""
346+
output = subprocess.check_output(cmd, text=True)
347+
return json.loads(output)
348+
349+
350+
def get_frr_routes_parallel(namespace):
351+
"""
352+
Read routes from zebra through CLI command for IPv4 and IPv6 in parallel
353+
:return combined IPv4 and IPv6 routes dictionary.
345354
"""
346355
if namespace == multi_asic.DEFAULT_NAMESPACE:
347356
v4_route_cmd = ['show', 'ip', 'route', 'json']
@@ -350,12 +359,18 @@ def get_frr_routes(namespace):
350359
v4_route_cmd = ['show', 'ip', 'route', '-n', namespace, 'json']
351360
v6_route_cmd = ['show', 'ipv6', 'route', '-n', namespace, 'json']
352361

353-
output = subprocess.check_output(v4_route_cmd, text=True)
354-
routes = json.loads(output)
355-
output = subprocess.check_output(v6_route_cmd, text=True)
356-
routes.update(json.loads(output))
357-
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, routes))
358-
return routes
362+
with concurrent.futures.ThreadPoolExecutor() as executor:
363+
future_v4 = executor.submit(fetch_routes, v4_route_cmd)
364+
future_v6 = executor.submit(fetch_routes, v6_route_cmd)
365+
366+
# Wait for both results to complete
367+
v4_routes = future_v4.result()
368+
v6_routes = future_v6.result()
369+
370+
# Combine both IPv4 and IPv6 routes
371+
v4_routes.update(v6_routes)
372+
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, v4_routes))
373+
return v4_routes
359374

360375

361376
def get_interfaces(namespace):
@@ -556,7 +571,7 @@ def check_frr_pending_routes(namespace):
556571
retries = FRR_CHECK_RETRIES
557572
for i in range(retries):
558573
missed_rt = []
559-
frr_routes = get_frr_routes(namespace)
574+
frr_routes = get_frr_routes_parallel(namespace)
560575

561576
for _, entries in frr_routes.items():
562577
for entry in entries:
@@ -689,8 +704,9 @@ def _filter_out_neigh_route(routes, neighs):
689704
return rt_appl_miss, rt_asic_miss
690705

691706

692-
def check_routes(namespace):
707+
def check_routes_for_namespace(namespace):
693708
"""
709+
Process a Single Namespace:
694710
The heart of this script which runs the checks.
695711
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
696712
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
@@ -708,6 +724,82 @@ def check_routes(namespace):
708724
:return (0, None) on sucess, else (-1, results) where results holds
709725
the unjustifiable entries.
710726
"""
727+
728+
results = {}
729+
adds = []
730+
deletes = []
731+
intf_appl_miss = []
732+
rt_appl_miss = []
733+
rt_asic_miss = []
734+
rt_frr_miss = []
735+
736+
selector, subs, rt_asic = get_asicdb_routes(namespace)
737+
738+
rt_appl = get_appdb_routes(namespace)
739+
intf_appl = get_interfaces(namespace)
740+
741+
# Diff APPL-DB routes & ASIC-DB routes
742+
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)
743+
744+
# Check missed ASIC routes against APPL-DB INTF_TABLE
745+
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
746+
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
747+
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
748+
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
749+
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)
750+
751+
# Check APPL-DB INTF_TABLE with ASIC table route entries
752+
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)
753+
754+
if rt_appl_miss:
755+
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)
756+
757+
if rt_appl_miss:
758+
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)
759+
760+
# NOTE: On dualtor environment, ignore any route miss for the
761+
# neighbors learned from the vlan subnet.
762+
if rt_appl_miss or rt_asic_miss:
763+
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)
764+
765+
if rt_appl_miss or rt_asic_miss:
766+
# Look for subscribe updates for a second
767+
adds, deletes = get_subscribe_updates(selector, subs)
768+
769+
# Drop all those for which SET received
770+
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds)
771+
772+
# Drop all those for which DEL received
773+
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes)
774+
775+
if rt_appl_miss:
776+
results["missed_ROUTE_TABLE_routes"] = rt_appl_miss
777+
778+
if intf_appl_miss:
779+
results["missed_INTF_TABLE_entries"] = intf_appl_miss
780+
781+
if rt_asic_miss:
782+
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss
783+
784+
rt_frr_miss = check_frr_pending_routes(namespace)
785+
786+
if rt_frr_miss:
787+
results["missed_FRR_routes"] = rt_frr_miss
788+
789+
if results:
790+
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
791+
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} \
792+
but all routes in APPL_DB and ASIC_DB are in sync".format(namespace))
793+
if is_suppress_fib_pending_enabled(namespace):
794+
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)
795+
796+
return results, adds, deletes
797+
798+
799+
def check_routes(namespace):
800+
"""
801+
Main function to parallelize route checks across all namespaces.
802+
"""
711803
namespace_list = []
712804
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
713805
namespace_list.append(namespace)
@@ -716,89 +808,32 @@ def check_routes(namespace):
716808
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)
717809

718810
results = {}
719-
adds = {}
720-
deletes = {}
721-
for namespace in namespace_list:
722-
intf_appl_miss = []
723-
rt_appl_miss = []
724-
rt_asic_miss = []
725-
rt_frr_miss = []
726-
adds[namespace] = []
727-
deletes[namespace] = []
728-
729-
selector, subs, rt_asic = get_asicdb_routes(namespace)
730-
731-
rt_appl = get_appdb_routes(namespace)
732-
intf_appl = get_interfaces(namespace)
733-
734-
# Diff APPL-DB routes & ASIC-DB routes
735-
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)
736-
737-
# Check missed ASIC routes against APPL-DB INTF_TABLE
738-
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
739-
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
740-
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
741-
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
742-
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)
743-
744-
# Check APPL-DB INTF_TABLE with ASIC table route entries
745-
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)
746-
747-
if rt_appl_miss:
748-
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)
749-
750-
if rt_appl_miss:
751-
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)
752-
753-
# NOTE: On dualtor environment, ignore any route miss for the
754-
# neighbors learned from the vlan subnet.
755-
if rt_appl_miss or rt_asic_miss:
756-
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)
757-
758-
if rt_appl_miss or rt_asic_miss:
759-
# Look for subscribe updates for a second
760-
adds[namespace], deletes[namespace] = get_subscribe_updates(selector, subs)
761-
762-
# Drop all those for which SET received
763-
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds[namespace])
764-
765-
# Drop all those for which DEL received
766-
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes[namespace])
767-
768-
if rt_appl_miss:
769-
if namespace not in results:
770-
results[namespace] = {}
771-
results[namespace]["missed_ROUTE_TABLE_routes"] = rt_appl_miss
772-
773-
if intf_appl_miss:
774-
if namespace not in results:
775-
results[namespace] = {}
776-
results[namespace]["missed_INTF_TABLE_entries"] = intf_appl_miss
777-
778-
if rt_asic_miss:
779-
if namespace not in results:
780-
results[namespace] = {}
781-
results[namespace]["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss
782-
783-
rt_frr_miss = check_frr_pending_routes(namespace)
784-
785-
if rt_frr_miss:
786-
if namespace not in results:
787-
results[namespace] = {}
788-
results[namespace]["missed_FRR_routes"] = rt_frr_miss
789-
790-
if results:
791-
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
792-
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} \
793-
but all routes in APPL_DB and ASIC_DB are in sync".format(namespace))
794-
if is_suppress_fib_pending_enabled(namespace):
795-
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)
811+
all_adds = {}
812+
all_deletes = {}
813+
814+
# Use ThreadPoolExecutor to parallelize the check for each namespace
815+
with concurrent.futures.ThreadPoolExecutor() as executor:
816+
futures = {executor.submit(check_routes_for_namespace, ns): ns for ns in namespace_list}
817+
818+
for future in concurrent.futures.as_completed(futures):
819+
ns = futures[future]
820+
all_adds[ns] = []
821+
all_deletes[ns] = []
822+
try:
823+
result, adds, deletes = future.result()
824+
if result:
825+
results[ns] = result
826+
all_adds[ns] = adds
827+
all_deletes[ns] = deletes
828+
except Exception as e:
829+
print_message(syslog.LOG_ERR, "Error processing namespace {}: {}".format(ns, e))
830+
return -1, results
796831

797832
if results:
798833
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
799834
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
800-
print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4))
801-
print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4))
835+
print_message(syslog.LOG_WARNING, "add: ", json.dumps(all_adds, indent=4))
836+
print_message(syslog.LOG_WARNING, "del: ", json.dumps(all_deletes, indent=4))
802837
return -1, results
803838
else:
804839
print_message(syslog.LOG_INFO, "All good!")

0 commit comments

Comments
 (0)