Skip to content

[202405]: Speed up route_check.py #3604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 121 additions & 91 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import signal
import traceback
import subprocess
import concurrent.futures

from ipaddress import ip_network
from swsscommon import swsscommon
Expand Down Expand Up @@ -348,10 +349,18 @@ def is_suppress_fib_pending_enabled(namespace):
return state == 'enabled'


def get_frr_routes(namespace):
def fetch_routes(cmd):
"""
Read routes from zebra through CLI command
:return frr routes dictionary
Fetch routes using the given command.
"""
output = subprocess.check_output(cmd, text=True)
return json.loads(output)


def get_frr_routes_parallel(namespace):
"""
Read routes from zebra through CLI command for IPv4 and IPv6 in parallel
:return combined IPv4 and IPv6 routes dictionary.
"""
if namespace == multi_asic.DEFAULT_NAMESPACE:
v4_route_cmd = ['show', 'ip', 'route', 'json']
Expand All @@ -360,12 +369,18 @@ def get_frr_routes(namespace):
v4_route_cmd = ['show', 'ip', 'route', '-n', namespace, 'json']
v6_route_cmd = ['show', 'ipv6', 'route', '-n', namespace, 'json']

output = subprocess.check_output(v4_route_cmd, text=True)
routes = json.loads(output)
output = subprocess.check_output(v6_route_cmd, text=True)
routes.update(json.loads(output))
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, routes))
return routes
with concurrent.futures.ThreadPoolExecutor() as executor:
future_v4 = executor.submit(fetch_routes, v4_route_cmd)
future_v6 = executor.submit(fetch_routes, v6_route_cmd)

# Wait for both results to complete
v4_routes = future_v4.result()
v6_routes = future_v6.result()

# Combine both IPv4 and IPv6 routes
v4_routes.update(v6_routes)
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, v4_routes))
return v4_routes


def get_interfaces(namespace):
Expand Down Expand Up @@ -566,7 +581,7 @@ def check_frr_pending_routes(namespace):
retries = FRR_CHECK_RETRIES
for i in range(retries):
missed_rt = []
frr_routes = get_frr_routes(namespace)
frr_routes = get_frr_routes_parallel(namespace)

for _, entries in frr_routes.items():
for entry in entries:
Expand Down Expand Up @@ -699,8 +714,9 @@ def _filter_out_neigh_route(routes, neighs):
return rt_appl_miss, rt_asic_miss


def check_routes(namespace):
def check_routes_for_namespace(namespace):
"""
Process a Single Namespace:
The heart of this script which runs the checks.
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
Expand All @@ -718,6 +734,83 @@ def check_routes(namespace):
:return (0, None) on sucess, else (-1, results) where results holds
the unjustifiable entries.
"""

results = {}
adds = []
deletes = []
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []

selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)

# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds, deletes = get_subscribe_updates(selector, subs)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds)

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes)

if rt_appl_miss:
results["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if intf_appl_miss:
results["missed_INTF_TABLE_entries"] = intf_appl_miss

if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if is_bgp_suppress_fib_pending_enabled(namespace):
rt_frr_miss = check_frr_pending_routes(namespace)

if rt_frr_miss:
results["missed_FRR_routes"] = rt_frr_miss

if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} \
but all routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)

return results, adds, deletes


def check_routes(namespace):
"""
Main function to parallelize route checks across all namespaces.
"""
namespace_list = []
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
namespace_list.append(namespace)
Expand All @@ -726,85 +819,23 @@ def check_routes(namespace):
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)

results = {}
adds = {}
deletes = {}
for namespace in namespace_list:
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []
adds[namespace] = []
deletes[namespace] = []

selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)


# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds[namespace], deletes[namespace] = get_subscribe_updates(selector, subs)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds[namespace])

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes[namespace])

if rt_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if intf_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_INTF_TABLE_entries"] = intf_appl_miss

if rt_asic_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if is_bgp_suppress_fib_pending_enabled(namespace):
rt_frr_miss = check_frr_pending_routes(namespace)

if rt_frr_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_FRR_routes"] = rt_frr_miss

if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} but all "
"routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)
all_adds = {}
all_deletes = {}

# Use ThreadPoolExecutor to parallelize the check for each namespace
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(check_routes_for_namespace, ns): ns for ns in namespace_list}

for future in concurrent.futures.as_completed(futures):
ns = futures[future]
try:
result, adds, deletes = future.result()
if result:
results[ns] = result
all_adds[ns] = adds
all_deletes[ns] = deletes
except Exception as e:
print_message(syslog.LOG_ERR, "Error processing namespace {}: {}".format(ns, e))

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
Expand Down Expand Up @@ -873,6 +904,5 @@ def main():
return ret, res



if __name__ == "__main__":
sys.exit(main()[0])
Loading