Skip to content

Commit a7a68c1

Browse files
zhenggen-xuqiluo-msft
authored andcommitted
Optimize neigh restore process during warm-reboot (sonic-net#722)
* Optimize neigh restore process * set stale timer bigger to avoid testbed difference related timing issues
1 parent 3d60b3e commit a7a68c1

File tree

2 files changed

+78
-37
lines changed

2 files changed

+78
-37
lines changed

neighsyncd/restore_neighbors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
129129
dst=dst_ip,
130130
lladdr=dmac,
131131
ifindex=intf_idx,
132-
state=ndmsg.states['reachable'])
132+
state=ndmsg.states['stale'])
133133
# If neigh exists, log it but no exception raise, other exceptions, raise
134134
except NetlinkError as e:
135135
if e[0] == errno.EEXIST:

tests/test_warm_reboot.py

+77-36
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,22 @@ def check_kernel_reachable_v6_neigh_num(dvs, number):
327327
neigh_num = int(output.strip())
328328
assert neigh_num == number
329329

330+
def check_kernel_stale_neigh_num(dvs, number):
331+
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip neigh show nud stale | grep -v 'dev lo' | wc -l"])
332+
neigh_num = int(output.strip())
333+
assert neigh_num == number
334+
335+
def check_kernel_stale_v4_neigh_num(dvs, number):
336+
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip -4 neigh show nud stale | grep -v 'dev lo' | wc -l"])
337+
neigh_num = int(output.strip())
338+
assert neigh_num == number
339+
340+
def check_kernel_stale_v6_neigh_num(dvs, number):
341+
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip -6 neigh show nud stale | grep -v 'dev lo' | wc -l"])
342+
neigh_num = int(output.strip())
343+
assert neigh_num == number
344+
345+
330346
def kernel_restore_neighs_done(restoretbl):
331347
keys = restoretbl.getKeys()
332348
return (len(keys) > 0)
@@ -1510,13 +1526,52 @@ def test_routing_WarmRestart(dvs, testlog):
15101526
assert rt_key['dest'] == "192.168.100.0/24"
15111527

15121528

1529+
# macros for number of interfaces and number of neighbors
1530+
# TBD: NUM_NEIGH_PER_INTF >= 128 ips will cause test framework to hang by default kernel settings
1531+
# TBD: Need tune gc_thresh1/2/3 at host side of vs docker to support this.
1532+
NUM_INTF = 8
1533+
NUM_NEIGH_PER_INTF = 16 #128
1534+
NUM_OF_NEIGHS = (NUM_INTF*NUM_NEIGH_PER_INTF)
1535+
15131536
# 'ip neigh flush all' won't remove failed entries if number of neighs less than gc_threshold1
15141537
# Also it takes time to remove them completly.
15151538
# We use arp off/on to do it
15161539
def flush_neigh_entries(dvs):
15171540
dvs.runcmd("ip link set group default arp off")
15181541
dvs.runcmd("ip link set group default arp on")
15191542

1543+
# Add neighbor entries on servers connecting to SONiC ports
1544+
# ping them to get the neighbor entries
1545+
def setup_initial_neighbors(dvs):
1546+
for i in range(8, 8+NUM_INTF):
1547+
for j in range(NUM_NEIGH_PER_INTF):
1548+
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+2))
1549+
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+2))
1550+
1551+
time.sleep(1)
1552+
1553+
for i in range(8, 8+NUM_INTF):
1554+
for j in range(NUM_NEIGH_PER_INTF):
1555+
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+2)])
1556+
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+2)])
1557+
1558+
# Del half of the ips and a new half of the ips
1559+
# note: the first ipv4 can not be deleted only
1560+
def del_and_add_neighbors(dvs):
1561+
for i in range(8, 8+NUM_INTF):
1562+
for j in range(NUM_NEIGH_PER_INTF/2):
1563+
dvs.servers[i].runcmd("ip addr del {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF/2+2))
1564+
dvs.servers[i].runcmd("ip -6 addr del {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF/2+2))
1565+
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF+2))
1566+
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF+2))
1567+
1568+
#ping new IPs
1569+
def ping_new_ips(dvs):
1570+
for i in range(8, 8+NUM_INTF):
1571+
for j in range(NUM_NEIGH_PER_INTF/2):
1572+
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
1573+
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
1574+
15201575
def test_system_warmreboot_neighbor_syncup(dvs, testlog):
15211576

15221577
appl_db = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0)
@@ -1536,18 +1591,16 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
15361591
# Ethernet32/36/.../60, with ip: 32.0.0.1/24... 60.0.0.1/24
15371592
# ipv6: 3200::1/64...6000::1/64
15381593
# bring up the servers'interfaces and assign NUM_NEIGH_PER_INTF (e,g 128) ips per interface
1539-
# TBD: NUM_NEIGH_PER_INTF >= 128 ips will cause test framework to hang by default settings
1540-
# TBD: Need tune gc_thresh1/2/3 at host side of vs docker to support this.
1541-
NUM_INTF = 8
1542-
NUM_NEIGH_PER_INTF = 64 #128
1543-
NUM_OF_NEIGHS = (NUM_INTF*NUM_NEIGH_PER_INTF)
15441594
macs = []
15451595
for i in range(8, 8+NUM_INTF):
15461596
# set timeout to be the same as real HW
1597+
# set stale timer bigger to avoid testbed difference related timing issues.
15471598
# set ip on server facing interfaces
15481599
# bring servers' interface up, save the macs
15491600
dvs.runcmd("sysctl -w net.ipv4.neigh.Ethernet{}.base_reachable_time_ms=1800000".format(i*4))
15501601
dvs.runcmd("sysctl -w net.ipv6.neigh.Ethernet{}.base_reachable_time_ms=1800000".format(i*4))
1602+
dvs.runcmd("sysctl -w net.ipv4.neigh.Ethernet{}.gc_stale_time=180".format(i*4))
1603+
dvs.runcmd("sysctl -w net.ipv6.neigh.Ethernet{}.gc_stale_time=180".format(i*4))
15511604
dvs.runcmd("ip addr flush dev Ethernet{}".format(i*4))
15521605
dvs.runcmd("ifconfig Ethernet{} {}.0.0.1/24 up".format(i*4, i*4))
15531606
dvs.runcmd("ip -6 addr add {}00::1/64 dev Ethernet{}".format(i*4,i*4))
@@ -1559,21 +1612,8 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
15591612

15601613
#
15611614
# Testcase 1:
1562-
# Add neighbor entries on servers connecting to SONiC ports
1563-
# 128 ipv4 and 128 ipv6 on each server
1564-
# total: 1024 ipv4 and 1024 ipv6
1565-
# ping them to get the neighbor entries
1566-
for i in range(8, 8+NUM_INTF):
1567-
for j in range(NUM_NEIGH_PER_INTF):
1568-
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+2))
1569-
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+2))
1570-
1571-
time.sleep(1)
1572-
1573-
for i in range(8, 8+NUM_INTF):
1574-
for j in range(NUM_NEIGH_PER_INTF):
1575-
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+2)])
1576-
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+2)])
1615+
# Setup initial neigbors
1616+
setup_initial_neighbors(dvs)
15771617

15781618
# Check the neighbor entries are inserted correctly
15791619
db = swsscommon.DBConnector(0, dvs.redis_sock, 0)
@@ -1660,15 +1700,9 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
16601700
# stop neighsyncd and sairedis.rec
16611701
stop_neighsyncd(dvs)
16621702
del_entry_tbl(state_db, "NEIGH_RESTORE_TABLE", "Flags")
1703+
time.sleep(3)
16631704

1664-
# Del half of the ips and a new half of the ips
1665-
# note: the first ipv4 can not be deleted only
1666-
for i in range(8, 8+NUM_INTF):
1667-
for j in range(NUM_NEIGH_PER_INTF/2):
1668-
dvs.servers[i].runcmd("ip addr del {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF/2+2))
1669-
dvs.servers[i].runcmd("ip -6 addr del {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF/2+2))
1670-
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF+2))
1671-
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF+2))
1705+
del_and_add_neighbors(dvs)
16721706

16731707
flush_neigh_entries(dvs)
16741708
time.sleep(3)
@@ -1684,8 +1718,11 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
16841718
# should finish the store within 10 seconds
16851719
time.sleep(10)
16861720

1687-
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
1688-
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)
1721+
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
1722+
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)
1723+
1724+
check_kernel_stale_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
1725+
check_kernel_stale_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)
16891726

16901727
# check syslog and sairedis.rec file for activities
16911728
check_syslog_for_neighbor_entry(dvs, marker, 0, 0, "ipv4")
@@ -1701,14 +1738,14 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
17011738
# ping the new ips, should get it into appDB
17021739
marker = dvs.add_log_marker()
17031740

1704-
for i in range(8, 8+NUM_INTF):
1705-
for j in range(NUM_NEIGH_PER_INTF/2):
1706-
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
1707-
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
1741+
ping_new_ips(dvs)
1742+
1743+
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
1744+
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)
17081745

1746+
check_kernel_stale_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
1747+
check_kernel_stale_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)
17091748

1710-
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS+NUM_OF_NEIGHS/2)
1711-
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS+NUM_OF_NEIGHS/2)
17121749
check_redis_neigh_entries(dvs, tbl, 2*(NUM_OF_NEIGHS+NUM_OF_NEIGHS/2))
17131750

17141751
(nadd, ndel) = dvs.CountSubscribedObjects(pubsub)
@@ -1725,6 +1762,10 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
17251762

17261763
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
17271764
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)
1765+
1766+
check_kernel_stale_v4_neigh_num(dvs, 0)
1767+
check_kernel_stale_v6_neigh_num(dvs, 0)
1768+
17281769
check_redis_neigh_entries(dvs, tbl, 2*NUM_OF_NEIGHS)
17291770

17301771
(nadd, ndel) = dvs.CountSubscribedObjects(pubsub)

0 commit comments

Comments
 (0)