Skip to content

Commit afdcf34

Browse files
zhenggen-xulguohan
authored andcommitted
Support neighsyncd system warmreboot. (#661)
* Support neighsyncd system warmreboot. neighsyncd will waits for kernel restore process to be done before reconciliation Add vs testcases to cover kernel neighbor table restore process and neignsyncd process upon system warm reboot Signed-off-by: Zhenggen Xu <[email protected]> * Add the neigh_restore table to swss-schema.md Make the state check function more accurate. Signed-off-by: Zhenggen Xu <[email protected]> * Add restore_neighbors.py to be part of swss deb pkg: In case system warm reboot is enabled, it will try to restore the neighbor table from appDB into kernel through netlink API calls and update the neighbor table by sending arp/ns requests to all neighbor entries, then it sets the stateDB flag for neighsyncd to continue the reconciliation process. Added timeout in neighsyncd when waiting for restore_neighbors to finish Updated vs testcases Signed-off-by: Zhenggen Xu <[email protected]> * Use chrono::steady_clock in neighsyncd for time check Use monotonic lib for python time check Update the warmrestart python binding lib and re-enabled restore cnt check in vs tests Signed-off-by: Zhenggen Xu <[email protected]> * Use table hget to simply the code Time-out value changes vs test case changes to support default host side neigh table settings. Signed-off-by: Zhenggen Xu <[email protected]> * Fix vs test cases after merge Signed-off-by: Zhenggen Xu <[email protected]>
1 parent f380685 commit afdcf34

File tree

8 files changed

+677
-58
lines changed

8 files changed

+677
-58
lines changed

debian/swss.install

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ swssconfig/sample/th.64ports.buffers.json etc/swss/config.d
66
swssconfig/sample/th.64ports.qos.json etc/swss/config.d
77
swssconfig/sample/th2.118ports.buffers.json etc/swss/config.d
88
swssconfig/sample/th2.118ports.qos.json etc/swss/config.d
9+
neighsyncd/restore_neighbors.py usr/bin

doc/swss-schema.md

+5
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,11 @@ Stores information for physical switch ports managed by the switch chip. Ports t
745745
; dynanic data like port state, neighbor, routes
746746
; and so on.
747747

748+
### NEIGH_RESTORE_TABLE
749+
;State for neighbor table restoring process during warm reboot
750+
key = NEIGH_RESTORE_TABLE|Flags
751+
restored = "true" / "false" ; restored state
752+
748753
## Configuration files
749754
What configuration files should we have? Do apps, orch agent each need separate files?
750755

neighsyncd/neighsync.cpp

+16-1
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,27 @@
1616
using namespace std;
1717
using namespace swss;
1818

19-
NeighSync::NeighSync(RedisPipeline *pipelineAppDB) :
19+
NeighSync::NeighSync(RedisPipeline *pipelineAppDB, DBConnector *stateDb) :
2020
m_neighTable(pipelineAppDB, APP_NEIGH_TABLE_NAME),
21+
m_stateNeighRestoreTable(stateDb, STATE_NEIGH_RESTORE_TABLE_NAME),
2122
m_AppRestartAssist(pipelineAppDB, "neighsyncd", "swss", &m_neighTable, DEFAULT_NEIGHSYNC_WARMSTART_TIMER)
2223
{
2324
}
2425

26+
// Check if neighbor table is restored in kernel
27+
bool NeighSync::isNeighRestoreDone()
28+
{
29+
string value;
30+
31+
m_stateNeighRestoreTable.hget("Flags", "restored", value);
32+
if (value == "true")
33+
{
34+
SWSS_LOG_NOTICE("neighbor table restore to kernel is done");
35+
return true;
36+
}
37+
return false;
38+
}
39+
2540
void NeighSync::onMsg(int nlmsg_type, struct nl_object *obj)
2641
{
2742
char ipStr[MAX_ADDR_SIZE + 1] = {0};

neighsyncd/neighsync.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,31 @@
88

99
#define DEFAULT_NEIGHSYNC_WARMSTART_TIMER 5
1010

11+
//This is the timer value (in seconds) that the neighsyncd waiting for restore_neighbors
12+
//service to finish, should be longer than the restore_neighbors timeout value (60)
13+
//This should not happen, if happens, system is in a unknown state, we should exit.
14+
#define RESTORE_NEIGH_WAIT_TIME_OUT 70
15+
1116
namespace swss {
1217

1318
class NeighSync : public NetMsg
1419
{
1520
public:
1621
enum { MAX_ADDR_SIZE = 64 };
1722

18-
NeighSync(RedisPipeline *pipelineAppDB);
23+
NeighSync(RedisPipeline *pipelineAppDB, DBConnector *stateDb);
1924

2025
virtual void onMsg(int nlmsg_type, struct nl_object *obj);
2126

27+
bool isNeighRestoreDone();
28+
2229
AppRestartAssist *getRestartAssist()
2330
{
2431
return &m_AppRestartAssist;
2532
}
2633

2734
private:
35+
Table m_stateNeighRestoreTable;
2836
ProducerStateTable m_neighTable;
2937
AppRestartAssist m_AppRestartAssist;
3038
};

neighsyncd/neighsyncd.cpp

+29-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#include <iostream>
2+
#include <stdlib.h>
3+
#include <unistd.h>
4+
#include <chrono>
25
#include "logger.h"
36
#include "select.h"
47
#include "netdispatcher.h"
@@ -14,8 +17,9 @@ int main(int argc, char **argv)
1417

1518
DBConnector appDb(APPL_DB, DBConnector::DEFAULT_UNIXSOCKET, 0);
1619
RedisPipeline pipelineAppDB(&appDb);
20+
DBConnector stateDb(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0);
1721

18-
NeighSync sync(&pipelineAppDB);
22+
NeighSync sync(&pipelineAppDB, &stateDb);
1923

2024
NetDispatcher::getInstance().registerMessageHandler(RTM_NEWNEIGH, &sync);
2125
NetDispatcher::getInstance().registerMessageHandler(RTM_DELNEIGH, &sync);
@@ -27,16 +31,36 @@ int main(int argc, char **argv)
2731
NetLink netlink;
2832
Select s;
2933

30-
netlink.registerGroup(RTNLGRP_NEIGH);
31-
cout << "Listens to neigh messages..." << endl;
32-
netlink.dumpRequest(RTM_GETNEIGH);
34+
using namespace std::chrono;
3335

34-
s.addSelectable(&netlink);
3536
if (sync.getRestartAssist()->isWarmStartInProgress())
3637
{
3738
sync.getRestartAssist()->readTableToMap();
39+
40+
steady_clock::time_point starttime = steady_clock::now();
41+
while (!sync.isNeighRestoreDone())
42+
{
43+
duration<double> time_span =
44+
duration_cast<duration<double>>(steady_clock::now() - starttime);
45+
int pasttime = int(time_span.count());
46+
SWSS_LOG_INFO("waited neighbor table to be restored to kernel"
47+
" for %d seconds", pasttime);
48+
if (pasttime > RESTORE_NEIGH_WAIT_TIME_OUT)
49+
{
50+
SWSS_LOG_ERROR("neighbor table restore is not finished"
51+
" after timed-out, exit!!!");
52+
exit(EXIT_FAILURE);
53+
}
54+
sleep(1);
55+
}
3856
sync.getRestartAssist()->startReconcileTimer(s);
3957
}
58+
59+
netlink.registerGroup(RTNLGRP_NEIGH);
60+
cout << "Listens to neigh messages..." << endl;
61+
netlink.dumpRequest(RTM_GETNEIGH);
62+
63+
s.addSelectable(&netlink);
4064
while (true)
4165
{
4266
Selectable *temps;

neighsyncd/restore_neighbors.py

+245
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#!/usr/bin/env python
2+
3+
""""
4+
Description: restore_neighbors.py -- restoring neighbor table into kernel during system warm reboot.
5+
The script is started by supervisord in swss docker when the docker is started.
6+
If does not do anything in case warm restart is not enabled.
7+
In case system warm reboot is enabled, it will try to restore the neighbor table into kernel
8+
through netlink API calls and update the neigh table by sending arp/ns requests to all neighbor
9+
entries, then it sets the stateDB flag for neighsyncd to continue the reconciliation process.
10+
In case docker restart enabled only, it sets the stateDB flag so neighsyncd can follow
11+
the same logic.
12+
"""
13+
14+
import sys
15+
import swsssdk
16+
import netifaces
17+
import time
18+
import monotonic
19+
from pyroute2 import IPRoute, NetlinkError
20+
from pyroute2.netlink.rtnl import ndmsg
21+
from socket import AF_INET,AF_INET6
22+
import logging
23+
logging.getLogger("scapy.runtime").setLevel(logging.ERROR)
24+
from scapy.all import conf, in6_getnsma, inet_pton, inet_ntop, in6_getnsmac, get_if_hwaddr, Ether, ARP, IPv6, ICMPv6ND_NS, ICMPv6NDOptSrcLLAddr
25+
from swsscommon import swsscommon
26+
import errno
27+
28+
logger = logging.getLogger(__name__)
29+
logger.setLevel(logging.WARNING)
30+
logger.addHandler(logging.NullHandler())
31+
32+
# timeout the restore process in 1 min if not finished
33+
# This is mostly to wait for interfaces to be created and up after warm-reboot
34+
# It would be good to keep that below routing reconciliation time-out.
35+
TIME_OUT = 60
36+
37+
# every 5 seconds to check interfaces state
38+
CHECK_INTERVAL = 5
39+
40+
ip_family = {"IPv4": AF_INET, "IPv6": AF_INET6}
41+
42+
# return the first ipv4/ipv6 address assigned on intf
43+
def first_ip_on_intf(intf, family):
44+
if intf in netifaces.interfaces():
45+
ipaddresses = netifaces.ifaddresses(intf)
46+
if ip_family[family] in ipaddresses:
47+
# cover link local address as well
48+
return ipaddresses[ip_family[family]][0]['addr'].split("%")[0]
49+
return None
50+
51+
# check if the intf is operational up
52+
def is_intf_oper_state_up(intf):
53+
oper_file = '/sys/class/net/{0}/carrier'
54+
try:
55+
state_file = open(oper_file.format(intf), 'r')
56+
state = state_file.readline().rstrip()
57+
except Exception as e:
58+
logger.info('Error: {}'.format(str(e)))
59+
return False
60+
if state == '1':
61+
return True
62+
return False
63+
64+
# read the neigh table from AppDB to memory, format as below
65+
# build map as below, this can efficiently access intf and family groups later
66+
# { intf1 -> { { family1 -> [[ip1, mac1], [ip2, mac2] ...] }
67+
# { family2 -> [[ipM, macM], [ipN, macN] ...] } },
68+
# ...
69+
# intfA -> { { family1 -> [[ipW, macW], [ipX, macX] ...] }
70+
# { family2 -> [[ipY, macY], [ipZ, macZ] ...] } }
71+
# }
72+
#
73+
# Alternatively:
74+
# 1, we can build:
75+
# { intf1 -> [[family1, ip1, mac1], [family2, ip2, mac2] ...]},
76+
# ...
77+
# { intfA -> [[family1, ipX, macX], [family2, ipY, macY] ...]}
78+
#
79+
# 2, Or simply build two maps based on families
80+
# These alternative solutions would have worse performance because:
81+
# 1, need iterate the whole list if only one family is up.
82+
# 2, need check interface state twice due to the split map
83+
84+
def read_neigh_table_to_maps():
85+
db = swsssdk.SonicV2Connector(host='127.0.0.1')
86+
db.connect(db.APPL_DB, False)
87+
88+
intf_neigh_map = {}
89+
90+
keys = db.keys(db.APPL_DB, 'NEIGH_TABLE:*')
91+
keys = [] if keys is None else keys
92+
for key in keys:
93+
key_split = key.split(':', 2)
94+
intf_name = key_split[1]
95+
if intf_name == 'lo':
96+
continue
97+
dst_ip = key_split[2]
98+
value = db.get_all(db.APPL_DB, key)
99+
if 'neigh' in value and 'family' in value:
100+
dmac = value['neigh']
101+
family = value['family']
102+
else:
103+
raise RuntimeError('Neigh table format is incorrect')
104+
105+
if family not in ip_family:
106+
raise RuntimeError('Neigh table format is incorrect')
107+
108+
ip_mac_pair = []
109+
ip_mac_pair.append(dst_ip)
110+
ip_mac_pair.append(dmac)
111+
112+
intf_neigh_map.setdefault(intf_name, {}).setdefault(family, []).append(ip_mac_pair)
113+
db.close(db.APPL_DB)
114+
return intf_neigh_map
115+
116+
117+
# Use netlink to set neigh table into kernel, not overwrite the existing ones
118+
def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
119+
logging.info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
120+
family, intf_idx, dst_ip, dmac))
121+
122+
if family not in ip_family:
123+
return
124+
125+
family_af_inet = ip_family[family]
126+
try :
127+
ipclass.neigh('add',
128+
family=family_af_inet,
129+
dst=dst_ip,
130+
lladdr=dmac,
131+
ifindex=intf_idx,
132+
state=ndmsg.states['reachable'])
133+
# If neigh exists, log it but no exception raise, other exceptions, raise
134+
except NetlinkError as e:
135+
if e[0] == errno.EEXIST:
136+
logger.warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
137+
family, intf_idx, dst_ip, dmac))
138+
else:
139+
raise
140+
141+
# build ARP or NS packets depending on family
142+
def build_arp_ns_pkt(family, smac, src_ip, dst_ip):
143+
if family == 'IPv4':
144+
eth = Ether(src=smac, dst='ff:ff:ff:ff:ff:ff')
145+
pkt = eth/ARP(op=ARP.who_has, pdst=dst_ip)
146+
elif family == 'IPv6':
147+
nsma = in6_getnsma(inet_pton(AF_INET6, dst_ip))
148+
mcast_dst_ip = inet_ntop(AF_INET6, nsma)
149+
dmac = in6_getnsmac(nsma)
150+
eth = Ether(src=smac,dst=dmac)
151+
ipv6 = IPv6(src=src_ip, dst=mcast_dst_ip)
152+
ns = ICMPv6ND_NS(tgt=dst_ip)
153+
ns_opt = ICMPv6NDOptSrcLLAddr(lladdr=smac)
154+
pkt = eth/ipv6/ns/ns_opt
155+
return pkt
156+
157+
# Set the statedb "NEIGH_RESTORE_TABLE|Flags", so neighsyncd can start reconciliation
158+
def set_statedb_neigh_restore_done():
159+
db = swsssdk.SonicV2Connector(host='127.0.0.1')
160+
db.connect(db.STATE_DB, False)
161+
db.set(db.STATE_DB, 'NEIGH_RESTORE_TABLE|Flags', 'restored', 'true')
162+
db.close(db.STATE_DB)
163+
return
164+
165+
def restore_update_kernel_neighbors(intf_neigh_map):
166+
# create object for netlink calls to kernel
167+
ipclass = IPRoute()
168+
mtime = monotonic.time.time
169+
start_time = mtime()
170+
while (mtime() - start_time) < TIME_OUT:
171+
for intf, family_neigh_map in intf_neigh_map.items():
172+
# only try to restore to kernel when link is up
173+
if is_intf_oper_state_up(intf):
174+
src_mac = get_if_hwaddr(intf)
175+
intf_idx = ipclass.link_lookup(ifname=intf)[0]
176+
# create socket per intf to send packets
177+
s = conf.L2socket(iface=intf)
178+
179+
# Only two families: 'IPv4' and 'IPv6'
180+
for family in ip_family.keys():
181+
# if ip address assigned and if we have neighs in this family, restore them
182+
src_ip = first_ip_on_intf(intf, family)
183+
if src_ip and (family in family_neigh_map):
184+
neigh_list = family_neigh_map[family]
185+
for dst_ip, dmac in neigh_list:
186+
# use netlink to set neighbor entries
187+
set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac)
188+
189+
# best effort to update kernel neigh info
190+
# this will be updated by arp_update later too
191+
s.send(build_arp_ns_pkt(family, src_mac, src_ip, dst_ip))
192+
# delete this family on the intf
193+
del intf_neigh_map[intf][family]
194+
# close the pkt socket
195+
s.close()
196+
197+
# if all families are deleted, remove the key
198+
if len(intf_neigh_map[intf]) == 0:
199+
del intf_neigh_map[intf]
200+
# map is empty, all neigh entries are restored
201+
if not intf_neigh_map:
202+
break
203+
time.sleep(CHECK_INTERVAL)
204+
205+
206+
def main():
207+
208+
print "restore_neighbors service is started"
209+
210+
# Use warmstart python binding
211+
warmstart = swsscommon.WarmStart()
212+
warmstart.initialize("neighsyncd", "swss")
213+
warmstart.checkWarmStart("neighsyncd", "swss", False)
214+
215+
# if swss or system warm reboot not enabled, don't run
216+
if not warmstart.isWarmStart():
217+
print "restore_neighbors service is skipped as warm restart not enabled"
218+
return
219+
220+
# swss restart not system warm reboot
221+
if not warmstart.isSystemWarmRebootEnabled():
222+
set_statedb_neigh_restore_done()
223+
print "restore_neighbors service is done as system warm reboot not enabled"
224+
return
225+
226+
# read the neigh table from appDB to internal map
227+
try:
228+
intf_neigh_map = read_neigh_table_to_maps()
229+
except RuntimeError as e:
230+
logger.exception(str(e))
231+
sys.exit(1)
232+
233+
try:
234+
restore_update_kernel_neighbors(intf_neigh_map)
235+
except Exception as e:
236+
logger.exception(str(e))
237+
sys.exit(1)
238+
239+
# set statedb to signal other processes like neighsyncd
240+
set_statedb_neigh_restore_done()
241+
print "restore_neighbor service is done for system warmreboot"
242+
return
243+
244+
if __name__ == '__main__':
245+
main()

tests/conftest.py

+3
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ def runcmd(self, cmd):
135135
def runcmd_async(self, cmd):
136136
return subprocess.Popen("ip netns exec %s %s" % (self.nsname, cmd), shell=True)
137137

138+
def runcmd_output(self, cmd):
139+
return subprocess.check_output("ip netns exec %s %s" % (self.nsname, cmd), shell=True)
140+
138141
class DockerVirtualSwitch(object):
139142
def __init__(self, name=None, keeptb=False):
140143
self.basicd = ['redis-server',

0 commit comments

Comments
 (0)