Skip to content

Commit c63a0cd

Browse files
nkelapurraphaelt-nvidia
authored andcommitted
Added changes to handle dependency check in FdbSyncd and FpmSyncd for warm-boot (sonic-net#1556)
Added changes to handle dependency check in FpmSyncd and FdbSyncd for warmreboot. This was done to ensure for EVPN warm-reboot the order of data replay to kernel is maintained across various submodules and the kernel programming will be successful.
1 parent 6600093 commit c63a0cd

File tree

7 files changed

+132
-10
lines changed

7 files changed

+132
-10
lines changed

fdbsyncd/fdbsync.cpp

+37-2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,36 @@ FdbSync::~FdbSync()
4343
}
4444
}
4545

46+
47+
// Check if interface entries are restored in kernel
48+
bool FdbSync::isIntfRestoreDone()
49+
{
50+
vector<string> required_modules = {
51+
"vxlanmgrd",
52+
"intfmgrd",
53+
"vlanmgrd",
54+
"vrfmgrd"
55+
};
56+
57+
for (string& module : required_modules)
58+
{
59+
WarmStart::WarmStartState state;
60+
61+
WarmStart::getWarmStartState(module, state);
62+
if (state == WarmStart::REPLAYED || state == WarmStart::RECONCILED)
63+
{
64+
SWSS_LOG_INFO("Module %s Replayed or Reconciled %d",module.c_str(), (int) state);
65+
}
66+
else
67+
{
68+
SWSS_LOG_INFO("Module %s NOT Replayed or Reconciled %d",module.c_str(), (int) state);
69+
return false;
70+
}
71+
}
72+
73+
return true;
74+
}
75+
4676
void FdbSync::processCfgEvpnNvo()
4777
{
4878
std::deque<KeyOpFieldsValuesTuple> entries;
@@ -447,14 +477,17 @@ void FdbSync::macDelVxlanDB(string key)
447477
fvVector.push_back(t);
448478
fvVector.push_back(v);
449479

480+
SWSS_LOG_NOTICE("%sVXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s",
481+
m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" ,
482+
key.c_str(), vtep.c_str(), type.c_str());
483+
450484
// If warmstart is in progress, we take all netlink changes into the cache map
451485
if (m_AppRestartAssist->isWarmStartInProgress())
452486
{
453487
m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, true);
454488
return;
455489
}
456490

457-
SWSS_LOG_INFO("VXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", key.c_str(), vtep.c_str(), type.c_str());
458491
m_fdbTable.del(key);
459492
return;
460493

@@ -476,14 +509,16 @@ void FdbSync::macAddVxlan(string key, struct in_addr vtep, string type, uint32_t
476509
fvVector.push_back(t);
477510
fvVector.push_back(v);
478511

512+
SWSS_LOG_INFO("%sVXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s",
513+
m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" ,
514+
key.c_str(), svtep.c_str(), type.c_str());
479515
// If warmstart is in progress, we take all netlink changes into the cache map
480516
if (m_AppRestartAssist->isWarmStartInProgress())
481517
{
482518
m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, false);
483519
return;
484520
}
485521

486-
SWSS_LOG_INFO("VXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", key.c_str(), svtep.c_str(), type.c_str());
487522
m_fdbTable.set(key, fvVector);
488523

489524
return;

fdbsyncd/fdbsync.h

+12-3
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,17 @@
99
#include "netmsg.h"
1010
#include "warmRestartAssist.h"
1111

12-
// The timeout value (in seconds) for fdbsyncd reconcilation logic
13-
#define DEFAULT_FDBSYNC_WARMSTART_TIMER 30
12+
/*
13+
* Default timer interval for fdbsyncd reconcillation
14+
*/
15+
#define DEFAULT_FDBSYNC_WARMSTART_TIMER 120
16+
17+
/*
18+
* This is the MAX time in seconds, fdbsyncd will wait after warm-reboot
19+
* for the interface entries to be recreated in kernel before attempting to
20+
* write the FDB data to kernel
21+
*/
22+
#define INTF_RESTORE_MAX_WAIT_TIME 180
1423

1524
namespace swss {
1625

@@ -43,7 +52,7 @@ class FdbSync : public NetMsg
4352

4453
virtual void onMsg(int nlmsg_type, struct nl_object *obj);
4554

46-
bool isFdbRestoreDone();
55+
bool isIntfRestoreDone();
4756

4857
AppRestartAssist *getRestartAssist()
4958
{

fdbsyncd/fdbsyncd.cpp

+54-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "netdispatcher.h"
88
#include "netlink.h"
99
#include "fdbsyncd/fdbsync.h"
10+
#include "warm_restart.h"
1011

1112
using namespace std;
1213
using namespace swss;
@@ -35,6 +36,7 @@ int main(int argc, char **argv)
3536
Selectable *temps;
3637
int ret;
3738
Select s;
39+
SelectableTimer replayCheckTimer(timespec{0, 0});
3840

3941
using namespace std::chrono;
4042

@@ -45,7 +47,29 @@ int main(int argc, char **argv)
4547
if (sync.getRestartAssist()->isWarmStartInProgress())
4648
{
4749
sync.getRestartAssist()->readTablesToMap();
48-
SWSS_LOG_NOTICE("Starting ReconcileTimer");
50+
51+
steady_clock::time_point starttime = steady_clock::now();
52+
while (!sync.isIntfRestoreDone())
53+
{
54+
duration<double> time_span =
55+
duration_cast<duration<double>>(steady_clock::now() - starttime);
56+
int pasttime = int(time_span.count());
57+
58+
if (pasttime > INTF_RESTORE_MAX_WAIT_TIME)
59+
{
60+
SWSS_LOG_INFO("timed-out before all interface data was replayed to kernel!!!");
61+
throw runtime_error("fdbsyncd: timedout on interface data replay");
62+
}
63+
sleep(1);
64+
}
65+
replayCheckTimer.setInterval(timespec{1, 0});
66+
replayCheckTimer.start();
67+
s.addSelectable(&replayCheckTimer);
68+
}
69+
else
70+
{
71+
sync.getRestartAssist()->warmStartDisabled();
72+
sync.m_reconcileDone = true;
4973
}
5074

5175
netlink.registerGroup(RTNLGRP_LINK);
@@ -67,14 +91,41 @@ int main(int argc, char **argv)
6791
{
6892
s.select(&temps);
6993

70-
if(temps == (Selectable *)sync.getFdbStateTable())
94+
if (temps == (Selectable *)sync.getFdbStateTable())
7195
{
7296
sync.processStateFdb();
7397
}
7498
else if (temps == (Selectable *)sync.getCfgEvpnNvoTable())
7599
{
76100
sync.processCfgEvpnNvo();
77101
}
102+
else if (temps == &replayCheckTimer)
103+
{
104+
if (sync.getFdbStateTable()->empty() && sync.getCfgEvpnNvoTable()->empty())
105+
{
106+
sync.getRestartAssist()->appDataReplayed();
107+
SWSS_LOG_NOTICE("FDB Replay Complete");
108+
s.removeSelectable(&replayCheckTimer);
109+
110+
/* Obtain warm-restart timer defined for routing application */
111+
uint32_t warmRestartIval = WarmStart::getWarmStartTimer("bgp","bgp");
112+
if (warmRestartIval)
113+
{
114+
sync.getRestartAssist()->setReconcileInterval(warmRestartIval);
115+
}
116+
//Else the interval is already set to default value
117+
118+
//TODO: Optimise the reconcillation time using eoiu - issue#1657
119+
SWSS_LOG_NOTICE("Starting ReconcileTimer");
120+
sync.getRestartAssist()->startReconcileTimer(s);
121+
}
122+
else
123+
{
124+
replayCheckTimer.setInterval(timespec{1, 0});
125+
// re-start replay check timer
126+
replayCheckTimer.start();
127+
}
128+
}
78129
else
79130
{
80131
/*
@@ -88,7 +139,7 @@ int main(int argc, char **argv)
88139
sync.m_reconcileDone = true;
89140
sync.getRestartAssist()->stopReconcileTimer(s);
90141
sync.getRestartAssist()->reconcile();
91-
SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete (Timer)");
142+
SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete");
92143
}
93144
}
94145
}

fpmsyncd/fpmsyncd.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ using namespace swss;
1818
*/
1919
const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 120;
2020

21+
2122
// Wait 3 seconds after detecting EOIU reached state
2223
// TODO: support eoiu hold interval config
2324
const uint32_t DEFAULT_EOIU_HOLD_INTERVAL = 3;
@@ -67,6 +68,7 @@ int main(int argc, char **argv)
6768
SelectableTimer eoiuCheckTimer(timespec{0, 0});
6869
// After eoiu flags are detected, start a hold timer before starting reconciliation.
6970
SelectableTimer eoiuHoldTimer(timespec{0, 0});
71+
7072
/*
7173
* Pipeline should be flushed right away to deal with state pending
7274
* from previous try/catch iterations.
@@ -108,6 +110,10 @@ int main(int argc, char **argv)
108110
s.addSelectable(&eoiuCheckTimer);
109111
SWSS_LOG_NOTICE("Warm-Restart eoiuCheckTimer timer started.");
110112
}
113+
else
114+
{
115+
sync.m_warmStartHelper.setState(WarmStart::WSDISABLED);
116+
}
111117

112118
while (true)
113119
{
@@ -132,6 +138,7 @@ int main(int argc, char **argv)
132138
{
133139
SWSS_LOG_NOTICE("Warm-Restart EOIU hold timer expired.");
134140
}
141+
135142
if (sync.m_warmStartHelper.inProgress())
136143
{
137144
sync.m_warmStartHelper.reconcile();

tests/test_warm_reboot.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def swss_app_check_RestoreCount_single(state_db, restore_count, name):
7676
if fv[0] == "restore_count":
7777
assert int(fv[1]) == restore_count[key] + 1
7878
elif fv[0] == "state":
79-
assert fv[1] == "reconciled" or fv[1] == "disabled"
79+
assert fv[1] == "reconciled" or fv[1] == "disabled"
8080

8181
def swss_app_check_warmstart_state(state_db, name, state):
8282
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
@@ -1150,7 +1150,7 @@ def test_routing_WarmRestart(self, dvs, testlog):
11501150
time.sleep(5)
11511151

11521152
# Verify FSM
1153-
swss_app_check_warmstart_state(state_db, "bgp", "")
1153+
swss_app_check_warmstart_state(state_db, "bgp", "disabled")
11541154

11551155
# Verify that multiple changes are seen in swss and sairedis logs as there's
11561156
# no warm-reboot logic in place.

warmrestart/warmRestartAssist.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,16 @@ AppRestartAssist::cache_state_t AppRestartAssist::getCacheEntryState(const std::
117117
throw std::logic_error("cache entry state is invalid");
118118
}
119119

120+
void AppRestartAssist::appDataReplayed()
121+
{
122+
WarmStart::setWarmStartState(m_appName, WarmStart::REPLAYED);
123+
}
124+
125+
void AppRestartAssist::warmStartDisabled()
126+
{
127+
WarmStart::setWarmStartState(m_appName, WarmStart::WSDISABLED);
128+
}
129+
120130
// Read table(s) from APPDB and append stale flag then insert to cachemap
121131
void AppRestartAssist::readTablesToMap()
122132
{
@@ -274,6 +284,13 @@ void AppRestartAssist::reconcile()
274284
return;
275285
}
276286

287+
// set the reconcile interval
288+
void AppRestartAssist::setReconcileInterval(uint32_t time)
289+
{
290+
m_reconcileTimer = time;
291+
m_warmStartTimer.setInterval(timespec{m_reconcileTimer, 0});
292+
}
293+
277294
// start the timer, take Select class "s" to add the timer.
278295
void AppRestartAssist::startReconcileTimer(Select &s)
279296
{

warmrestart/warmRestartAssist.h

+3
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,13 @@ class AppRestartAssist
7575
DELETE = 3
7676
};
7777
// These functions were used as described in the class description
78+
void setReconcileInterval(uint32_t time);
7879
void startReconcileTimer(Select &s);
7980
void stopReconcileTimer(Select &s);
8081
bool checkReconcileTimer(Selectable *s);
8182
void readTablesToMap(void);
83+
void appDataReplayed(void);
84+
void warmStartDisabled(void);
8285
void insertToMap(std::string tableName, std::string key, std::vector<FieldValueTuple> fvVector, bool delete_key);
8386
void reconcile(void);
8487
bool isWarmStartInProgress(void)

0 commit comments

Comments
 (0)