Skip to content

Commit 9fda944

Browse files
jipanyangqiluo-msft
authored andcommitted
Warm reboot: Add support for orchagent pre-shutdown warm-restart state check (sonic-net#562)
* Add orchagent pre-warm-restart check mechanism * Add orchagent_restart_check options: --noFreeze & --skipPendingTaskCheck * Add waitTime option for response from orchagent * Fix build issue with latest master * adapt to new dvs.runcmd() signature * Move standard header before local headers
1 parent 41e61bd commit 9fda944

8 files changed

+342
-3
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ neighsyncd/neighsyncd
5252
portsyncd/portsyncd
5353
orchagent/orchagent
5454
orchagent/routeresync
55+
orchagent/orchagent_restart_check
5556
swssconfig/swssconfig
5657
swssconfig/swssplayer
5758
tests/tests

orchagent/Makefile.am

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ dist_swss_DATA = \
1010
pfc_detect_barefoot.lua \
1111
pfc_restore.lua
1212

13-
bin_PROGRAMS = orchagent routeresync
13+
bin_PROGRAMS = orchagent routeresync orchagent_restart_check
1414

1515
if DEBUG
1616
DBGFLAGS = -ggdb -DDEBUG
@@ -86,3 +86,7 @@ routeresync_SOURCES = routeresync.cpp
8686
routeresync_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
8787
routeresync_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
8888
routeresync_LDADD = -lswsscommon
89+
90+
orchagent_restart_check_SOURCES = orchagent_restart_check.cpp
91+
orchagent_restart_check_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON)
92+
orchagent_restart_check_LDADD = -lhiredis -lswsscommon -lpthread

orchagent/orchagent_restart_check.cpp

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#include <iostream>
2+
#include <sstream>
3+
4+
#include <unistd.h>
5+
#include <getopt.h>
6+
7+
#include "notificationproducer.h"
8+
#include "notificationconsumer.h"
9+
#include "select.h"
10+
#include "logger.h"
11+
12+
13+
void printUsage()
14+
{
15+
SWSS_LOG_ENTER();
16+
17+
std::cout << "Usage: orchagent_restart_check [-s] " << std::endl;
18+
std::cout << " -n --noFreeze" << std::endl;
19+
std::cout << " Don't freeze orchagent even if check succeeded" << std::endl;
20+
std::cout << " -s --skipPendingTaskCheck" << std::endl;
21+
std::cout << " Skip pending task dependency check for orchagent" << std::endl;
22+
std::cout << " -w --waitTime" << std::endl;
23+
std::cout << " Wait time for response from orchagent, in milliseconds. Default value: 1000" << std::endl;
24+
std::cout << " -h --help:" << std::endl;
25+
std::cout << " Print out this message" << std::endl;
26+
}
27+
28+
29+
/*
30+
* Before stopping orchagent for warm restart, basic state check is preferred to
31+
* ensure orchagent is not in transient state, so a deterministic state may be restored after restart.
32+
*
33+
* Here is to implement orchagent_restart_check binary which may talk to orchagent and
34+
* ask it to do self-check, return "READY " signal and freeze if everything is ok,
35+
* otherwise "NOT_READY" signal should be returned.
36+
*
37+
* Optionally:
38+
* if --noFreeze option is provided, orchagent won't freeze.
39+
* if --skipPendingTaskCheck option is provided, orchagent won't use
40+
* whether there is pending task existing as state check criterion.
41+
*/
42+
int main(int argc, char **argv)
43+
{
44+
swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_INFO);
45+
SWSS_LOG_ENTER();
46+
47+
std::string skipPendingTaskCheck = "fasle";
48+
std::string noFreeze = "fasle";
49+
/* Default wait time is 1000 millisecond */
50+
int waitTime = 1000;
51+
52+
const char* const optstring = "nsw:";
53+
while(true)
54+
{
55+
static struct option long_options[] =
56+
{
57+
{ "noFreeze", no_argument, 0, 'n' },
58+
{ "skipPendingTaskCheck", no_argument, 0, 's' },
59+
{ "waitTime", required_argument, 0, 'w' }
60+
};
61+
62+
int option_index = 0;
63+
64+
int c = getopt_long(argc, argv, optstring, long_options, &option_index);
65+
66+
if (c == -1)
67+
{
68+
break;
69+
}
70+
71+
switch (c)
72+
{
73+
case 'n':
74+
SWSS_LOG_NOTICE("Won't freeze orchagent even if check succeeded");
75+
noFreeze = "true";
76+
break;
77+
case 's':
78+
SWSS_LOG_NOTICE("Skipping pending task check for orchagent");
79+
skipPendingTaskCheck = "true";
80+
break;
81+
case 'w':
82+
SWSS_LOG_NOTICE("Wait time for response from orchagent set to %s milliseconds", optarg);
83+
waitTime = atoi(optarg);
84+
break;
85+
case 'h':
86+
printUsage();
87+
exit(EXIT_SUCCESS);
88+
89+
case '?':
90+
SWSS_LOG_WARN("unknown option %c", optopt);
91+
printUsage();
92+
exit(EXIT_FAILURE);
93+
94+
default:
95+
SWSS_LOG_ERROR("getopt_long failure");
96+
exit(EXIT_FAILURE);
97+
}
98+
}
99+
100+
swss::DBConnector db(APPL_DB, swss::DBConnector::DEFAULT_UNIXSOCKET, 0);
101+
// Send warm restart query via "RESTARTCHECK" notification channel
102+
swss::NotificationProducer restartQuery(&db, "RESTARTCHECK");
103+
// Will listen for the reply on "RESTARTCHECKREPLY" channel
104+
swss::NotificationConsumer restartQueryReply(&db, "RESTARTCHECKREPLY");
105+
106+
std::vector<swss::FieldValueTuple> values;
107+
values.emplace_back("NoFreeze", noFreeze);
108+
values.emplace_back("SkipPendingTaskCheck", skipPendingTaskCheck);
109+
std::string op = "orchagent";
110+
SWSS_LOG_NOTICE("requested %s to do warm restart state check", op.c_str());
111+
restartQuery.send(op, op, values);
112+
113+
114+
swss::Select s;
115+
s.addSelectable(&restartQueryReply);
116+
swss::Selectable *sel;
117+
std::string op_ret, data;
118+
values.clear();
119+
int result = s.select(&sel, waitTime);
120+
if (result == swss::Select::OBJECT)
121+
{
122+
restartQueryReply.pop(op_ret, data, values);
123+
if (data == "READY")
124+
{
125+
SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op_ret.c_str());
126+
std::cout << "RESTARTCHECK succeeded" << std::endl;
127+
return EXIT_SUCCESS;
128+
}
129+
else
130+
{
131+
SWSS_LOG_NOTICE("RESTARTCHECK failed, %s is not ready for warm restart with status %s",
132+
op_ret.c_str(), data.c_str());
133+
}
134+
}
135+
else if (result == swss::Select::TIMEOUT)
136+
{
137+
SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op_ret.c_str());
138+
}
139+
else
140+
{
141+
SWSS_LOG_NOTICE("RESTARTCHECK for %s error", op_ret.c_str());
142+
}
143+
std::cout << "RESTARTCHECK failed" << std::endl;
144+
return EXIT_FAILURE;
145+
}

orchagent/orchdaemon.cpp

+60
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <unistd.h>
22
#include <unordered_map>
3+
#include <limits.h>
34
#include "orchdaemon.h"
45
#include "logger.h"
56
#include <sairedis.h>
@@ -343,6 +344,26 @@ void OrchDaemon::start()
343344
* is a good chance to flush the pipeline before next select happened.
344345
*/
345346
flush();
347+
348+
/*
349+
* Asked to check warm restart readiness.
350+
* Not doing this under Select::TIMEOUT condition because of
351+
* the existence of finer granularity ExecutableTimer with select
352+
*/
353+
if (gSwitchOrch->checkRestartReady())
354+
{
355+
bool ret = warmRestartCheck();
356+
if (ret)
357+
{
358+
// Orchagent is ready to perform warm restart, stop processing any new db data.
359+
// Should sleep here or continue handling timers and etc.??
360+
if (!gSwitchOrch->checkRestartNoFreeze())
361+
{
362+
SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
363+
sleep(UINT_MAX);
364+
}
365+
}
366+
}
346367
}
347368
}
348369

@@ -435,3 +456,42 @@ bool OrchDaemon::warmRestoreValidation()
435456
WarmStart::setWarmStartState("orchagent", WarmStart::RESTORED);
436457
return true;
437458
}
459+
460+
/*
461+
* Reply with "READY" notification if no pending tasks, and return true.
462+
* Ortherwise reply with "NOT_READY" notification and return false.
463+
* Further consideration is needed as to when orchagent is treated as warm restart ready.
464+
* For now, no pending task should exist in any orch agent.
465+
*/
466+
bool OrchDaemon::warmRestartCheck()
467+
{
468+
std::vector<swss::FieldValueTuple> values;
469+
std::string op = "orchagent";
470+
std::string data = "READY";
471+
bool ret = true;
472+
473+
vector<string> ts;
474+
getTaskToSync(ts);
475+
476+
if (ts.size() != 0)
477+
{
478+
SWSS_LOG_NOTICE("WarmRestart check found pending tasks: ");
479+
for(auto &s : ts)
480+
{
481+
SWSS_LOG_NOTICE(" %s", s.c_str());
482+
}
483+
if (!gSwitchOrch->skipPendingTaskCheck())
484+
{
485+
data = "NOT_READY";
486+
ret = false;
487+
}
488+
else
489+
{
490+
SWSS_LOG_NOTICE("Orchagent objects dependency check skipped");
491+
}
492+
}
493+
494+
SWSS_LOG_NOTICE("Restart check result: %s", data.c_str());
495+
gSwitchOrch->restartCheckReply(op, data, values);
496+
return ret;
497+
}

orchagent/orchdaemon.h

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class OrchDaemon
3939
bool warmRestoreAndSyncUp();
4040
void getTaskToSync(vector<string> &ts);
4141
bool warmRestoreValidation();
42+
43+
bool warmRestartCheck();
4244
private:
4345
DBConnector *m_applDb;
4446
DBConnector *m_configDb;

orchagent/switchorch.cpp

+55-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include "switchorch.h"
44
#include "converter.h"
5+
#include "notifier.h"
6+
#include "notificationproducer.h"
57

68
using namespace std;
79
using namespace swss;
@@ -27,8 +29,12 @@ const map<string, sai_packet_action_t> packet_action_map =
2729
};
2830

2931
SwitchOrch::SwitchOrch(DBConnector *db, string tableName) :
30-
Orch(db, tableName)
32+
Orch(db, tableName),
33+
m_db(db)
3134
{
35+
m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK");
36+
auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK");
37+
Orch::addExecutor(restartCheckNotifier);
3238
}
3339

3440
void SwitchOrch::doTask(Consumer &consumer)
@@ -122,3 +128,51 @@ void SwitchOrch::doTask(Consumer &consumer)
122128
}
123129
}
124130

131+
void SwitchOrch::doTask(NotificationConsumer& consumer)
132+
{
133+
SWSS_LOG_ENTER();
134+
135+
std::string op;
136+
std::string data;
137+
std::vector<swss::FieldValueTuple> values;
138+
139+
consumer.pop(op, data, values);
140+
141+
if (&consumer != m_restartCheckNotificationConsumer)
142+
{
143+
return;
144+
}
145+
146+
m_warmRestartCheck.checkRestartReadyState = false;
147+
m_warmRestartCheck.noFreeze = false;
148+
m_warmRestartCheck.skipPendingTaskCheck = false;
149+
150+
SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str());
151+
if (op == "orchagent")
152+
{
153+
string s = op;
154+
155+
m_warmRestartCheck.checkRestartReadyState = true;
156+
for (auto &i : values)
157+
{
158+
s += "|" + fvField(i) + ":" + fvValue(i);
159+
160+
if (fvField(i) == "NoFreeze" && fvValue(i) == "true")
161+
{
162+
m_warmRestartCheck.noFreeze = true;
163+
}
164+
if (fvField(i) == "SkipPendingTaskCheck" && fvValue(i) == "true")
165+
{
166+
m_warmRestartCheck.skipPendingTaskCheck = true;
167+
}
168+
}
169+
SWSS_LOG_NOTICE("%s", s.c_str());
170+
}
171+
}
172+
173+
void SwitchOrch::restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values)
174+
{
175+
NotificationProducer restartRequestReply(m_db, "RESTARTCHECKREPLY");
176+
restartRequestReply.send(op, data, values);
177+
checkRestartReadyDone();
178+
}

orchagent/switchorch.h

+20
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,31 @@
22

33
#include "orch.h"
44

5+
struct WarmRestartCheck
6+
{
7+
bool checkRestartReadyState;
8+
bool noFreeze;
9+
bool skipPendingTaskCheck;
10+
};
11+
512
class SwitchOrch : public Orch
613
{
714
public:
815
SwitchOrch(DBConnector *db, string tableName);
916

17+
bool checkRestartReady() { return m_warmRestartCheck.checkRestartReadyState; }
18+
bool checkRestartNoFreeze() { return m_warmRestartCheck.noFreeze; }
19+
bool skipPendingTaskCheck() { return m_warmRestartCheck.skipPendingTaskCheck; }
20+
void checkRestartReadyDone() { m_warmRestartCheck.checkRestartReadyState = false; }
21+
void restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values);
1022
private:
1123
void doTask(Consumer &consumer);
24+
25+
NotificationConsumer* m_restartCheckNotificationConsumer;
26+
void doTask(NotificationConsumer& consumer);
27+
DBConnector *m_db;
28+
29+
// Information contained in the request from
30+
// external program for orchagent pre-shutdown state check
31+
WarmRestartCheck m_warmRestartCheck = {false, false, false};
1232
};

0 commit comments

Comments
 (0)