Skip to content

Commit fbdcaae

Browse files
authored
[teammgrd]: Improve LAGs cleanup on shutdown: send SIGTERM directly to PID. (#1841)
This PR is intended to fix LAGs cleanup degradation caused by python2.7 -> python3 migration. The approach is to replace `teamd -k -t` call with the raw `SIGTERM` and add PID alive check. This will make sure the `teammgrd` is stopped only after all managed processes are being killed. resolves: #8071 **What I did** * Replaced `teamd -k -t` call with raw `SIGTERM` * Added PID alive check **Why I did it** * To fix LAGs cleanup timeout issue caused by python2.7 -> python3 upgrade **How I verified it** 1. Configure 64 LAG RIFs 2. Reload config
1 parent 002bb1d commit fbdcaae

File tree

3 files changed

+43
-9
lines changed

3 files changed

+43
-9
lines changed

cfgmgr/teammgr.cpp

+40-5
Original file line numberDiff line numberDiff line change
@@ -112,18 +112,53 @@ void TeamMgr::doTask(Consumer &consumer)
112112
}
113113
}
114114

115-
116115
void TeamMgr::cleanTeamProcesses()
117116
{
118117
SWSS_LOG_ENTER();
119118
SWSS_LOG_NOTICE("Cleaning up LAGs during shutdown...");
120-
for (const auto& it: m_lagList)
119+
120+
std::unordered_map<std::string, pid_t> aliasPidMap;
121+
122+
for (const auto& alias: m_lagList)
123+
{
124+
std::string res;
125+
pid_t pid;
126+
127+
{
128+
std::stringstream cmd;
129+
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid");
130+
EXEC_WITH_ERROR_THROW(cmd.str(), res);
131+
132+
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10));
133+
aliasPidMap[alias] = pid;
134+
135+
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
136+
}
137+
138+
{
139+
std::stringstream cmd;
140+
cmd << "kill -TERM " << pid;
141+
EXEC_WITH_ERROR_THROW(cmd.str(), res);
142+
143+
SWSS_LOG_INFO("Sent SIGTERM to port channel %s pid %d", alias.c_str(), pid);
144+
}
145+
}
146+
147+
for (const auto& cit: aliasPidMap)
121148
{
122-
//This will call team -k kill -t <teamdevicename> which internally send SIGTERM
123-
removeLag(it);
149+
const auto &alias = cit.first;
150+
const auto &pid = cit.second;
151+
152+
std::stringstream cmd;
153+
std::string res;
154+
155+
SWSS_LOG_NOTICE("Waiting for port channel %s pid %d to stop...", alias.c_str(), pid);
156+
157+
cmd << "tail -f --pid=" << pid << " /dev/null";
158+
EXEC_WITH_ERROR_THROW(cmd.str(), res);
124159
}
125160

126-
return;
161+
SWSS_LOG_NOTICE("LAGs cleanup is done");
127162
}
128163

129164
void TeamMgr::doLagTask(Consumer &consumer)

cfgmgr/teammgr.h

-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ class TeamMgr : public Orch
3232
ProducerStateTable m_appLagTable;
3333

3434
std::set<std::string> m_lagList;
35-
std::map<std::string, pid_t> m_lagPIDList;
3635

3736
MacAddress m_mac;
3837

@@ -50,7 +49,6 @@ class TeamMgr : public Orch
5049
bool setLagMtu(const std::string &alias, const std::string &mtu);
5150
bool setLagLearnMode(const std::string &alias, const std::string &learn_mode);
5251
bool setLagTpid(const std::string &alias, const std::string &tpid);
53-
5452

5553
bool isPortEnslaved(const std::string &);
5654
bool findPortMaster(std::string &, const std::string &);

cfgmgr/teammgrd.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char **argv)
6666
}
6767

6868
while (!received_sigterm)
69-
{
69+
{
7070
Selectable *sel;
7171
int ret;
7272

@@ -91,7 +91,8 @@ int main(int argc, char **argv)
9191
catch (const exception &e)
9292
{
9393
SWSS_LOG_ERROR("Runtime error: %s", e.what());
94+
return EXIT_FAILURE;
9495
}
9596

96-
return -1;
97+
return EXIT_SUCCESS;
9798
}

0 commit comments

Comments
 (0)