Skip to content

Commit f666011

Browse files
author
Shuotian Cheng
authored
[teammgrd]: Add retry logic when enslaving member port into team (#669)
* [teammgrd]: Add retry logic when enslaving member port into team When a port is not set as admin down, teamd will prevent the port from being added into the port channel. This could happen when some other processes or users are executing commands to bring up the port. This commit will try three times of enslaving before return a false. Example error message when adding a port which is admin up: libteamdctl: cli_usock_process_msg: usock: Error message received: "PortAddFail" libteamdctl: cli_usock_process_msg: usock: Error message content: "Failed to add port." command call failed (Invalid argument) * [teammgrd]: Check the admin status after the teamdctl command failure Instead of having a retry logic with time limitations, it is better to confirm the exact reason of the failure - the member port is admin up or not. The function checkPortIffUp() is added to get the netdev flags and check the IFF_UP flag. The add member logic will only retry under the circumstance that the teamdctl command fails and the member port is still UP. Signed-off-by: Shu0T1an ChenG <[email protected]>
1 parent 36e304d commit f666011

File tree

2 files changed

+89
-20
lines changed

2 files changed

+89
-20
lines changed

cfgmgr/teammgr.cpp

+86-19
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <unistd.h>
2+
13
#include "exec.h"
24
#include "teammgr.h"
35
#include "logger.h"
@@ -8,6 +10,9 @@
810
#include <sstream>
911
#include <thread>
1012

13+
#include <sys/ioctl.h>
14+
#include <net/if.h>
15+
1116
using namespace std;
1217
using namespace swss;
1318

@@ -182,7 +187,7 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
182187
{
183188
KeyOpFieldsValuesTuple t = it->second;
184189

185-
auto tokens = tokenize(kfvKey(t), '|');
190+
auto tokens = tokenize(kfvKey(t), config_db_key_delimiter);
186191
auto lag = tokens[0];
187192
auto member = tokens[1];
188193

@@ -196,7 +201,11 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
196201
continue;
197202
}
198203

199-
addLagMember(lag, member);
204+
if (addLagMember(lag, member) == task_need_retry)
205+
{
206+
it++;
207+
continue;
208+
}
200209
}
201210
else if (op == DEL_COMMAND)
202211
{
@@ -207,6 +216,49 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
207216
}
208217
}
209218

219+
bool TeamMgr::checkPortIffUp(const string &port)
220+
{
221+
SWSS_LOG_ENTER();
222+
223+
struct ifreq ifr;
224+
memcpy(ifr.ifr_name, port.c_str(), strlen(port.c_str()));
225+
ifr.ifr_name[strlen(port.c_str())] = 0;
226+
227+
int fd = socket(AF_UNIX, SOCK_DGRAM, 0);
228+
if (fd == -1 || ioctl(fd, SIOCGIFFLAGS, &ifr) == -1)
229+
{
230+
SWSS_LOG_ERROR("Failed to get port %s flags", port.c_str());
231+
return false;
232+
}
233+
234+
SWSS_LOG_INFO("Get port %s flags %i", port.c_str(), ifr.ifr_flags);
235+
236+
return ifr.ifr_flags & IFF_UP;
237+
}
238+
239+
bool TeamMgr::findPortMaster(string &master, const string &port)
240+
{
241+
SWSS_LOG_ENTER();
242+
243+
vector<string> keys;
244+
m_cfgLagMemberTable.getKeys(keys);
245+
246+
for (auto key: keys)
247+
{
248+
auto tokens = tokenize(key, config_db_key_delimiter);
249+
auto lag = tokens[0];
250+
auto member = tokens[1];
251+
252+
if (port == member)
253+
{
254+
master = lag;
255+
return true;
256+
}
257+
}
258+
259+
return false;
260+
}
261+
210262
// When a port gets removed and created again, notification is triggered
211263
// when state dabatabase gets updated. In this situation, the port needs
212264
// to be enslaved into the LAG again.
@@ -226,21 +278,13 @@ void TeamMgr::doPortUpdateTask(Consumer &consumer)
226278
{
227279
SWSS_LOG_INFO("Received port %s state update", alias.c_str());
228280

229-
vector<string> keys;
230-
m_cfgLagMemberTable.getKeys(keys);
231-
232-
for (auto key : keys)
281+
string lag;
282+
if (findPortMaster(lag, alias))
233283
{
234-
auto tokens = tokenize(key, '|');
235-
236-
auto lag = tokens[0];
237-
auto member = tokens[1];
238-
239-
// Find the master of the port
240-
if (alias == member)
284+
if (addLagMember(lag, alias) == task_need_retry)
241285
{
242-
addLagMember(lag, alias);
243-
break;
286+
it++;
287+
continue;
244288
}
245289
}
246290
}
@@ -291,7 +335,7 @@ bool TeamMgr::setLagMtu(const string &alias, const string &mtu)
291335

292336
for (auto key : keys)
293337
{
294-
auto tokens = tokenize(key, '|');
338+
auto tokens = tokenize(key, config_db_key_delimiter);
295339
auto lag = tokens[0];
296340
auto member = tokens[1];
297341

@@ -362,7 +406,7 @@ bool TeamMgr::removeLag(const string &alias)
362406
// Once a port is enslaved into a port channel, the port's MTU will
363407
// be inherited from the master's MTU while the port's admin status
364408
// will still be controlled separately.
365-
bool TeamMgr::addLagMember(const string &lag, const string &member)
409+
task_process_status TeamMgr::addLagMember(const string &lag, const string &member)
366410
{
367411
SWSS_LOG_ENTER();
368412

@@ -373,7 +417,29 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)
373417
// ip link set dev <member> down;
374418
// teamdctl <port_channel_name> port add <member>;
375419
cmd << IP_CMD << " link set dev " << member << " down; ";
376-
cmd << TEAMDCTL_CMD << " " << lag << " port add " << member << "; ";
420+
cmd << TEAMDCTL_CMD << " " << lag << " port add " << member;
421+
422+
if (exec(cmd.str(), res) != 0)
423+
{
424+
// teamdctl port add command will fail when the member port is not
425+
// set to admin status down; it is possible that some other processes
426+
// or users (e.g. portmgrd) are executing the command to bring up the
427+
// member port while adding this port into the port channel. This piece
428+
// of code will check if the port is set to admin status up. If yes,
429+
// it will retry to add the port into the port channel.
430+
if (checkPortIffUp(member))
431+
{
432+
SWSS_LOG_INFO("Failed to add %s to port channel %s, retry...",
433+
member.c_str(), lag.c_str());
434+
return task_need_retry;
435+
}
436+
else
437+
{
438+
SWSS_LOG_ERROR("Failed to add %s to port channel %s",
439+
member.c_str(), lag.c_str());
440+
return task_failed;
441+
}
442+
}
377443

378444
vector<FieldValueTuple> fvs;
379445
m_cfgPortTable.get(member, fvs);
@@ -403,6 +469,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)
403469
}
404470

405471
// ip link set dev <member> [up|down]
472+
cmd.str(string());
406473
cmd << IP_CMD << " link set dev " << member << " " << admin_status;
407474
EXEC_WITH_ERROR_THROW(cmd.str(), res);
408475

@@ -415,7 +482,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)
415482

416483
SWSS_LOG_NOTICE("Add %s to port channel %s", member.c_str(), lag.c_str());
417484

418-
return true;
485+
return task_success;
419486
}
420487

421488
// Once a port is removed from from the master, both the admin status and the

cfgmgr/teammgr.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,14 @@ class TeamMgr : public Orch
4040

4141
bool addLag(const string &alias, int min_links, bool fall_back);
4242
bool removeLag(const string &alias);
43-
bool addLagMember(const string &lag, const string &member);
43+
task_process_status addLagMember(const string &lag, const string &member);
4444
bool removeLagMember(const string &lag, const string &member);
4545

4646
bool setLagAdminStatus(const string &alias, const string &admin_status);
4747
bool setLagMtu(const string &alias, const string &mtu);
4848

49+
bool findPortMaster(string &, const string &);
50+
bool checkPortIffUp(const string &);
4951
bool isPortStateOk(const string&);
5052
bool isLagStateOk(const string&);
5153
};

0 commit comments

Comments
 (0)