Skip to content

Commit a828e86

Browse files
lolyuyxieca
authored andcommitted
Fix inconsistent mux state (#92)
What is the motivation for this PR? This issue occurs when running config load_minigraph to load new configs at both ToRs. After write_standby.py, the muxorch will try to direct all traffic downstream to the SoC IP and server IP to the tunnel, but xcvrd might fail to set the adminforwardingstate of the port via gRPC because the gRPC channel could not be established because the other ToR is in standby state. After linkmgrd starts to run and receives heartbeats from itself, it will try to toggle to active[toggle#1], but xcvrd might not be able to make the hardware toggle at the moment, so linkmgrd will mux wait. Also, because the mux probe table is initialized with Unknown state, linkmgrd will handle the initial mux probe state to have the composite states (active, unknown, up) and tries to probe the mux state. As the muxorch has been toggled to active[toggle#1], the gRPC channel will be established at some point after, xcvrd will be able to answer the mux probe, so the linkmgrd will be able to change into (active, active, up) state. But the toggling to active[toggle#1] is only finished half way, the mux status in STATE_DB:MUX_CABLE_TABLE is not updated, so show mux status will show unknown for those ports. Signed-off-by: Longxiang Lyu [email protected] How did you do it? When the linkmgrd changes into states (active, active, up) and has the original mux state as unknown, it will toggle the mux to active again to have those DB tables updated: linkmgrd -> APP_DB:MUX_CABLE_TABLE -> swss -> APP_DB:HW_MUX_CABLE_TABLE -> xcvrd xcvrd -> STATE_DB:HW_MUX_CABLE_TABLE -> swss -> STATE_DB:MUX_CABLE_TABLE -> linkmgrd How did you verify/test it? On dualtor-mixed topo with icmp_responder running, do config load_minigraph on both ToRs, verify the show mux status on both ToRs. Signed-off-by: Longxiang Lyu <[email protected]>
1 parent 59334be commit a828e86

3 files changed

+75
-13
lines changed

src/link_manager/LinkManagerStateMachineActiveActive.cpp

+25-3
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,15 @@ void ActiveActiveStateMachine::initializeTransitionFunctionTable()
507507
MUXLOGWARNING("Initializing State Transition Table...");
508508
LinkManagerStateMachineBase::initializeTransitionFunctionTable();
509509

510+
mStateTransitionHandler[link_prober::LinkProberState::Label::Active]
511+
[mux_state::MuxState::Label::Active]
512+
[link_state::LinkState::Label::Up] =
513+
boost::bind(
514+
&ActiveActiveStateMachine::LinkProberActiveMuxActiveLinkUpTransitionFunction,
515+
this,
516+
boost::placeholders::_1
517+
);
518+
510519
mStateTransitionHandler[link_prober::LinkProberState::Label::Active]
511520
[mux_state::MuxState::Label::Standby]
512521
[link_state::LinkState::Label::Up] =
@@ -580,6 +589,19 @@ void ActiveActiveStateMachine::initializeTransitionFunctionTable()
580589
);
581590
}
582591

592+
//
593+
// ---> LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState);
594+
//
595+
// transition function when entering {LinkProberActive, MuxActive, LinkUp} state
596+
//
597+
void ActiveActiveStateMachine::LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState)
598+
{
599+
MUXLOGINFO(mMuxPortConfig.getPortName());
600+
if (ms(mCompositeState) == mux_state::MuxState::Unknown) {
601+
switchMuxState(nextState, mux_state::MuxState::Label::Active);
602+
}
603+
}
604+
583605
//
584606
// ---> LinkProberActiveMuxStandbyLinkUpTransitionFunction(CompositeState &nextState);
585607
//
@@ -1002,9 +1024,9 @@ void ActiveActiveStateMachine::handlePeerMuxWaitTimeout(boost::system::error_cod
10021024
}
10031025
}
10041026

1005-
//
1027+
//
10061028
// ---> handleDefaultRouteStateNotification(const DefaultRoute routeState);
1007-
//
1029+
//
10081030
// handle default route state notification from routeorch
10091031
//
10101032
void ActiveActiveStateMachine::handleDefaultRouteStateNotification(const DefaultRoute routeState)
@@ -1029,7 +1051,7 @@ void ActiveActiveStateMachine::shutdownOrRestartLinkProberOnDefaultRoute()
10291051
if (mMuxPortConfig.getMode() == common::MuxPortConfig::Mode::Auto && mDefaultRouteState == DefaultRoute::NA) {
10301052
mShutdownTxFnPtr();
10311053
} else {
1032-
// If mux mode is in manual/standby/active mode, we should restart link prober.
1054+
// If mux mode is in manual/standby/active mode, we should restart link prober.
10331055
// If default route state is "ok", we should retart link prober.
10341056
mRestartTxFnPtr();
10351057
}

src/link_manager/LinkManagerStateMachineActiveActive.h

+9
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,15 @@ class ActiveActiveStateMachine : public LinkManagerStateMachineBase,
236236
*/
237237
void initializeTransitionFunctionTable() override;
238238

239+
/**
240+
* @method LinkProberActiveMuxActiveLinkUpTransitionFunction
241+
*
242+
* @brief transition function when entering {LinkProberActive, MuxActive, LinkUp} state
243+
*
244+
* @param nextState reference to composite state
245+
*/
246+
void LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState);
247+
239248
/**
240249
* @method LinkProberActiveMuxStandbyLinkUpTransitionFunction
241250
*

test/LinkManagerStateMachineActiveActiveTest.cpp

+41-10
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActiveLinkProberPeerUnknown)
364364
TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActiveConfigDetachedLinkProberPeerUnknown)
365365
{
366366
setMuxActive();
367-
367+
368368
postPeerLinkProberEvent(link_prober::LinkProberState::PeerActive);
369369
VALIDATE_PEER_STATE(PeerActive, Active);
370370

@@ -415,26 +415,57 @@ TEST_F(LinkManagerStateMachineActiveActiveTest, MuxStandbyLinkProberPeerUnknown)
415415
EXPECT_EQ(mDbInterfacePtr->mSetPeerMuxStateInvokeCount, 0);
416416
}
417417

418-
TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActivDefaultRouteState)
418+
TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActivDefaultRouteState)
419419
{
420420
setMuxActive();
421421

422422
postDefaultRouteEvent("ok", 1);
423423
EXPECT_FALSE(mMuxConfig.getIfEnableDefaultRouteFeature());
424-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,0);
425-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,1);
424+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 0);
425+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 1);
426426

427427
postDefaultRouteEvent("na", 1);
428-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,0);
429-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,2);
428+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 0);
429+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 2);
430430

431431
mMuxConfig.enableDefaultRouteFeature(true);
432432
postDefaultRouteEvent("na", 1);
433-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,1);
434-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,2);
433+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 1);
434+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 2);
435435

436436
postDefaultRouteEvent("ok", 1);
437-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,1);
438-
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,3);
437+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 1);
438+
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 3);
439439
}
440+
441+
TEST_F(LinkManagerStateMachineActiveActiveTest, LinkmgrdBootupSequenceHeartBeatFirst)
442+
{
443+
activateStateMachine();
444+
VALIDATE_STATE(Wait, Wait, Down);
445+
446+
postLinkEvent(link_state::LinkState::Up);
447+
VALIDATE_STATE(Wait, Wait, Up);
448+
449+
// the first toggle fails because the the inital mux state
450+
// is standby when linkmgrd first boots up
451+
postLinkProberEvent(link_prober::LinkProberState::Active, 4);
452+
VALIDATE_STATE(Active, Active, Up);
453+
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 1);
454+
EXPECT_EQ(mDbInterfacePtr->mLastSetMuxState, mux_state::MuxState::Label::Active);
455+
456+
// now linkmgrd should be stuck in mux wait timeout
457+
458+
handleProbeMuxState("unknown", 3);
459+
VALIDATE_STATE(Active, Unknown, Up);
460+
461+
// now linkmgrd should be stuck in mux probe timeout
462+
runIoService(4);
463+
464+
// xcvrd now answers the mux probe
465+
handleProbeMuxState("active", 4);
466+
VALIDATE_STATE(Active, Active, Up);
467+
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 2);
468+
EXPECT_EQ(mDbInterfacePtr->mLastSetMuxState, mux_state::MuxState::Label::Active);
469+
}
470+
440471
} /* namespace test */

0 commit comments

Comments
 (0)