Skip to content

Commit

Permalink
[active-standby] Fix the oscillation logic (#261)
Browse files Browse the repository at this point in the history
Approach
What is the motivation for this PR?
Fix the unexpected toggle introduced by the oscillation logic.

Signed-off-by: Longxiang Lyu lolv@microsoft.com

Work item tracking
Microsoft ADO (number only): 28397786
How did you do it?
The oscillation is introduced to allow the active side to toggle to standby if no heartbeat is received.
The workflow is described as the following:

(wait, active, up) ---> set oscillation timer [1]
...
(wait, active, up) ---> (wait, wait, up) [2]
(wait, wait, up) ---> (wait, active, up)
...
(wait, active, up) <--- oscillation timer expires, toggle to standby [3]
[1]: the ToR enters (wait, active, up), no heartbeats received, oscillation timer is set.
[2]: the ToR consistently probes the mux status, and transits between (wait, active, up) and (wait, wait, up).
[3]: when the oscillation timer expires and the ToR is (wait, active, up), make the toggle to standby request.

We need to ensure that, the ToR is only allowed to transits between (wait, active, up) and (wait, wait, up) during [2]. So any link prober active/standby, mux standby, or link down events should cancel the oscillation.

How did you verify/test it?
UT

Any platform specific information?
Documentation
  • Loading branch information
lolyu authored and mssonicbld committed Aug 23, 2024
1 parent 8427a89 commit ff0cc80
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 5 deletions.
31 changes: 28 additions & 3 deletions src/link_manager/LinkManagerStateMachineActiveStandby.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ ActiveStandbyStateMachine::ActiveStandbyStateMachine(
mMuxStateMachine.setWaitStateCause(mux_state::WaitState::WaitStateCause::SwssUpdate);
mMuxPortPtr->setMuxLinkmgrState(mLabel);
initializeTransitionFunctionTable();

mOscillationTimer.expires_from_now(boost::posix_time::seconds(1));
}

//
Expand Down Expand Up @@ -476,12 +474,16 @@ void ActiveStandbyStateMachine::handleStateChange(LinkProberEvent &event, link_p
mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberActiveStart);

mStandbyUnknownUpCount = 0;

tryCancelOscillationTimerIfAlive();
}

if (state == link_prober::LinkProberState::Label::Standby) {
mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberStandbyStart);

mActiveUnknownUpCount = 0;

tryCancelOscillationTimerIfAlive();
}

CompositeState nextState = mCompositeState;
Expand Down Expand Up @@ -542,6 +544,10 @@ void ActiveStandbyStateMachine::handleStateChange(MuxStateEvent &event, mux_stat
mStandbyUnknownUpCount = 0;
}

if (state == mux_state::MuxState::Label::Standby) {
tryCancelOscillationTimerIfAlive();
}

updateMuxLinkmgrState();
}

Expand Down Expand Up @@ -582,6 +588,8 @@ void ActiveStandbyStateMachine::handleStateChange(LinkStateEvent &event, link_st
mWaitActiveUpCount = 0;
mWaitStandbyUpBackoffFactor = 1;
mUnknownActiveUpBackoffFactor = 1;

tryCancelOscillationTimerIfAlive();
} else {
mStateTransitionHandler[ps(nextState)][ms(nextState)][ls(nextState)](nextState);
}
Expand Down Expand Up @@ -1064,6 +1072,8 @@ void ActiveStandbyStateMachine::handleMuxWaitTimeout(boost::system::error_code e
void ActiveStandbyStateMachine::startOscillationTimer()
{
// Note: This timer is started when Mux state is active and link prober is in wait state.
MUXLOGINFO(boost::format("%s: start the oscillation timer") % mMuxPortConfig.getPortName());
mOscillationTimerAlive = true;
mOscillationTimer.expires_from_now(boost::posix_time::seconds(
mMuxPortConfig.getOscillationInterval_sec()
));
Expand All @@ -1074,6 +1084,20 @@ void ActiveStandbyStateMachine::startOscillationTimer()
)));
}

//
// ---> tryCancelOscillationTimerIfAlive();
//
// cancel the oscillation timer if it is alive
//
void ActiveStandbyStateMachine::tryCancelOscillationTimerIfAlive()
{
if (mOscillationTimerAlive) {
MUXLOGINFO(boost::format("%s: cancel the oscillation timer") % mMuxPortConfig.getPortName());
mOscillationTimerAlive = false;
mOscillationTimer.cancel();
}
}

//
// ---> handleOscillationTimeout(boost::system::error_code errorCode);
//
Expand All @@ -1083,6 +1107,7 @@ void ActiveStandbyStateMachine::handleOscillationTimeout(boost::system::error_co
{
MUXLOGDEBUG(mMuxPortConfig.getPortName());

mOscillationTimerAlive = false;
if (mMuxPortConfig.getIfOscillationEnabled() &&
errorCode == boost::system::errc::success &&
ps(mCompositeState) == link_prober::LinkProberState::Label::Wait &&
Expand Down Expand Up @@ -1318,7 +1343,7 @@ void ActiveStandbyStateMachine::LinkProberWaitMuxActiveLinkUpTransitionFunction(
mSuspendTxFnPtr(mMuxPortConfig.getLinkWaitTimeout_msec());
}

if (mOscillationTimer.expires_at() < boost::posix_time::microsec_clock::local_time()) {
if (!mOscillationTimerAlive) {
startOscillationTimer();
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/link_manager/LinkManagerStateMachineActiveStandby.h
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,15 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase,
*/
void startOscillationTimer();

/**
*@method tryCancelOscillationTimerIfAlive
*
*@brief cancel the oscillation timer if it is alive
*
*@return none
*/
inline void tryCancelOscillationTimerIfAlive();

/**
*@method handleOscillationTimeout
*
Expand Down Expand Up @@ -859,6 +868,7 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase,
boost::asio::deadline_timer mDeadlineTimer;
boost::asio::deadline_timer mWaitTimer;
boost::asio::deadline_timer mOscillationTimer;
bool mOscillationTimerAlive = false;

boost::function<void ()> mInitializeProberFnPtr;
boost::function<void ()> mStartProbingFnPtr;
Expand Down
39 changes: 37 additions & 2 deletions test/LinkManagerStateMachineTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
* Author: Tamer Ahmed
*/

#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/thread/thread.hpp>

#include "LinkManagerStateMachineTest.h"
#include "link_prober/LinkProberStateMachineBase.h"
#include "common/MuxLogger.h"
Expand Down Expand Up @@ -50,7 +53,7 @@ LinkManagerStateMachineTest::LinkManagerStateMachineTest() :
mMuxConfig.setPositiveStateChangeRetryCount(mPositiveUpdateCount);
mMuxConfig.setMuxStateChangeRetryCount(mPositiveUpdateCount);
mMuxConfig.setLinkStateChangeRetryCount(mPositiveUpdateCount);
mMuxConfig.setOscillationInterval_sec(1,true);
mMuxConfig.setOscillationInterval_sec(2, true);
}

void LinkManagerStateMachineTest::runIoService(uint32_t count)
Expand Down Expand Up @@ -1468,7 +1471,7 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation)
handleMuxState("active", 3);
VALIDATE_STATE(Wait, Active, Up);

runIoService(2);
runIoService(1);
VALIDATE_STATE(Wait, Wait, Up);

handleProbeMuxState("active", 3);
Expand All @@ -1481,6 +1484,38 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation)
mMuxConfig.setTimeoutIpv4_msec(10);
}

TEST_F(LinkManagerStateMachineTest, TimedOscillationMuxProbeStandbyCancel)
{
setMuxStandby();

// set icmp timeout to be 500ms otherwise it will probe mux state endlessly and get no chance to do timed oscillation
mMuxConfig.setTimeoutIpv4_msec(500);

postLinkProberEvent(link_prober::LinkProberState::Unknown, 2);
VALIDATE_STATE(Wait, Wait, Up);
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 1);
EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing);

// swss notification
handleMuxState("active", 3);
VALIDATE_STATE(Wait, Active, Up);

runIoService(1);
VALIDATE_STATE(Wait, Wait, Up);

handleProbeMuxState("standby", 3);
VALIDATE_STATE(Wait, Standby, Up);

boost::this_thread::sleep(boost::posix_time::seconds(2));

// 3 mux probe active notifiations + 1 oscillation timeout + 1 mux probe timeout
handleProbeMuxState("active", 5);
VALIDATE_STATE(Wait, Active, Up);
EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing);

mMuxConfig.setTimeoutIpv4_msec(10);
}

TEST_F(LinkManagerStateMachineTest, OrchagentRollback)
{
setMuxStandby();
Expand Down

0 comments on commit ff0cc80

Please sign in to comment.