From dd618447e2cc1838431d94f1f8bffa807ef65d89 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Thu, 14 Mar 2024 15:04:46 -0700 Subject: [PATCH 1/5] [active-standby] reset mux probing backoff factor in link flaps (#245) Description of PR Summary: Fixes # (issue) This PR is to address consistency in PXE boot scenario. In PXE boot, there is no ICMP heartbeats. During booting up, standby might switch to active. Mux probing is now the only way to find out mux toggles. But it's also expected to see mux probing fail when link is down, backoff factor can already be maximum when link is up. If we don't reset backoff factor, it might take ~40s for peer to correct the inconsistency. sign-off: Jing Zhang zhangjing@microsoft.com --- src/link_manager/LinkManagerStateMachineActiveStandby.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp index 52bd2d6..cfb5cea 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp @@ -567,6 +567,9 @@ void ActiveStandbyStateMachine::handleStateChange(LinkStateEvent &event, link_st // There is a problem with this approach as it will hide link flaps that result in lost heart-beats. initLinkProberState(nextState, true); // enterMuxWaitState(nextState); + + // reset mux probing backoff factor when link goes up + mMuxUnknownBackoffFactor = 1; } else if (ls(mCompositeState) == link_state::LinkState::Up && ls(nextState) == link_state::LinkState::Down && ms(mCompositeState) != mux_state::MuxState::Label::Standby) { @@ -575,6 +578,10 @@ void ActiveStandbyStateMachine::handleStateChange(LinkStateEvent &event, link_st mActiveUnknownUpCount = 0; mStandbyUnknownUpCount = 0; + + mWaitActiveUpCount = 0; + mWaitStandbyUpBackoffFactor = 1; + mUnknownActiveUpBackoffFactor = 1; } else { mStateTransitionHandler[ps(nextState)][ms(nextState)][ls(nextState)](nextState); } From f96d40cfbe4a771c60579935998ae2c52816c4ea Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Sat, 25 May 2024 02:48:14 -0700 Subject: [PATCH 2/5] [active-standby] add knob to enable/disable oscillation (#250) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Approach #### What is the motivation for this PR? To avoid test flakiness.   ##### Work item tracking - Microsoft ADO **(number only)**: - 28187403 #### How did you do it? 1. Add DB interface to enable/disable the oscillation feature 2. Add DB interface to config the oscillation interval #### How did you verify/test it? Tested on lab device: 1. Knob was default on 2. Turned it off 3. Turned it on 4. Changed the interval #### Any platform specific information? --- src/DbInterface.cpp | 28 +++++++++++ src/MuxManager.h | 22 ++++++++ src/common/MuxConfig.h | 24 ++++++++- src/common/MuxPortConfig.h | 9 ++++ .../LinkManagerStateMachineActiveStandby.cpp | 3 +- test/MuxManagerTest.cpp | 50 +++++++++++++++++++ test/MuxManagerTest.h | 7 +++ 7 files changed, 141 insertions(+), 2 deletions(-) diff --git a/src/DbInterface.cpp b/src/DbInterface.cpp index d5f4357..21688b7 100644 --- a/src/DbInterface.cpp +++ b/src/DbInterface.cpp @@ -1076,6 +1076,34 @@ void DbInterface::processMuxLinkmgrConfigNotifiction(std::deque fieldValues = kfvFieldsValues(entry); + + for (auto &fieldValue: fieldValues) { + std::string f = fvField(fieldValue); + std::string v = fvValue(fieldValue); + if (f == "oscillation_enabled") { + if (v == "true") { + mMuxManagerPtr->setOscillationEnabled(true); + } else if (v == "false"){ + mMuxManagerPtr->setOscillationEnabled(false); + } + } else if (f == "interval_sec") { + try { + mMuxManagerPtr->setOscillationInterval_sec(boost::lexical_cast (v)); + } catch (boost::bad_lexical_cast const &badLexicalCast) { + MUXLOGWARNING(boost::format("bad lexical cast: %s") % badLexicalCast.what()); + } + } + MUXLOGINFO(boost::format("key: %s, Operation: %s, f: %s, v: %s") % key % operation % diff --git a/src/MuxManager.h b/src/MuxManager.h index 3ae75b7..d49fe2c 100644 --- a/src/MuxManager.h +++ b/src/MuxManager.h @@ -123,6 +123,28 @@ class MuxManager */ inline void setTimeoutIpv6_msec(uint32_t timeout_msec) {mMuxConfig.setTimeoutIpv6_msec(timeout_msec);}; + /** + *@method setOscillationEnabled + * + *@brief setter for enable/disable oscillation + * + *@param enable (in) true to enable oscillation + * + *@return none + */ + inline void setOscillationEnabled(bool enable) {mMuxConfig.setOscillationEnabled(enable);}; + + /** + *@method setOscillationInterval_sec + * + *@brief setter for oscillation interval in sec + * + *@param interval_sec (in) interval in sec + * + *@return none + */ + inline void setOscillationInterval_sec(uint32_t interval_sec) {mMuxConfig.setOscillationInterval_sec(interval_sec);}; + /** *@method setLinkProberStatUpdateIntervalCount * diff --git a/src/common/MuxConfig.h b/src/common/MuxConfig.h index 588492c..8ad6454 100644 --- a/src/common/MuxConfig.h +++ b/src/common/MuxConfig.h @@ -97,6 +97,17 @@ class MuxConfig */ inline void setTimeoutIpv6_msec(uint32_t timeout_msec) {mTimeoutIpv6_msec = timeout_msec;}; + /** + *@method setOscillationEnabled + * + *@brief setter for enabling/disabling mux state oscillation + * + *@param enable (in) enable/disable mux state oscillation + * + *@return none + */ + inline void setOscillationEnabled(bool enable) {mEnableTimedOscillationWhenNoHeartbeat = enable;}; + /** *@method setLinkProberStatUpdateIntervalCount * @@ -274,6 +285,15 @@ class MuxConfig */ inline uint32_t getSuspendTimeout_msec() const {return (mNegativeStateChangeRetryCount + 1) * mTimeoutIpv4_msec;}; + /** + *@method getIfOscillationEnabled + * + *@brief getter for oscillation flag + * + *@return oscillation flag + */ + inline bool getIfOscillationEnabled() const {return mEnableTimedOscillationWhenNoHeartbeat;}; + /** *@method getOscillationInterval_sec * @@ -433,7 +453,6 @@ class MuxConfig private: uint8_t mNumberOfThreads = 5; - uint32_t mOscillationTimeout_sec = 300; uint32_t mTimeoutIpv4_msec = 100; uint32_t mTimeoutIpv6_msec = 1000; uint32_t mPositiveStateChangeRetryCount = 1; @@ -443,6 +462,9 @@ class MuxConfig uint32_t mMuxStateChangeRetryCount = 1; uint32_t mLinkStateChangeRetryCount = 1; + bool mEnableTimedOscillationWhenNoHeartbeat = true; + uint32_t mOscillationTimeout_sec = 300; + bool mEnableSwitchoverMeasurement = false; uint32_t mDecreasedTimeoutIpv4_msec = 10; diff --git a/src/common/MuxPortConfig.h b/src/common/MuxPortConfig.h index bc1f82e..fa8a8b6 100644 --- a/src/common/MuxPortConfig.h +++ b/src/common/MuxPortConfig.h @@ -192,6 +192,15 @@ class MuxPortConfig */ inline uint32_t getLinkWaitTimeout_msec() const {return mMuxConfig.getSuspendTimeout_msec();}; + /** + *@method getIfOscillationEnabled + * + *@brief getter for mux state oscillation enable flag + * + *@return oscillation enable flag + */ + inline bool getIfOscillationEnabled() const {return mMuxConfig.getIfOscillationEnabled();}; + /** *@method getOscillationInterval_sec * diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp index cfb5cea..e7c668c 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp @@ -1083,7 +1083,8 @@ void ActiveStandbyStateMachine::handleOscillationTimeout(boost::system::error_co { MUXLOGDEBUG(mMuxPortConfig.getPortName()); - if (errorCode == boost::system::errc::success && + if (mMuxPortConfig.getIfOscillationEnabled() && + errorCode == boost::system::errc::success && ps(mCompositeState) == link_prober::LinkProberState::Label::Wait && ms(mCompositeState) == mux_state::MuxState::Label::Active && ls(mCompositeState) == link_state::LinkState::Label::Up) { diff --git a/test/MuxManagerTest.cpp b/test/MuxManagerTest.cpp index 461b8ca..f4d5da9 100644 --- a/test/MuxManagerTest.cpp +++ b/test/MuxManagerTest.cpp @@ -479,6 +479,20 @@ void MuxManagerTest::resetUpdateEthernetFrameFn(const std::string &portName) } } +uint32_t MuxManagerTest::getOscillationInterval_sec(std::string port) +{ + std::shared_ptr muxPortPtr = mMuxManagerPtr->mPortMap[port]; + + return muxPortPtr->mMuxPortConfig.getOscillationInterval_sec(); +} + +bool MuxManagerTest::getOscillationEnabled(std::string port) +{ + std::shared_ptr muxPortPtr = mMuxManagerPtr->mPortMap[port]; + + return muxPortPtr->mMuxPortConfig.getIfOscillationEnabled(); +} + TEST_F(MuxManagerTest, UpdatePortCableTypeActiveStandby) { std::string port = MuxManagerTest::PortName; @@ -663,6 +677,42 @@ TEST_F(MuxManagerTest, SrcMacAddressUpdate) EXPECT_TRUE(mFakeLinkProber->mUpdateEthernetFrameCallCount == updateEthernetFrameCallCountBefore + 1); } +TEST_P(OscillationIntervalTest, OscillationInterval) +{ + std::string port = "Ethernet0"; + createPort(port); + + std::deque entries = { + {"TIMED_OSCILLATION", "SET", {{"interval_sec", std::get<0>(GetParam())}}} + }; + processMuxLinkmgrConfigNotifiction(entries); + + EXPECT_TRUE(getOscillationInterval_sec(port) == std::get<1>(GetParam())); +} + +INSTANTIATE_TEST_CASE_P( + OscillationInterval, + OscillationIntervalTest, + ::testing::Values( + std::make_tuple("1200", 1200), + std::make_tuple("1", 300), + std::make_tuple("invalid", 300) + ) +); + +TEST_F(MuxManagerTest, OscillationDisabled) +{ + std::string port = "Ethernet0"; + createPort(port); + + std::deque entries = { + {"TIMED_OSCILLATION", "SET", {{"oscillation_enabled", "false"}}} + }; + processMuxLinkmgrConfigNotifiction(entries); + + EXPECT_TRUE(getOscillationEnabled(port) == false); +} + TEST_F(MuxManagerTest, ServerMacAddress) { std::string port = "Ethernet0"; diff --git a/test/MuxManagerTest.h b/test/MuxManagerTest.h index 5874d4f..8d3fda9 100644 --- a/test/MuxManagerTest.h +++ b/test/MuxManagerTest.h @@ -56,9 +56,11 @@ class MuxManagerTest: public testing::Test uint32_t getTimeoutIpv4_msec(std::string port); uint32_t getTimeoutIpv6_msec(std::string port); uint32_t getLinkWaitTimeout_msec(std::string port); + uint32_t getOscillationInterval_sec(std::string port); bool getIfUseWellKnownMac(std::string port); bool setUseWellKnownMacActiveActive(bool use); bool getIfUseToRMac(std::string port); + bool getOscillationEnabled(std::string port); boost::asio::ip::address getBladeIpv4Address(std::string port); std::array getBladeMacAddress(std::string port); std::array getLastUpdatedMacAddress(std::string port); @@ -123,6 +125,11 @@ class MuxConfigUpdateTest: public MuxManagerTest, { }; +class OscillationIntervalTest: public MuxManagerTest, + public testing::WithParamInterface> +{ +}; + } /* namespace test */ #endif /* MUXMANAGERTEST_H_ */ From d0124f5254395ca1dc4aa0853769ac8efdbf380d Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Sat, 22 Jun 2024 01:34:39 +0800 Subject: [PATCH 3/5] [active-standby] Fix default route handler race condition (#254) What is the motivation for this PR? Fix the race condition of the default route notification. This is similar to #104 If there are multiple default route notifications received by linkmgrd, the mux port posts the default route handlers wrapped by strand. But boost asio doesn't guarantee the execution order of the default route handlers, so the final state machine default route could be any intermediate default route state. For example, for default route notifications like: [2024-06-20 08:28:57.872911] [warning] MuxPort.cpp:365 handleDefaultRouteState: port: EtherTest01, state db default route state: na [2024-06-20 08:28:57.872954] [warning] MuxPort.cpp:365 handleDefaultRouteState: port: EtherTest01, state db default route state: ok The final state machine default route state could be "ok" if the handler for "ok" is executed after the handler for "na". The final state machine default route state could be "na" if the handler for "ok" is executed before the handler for "na". Signed-off-by: Longxiang Lyu lolv@microsoft.com Work item tracking Microsoft ADO (number only): 28471183 How did you do it? post the default route handlers directly through strand instead of using strand::wrap, so the handlers are executed in the same order as the handlers' post order. How did you verify/test it? without this PR, UT fail: Signed-off-by: Longxiang Lyu --- src/MuxPort.cpp | 5 ++-- test/FakeMuxPort.h | 1 + test/LinkManagerStateMachineTest.cpp | 43 ++++++++++++++++++++++++++++ test/LinkManagerStateMachineTest.h | 5 ++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/src/MuxPort.cpp b/src/MuxPort.cpp index ca1501c..a9961df 100644 --- a/src/MuxPort.cpp +++ b/src/MuxPort.cpp @@ -369,12 +369,11 @@ void MuxPort::handleDefaultRouteState(const std::string &routeState) state = link_manager::LinkManagerStateMachineBase::DefaultRoute::NA; } - boost::asio::io_service &ioService = mStrand.context(); - ioService.post(mStrand.wrap(boost::bind( + boost::asio::post(mStrand, boost::bind( &link_manager::LinkManagerStateMachineBase::handleDefaultRouteStateNotification, mLinkManagerStateMachinePtr.get(), state - ))); + )); } // diff --git a/test/FakeMuxPort.h b/test/FakeMuxPort.h index 8fc8072..07e5a0b 100644 --- a/test/FakeMuxPort.h +++ b/test/FakeMuxPort.h @@ -63,6 +63,7 @@ class FakeMuxPort : public ::mux::MuxPort { link_prober::LinkProberStateMachineBase* getLinkProberStateMachinePtr() { return getLinkManagerStateMachinePtr()->getLinkProberStateMachinePtr().get(); }; mux_state::MuxStateMachine& getMuxStateMachine() { return getLinkManagerStateMachinePtr()->getMuxStateMachine(); }; link_state::LinkStateMachine& getLinkStateMachine() { return getLinkManagerStateMachinePtr()->getLinkStateMachine(); }; + link_manager::LinkManagerStateMachineBase::DefaultRoute getDefaultRouteState() { return getLinkManagerStateMachinePtr()->getDefaultRouteState(); }; bool getPendingMuxModeChange() { return getActiveStandbyStateMachinePtr()->mPendingMuxModeChange; }; common::MuxPortConfig::Mode getTargetMuxMode() { return getActiveStandbyStateMachinePtr()->mTargetMuxMode; }; diff --git a/test/LinkManagerStateMachineTest.cpp b/test/LinkManagerStateMachineTest.cpp index c954c4a..507ad03 100644 --- a/test/LinkManagerStateMachineTest.cpp +++ b/test/LinkManagerStateMachineTest.cpp @@ -23,6 +23,7 @@ #include "LinkManagerStateMachineTest.h" #include "link_prober/LinkProberStateMachineBase.h" +#include "common/MuxLogger.h" #define VALIDATE_STATE(p, m, l) \ do { \ @@ -69,6 +70,21 @@ void LinkManagerStateMachineTest::runIoService(uint32_t count) } } +void LinkManagerStateMachineTest::runIoServiceThreaded(uint32_t count) +{ + mWork = std::make_unique(mIoService); + for (uint8_t i = 0; i < count; i++) { + mThreadGroup.create_thread(boost::bind(&boost::asio::io_service::run, &mIoService)); + } +} + +void LinkManagerStateMachineTest::stopIoServiceThreaded() +{ + mWork.reset(); + mIoService.stop(); + mThreadGroup.join_all(); +} + void LinkManagerStateMachineTest::postLinkProberEvent(link_prober::LinkProberState::Label label, uint32_t count, uint32_t detect_multiplier) { switch (label) { @@ -1529,4 +1545,31 @@ TEST_F(LinkManagerStateMachineTest, ProbeLinkInSuspendTimeout) EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mDetectLinkCallCount, 1); } +TEST_F(LinkManagerStateMachineTest, DefaultRouteStateRaceCondition) +{ + mFakeMuxPort.activateStateMachine(); + runIoServiceThreaded(3); + + mMuxConfig.enableDefaultRouteFeature(true); + for (int i = 0; i < 10000; ++i) + { + MUXLOGDEBUG(boost::format("Iteration %d") % i); + mFakeMuxPort.handleDefaultRouteState("na"); + mFakeMuxPort.handleDefaultRouteState("ok"); + + int check = 0; + while (((mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount < i + 1) || + (mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount < i + 1)) && (check < 10)) + { + usleep(1000); + ++check; + } + + EXPECT_EQ(mFakeMuxPort.getDefaultRouteState(), link_manager::LinkManagerStateMachineBase::DefaultRoute::OK); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, i + 1); + EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, i + 1); + } + stopIoServiceThreaded(); +} + } /* namespace test */ diff --git a/test/LinkManagerStateMachineTest.h b/test/LinkManagerStateMachineTest.h index 41fbe14..38c60fb 100644 --- a/test/LinkManagerStateMachineTest.h +++ b/test/LinkManagerStateMachineTest.h @@ -24,6 +24,7 @@ #ifndef LINKMANAGERSTATEMACHINETEST_H_ #define LINKMANAGERSTATEMACHINETEST_H_ +#include #include "gtest/gtest.h" #include "FakeMuxPort.h" @@ -39,6 +40,8 @@ class LinkManagerStateMachineTest: public ::testing::Test virtual ~LinkManagerStateMachineTest() = default; void runIoService(uint32_t count = 0); + void runIoServiceThreaded(uint32_t count = 3); + void stopIoServiceThreaded(); void postLinkProberEvent(link_prober::LinkProberState::Label label, uint32_t count = 0, uint32_t detect_multiplier = 0); void postMuxEvent(mux_state::MuxState::Label label, uint32_t count = 0); void postLinkEvent(link_state::LinkState::Label label, uint32_t count = 0); @@ -58,6 +61,8 @@ class LinkManagerStateMachineTest: public ::testing::Test public: boost::asio::io_service mIoService; + std::unique_ptr mWork; + boost::thread_group mThreadGroup; common::MuxConfig mMuxConfig; std::shared_ptr mDbInterfacePtr; std::string mPortName = "EtherTest01"; From 9768268cf11cdfab1eccb194c0ce762e372aa536 Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Wed, 26 Jun 2024 13:01:19 +0800 Subject: [PATCH 4/5] Fix default route race condition UT (#258) Signed-off-by: Longxiang Lyu --- test/LinkManagerStateMachineTest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/LinkManagerStateMachineTest.cpp b/test/LinkManagerStateMachineTest.cpp index 507ad03..b611c9c 100644 --- a/test/LinkManagerStateMachineTest.cpp +++ b/test/LinkManagerStateMachineTest.cpp @@ -1551,7 +1551,7 @@ TEST_F(LinkManagerStateMachineTest, DefaultRouteStateRaceCondition) runIoServiceThreaded(3); mMuxConfig.enableDefaultRouteFeature(true); - for (int i = 0; i < 10000; ++i) + for (uint i = 0; i < 10000; ++i) { MUXLOGDEBUG(boost::format("Iteration %d") % i); mFakeMuxPort.handleDefaultRouteState("na"); @@ -1559,9 +1559,9 @@ TEST_F(LinkManagerStateMachineTest, DefaultRouteStateRaceCondition) int check = 0; while (((mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount < i + 1) || - (mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount < i + 1)) && (check < 10)) + (mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount < i + 1)) && (check < 4000)) { - usleep(1000); + usleep(2000); ++check; } From 2b7e4f975ff0f2a5d6cd77282e74490928db8000 Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:36:20 +0800 Subject: [PATCH 5/5] [active-standby] Fix the oscillation logic (#261) Approach What is the motivation for this PR? Fix the unexpected toggle introduced by the oscillation logic. Signed-off-by: Longxiang Lyu lolv@microsoft.com Work item tracking Microsoft ADO (number only): 28397786 How did you do it? The oscillation is introduced to allow the active side to toggle to standby if no heartbeat is received. The workflow is described as the following: (wait, active, up) ---> set oscillation timer [1] ... (wait, active, up) ---> (wait, wait, up) [2] (wait, wait, up) ---> (wait, active, up) ... (wait, active, up) <--- oscillation timer expires, toggle to standby [3] [1]: the ToR enters (wait, active, up), no heartbeats received, oscillation timer is set. [2]: the ToR consistently probes the mux status, and transits between (wait, active, up) and (wait, wait, up). [3]: when the oscillation timer expires and the ToR is (wait, active, up), make the toggle to standby request. We need to ensure that, the ToR is only allowed to transits between (wait, active, up) and (wait, wait, up) during [2]. So any link prober active/standby, mux standby, or link down events should cancel the oscillation. How did you verify/test it? UT Any platform specific information? Documentation --- .../LinkManagerStateMachineActiveStandby.cpp | 31 +++++++++++++-- .../LinkManagerStateMachineActiveStandby.h | 10 +++++ test/LinkManagerStateMachineTest.cpp | 39 ++++++++++++++++++- 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp index e7c668c..9045bd4 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp @@ -61,8 +61,6 @@ ActiveStandbyStateMachine::ActiveStandbyStateMachine( mMuxStateMachine.setWaitStateCause(mux_state::WaitState::WaitStateCause::SwssUpdate); mMuxPortPtr->setMuxLinkmgrState(mLabel); initializeTransitionFunctionTable(); - - mOscillationTimer.expires_from_now(boost::posix_time::seconds(1)); } // @@ -476,12 +474,16 @@ void ActiveStandbyStateMachine::handleStateChange(LinkProberEvent &event, link_p mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberActiveStart); mStandbyUnknownUpCount = 0; + + tryCancelOscillationTimerIfAlive(); } if (state == link_prober::LinkProberState::Label::Standby) { mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberStandbyStart); mActiveUnknownUpCount = 0; + + tryCancelOscillationTimerIfAlive(); } CompositeState nextState = mCompositeState; @@ -542,6 +544,10 @@ void ActiveStandbyStateMachine::handleStateChange(MuxStateEvent &event, mux_stat mStandbyUnknownUpCount = 0; } + if (state == mux_state::MuxState::Label::Standby) { + tryCancelOscillationTimerIfAlive(); + } + updateMuxLinkmgrState(); } @@ -582,6 +588,8 @@ void ActiveStandbyStateMachine::handleStateChange(LinkStateEvent &event, link_st mWaitActiveUpCount = 0; mWaitStandbyUpBackoffFactor = 1; mUnknownActiveUpBackoffFactor = 1; + + tryCancelOscillationTimerIfAlive(); } else { mStateTransitionHandler[ps(nextState)][ms(nextState)][ls(nextState)](nextState); } @@ -1064,6 +1072,8 @@ void ActiveStandbyStateMachine::handleMuxWaitTimeout(boost::system::error_code e void ActiveStandbyStateMachine::startOscillationTimer() { // Note: This timer is started when Mux state is active and link prober is in wait state. + MUXLOGINFO(boost::format("%s: start the oscillation timer") % mMuxPortConfig.getPortName()); + mOscillationTimerAlive = true; mOscillationTimer.expires_from_now(boost::posix_time::seconds( mMuxPortConfig.getOscillationInterval_sec() )); @@ -1074,6 +1084,20 @@ void ActiveStandbyStateMachine::startOscillationTimer() ))); } +// +// ---> tryCancelOscillationTimerIfAlive(); +// +// cancel the oscillation timer if it is alive +// +void ActiveStandbyStateMachine::tryCancelOscillationTimerIfAlive() +{ + if (mOscillationTimerAlive) { + MUXLOGINFO(boost::format("%s: cancel the oscillation timer") % mMuxPortConfig.getPortName()); + mOscillationTimerAlive = false; + mOscillationTimer.cancel(); + } +} + // // ---> handleOscillationTimeout(boost::system::error_code errorCode); // @@ -1083,6 +1107,7 @@ void ActiveStandbyStateMachine::handleOscillationTimeout(boost::system::error_co { MUXLOGDEBUG(mMuxPortConfig.getPortName()); + mOscillationTimerAlive = false; if (mMuxPortConfig.getIfOscillationEnabled() && errorCode == boost::system::errc::success && ps(mCompositeState) == link_prober::LinkProberState::Label::Wait && @@ -1318,7 +1343,7 @@ void ActiveStandbyStateMachine::LinkProberWaitMuxActiveLinkUpTransitionFunction( mSuspendTxFnPtr(mMuxPortConfig.getLinkWaitTimeout_msec()); } - if (mOscillationTimer.expires_at() < boost::posix_time::microsec_clock::local_time()) { + if (!mOscillationTimerAlive) { startOscillationTimer(); } } diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.h b/src/link_manager/LinkManagerStateMachineActiveStandby.h index 147b952..c1b1d58 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.h +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.h @@ -464,6 +464,15 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, */ void startOscillationTimer(); + /** + *@method tryCancelOscillationTimerIfAlive + * + *@brief cancel the oscillation timer if it is alive + * + *@return none + */ + inline void tryCancelOscillationTimerIfAlive(); + /** *@method handleOscillationTimeout * @@ -859,6 +868,7 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, boost::asio::deadline_timer mDeadlineTimer; boost::asio::deadline_timer mWaitTimer; boost::asio::deadline_timer mOscillationTimer; + bool mOscillationTimerAlive = false; boost::function mInitializeProberFnPtr; boost::function mStartProbingFnPtr; diff --git a/test/LinkManagerStateMachineTest.cpp b/test/LinkManagerStateMachineTest.cpp index b611c9c..64001a2 100644 --- a/test/LinkManagerStateMachineTest.cpp +++ b/test/LinkManagerStateMachineTest.cpp @@ -21,6 +21,9 @@ * Author: Tamer Ahmed */ +#include +#include + #include "LinkManagerStateMachineTest.h" #include "link_prober/LinkProberStateMachineBase.h" #include "common/MuxLogger.h" @@ -50,7 +53,7 @@ LinkManagerStateMachineTest::LinkManagerStateMachineTest() : mMuxConfig.setPositiveStateChangeRetryCount(mPositiveUpdateCount); mMuxConfig.setMuxStateChangeRetryCount(mPositiveUpdateCount); mMuxConfig.setLinkStateChangeRetryCount(mPositiveUpdateCount); - mMuxConfig.setOscillationInterval_sec(1,true); + mMuxConfig.setOscillationInterval_sec(2, true); } void LinkManagerStateMachineTest::runIoService(uint32_t count) @@ -1468,7 +1471,7 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation) handleMuxState("active", 3); VALIDATE_STATE(Wait, Active, Up); - runIoService(2); + runIoService(1); VALIDATE_STATE(Wait, Wait, Up); handleProbeMuxState("active", 3); @@ -1481,6 +1484,38 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation) mMuxConfig.setTimeoutIpv4_msec(10); } +TEST_F(LinkManagerStateMachineTest, TimedOscillationMuxProbeStandbyCancel) +{ + setMuxStandby(); + + // set icmp timeout to be 500ms otherwise it will probe mux state endlessly and get no chance to do timed oscillation + mMuxConfig.setTimeoutIpv4_msec(500); + + postLinkProberEvent(link_prober::LinkProberState::Unknown, 2); + VALIDATE_STATE(Wait, Wait, Up); + EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 1); + EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing); + + // swss notification + handleMuxState("active", 3); + VALIDATE_STATE(Wait, Active, Up); + + runIoService(1); + VALIDATE_STATE(Wait, Wait, Up); + + handleProbeMuxState("standby", 3); + VALIDATE_STATE(Wait, Standby, Up); + + boost::this_thread::sleep(boost::posix_time::seconds(2)); + + // 3 mux probe active notifiations + 1 oscillation timeout + 1 mux probe timeout + handleProbeMuxState("active", 5); + VALIDATE_STATE(Wait, Active, Up); + EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing); + + mMuxConfig.setTimeoutIpv4_msec(10); +} + TEST_F(LinkManagerStateMachineTest, OrchagentRollback) { setMuxStandby();