[mle] simplify router/leader role restoration upon reset (#10798)

This commit introduces the `RouterRoleRestorer` class, nested within
`MleRouter`, to manage router/leader role restoration after an MLE
operation restart (e.g., a device reboot) by sending multicast Link
Requests. This class simplifies the code and centralizes role
restoration logic.

Specific changes:

- A new member variable `mLastSavedRole` is added to track the last
  attached role (saved in non-volatile memory). This is used by
  `RouterRoleRestorer` to determine the number of Link Request
  attempts. This variable replaces the previous `mWasLeader`, which
  was only updated after a reboot and would not account for role
  changes afterward.
- The `AttachTimer` is now used for role restoration instead of the
  retransmission timer, as role restoration always occurs while the
  device is detached and before any attach attempts.
- The `kLinkRequestTimeout` is used for the last attempt before
  considering restoration failure.
- The `mChallengeTimeout` mechanism is now removed (in earlier Thread
  specification versions, multicast Link Requests could be used while
  the device was attached, but this is no longer used or needed).
- `test-012-reset-recovery.py` is updated to validate the role
  restoration behavior. `test_detach` is also updated and fixed.
This commit is contained in:
Abtin Keshavarzian
2024-10-10 21:47:34 -07:00
committed by GitHub
parent 057b87fdd1
commit 19b2d6bbc4
9 changed files with 182 additions and 84 deletions
+17 -29
View File
@@ -52,10 +52,8 @@ Mle::Mle(Instance &aInstance)
, mReceivedResponseFromParent(false)
, mDetachingGracefully(false)
, mInitiallyAttachedAsSleepy(false)
#if OPENTHREAD_FTD
, mWasLeader(false)
#endif
, mRole(kRoleDisabled)
, mLastSavedRole(kRoleDisabled)
, mDeviceMode(DeviceMode::kModeRxOnWhenIdle)
, mAttachState(kAttachStateIdle)
, mReattachState(kReattachStop)
@@ -68,9 +66,6 @@ Mle::Mle(Instance &aInstance)
, mDataRequestAttempts(0)
, mAnnounceChannel(0)
, mAlternateChannel(0)
#if OPENTHREAD_FTD
, mLinkRequestAttempts(0)
#endif
, mRloc16(kInvalidRloc16)
, mPreviousParentRloc(kInvalidRloc16)
, mAttachCounter(0)
@@ -228,6 +223,10 @@ void Mle::Stop(StopMode aMode)
Get<ThreadNetif>().RemoveUnicastAddress(mMeshLocalRloc);
Get<ThreadNetif>().RemoveUnicastAddress(mMeshLocalEid);
#if OPENTHREAD_FTD
Get<MleRouter>().mRouterRoleRestorer.Stop();
#endif
SetRole(kRoleDisabled);
exit:
@@ -367,7 +366,9 @@ void Mle::Restore(void)
// force re-attach when version mismatch.
VerifyOrExit(networkInfo.GetVersion() == kThreadVersion);
switch (networkInfo.GetRole())
mLastSavedRole = static_cast<DeviceRole>(networkInfo.GetRole());
switch (mLastSavedRole)
{
case kRoleChild:
case kRoleRouter:
@@ -426,8 +427,6 @@ void Mle::Restore(void)
Get<MleRouter>().SetPreviousPartitionId(networkInfo.GetPreviousPartitionId());
Get<ChildTable>().Restore();
}
mWasLeader = networkInfo.GetRole() == kRoleLeader;
#endif
// Successfully restored the network information from
@@ -457,6 +456,7 @@ Error Mle::Store(void)
networkInfo.SetExtAddress(Get<Mac::Mac>().GetExtAddress());
networkInfo.SetMeshLocalIid(mMeshLocalEid.GetAddress().GetIid());
networkInfo.SetVersion(kThreadVersion);
mLastSavedRole = mRole;
if (IsChild())
{
@@ -1393,6 +1393,14 @@ void Mle::HandleAttachTimer(void)
ExitNow();
}
#if OPENTHREAD_FTD
if (IsDetached() && Get<MleRouter>().mRouterRoleRestorer.IsActive())
{
Get<MleRouter>().mRouterRoleRestorer.HandleTimer();
ExitNow();
}
#endif
// First, check if we are waiting to receive parent responses and
// found an acceptable parent candidate.
@@ -1799,13 +1807,6 @@ void Mle::ScheduleMessageTransmissionTimer(void)
{
uint32_t interval = 0;
#if OPENTHREAD_FTD
if (mRole == kRoleDetached && mLinkRequestAttempts > 0)
{
ExitNow(interval = Random::NonCrypto::GetUint32InRange(kMulticastRetxDelayMin, kMulticastRetxDelayMax));
}
#endif
switch (mChildUpdateRequestState)
{
case kChildUpdateRequestNone:
@@ -1860,19 +1861,6 @@ void Mle::HandleMessageTransmissionTimer(void)
// - Retransmission of "Child Update Request",
// - Retransmission of "Data Request" on a child,
// - Sending periodic keep-alive "Child Update Request" messages on a non-sleepy (rx-on) child.
// - Retransmission of "Link Request" after router reset
#if OPENTHREAD_FTD
// Retransmit multicast link request if no response has been received
// and maximum transmission limit has not been reached.
if (mRole == kRoleDetached && mLinkRequestAttempts > 0)
{
IgnoreError(Get<MleRouter>().SendLinkRequest(nullptr));
mLinkRequestAttempts--;
ScheduleMessageTransmissionTimer();
ExitNow();
}
#endif
switch (mChildUpdateRequestState)
{
+6 -11
View File
@@ -1373,11 +1373,9 @@ private:
bool mReceivedResponseFromParent : 1;
bool mDetachingGracefully : 1;
bool mInitiallyAttachedAsSleepy : 1;
#if OPENTHREAD_FTD
bool mWasLeader : 1;
#endif
DeviceRole mRole;
DeviceRole mLastSavedRole;
DeviceMode mDeviceMode;
AttachState mAttachState;
ReattachState mReattachState;
@@ -1386,14 +1384,11 @@ private:
AddressRegistrationMode mAddressRegistrationMode;
ChildUpdateRequestState mChildUpdateRequestState;
uint8_t mParentRequestCounter;
uint8_t mChildUpdateAttempts;
uint8_t mDataRequestAttempts;
uint8_t mAnnounceChannel;
uint8_t mAlternateChannel;
#if OPENTHREAD_FTD
uint8_t mLinkRequestAttempts;
#endif
uint8_t mParentRequestCounter;
uint8_t mChildUpdateAttempts;
uint8_t mDataRequestAttempts;
uint8_t mAnnounceChannel;
uint8_t mAlternateChannel;
uint16_t mRloc16;
uint16_t mPreviousParentRloc;
uint16_t mAttachCounter;
+82 -35
View File
@@ -50,7 +50,6 @@ MleRouter::MleRouter(Instance &aInstance)
, mCcmEnabled(false)
, mThreadVersionCheckEnabled(true)
#endif
, mChallengeTimeout(0)
, mNetworkIdTimeout(kNetworkIdTimeout)
, mRouterUpgradeThreshold(kRouterUpgradeThreshold)
, mRouterDowngradeThreshold(kRouterDowngradeThreshold)
@@ -70,6 +69,7 @@ MleRouter::MleRouter(Instance &aInstance)
, mAdvertiseTrickleTimer(aInstance, MleRouter::HandleAdvertiseTrickleTimer)
, mChildTable(aInstance)
, mRouterTable(aInstance)
, mRouterRoleRestorer(aInstance)
{
mDeviceMode.Set(mDeviceMode.Get() | DeviceMode::kModeFullThreadDevice | DeviceMode::kModeFullNetworkData);
@@ -224,18 +224,7 @@ Error MleRouter::BecomeRouter(ThreadStatusTlv::Status aStatus)
switch (mRole)
{
case kRoleDetached:
// If router had more than `kMinCriticalChildrenCount` children
// or was a leader prior to reset we treat the multicast Link
// Request as a critical message.
mLinkRequestAttempts =
(mWasLeader || mChildTable.GetNumChildren(Child::kInStateValidOrRestoring) >= kMinCriticalChildrenCount)
? kMaxCriticalTxCount
: kMaxTxCount;
SuccessOrExit(error = SendLinkRequest(nullptr));
mLinkRequestAttempts--;
ScheduleMessageTransmissionTimer();
Get<TimeTicker>().RegisterReceiver(TimeTicker::kMleRouter);
mRouterRoleRestorer.Start(mLastSavedRole);
break;
case kRoleChild:
@@ -435,12 +424,12 @@ void MleRouter::SetStateRouterOrLeader(DeviceRole aRole, uint16_t aRloc16, Leade
Get<ThreadNetif>().SubscribeAllRoutersMulticast();
mPreviousPartitionIdRouter = mLeaderData.GetPartitionId();
Get<Mac::Mac>().SetBeaconEnabled(true);
Get<TimeTicker>().RegisterReceiver(TimeTicker::kMleRouter);
if (aRole == kRoleLeader)
{
GetLeaderAloc(mLeaderAloc.GetAddress());
Get<ThreadNetif>().AddUnicastAddress(mLeaderAloc);
Get<TimeTicker>().RegisterReceiver(TimeTicker::kMleRouter);
Get<NetworkData::Leader>().Start(aStartMode);
Get<MeshCoP::ActiveDatasetManager>().StartLeader();
Get<MeshCoP::PendingDatasetManager>().StartLeader();
@@ -580,8 +569,6 @@ Error MleRouter::SendLinkRequest(Neighbor *aNeighbor)
TxMessage *message = nullptr;
Ip6::Address destination;
VerifyOrExit(mChallengeTimeout == 0);
destination.Clear();
VerifyOrExit((message = NewMleMessage(kCommandLinkRequest)) != nullptr, error = kErrorNoBufs);
@@ -623,10 +610,8 @@ Error MleRouter::SendLinkRequest(Neighbor *aNeighbor)
if (aNeighbor == nullptr)
{
mChallenge.GenerateRandom();
mChallengeTimeout = kChallengeTimeout;
SuccessOrExit(error = message->AppendChallengeTlv(mChallenge));
mRouterRoleRestorer.GenerateRandomChallenge();
SuccessOrExit(error = message->AppendChallengeTlv(mRouterRoleRestorer.GetChallenge()));
destination.SetToLinkLocalAllRoutersMulticast();
}
else
@@ -894,7 +879,7 @@ Error MleRouter::HandleLinkAccept(RxInfo &aRxInfo, bool aRequest)
break;
case Neighbor::kStateInvalid:
VerifyOrExit((mLinkRequestAttempts > 0 || mChallengeTimeout > 0) && (response == mChallenge),
VerifyOrExit(mRouterRoleRestorer.IsActive() && (response == mRouterRoleRestorer.GetChallenge()),
error = kErrorSecurity);
OT_FALL_THROUGH;
@@ -955,7 +940,7 @@ Error MleRouter::HandleLinkAccept(RxInfo &aRxInfo, bool aRequest)
SetStateRouter(GetRloc16());
}
mLinkRequestAttempts = 0;
mRouterRoleRestorer.Stop();
mRetrieveNewNetworkData = true;
IgnoreError(SendDataRequest(aRxInfo.mMessageInfo.GetPeerAddr()));
shouldUpdateRoutes = true;
@@ -1327,8 +1312,7 @@ Error MleRouter::HandleAdvertisementOnFtd(RxInfo &aRxInfo, uint16_t aSourceAddre
// Send unicast link request if no link to router and no
// unicast/multicast link request in progress
if (!router->IsStateValid() && !router->IsStateLinkRequest() && (mChallengeTimeout == 0) &&
(linkMargin >= kLinkRequestMinMargin))
if (!router->IsStateValid() && !router->IsStateLinkRequest() && (linkMargin >= kLinkRequestMinMargin))
{
InitNeighbor(*router, aRxInfo);
router->SetState(Neighbor::kStateLinkRequest);
@@ -1485,11 +1469,6 @@ void MleRouter::HandleTimeTick(void)
VerifyOrExit(IsFullThreadDevice(), Get<TimeTicker>().UnregisterReceiver(TimeTicker::kMleRouter));
if (mChallengeTimeout > 0)
{
mChallengeTimeout--;
}
if (mPreviousPartitionIdTimeout > 0)
{
mPreviousPartitionIdTimeout--;
@@ -1503,12 +1482,6 @@ void MleRouter::HandleTimeTick(void)
switch (mRole)
{
case kRoleDetached:
if (mChallengeTimeout == 0 && mLinkRequestAttempts == 0)
{
IgnoreError(BecomeDetached());
ExitNow();
}
break;
case kRoleChild:
@@ -3876,6 +3849,80 @@ exit:
return expired;
}
//----------------------------------------------------------------------------------------------------------------------
// RouterRoleRestorer
MleRouter::RouterRoleRestorer::RouterRoleRestorer(Instance &aInstance)
: InstanceLocator(aInstance)
, mAttempts(0)
{
}
void MleRouter::RouterRoleRestorer::Start(DeviceRole aPreviousRole)
{
// If the device was previously the leader or had more than
// `kMinCriticalChildrenCount` children, we use more link
// request attempts.
mAttempts = 0;
switch (aPreviousRole)
{
case kRoleRouter:
if (Get<MleRouter>().mChildTable.GetNumChildren(Child::kInStateValidOrRestoring) < kMinCriticalChildrenCount)
{
mAttempts = kMaxTxCount;
break;
}
OT_FALL_THROUGH;
case kRoleLeader:
mAttempts = kMaxCriticalTxCount;
break;
case kRoleChild:
case kRoleDetached:
case kRoleDisabled:
break;
}
SendMulticastLinkRequest();
}
void MleRouter::RouterRoleRestorer::HandleTimer(void)
{
if (mAttempts > 0)
{
mAttempts--;
}
SendMulticastLinkRequest();
}
void MleRouter::RouterRoleRestorer::SendMulticastLinkRequest(void)
{
uint32_t delay;
VerifyOrExit(Get<Mle>().IsDetached(), mAttempts = 0);
if (mAttempts == 0)
{
IgnoreError(Get<Mle>().BecomeDetached());
ExitNow();
}
IgnoreError(Get<MleRouter>().SendLinkRequest(nullptr));
delay = (mAttempts == 1) ? kLinkRequestTimeout
: Random::NonCrypto::GetUint32InRange(kMulticastRetxDelayMin, kMulticastRetxDelayMax);
Get<Mle>().mAttachTimer.Start(delay);
exit:
return;
}
} // namespace Mle
} // namespace ot
+27 -3
View File
@@ -512,7 +512,6 @@ private:
static constexpr uint8_t kRouterDowngradeThreshold = 23;
static constexpr uint8_t kRouterUpgradeThreshold = 16;
static constexpr uint16_t kDiscoveryMaxJitter = 250; // Max jitter delay Discovery Responses (in msec).
static constexpr uint16_t kChallengeTimeout = 2; // Challenge timeout (in sec).
static constexpr uint16_t kUnsolicitedDataResponseJitter = 500; // Max delay for unsol Data Response (in msec).
static constexpr uint8_t kLeaderDowngradeExtraDelay = 10; // Extra delay to downgrade leader (in sec).
static constexpr uint8_t kDefaultLeaderWeight = 64;
@@ -559,6 +558,32 @@ private:
uint8_t mJitter;
};
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class RouterRoleRestorer : public InstanceLocator
{
// Attempts to restore the router or leader role after an MLE
// restart(e.g., after a device reboot) by sending multicast
// Link Requests.
public:
RouterRoleRestorer(Instance &aInstance);
bool IsActive(void) const { return mAttempts > 0; }
void Start(DeviceRole aPreviousRole);
void Stop(void) { mAttempts = 0; }
void HandleTimer(void);
void GenerateRandomChallenge(void) { mChallenge.GenerateRandom(); }
const TxChallenge &GetChallenge(void) const { return mChallenge; }
private:
void SendMulticastLinkRequest(void);
uint8_t mAttempts;
TxChallenge mChallenge;
};
//------------------------------------------------------------------------------------------------------------------
// Methods
@@ -648,7 +673,6 @@ private:
uint8_t mRouterId;
uint8_t mPreviousRouterId;
uint8_t mChallengeTimeout;
uint8_t mNetworkIdTimeout;
uint8_t mRouterUpgradeThreshold;
uint8_t mRouterDowngradeThreshold;
@@ -670,7 +694,7 @@ private:
TrickleTimer mAdvertiseTrickleTimer;
ChildTable mChildTable;
RouterTable mRouterTable;
TxChallenge mChallenge;
RouterRoleRestorer mRouterRoleRestorer;
RouterRoleTransition mRouterRoleTransition;
Ip6::Netif::UnicastAddress mLeaderAloc;
#if OPENTHREAD_CONFIG_MLE_DEVICE_PROPERTY_LEADER_WEIGHT_ENABLE
@@ -133,8 +133,7 @@ class PublishMeshCopService(thread_cert.TestCase):
br1.stop()
br1.set_active_dataset(updateExisting=True, network_name='ot-br1-1')
br1.start()
self.simulator.go(config.BORDER_ROUTER_STARTUP_DELAY)
self.simulator.go(5) # Needs to wait extra some time to update meshcop service on state changes.
self.simulator.go(config.LEADER_REBOOT_DELAY)
self.check_meshcop_service(br1, host)
# verify that there are two meshcop services
@@ -142,7 +141,7 @@ class PublishMeshCopService(thread_cert.TestCase):
br2.start()
br2.disable_backbone_router()
br2.enable_br()
self.simulator.go(config.BORDER_ROUTER_STARTUP_DELAY)
self.simulator.go(config.LEADER_REBOOT_DELAY)
service_instances = host.browse_mdns_services('_meshcop._udp')
self.assertEqual(len(service_instances), 2)
+1
View File
@@ -138,6 +138,7 @@ SECURITY_POLICY = [672, 'onrc']
LEADER_STARTUP_DELAY = 12
ROUTER_STARTUP_DELAY = 10
LEADER_REBOOT_DELAY = 40
ED_STARTUP_DELAY = 5
BORDER_ROUTER_STARTUP_DELAY = 20
MAX_NEIGHBOR_AGE = 100
+2 -3
View File
@@ -145,10 +145,9 @@ class TestDetach(thread_cert.TestCase):
self.assertEqual(leader.get_state(), 'disabled')
leader.start()
# leader didn't become leader after the last start(), so it re-syncs in a non-critical manner thus taking ROUTER_RESET_DELAY to recover
self.simulator.go(config.ROUTER_RESET_DELAY / 2)
self.simulator.go(config.LEADER_RESET_DELAY / 2)
self.assertEqual(leader.get_state(), 'detached')
self.simulator.go(config.ROUTER_RESET_DELAY / 2)
self.simulator.go(config.LEADER_RESET_DELAY / 2)
self.assertEqual(leader.get_state(), 'leader')
router1.start()
self.simulator.go(config.ROUTER_RESET_DELAY)
+3
View File
@@ -487,6 +487,9 @@ class Node(object):
def get_ip_counters(self):
return Node.parse_list(self.cli('counters ip'))
def get_mac_counters(self):
return Node.parse_list(self.cli('counters mac'))
def get_br_counter_unicast_outbound_packets(self):
outputs = self.cli('counters br')
for line in outputs:
@@ -153,6 +153,48 @@ def check_child1_state():
verify_within(check_child1_state, 10)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Remove all nodes and restart `router` on its own
del leader
del child1
del child2
del router
router = cli.Node(index=2)
router.interface_up()
router.thread_start()
def check_router_become_leader():
verify(router.get_state() == 'leader')
verify_within(check_router_become_leader, 10)
# Router device should attempt 3 Link Request to restore its
# previous role, before sending Parent Request (7 times)
counters = router.get_mac_counters()
print(int(counters['TxBroadcast']) >= 10)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Remove all nodes and restart `leader` on its own
del router
leader = cli.Node(index=1)
leader.interface_up()
leader.thread_start()
verify_within(check_leader_state, 10)
# Leader device should attempt 6 Link Request to restore its
# previous role, before sending Parent Request (7 times)
counters = leader.get_mac_counters()
print(int(counters['TxBroadcast']) >= 13)
# -----------------------------------------------------------------------------------------------------------------------
# Test finished