Skip to content

Commit a16a3f6

Browse files
Simplify peer selection actions
Removed some code which tested peer status variable with various 'wasWarm' 'isCold' 'not notCold' booleans and made the flow more explicit for dealing with races between peer selection activities and 'peerMonitoringLoop' which separately call into the relevant functions in 'PeerSelectionActions'
1 parent 12cf310 commit a16a3f6

File tree

4 files changed

+144
-170
lines changed

4 files changed

+144
-170
lines changed

ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs

Lines changed: 133 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ module Ouroboros.Network.PeerSelection.PeerStateActions
3434
import Control.Applicative (Alternative)
3535
import Control.Concurrent.Class.MonadSTM.Strict
3636
import Control.Exception (SomeAsyncException (..), assert)
37-
import Control.Monad (when, (<=<))
37+
import Control.Monad (join, when, (<=<))
3838
import Control.Monad.Class.MonadAsync
3939
import Control.Monad.Class.MonadFork
4040
import Control.Monad.Class.MonadThrow
@@ -622,10 +622,6 @@ withPeerStateActions PeerStateActionsArguments {
622622
then return False
623623
else writeTVar stateVar newState >> return True
624624

625-
isNotCoolingOrCold :: StrictTVar m PeerStatus -> STM m Bool
626-
isNotCoolingOrCold stateVar =
627-
(> PeerCooling) <$> readTVar stateVar
628-
629625
peerMonitoringLoop
630626
:: PeerConnectionHandle muxMode responderCtx peerAddr versionData ByteString m a b
631627
-> m ()
@@ -672,9 +668,7 @@ withPeerStateActions PeerStateActionsArguments {
672668
Just (WithSomeProtocolTemperature (WithHot MiniProtocolError{})) -> do
673669
-- current `pchPeerStatus` must be 'HotPeer'
674670
state <- atomically $ do
675-
peerState <- readTVar pchPeerStatus
676-
_ <- updateUnlessCoolingOrCold pchPeerStatus PeerCooling
677-
return peerState
671+
readTVar pchPeerStatus <* updateUnlessCoolingOrCold pchPeerStatus PeerCooling
678672
case state of
679673
PeerCold -> return ()
680674
PeerCooling -> return ()
@@ -690,12 +684,11 @@ withPeerStateActions PeerStateActionsArguments {
690684
-- update 'pchPeerStatus' and log (as the two other transition to
691685
-- cold state.
692686
state <- atomically $ do
693-
peerState <- readTVar pchPeerStatus
694-
_ <- updateUnlessCoolingOrCold pchPeerStatus PeerCooling
695-
pure peerState
687+
readTVar pchPeerStatus <* updateUnlessCoolingOrCold pchPeerStatus PeerCooling
696688
case state of
697-
PeerCold -> return ()
698-
PeerCooling -> return ()
689+
PeerCold -> return ()
690+
PeerCooling -> return ()
691+
PeerWarmWait -> return () -- ^ the relevant trace will be performed by deactivatePeerConnection
699692
PeerWarm -> traceWith spsTracer (PeerStatusChanged (WarmToCooling pchConnectionId))
700693
PeerHot -> traceWith spsTracer (PeerStatusChanged (HotToCooling pchConnectionId))
701694
peerMonitoringLoop pch
@@ -919,35 +912,22 @@ withPeerStateActions PeerStateActionsArguments {
919912
pchPeerStatus,
920913
pchAppHandles,
921914
pchPromotedHotVar } = do
922-
-- quiesce warm peer protocols and set hot ones in 'Continue' mode.
923-
wasWarm <- atomically $ do
924-
-- if the peer is cold we can't activate it.
925-
notCold <- isNotCoolingOrCold pchPeerStatus
926-
when notCold $ do
927-
writeTVar (getControlVar SingHot pchAppHandles) Continue
928-
writeTVar (getControlVar SingWarm pchAppHandles) Quiesce
929-
return notCold
930-
when (not wasWarm) $ do
931-
traceWith spsTracer (PeerStatusChangeFailure
932-
(WarmToHot pchConnectionId)
933-
ActiveCold)
934-
throwIO $ ColdActivationException pchConnectionId
935-
936-
-- start hot peer protocols
937-
startProtocols SingHot isBigLedgerPeer connHandle
938-
939-
-- Only set the status to PeerHot if the peer isn't PeerCold.
940-
-- This can happen asynchronously between the check above and now.
941-
wasWarm' <- atomically $ updateUnlessCoolingOrCold pchPeerStatus PeerHot
942-
if wasWarm'
943-
then do
944-
atomically . writeTVar pchPromotedHotVar . (Just $!) =<< getMonotonicTime
945-
traceWith spsTracer (PeerStatusChanged (WarmToHot pchConnectionId))
946-
else do
947-
traceWith spsTracer (PeerStatusChangeFailure
948-
(WarmToHot pchConnectionId)
949-
ActiveCold)
950-
throwIO $ ColdActivationException pchConnectionId
915+
join . atomically $ do
916+
peerStatus <- readTVar pchPeerStatus
917+
case peerStatus of
918+
PeerWarm -> do
919+
writeTVar (getControlVar SingHot pchAppHandles) Continue
920+
writeTVar (getControlVar SingWarm pchAppHandles) Quiesce
921+
writeTVar pchPeerStatus PeerHot
922+
return $ do
923+
startProtocols SingHot isBigLedgerPeer connHandle
924+
atomically . writeTVar pchPromotedHotVar . (Just $!) =<< getMonotonicTime
925+
traceWith spsTracer (PeerStatusChanged (WarmToHot pchConnectionId))
926+
_otherwise -> return $ do
927+
traceWith spsTracer (PeerStatusChangeFailure
928+
(WarmToHot pchConnectionId)
929+
(ActiveCold peerStatus))
930+
throwIO $ ColdActivationException pchConnectionId
951931

952932

953933
-- Take a hot peer and demote it to a warm one.
@@ -960,75 +940,64 @@ withPeerStateActions PeerStateActionsArguments {
960940
pchAppHandles,
961941
pchPromotedHotVar
962942
} = do
963-
wasCold <- atomically $ do
964-
notCold <- isNotCoolingOrCold pchPeerStatus
965-
when notCold $ do
966-
writeTVar (getControlVar SingHot pchAppHandles) Terminate
967-
writeTVar (getControlVar SingWarm pchAppHandles) Continue
968-
return (not notCold)
969-
when wasCold $ do
970-
-- The governor attempted to demote an already cold peer.
971-
traceWith spsTracer (PeerStatusChangeFailure
972-
(HotToWarm pchConnectionId)
973-
ActiveCold)
974-
throwIO $ ColdDeactivationException pchConnectionId
975-
976-
977-
-- Hot protocols should stop within 'spsDeactivateTimeout'.
978-
res <-
979-
timeout spsDeactivateTimeout
980-
(atomically $ awaitAllResults SingHot pchAppHandles)
981-
982-
pchPromotedHot <- atomically . stateTVar pchPromotedHotVar $ (, Nothing)
983-
case pchPromotedHot of
984-
Just t1 -> do
985-
dt <- diffTime <$> getMonotonicTime <*> pure t1
986-
traceWith spsTracer (PeerHotDuration pchConnectionId dt)
987-
Nothing -> pure ()
988-
989-
case res of
990-
Nothing -> do
991-
Mux.stop pchMux
992-
atomically (writeTVar pchPeerStatus PeerCooling)
993-
traceWith spsTracer (PeerStatusChangeFailure
994-
(HotToCooling pchConnectionId)
995-
TimeoutError)
996-
throwIO (DeactivationTimeout pchConnectionId)
997-
998-
-- some of the hot mini-protocols errored
999-
Just (SomeErrored errs) -> do
1000-
-- we don't need to notify the connection manager, we can instead
1001-
-- relay on mux property: if any of the mini-protocols errors, mux
1002-
-- throws an exception as well.
1003-
atomically (writeTVar pchPeerStatus PeerCooling)
1004-
traceWith spsTracer (PeerStatusChangeFailure
1005-
(HotToCooling pchConnectionId)
1006-
(ApplicationFailure errs))
1007-
throwIO (MiniProtocolExceptions errs)
1008-
1009-
-- all hot mini-protocols succeeded
1010-
Just (AllSucceeded results) -> do
1011-
-- we don't notify the connection manager as this connection is still
1012-
-- useful to the outbound governor (warm peer).
1013-
wasWarm <- atomically $ do
1014-
-- Only set the status to PeerWarm if the peer isn't cold
1015-
-- (can happen asynchronously).
1016-
notCold <- updateUnlessCoolingOrCold pchPeerStatus PeerWarm
1017-
when notCold $ do
1018-
-- We need to update hot protocols to indicate that they are not
1019-
-- running. Preserve the results returned by their previous
1020-
-- execution.
1021-
modifyTVar (getMiniProtocolsVar SingHot pchAppHandles)
1022-
(\_ -> Map.map (pure . NotRunning . Right) results)
1023-
return notCold
1024-
1025-
if wasWarm
1026-
then traceWith spsTracer (PeerStatusChanged (HotToWarm pchConnectionId))
1027-
else do
1028-
traceWith spsTracer (PeerStatusChangeFailure
1029-
(HotToWarm pchConnectionId)
1030-
ActiveCold)
1031-
throwIO $ ColdDeactivationException pchConnectionId
943+
join . atomically $ do
944+
peerStatus <- readTVar pchPeerStatus
945+
case peerStatus of
946+
PeerHot -> do
947+
writeTVar (getControlVar SingHot pchAppHandles) Terminate
948+
writeTVar (getControlVar SingWarm pchAppHandles) Continue
949+
writeTVar pchPeerStatus PeerWarmWait
950+
pchPromotedHot <- stateTVar pchPromotedHotVar (, Nothing)
951+
return $ do
952+
-- Hot protocols should stop within 'spsDeactivateTimeout'.
953+
res <- timeout spsDeactivateTimeout
954+
$ atomically $ do
955+
res <- awaitAllResults SingHot pchAppHandles
956+
res <$ case res of
957+
AllSucceeded results -> do
958+
modifyTVar (getMiniProtocolsVar SingHot pchAppHandles)
959+
(\_ -> Map.map (pure . NotRunning . Right) results)
960+
writeTVar pchPeerStatus PeerWarm
961+
SomeErrored _ -> writeTVar pchPeerStatus PeerCooling
962+
963+
case pchPromotedHot of
964+
Just t1 -> do
965+
dt <- diffTime <$> getMonotonicTime <*> pure t1
966+
traceWith spsTracer (PeerHotDuration pchConnectionId dt)
967+
Nothing -> pure ()
968+
969+
case res of
970+
Nothing -> do
971+
Mux.stop pchMux
972+
atomically (writeTVar pchPeerStatus PeerCooling)
973+
traceWith spsTracer (PeerStatusChangeFailure
974+
(HotToCooling pchConnectionId)
975+
TimeoutError)
976+
throwIO (DeactivationTimeout pchConnectionId)
977+
Just (SomeErrored errs) -> do
978+
traceWith spsTracer (PeerStatusChangeFailure
979+
(HotToCooling pchConnectionId)
980+
(ApplicationFailure errs))
981+
throwIO (MiniProtocolExceptions errs)
982+
Just (AllSucceeded {}) -> do
983+
traceWith spsTracer (PeerStatusChanged (HotToWarm pchConnectionId))
984+
985+
-- either the peer monitoring loop or peer selection demotion lost the race
986+
PeerWarmWait -> do
987+
peerStatus' <- readTVar pchPeerStatus
988+
check (peerStatus' /= PeerWarmWait)
989+
return $ do
990+
case peerStatus' of
991+
PeerWarm -> return () -- ^ successful demotion by the winner
992+
-- in this case the race winner traces the error
993+
_otherwise -> throwIO $ ColdDeactivationException pchConnectionId
994+
995+
_otherwise ->
996+
return $ do
997+
traceWith spsTracer (PeerStatusChangeFailure
998+
(HotToWarm pchConnectionId)
999+
(ActiveCold peerStatus))
1000+
throwIO $ ColdDeactivationException pchConnectionId
10321001

10331002

10341003
closePeerConnection :: PeerConnectionHandle muxMode responderCtx peerAddr versionData ByteString m a b
@@ -1041,64 +1010,62 @@ withPeerStateActions PeerStateActionsArguments {
10411010
pchMux,
10421011
pchPromotedHotVar
10431012
} = do
1044-
atomically $ do
1013+
(peerStatus, pchPromotedHot) <- atomically $ do
10451014
writeTVar (getControlVar SingWarm pchAppHandles) Terminate
10461015
writeTVar (getControlVar SingEstablished pchAppHandles) Terminate
10471016
writeTVar (getControlVar SingHot pchAppHandles) Terminate
1017+
(,) <$> stateTVar pchPeerStatus (, PeerCooling) <*> stateTVar pchPromotedHotVar (, Nothing)
1018+
1019+
case peerStatus of
1020+
ps@PeerCooling -> return ps
1021+
ps@PeerCold -> return ps
1022+
_otherwise -> do
1023+
res <-
1024+
timeout spsCloseConnectionTimeout
1025+
(atomically $
1026+
(\a b c -> a <> b <> c)
1027+
-- note: we use last to finish on hot, warm and
1028+
-- established mini-protocols since 'closePeerConnection'
1029+
-- is also used by asynchronous demotions, not just
1030+
-- /warm → cold/ transition.
1031+
<$> awaitAllResults SingHot pchAppHandles
1032+
<*> awaitAllResults SingWarm pchAppHandles
1033+
<*> awaitAllResults SingEstablished pchAppHandles)
1034+
1035+
case pchPromotedHot of
1036+
Just t1 -> do
1037+
dt <- diffTime <$> getMonotonicTime <*> pure t1
1038+
traceWith spsTracer (PeerHotDuration pchConnectionId dt)
1039+
Nothing -> pure ()
1040+
1041+
PeerCooling <$ case res of
1042+
Nothing -> do
1043+
-- timeout fired
1044+
Mux.stop pchMux
1045+
traceWith spsTracer (PeerStatusChangeFailure
1046+
(WarmToCooling pchConnectionId)
1047+
TimeoutError)
1048+
1049+
Just (SomeErrored errs) -> do
1050+
-- some mini-protocol errored
1051+
--
1052+
-- we don't need to notify the connection manager, we can instead
1053+
-- rely on mux property: if any of the mini-protocols errors, mux
1054+
-- throws an exception as well.
1055+
traceWith spsTracer (PeerStatusChangeFailure
1056+
(WarmToCooling pchConnectionId)
1057+
(ApplicationFailure errs))
1058+
throwIO (MiniProtocolExceptions errs)
1059+
1060+
Just AllSucceeded {} -> do
1061+
-- all mini-protocols terminated cleanly
1062+
--
1063+
-- 'unregisterOutboundConnection' could only fail to demote the peer if
1064+
-- connection manager would simultaneously promote it, but this is not
1065+
-- possible.
1066+
_ <- releaseOutboundConnection spsConnectionManager pchConnectionId
1067+
traceWith spsTracer (PeerStatusChanged (WarmToCooling pchConnectionId))
10481068

1049-
res <-
1050-
timeout spsCloseConnectionTimeout
1051-
(atomically $
1052-
(\a b c -> a <> b <> c)
1053-
-- note: we use last to finish on hot, warm and
1054-
-- established mini-protocols since 'closePeerConnection'
1055-
-- is also used by asynchronous demotions, not just
1056-
-- /warm → cold/ transition.
1057-
<$> awaitAllResults SingHot pchAppHandles
1058-
<*> awaitAllResults SingWarm pchAppHandles
1059-
<*> awaitAllResults SingEstablished pchAppHandles)
1060-
1061-
1062-
pchPromotedHot <- atomically . stateTVar pchPromotedHotVar $ (, Nothing)
1063-
case pchPromotedHot of
1064-
Just t1 -> do
1065-
dt <- diffTime <$> getMonotonicTime <*> pure t1
1066-
traceWith spsTracer (PeerHotDuration pchConnectionId dt)
1067-
Nothing -> pure ()
1068-
1069-
wasWarm <- atomically (updateUnlessCoolingOrCold pchPeerStatus PeerCooling)
1070-
case res of
1071-
Nothing -> do
1072-
-- timeout fired
1073-
Mux.stop pchMux
1074-
when wasWarm $
1075-
traceWith spsTracer (PeerStatusChangeFailure
1076-
(WarmToCooling pchConnectionId)
1077-
TimeoutError)
1078-
readTVarIO pchPeerStatus
1079-
1080-
Just (SomeErrored errs) -> do
1081-
-- some mini-protocol errored
1082-
--
1083-
-- we don't need to notify the connection manager, we can instead
1084-
-- rely on mux property: if any of the mini-protocols errors, mux
1085-
-- throws an exception as well.
1086-
when wasWarm $
1087-
traceWith spsTracer (PeerStatusChangeFailure
1088-
(WarmToCooling pchConnectionId)
1089-
(ApplicationFailure errs))
1090-
throwIO (MiniProtocolExceptions errs)
1091-
1092-
Just AllSucceeded {} -> do
1093-
-- all mini-protocols terminated cleanly
1094-
--
1095-
-- 'unregisterOutboundConnection' could only fail to demote the peer if
1096-
-- connection manager would simultaneously promote it, but this is not
1097-
-- possible.
1098-
when wasWarm $ do
1099-
_ <- releaseOutboundConnection spsConnectionManager pchConnectionId
1100-
traceWith spsTracer (PeerStatusChanged (WarmToCooling pchConnectionId))
1101-
readTVarIO pchPeerStatus
11021069

11031070
--
11041071
-- Utilities
@@ -1203,7 +1170,7 @@ data FailureType versionNumber =
12031170
| HandleFailure !SomeException
12041171
| MuxStoppedFailure
12051172
| TimeoutError
1206-
| ActiveCold
1173+
| ActiveCold !PeerStatus
12071174
| ApplicationFailure ![MiniProtocolException]
12081175
deriving Show
12091176

ouroboros-network/src/Ouroboros/Network/PeerSelection/Types.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ data PeerStatus =
3636
-- errors.
3737
--
3838
| PeerWarm
39+
| PeerWarmWait
40+
-- ^ Either peer monitoring loop or peer selection can demote a hot peer.
41+
-- In case of a race, the loser waits for the winner to perform the work.
42+
-- ie. it acts as a semaphore
3943
| PeerHot
4044
deriving (Eq, Ord, Show)
4145

ouroboros-network/testlib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3881,7 +3881,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber =
38813881
$ evs'
38823882
numOfActiveColdErrors = length
38833883
. filter (\case
3884-
(PeerStatusChangeFailure HotToWarm{} ActiveCold)
3884+
(PeerStatusChangeFailure HotToWarm{} ActiveCold{})
38853885
-> True
38863886
_ -> False)
38873887
$ evs'
@@ -3904,7 +3904,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber =
39043904
in
39053905
conjoin (zipWith (curry (\case
39063906
ev@( WithTime _ (PeerStatusChangeFailure (HotToWarm _) TimeoutError)
3907-
, WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold)
3907+
, WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold{})
39083908
)
39093909
-> counterexample (show ev)
39103910
$ counterexample (unlines $ map show peerSelectionActionsEvents)

0 commit comments

Comments
 (0)