From e9fc044722820f70fee3f5ef40c521051390e770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C3=B3jtowicz?= Date: Thu, 26 Jun 2025 11:34:00 +0200 Subject: [PATCH 1/3] Trace hot outbound connection durations A trace is emitted whenever a hot outbound peer is demoted or closed (possibly due to an error), giving the duration in seconds of how long the peer has been in hot mode. Some other refactoring to aid comprehensibility --- .../Network/PeerSelection/PeerStateActions.hs | 322 +++++++++--------- .../Network/Diffusion/Testnet/Cardano.hs | 9 +- 2 files changed, 167 insertions(+), 164 deletions(-) diff --git a/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs b/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs index f9a91386fb..9df9d5d333 100644 --- a/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs +++ b/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs @@ -1,7 +1,9 @@ +{-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE KindSignatures #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -34,10 +36,11 @@ module Ouroboros.Network.PeerSelection.PeerStateActions import Control.Applicative (Alternative) import Control.Concurrent.Class.MonadSTM.Strict import Control.Exception (SomeAsyncException (..), assert) -import Control.Monad (when, (<=<)) +import Control.Monad (join, when, (<=<)) import Control.Monad.Class.MonadAsync import Control.Monad.Class.MonadFork import Control.Monad.Class.MonadThrow +import Control.Monad.Class.MonadTime.SI import Control.Monad.Class.MonadTimer.SI import Control.Concurrent.JobPool (Job (..), JobPool) @@ -428,11 +431,12 @@ awaitAllResults tok bundle = do -- together with their state 'StrictTVar's. -- data PeerConnectionHandle (muxMode :: Mux.Mode) responderCtx peerAddr versionData bytes m a b = PeerConnectionHandle { - pchConnectionId :: ConnectionId peerAddr, - pchPeerStatus :: StrictTVar m PeerStatus, - pchMux :: Mux.Mux muxMode m, - pchAppHandles :: TemperatureBundle (ApplicationHandle muxMode responderCtx peerAddr bytes m a b), - pchVersionData :: !versionData + pchConnectionId :: !(ConnectionId peerAddr), + pchPeerStatus :: !(StrictTVar m PeerStatus), + pchMux :: !(Mux.Mux muxMode m), + pchAppHandles :: !(TemperatureBundle (ApplicationHandle muxMode responderCtx peerAddr bytes m a b)), + pchVersionData :: !versionData, + pchPromotedHotVar :: !(StrictTVar m (Maybe Time)) } mkInitiatorContext :: MonadSTM m @@ -620,14 +624,10 @@ withPeerStateActions PeerStateActionsArguments { then return False else writeTVar stateVar newState >> return True - isNotCoolingOrCold :: StrictTVar m PeerStatus -> STM m Bool - isNotCoolingOrCold stateVar = - (> PeerCooling) <$> readTVar stateVar - peerMonitoringLoop :: PeerConnectionHandle muxMode responderCtx peerAddr versionData ByteString m a b -> m () - peerMonitoringLoop pch@PeerConnectionHandle { pchConnectionId, pchPeerStatus, pchAppHandles } = do + peerMonitoringLoop pch@PeerConnectionHandle { pchConnectionId, pchPeerStatus, pchAppHandles, pchPromotedHotVar } = do -- A first-to-finish synchronisation on all the bundles; As a result -- this is a first-to-finish synchronisation between all the -- mini-protocols runs toward the given peer. @@ -669,10 +669,8 @@ withPeerStateActions PeerStateActionsArguments { Just (WithSomeProtocolTemperature (WithHot MiniProtocolError{})) -> do -- current `pchPeerStatus` must be 'HotPeer' - state <- atomically $ do - peerState <- readTVar pchPeerStatus - _ <- updateUnlessCoolingOrCold pchPeerStatus PeerCooling - return peerState + state <- atomically do + readTVar pchPeerStatus <* updateUnlessCoolingOrCold pchPeerStatus PeerCooling case state of PeerCold -> return () PeerCooling -> return () @@ -687,10 +685,9 @@ withPeerStateActions PeerStateActionsArguments { Just (WithSomeProtocolTemperature (WithEstablished MiniProtocolError{})) -> do -- update 'pchPeerStatus' and log (as the two other transition to -- cold state. - state <- atomically $ do - peerState <- readTVar pchPeerStatus - _ <- updateUnlessCoolingOrCold pchPeerStatus PeerCooling - pure peerState + state <- atomically do + readTVar pchPeerStatus <* updateUnlessCoolingOrCold pchPeerStatus PeerCooling + case state of PeerCold -> return () PeerCooling -> return () @@ -731,7 +728,13 @@ withPeerStateActions PeerStateActionsArguments { -- peerMonitingLoop exit -- - Nothing -> + Nothing -> do + pchPromotedHot <- atomically $ stateTVar pchPromotedHotVar (, Nothing) + case pchPromotedHot of + Just t1 -> do + dt <- diffTime <$> getMonotonicTime <*> pure t1 + traceWith spsTracer (PeerHotDuration pchConnectionId dt) + Nothing -> pure () traceWith spsTracer (PeerStatusChanged (CoolingToCold pchConnectionId)) establishPeerConnection :: JobPool () m (Maybe SomeException) @@ -768,7 +771,8 @@ withPeerStateActions PeerStateActionsArguments { writeTVar (projectBundle SingWarm controlMessageBundle) Continue writeTVar (projectBundle SingEstablished controlMessageBundle) Continue - awaitVarBundle <- atomically $ mkAwaitVars muxBundle + awaitVarBundle <- atomically $ mkAwaitVars muxBundle + pchPromotedHotVar <- newTVarIO Nothing let connHandle = PeerConnectionHandle { @@ -779,7 +783,8 @@ withPeerStateActions PeerStateActionsArguments { muxBundle controlMessageBundle awaitVarBundle, - pchVersionData = versionData + pchVersionData = versionData, + pchPromotedHotVar } startProtocols SingWarm isBigLedgerPeer connHandle @@ -796,9 +801,15 @@ withPeerStateActions PeerStateActionsArguments { Just SomeAsyncException {} -> Nothing Nothing -> Just e) (\e -> do - atomically $ do + promotedHot <- atomically $ do waitForOutboundDemotion spsConnectionManager connId writeTVar peerStateVar PeerCold + stateTVar pchPromotedHotVar (, Nothing) + case promotedHot of + Just t1 -> do + dt <- diffTime <$> getMonotonicTime <*> pure t1 + traceWith spsTracer (PeerHotDuration connId dt) + Nothing -> pure () traceWith spsTracer (PeerMonitoringError connId e) throwIO e) (peerMonitoringLoop connHandle $> Nothing)) @@ -913,106 +924,95 @@ withPeerStateActions PeerStateActionsArguments { connHandle@PeerConnectionHandle { pchConnectionId, pchPeerStatus, - pchAppHandles } = do - -- quiesce warm peer protocols and set hot ones in 'Continue' mode. - wasWarm <- atomically $ do - -- if the peer is cold we can't activate it. - notCold <- isNotCoolingOrCold pchPeerStatus - when notCold $ do - writeTVar (getControlVar SingHot pchAppHandles) Continue - writeTVar (getControlVar SingWarm pchAppHandles) Quiesce - return notCold - when (not wasWarm) $ do - traceWith spsTracer (PeerStatusChangeFailure - (WarmToHot pchConnectionId) - ActiveCold) - throwIO $ ColdActivationException pchConnectionId - - -- start hot peer protocols - startProtocols SingHot isBigLedgerPeer connHandle - - -- Only set the status to PeerHot if the peer isn't PeerCold. - -- This can happen asynchronously between the check above and now. - wasWarm' <- atomically $ updateUnlessCoolingOrCold pchPeerStatus PeerHot - if wasWarm' - then traceWith spsTracer (PeerStatusChanged (WarmToHot pchConnectionId)) - else do - traceWith spsTracer (PeerStatusChangeFailure - (WarmToHot pchConnectionId) - ActiveCold) - throwIO $ ColdActivationException pchConnectionId + pchAppHandles, + pchPromotedHotVar } = do + join . atomically $ do + peerStatus <- readTVar pchPeerStatus + case peerStatus of + PeerWarm -> do + writeTVar (getControlVar SingHot pchAppHandles) Continue + writeTVar (getControlVar SingWarm pchAppHandles) Quiesce + writeTVar pchPeerStatus PeerHot + return $ do + startProtocols SingHot isBigLedgerPeer connHandle + atomically . writeTVar pchPromotedHotVar . (Just $!) =<< getMonotonicTime + traceWith spsTracer (PeerStatusChanged (WarmToHot pchConnectionId)) + _otherwise -> return $ do + traceWith spsTracer (PeerStatusChangeFailure + (WarmToHot pchConnectionId) + (ActiveCold peerStatus)) + throwIO $ ColdActivationException pchConnectionId -- Take a hot peer and demote it to a warm one. + -- this can be raced by 'peerMonitoringLoop' and peer selection demotion activity deactivatePeerConnection :: PeerConnectionHandle muxMode responderCtx peerAddr versionData ByteString m a b -> m () deactivatePeerConnection PeerConnectionHandle { pchConnectionId, pchPeerStatus, pchMux, - pchAppHandles + pchAppHandles, + pchPromotedHotVar } = do - wasCold <- atomically $ do - notCold <- isNotCoolingOrCold pchPeerStatus - when notCold $ do - writeTVar (getControlVar SingHot pchAppHandles) Terminate - writeTVar (getControlVar SingWarm pchAppHandles) Continue - return (not notCold) - when wasCold $ do - -- The governor attempted to demote an already cold peer. - traceWith spsTracer (PeerStatusChangeFailure - (HotToWarm pchConnectionId) - ActiveCold) - throwIO $ ColdDeactivationException pchConnectionId - - - -- Hot protocols should stop within 'spsDeactivateTimeout'. - res <- - timeout spsDeactivateTimeout - (atomically $ awaitAllResults SingHot pchAppHandles) - case res of - Nothing -> do - Mux.stop pchMux - atomically (writeTVar pchPeerStatus PeerCooling) - traceWith spsTracer (PeerStatusChangeFailure - (HotToCooling pchConnectionId) - TimeoutError) - throwIO (DeactivationTimeout pchConnectionId) - - -- some of the hot mini-protocols errored - Just (SomeErrored errs) -> do - -- we don't need to notify the connection manager, we can instead - -- relay on mux property: if any of the mini-protocols errors, mux - -- throws an exception as well. - atomically (writeTVar pchPeerStatus PeerCooling) - traceWith spsTracer (PeerStatusChangeFailure - (HotToCooling pchConnectionId) - (ApplicationFailure errs)) - throwIO (MiniProtocolExceptions errs) - - -- all hot mini-protocols succeeded - Just (AllSucceeded results) -> do - -- we don't notify the connection manager as this connection is still - -- useful to the outbound governor (warm peer). - wasWarm <- atomically $ do - -- Only set the status to PeerWarm if the peer isn't cold - -- (can happen asynchronously). - notCold <- updateUnlessCoolingOrCold pchPeerStatus PeerWarm - when notCold $ do - -- We need to update hot protocols to indicate that they are not - -- running. Preserve the results returned by their previous - -- execution. - modifyTVar (getMiniProtocolsVar SingHot pchAppHandles) - (\_ -> Map.map (pure . NotRunning . Right) results) - return notCold - - if wasWarm - then traceWith spsTracer (PeerStatusChanged (HotToWarm pchConnectionId)) - else do - traceWith spsTracer (PeerStatusChangeFailure - (HotToWarm pchConnectionId) - ActiveCold) - throwIO $ ColdDeactivationException pchConnectionId + join . atomically $ do + peerStatus <- readTVar pchPeerStatus + case peerStatus of + PeerHot -> do + writeTVar (getControlVar SingHot pchAppHandles) Terminate + writeTVar (getControlVar SingWarm pchAppHandles) Continue + pchPromotedHot <- stateTVar pchPromotedHotVar (, Nothing) + return $ do + case pchPromotedHot of + Just t1 -> do + dt <- diffTime <$> getMonotonicTime <*> pure t1 + traceWith spsTracer (PeerHotDuration pchConnectionId dt) + Nothing -> pure () + + -- Hot protocols should stop within 'spsDeactivateTimeout'. + res <- + timeout spsDeactivateTimeout + $ join . atomically $ do + res <- awaitAllResults SingHot pchAppHandles + case res of + AllSucceeded results -> do + modifyTVar (getMiniProtocolsVar SingHot pchAppHandles) + (\_ -> Map.map (pure . NotRunning . Right) results) + stateTVar pchPeerStatus \case + PeerHot -> (traceWith spsTracer (PeerStatusChanged (HotToWarm pchConnectionId)) + , PeerWarm) + x -> (pure () , x) + SomeErrored errs -> + stateTVar pchPeerStatus \status -> + if status <= PeerCooling then + (throwIO (MiniProtocolExceptions errs), status) + else ( traceWith spsTracer (PeerStatusChangeFailure + (HotToCooling pchConnectionId) + (ApplicationFailure errs)) + >> throwIO (MiniProtocolExceptions errs) + , PeerCooling) + + case res of + Nothing -> do + Mux.stop pchMux + trace <- atomically $ updateUnlessCoolingOrCold pchPeerStatus PeerCooling + when trace do + traceWith spsTracer (PeerStatusChangeFailure + (HotToCooling pchConnectionId) + TimeoutError) + throwIO (DeactivationTimeout pchConnectionId) + Just _ -> return () + + -- we could genuinly hit this case due to a race between 'peerMonitoringLoop' + -- and peer selection demotion job + PeerWarm -> return $ pure () + + _otherwise -> + return $ do + traceWith spsTracer (PeerStatusChangeFailure + (HotToWarm pchConnectionId) + (ActiveCold peerStatus)) + throwIO $ ColdDeactivationException pchConnectionId closePeerConnection :: PeerConnectionHandle muxMode responderCtx peerAddr versionData ByteString m a b @@ -1024,57 +1024,56 @@ withPeerStateActions PeerStateActionsArguments { pchAppHandles, pchMux } = do - atomically $ do + peerStatus <- atomically $ do writeTVar (getControlVar SingWarm pchAppHandles) Terminate writeTVar (getControlVar SingEstablished pchAppHandles) Terminate writeTVar (getControlVar SingHot pchAppHandles) Terminate + readTVar pchPeerStatus <* updateUnlessCoolingOrCold pchPeerStatus PeerCooling + + case peerStatus of + ps@PeerCooling -> return ps + ps@PeerCold -> return ps + _otherwise -> do + res <- + timeout spsCloseConnectionTimeout + (atomically $ + (\a b c -> a <> b <> c) + -- note: we use last to finish on hot, warm and + -- established mini-protocols since 'closePeerConnection' + -- is also used by asynchronous demotions, not just + -- /warm → cold/ transition. + <$> awaitAllResults SingHot pchAppHandles + <*> awaitAllResults SingWarm pchAppHandles + <*> awaitAllResults SingEstablished pchAppHandles) + + PeerCooling <$ case res of + Nothing -> do + -- timeout fired + Mux.stop pchMux + traceWith spsTracer (PeerStatusChangeFailure + (WarmToCooling pchConnectionId) + TimeoutError) + + Just (SomeErrored errs) -> do + -- some mini-protocol errored + -- + -- we don't need to notify the connection manager, we can instead + -- rely on mux property: if any of the mini-protocols errors, mux + -- throws an exception as well. + traceWith spsTracer (PeerStatusChangeFailure + (WarmToCooling pchConnectionId) + (ApplicationFailure errs)) + throwIO (MiniProtocolExceptions errs) + + Just AllSucceeded {} -> do + -- all mini-protocols terminated cleanly + -- + -- 'unregisterOutboundConnection' could only fail to demote the peer if + -- connection manager would simultaneously promote it, but this is not + -- possible. + _ <- releaseOutboundConnection spsConnectionManager pchConnectionId + traceWith spsTracer (PeerStatusChanged (WarmToCooling pchConnectionId)) - res <- - timeout spsCloseConnectionTimeout - (atomically $ - (\a b c -> a <> b <> c) - -- note: we use last to finish on hot, warm and - -- established mini-protocols since 'closePeerConnection' - -- is also used by asynchronous demotions, not just - -- /warm → cold/ transition. - <$> awaitAllResults SingHot pchAppHandles - <*> awaitAllResults SingWarm pchAppHandles - <*> awaitAllResults SingEstablished pchAppHandles) - case res of - Nothing -> do - -- timeout fired - Mux.stop pchMux - wasWarm <- atomically (updateUnlessCoolingOrCold pchPeerStatus PeerCooling) - when wasWarm $ - traceWith spsTracer (PeerStatusChangeFailure - (WarmToCooling pchConnectionId) - TimeoutError) - readTVarIO pchPeerStatus - - Just (SomeErrored errs) -> do - -- some mini-protocol errored - -- - -- we don't need to notify the connection manager, we can instead - -- rely on mux property: if any of the mini-protocols errors, mux - -- throws an exception as well. - wasWarm <- atomically (updateUnlessCoolingOrCold pchPeerStatus PeerCooling) - when wasWarm $ - traceWith spsTracer (PeerStatusChangeFailure - (WarmToCooling pchConnectionId) - (ApplicationFailure errs)) - throwIO (MiniProtocolExceptions errs) - - Just AllSucceeded {} -> do - -- all mini-protocols terminated cleanly - -- - -- 'unregisterOutboundConnection' could only fail to demote the peer if - -- connection manager would simultaneously promote it, but this is not - -- possible. - wasWarm <- atomically (updateUnlessCoolingOrCold pchPeerStatus PeerCooling) - when wasWarm $ do - _ <- releaseOutboundConnection spsConnectionManager pchConnectionId - traceWith spsTracer (PeerStatusChanged (WarmToCooling pchConnectionId)) - readTVarIO pchPeerStatus -- -- Utilities @@ -1179,7 +1178,7 @@ data FailureType versionNumber = | HandleFailure !SomeException | MuxStoppedFailure | TimeoutError - | ActiveCold + | ActiveCold !PeerStatus | ApplicationFailure ![MiniProtocolException] deriving Show @@ -1207,4 +1206,5 @@ data PeerSelectionActionsTrace peerAddr vNumber = | PeerMonitoringError (ConnectionId peerAddr) SomeException | PeerMonitoringResult (ConnectionId peerAddr) (Maybe (WithSomeProtocolTemperature FirstToFinishResult)) | AcquireConnectionError SomeException + | PeerHotDuration (ConnectionId peerAddr) DiffTime deriving Show diff --git a/ouroboros-network/testlib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs b/ouroboros-network/testlib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs index f29c11470c..07d55fe2bc 100644 --- a/ouroboros-network/testlib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs +++ b/ouroboros-network/testlib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs @@ -1259,6 +1259,8 @@ prop_peer_selection_action_trace_coverage defaultBearerInfo diffScript = = "AcquireConnectionError: " ++ show (ioe_type ioe) | otherwise = "AcquireConnectionError: " ++ show e + peerSelectionActionsTraceMap (PeerHotDuration _id _dt) = + "PeerHotDuration" eventsSeenNames = map peerSelectionActionsTraceMap events @@ -3879,7 +3881,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber = $ evs' numOfActiveColdErrors = length . filter (\case - (PeerStatusChangeFailure HotToWarm{} ActiveCold) + (PeerStatusChangeFailure HotToWarm{} ActiveCold{}) -> True _ -> False) $ evs' @@ -3902,7 +3904,7 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber = in conjoin (zipWith (curry (\case ev@( WithTime _ (PeerStatusChangeFailure (HotToWarm _) TimeoutError) - , WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold) + , WithTime _ (PeerStatusChangeFailure (HotToWarm _) ActiveCold{}) ) -> counterexample (show ev) $ counterexample (unlines $ map show peerSelectionActionsEvents) @@ -3969,7 +3971,8 @@ prop_diffusion_peer_selection_actions_no_dodgy_traces ioSimTrace traceNumber = WithTime _ (PeerStatusChangeFailure type_ _) -> getConnId type_ WithTime _ (PeerMonitoringError connId _) -> Just connId WithTime _ (PeerMonitoringResult connId _) -> Just connId - WithTime _ (AcquireConnectionError _) -> Nothing) + WithTime _ (AcquireConnectionError _) -> Nothing + WithTime _ (PeerHotDuration connId _) -> Just connId) $ peerSelectionActionsEvents ) From bfcf8afbcbdfccda937e871fa2dbdd5147a639d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C3=B3jtowicz?= Date: Fri, 27 Jun 2025 11:58:45 +0200 Subject: [PATCH 2/3] Added hot connection durations to peer selection debug state Traces durations upon receiving sigusr1 interrupt. --- .../Network/PeerSelection/LedgerPeers/Type.hs | 2 +- .../Cardano/Network/Diffusion.hs | 4 ++- .../Cardano/Network/Diffusion/Handlers.hs | 10 +++++--- .../Network/PeerSelection/Governor/Types.hs | 25 ++++++++++++++++--- .../Network/PeerSelection/PeerStateActions.hs | 10 ++++++++ 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/ouroboros-network-api/src/Ouroboros/Network/PeerSelection/LedgerPeers/Type.hs b/ouroboros-network-api/src/Ouroboros/Network/PeerSelection/LedgerPeers/Type.hs index 6805c7b2ea..b43801120c 100644 --- a/ouroboros-network-api/src/Ouroboros/Network/PeerSelection/LedgerPeers/Type.hs +++ b/ouroboros-network-api/src/Ouroboros/Network/PeerSelection/LedgerPeers/Type.hs @@ -253,7 +253,7 @@ data IsLedgerPeer = IsLedgerPeer data IsBigLedgerPeer = IsBigLedgerPeer | IsNotBigLedgerPeer - deriving Eq + deriving (Eq, Show) -- | Return ledger state information and ledger peers. -- diff --git a/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion.hs b/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion.hs index 8042494323..4ebdcb9ed8 100644 --- a/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion.hs +++ b/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion.hs @@ -41,6 +41,7 @@ import Ouroboros.Network.NodeToNode (NodeToNodeVersionData (..), RemoteAddress, import Ouroboros.Network.NodeToNode qualified as NodeToNode import Ouroboros.Network.PeerSelection.LedgerPeers.Type (LedgerPeersConsensusInterface (..)) +import Ouroboros.Network.PeerSelection.PeerStateActions import Ouroboros.Network.Protocol.Handshake -- | Main entry point for Cardano data diffusion service. It allows to: @@ -136,7 +137,8 @@ run CardanoNodeArguments { (Diffusion.dcPeerSharing config) readUseBootstrapPeers (Cardano.getLedgerStateJudgement (lpExtraAPI ledgerPeersAPI)) - churnMetrics, + churnMetrics + getPromotedHotTime, daPeerSelectionGovernorArgs = Cardano.Types.cardanoPeerSelectionGovernorArgs Cardano.ExtraPeerSelectionActions { diff --git a/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion/Handlers.hs b/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion/Handlers.hs index abe2eae369..663bc22cf8 100644 --- a/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion/Handlers.hs +++ b/ouroboros-network/cardano-diffusion/Cardano/Network/Diffusion/Handlers.hs @@ -8,6 +8,8 @@ module Cardano.Network.Diffusion.Handlers where +import Control.Monad.Class.MonadTime.SI + import Cardano.Network.PeerSelection.Bootstrap (UseBootstrapPeers) import Cardano.Network.PeerSelection.Governor.PeerSelectionState qualified as Cardano import Cardano.Network.Types (LedgerStateJudgement) @@ -19,7 +21,6 @@ import Ouroboros.Network.PeerSelection.LedgerPeers.Type (UseLedgerPeers) import Ouroboros.Network.PeerSelection.PeerMetric import Ouroboros.Network.PeerSelection.PeerSharing (PeerSharing) #ifdef POSIX -import Control.Monad.Class.MonadTime.SI import Control.Tracer (traceWith) import Ouroboros.Network.ConnectionManager.Core (Trace (..)) import Ouroboros.Network.PeerSelection.Governor.Types @@ -40,6 +41,7 @@ sigUSR1Handler -> STM IO UseBootstrapPeers -> STM IO LedgerStateJudgement -> PeerMetrics IO ntnAddr + -> (peerconn -> STM IO (Maybe Time)) -> ConnectionManager muxMode socket ntnAddr handle handleError IO -> StrictTVar IO (PeerSelectionState @@ -49,7 +51,7 @@ sigUSR1Handler -> IO () #ifdef POSIX sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers - getLedgerStateJudgement metrics connectionManager dbgStateVar = do + getLedgerStateJudgement metrics getPromotedHotTime connectionManager dbgStateVar = do _ <- Signals.installHandler Signals.sigUSR1 (Signals.Catch @@ -66,7 +68,7 @@ sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers useBootstrapPeers <*> readTVar dbgStateVar - let dbgState = makeDebugPeerSelectionState ps up bp lsj am + dbgState <- makeDebugPeerSelectionState ps up bp lsj am getPromotedHotTime now traceWith (dtConnectionManagerTracer tracersExtra) (TrState state) @@ -77,5 +79,5 @@ sigUSR1Handler tracersExtra getUseLedgerPeers ownPeerSharing getBootstrapPeers Nothing return () #else -sigUSR1Handler _ _ _ _ _ _ _ _ = pure () +sigUSR1Handler _ _ _ _ _ _ _ _ _ = pure () #endif diff --git a/ouroboros-network/src/Ouroboros/Network/PeerSelection/Governor/Types.hs b/ouroboros-network/src/Ouroboros/Network/PeerSelection/Governor/Types.hs index 7d806467ce..7e0cb3f75d 100644 --- a/ouroboros-network/src/Ouroboros/Network/PeerSelection/Governor/Types.hs +++ b/ouroboros-network/src/Ouroboros/Network/PeerSelection/Governor/Types.hs @@ -769,18 +769,24 @@ data DebugPeerSelectionState extraState extraFlags extraPeers peeraddr = dpssUpstreamyness :: !(Map peeraddr Int), dpssFetchynessBlocks :: !(Map peeraddr Int), dpssAssociationMode :: !AssociationMode, + dpssHotDurations :: !(Map peeraddr (IsBigLedgerPeer, DiffTime)), dpssExtraState :: !extraState } deriving Show makeDebugPeerSelectionState - :: PeerSelectionState extraState extraFlags extraPeers peeraddr peerconn + :: (Ord peeraddr, MonadSTM m) + => PeerSelectionState extraState extraFlags extraPeers peeraddr peerconn -> Map peeraddr Int -> Map peeraddr Int -> extraDebugState -> AssociationMode - -> DebugPeerSelectionState extraDebugState extraFlags extraPeers peeraddr -makeDebugPeerSelectionState PeerSelectionState {..} up bp es am = - DebugPeerSelectionState { + -> (peerconn -> STM m (Maybe Time)) + -> Time + -> m (DebugPeerSelectionState extraDebugState extraFlags extraPeers peeraddr) +makeDebugPeerSelectionState PeerSelectionState {..} up bp es am getPromotedHotTime now = do + let activeMap = EstablishedPeers.toMap establishedPeers `Map.restrictKeys` activePeers + dpssHotDurations <- Map.traverseMaybeWithKey getDiffTimes activeMap + return DebugPeerSelectionState { dpssTargets = targets , dpssLocalRootPeers = localRootPeers , dpssPublicRootPeers = publicRootPeers @@ -802,8 +808,19 @@ makeDebugPeerSelectionState PeerSelectionState {..} up bp es am = , dpssUpstreamyness = up , dpssFetchynessBlocks = bp , dpssAssociationMode = am + , dpssHotDurations , dpssExtraState = es } + where + getDiffTimes peeraddr peerconn = do + t1 <- atomically $ getPromotedHotTime peerconn + case t1 of + Nothing -> return Nothing + Just t1' -> + let !dt = now `diffTime` t1' + in if Set.member peeraddr (PublicRootPeers.getBigLedgerPeers publicRootPeers) + then return . Just $ (IsBigLedgerPeer, dt) + else return . Just $ (IsNotBigLedgerPeer, dt) -- | Public 'PeerSelectionState' that can be accessed by Peer Sharing -- mechanisms without any problem. diff --git a/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs b/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs index 9df9d5d333..6942d5f976 100644 --- a/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs +++ b/ouroboros-network/src/Ouroboros/Network/PeerSelection/PeerStateActions.hs @@ -20,6 +20,7 @@ module Ouroboros.Network.PeerSelection.PeerStateActions -- * Create PeerStateActions PeerStateActionsArguments (..) , PeerConnectionHandle + , getPromotedHotTime , withPeerStateActions , pchPeerSharing -- * Exceptions @@ -439,6 +440,15 @@ data PeerConnectionHandle (muxMode :: Mux.Mode) responderCtx peerAddr versionDat pchPromotedHotVar :: !(StrictTVar m (Maybe Time)) } +-- | Retrieve the time the remote peer has been promoted to hot state +-- or Nothing if either the peer was not promoted or is being currently demoted +-- +getPromotedHotTime :: (MonadSTM m) + => PeerConnectionHandle muxMode responderCtx peerAddr versionData bytes m a b + -> STM m (Maybe Time) +getPromotedHotTime PeerConnectionHandle { pchPromotedHotVar } = + readTVar pchPromotedHotVar + mkInitiatorContext :: MonadSTM m => SingProtocolTemperature pt -> IsBigLedgerPeer From c7188631c086f355515135f67732502b67bb88a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C3=B3jtowicz?= Date: Thu, 26 Jun 2025 11:36:31 +0200 Subject: [PATCH 3/3] changelog update --- ouroboros-network/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ouroboros-network/CHANGELOG.md b/ouroboros-network/CHANGELOG.md index 62efee83d2..b090302ac6 100644 --- a/ouroboros-network/CHANGELOG.md +++ b/ouroboros-network/CHANGELOG.md @@ -39,6 +39,9 @@ - Renamed `Applications` to `DiffusionApplications` - `runM` function now receives `ExtraParameters` as an argument - Configurable Mux Egress Poll Interval +- Added `pchPromotedHotVar` to `PeerConnectionHandle` to track when a peer has been promoted to hot +- Added tag `PeerHotDuration` to `PeerSelectionActionsTrace` to indicate how long a remote + peer has been in hot mode until it was either demoted or closed. ## 0.21.2.0 -- 2025-06-02