Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rethrow policy for 0.19 release #5076

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ouroboros-network/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@

### Non-breaking changes

## 0.19.0.3 -- 2025-03-11

### Non-breaking changes

* `IOError` rethrow policy changed.

## 0.19.0.2 -- 2025-02-03

### Breaking changes
Expand Down
2 changes: 1 addition & 1 deletion ouroboros-network/ouroboros-network.cabal
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cabal-version: 3.0
name: ouroboros-network
version: 0.19.0.2
version: 0.19.0.3
synopsis: A networking layer for the Ouroboros blockchain protocol
description: A networking layer for the Ouroboros blockchain protocol.
license: Apache-2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ import Test.QuickCheck
import Test.QuickCheck.Monoids
import Test.Tasty
import Test.Tasty.QuickCheck
import Ouroboros.Network.Diffusion.P2P (isFatal)

tests :: TestTree
tests =
Expand Down Expand Up @@ -1576,7 +1575,6 @@ unit_4191 = testWithIOSim prop_diffusion_dns_can_recover 125000 absInfo script
--
prop_connect_failure :: AbsIOError -> Property
prop_connect_failure (AbsIOError ioerr) =
label (if isFatal (ioe_type ioerr) then "fatal IOError" else "non-fatal IOError") $
testWithIOSim
(\trace _ ->
let -- events generated by `nodeAddr`
Expand All @@ -1594,18 +1592,10 @@ prop_connect_failure (AbsIOError ioerr) =
evs = eventsToList (selectDiffusionSimulationTrace events)
in counterexample (Trace.ppTrace show (ppSimEvent 0 0 0) . Trace.take noEvents $ trace)
. counterexample (show evs)
. (if isFatal (ioe_type ioerr)
then -- verify that the node was killed by the right exception
any (\case
TrErrored e | Just (ExceptionInHandler _ e') <- fromException e
, Just e'' <- fromException e'
, e'' == ioerr
-> True
_ -> False)
else -- verify that the ndoe was not killed by the `ioerr` exception
all (\case
TrErrored {} -> False
_ -> True)
. ( -- verify that the node was not killed by the `IOError`
all (\case
TrErrored {} -> False
_ -> True)
)
. map snd
$ evs
Expand Down
66 changes: 22 additions & 44 deletions ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ module Ouroboros.Network.Diffusion.P2P
, Interfaces (..)
, runM
, NodeToNodePeerConnectionHandle
, isFatal
-- * Re-exports
, AbstractTransitionTrace
, RemoteTransitionTrace
Expand Down Expand Up @@ -56,7 +55,7 @@ import Data.Maybe (catMaybes, maybeToList)
import Data.Proxy (Proxy (..))
import Data.Typeable (Typeable)
import Data.Void (Void)
import GHC.IO.Exception (IOException (..), IOErrorType (..))
import GHC.IO.Exception (IOException (..))
import System.Exit (ExitCode)
import System.Random (StdGen, newStdGen, split)
#ifdef POSIX
Expand Down Expand Up @@ -737,11 +736,28 @@ runM Interfaces
Just (_ :: IOManagerError) -> ShutdownNode
Nothing -> mempty)
<>
-- IOError rethrow-policy
--
-- After a critical bug, we decided that `IOError` policy should only
-- kill the connection which thrown it. `IOError`s are not propagated.
-- There's a risk that one could arm an attack if one discovers
-- a mechanism to trigger fatal `IOError`s, e.g. through a kernel bug.
--
-- It is responsibility for an SPO to monitor the node if it is making
-- progress and have enough resources to do so, e.g. if it has enough
-- memory, file descriptors.
--
-- The `ouroboros-network` guarantees running on a fixed number of file
-- descriptors given a topology file, see
-- https://github.com/IntersectMBO/ouroboros-network/issues/4585#issuecomment-1591777447
-- There's also a calculation for `ouroboros-consensus`, see
-- https://github.com/IntersectMBO/ouroboros-consensus/issues/20#issuecomment-1514554680
-- File descriptors could be drained by the tracing system in
-- `cardano-node` (such a bug existed), or even an external process.
--
RethrowPolicy (\_ctx err ->
case fromException err of
Just IOError { ioe_type } ->
if isFatal ioe_type then ShutdownNode
else mempty
case fromException err :: Maybe IOException of
Just {} -> mempty
Nothing -> mempty)
<>
RethrowPolicy (\ctx err -> case (ctx, fromException err) of
Expand Down Expand Up @@ -1311,44 +1327,6 @@ run tracers tracersExtra args argsExtra apps appsExtra = do
}
tracers tracersExtra args argsExtra apps appsExtra

--
-- Fatal Errors
--
-- If we are out of file descriptors (either because we exhausted
-- process or system limit) we should shut down the node and let the
-- operator investigate.
--
-- Refs:
-- * https://hackage.haskell.org/package/ghc-internal-9.1001.0/docs/src/GHC.Internal.Foreign.C.Error.html#errnoToIOError
-- * man socket.2
-- * man connect.2
-- * man accept.2
-- NOTE: many `connect` and `accept` exceptions are classified as
-- `OtherError`, here we only distinguish fatal IO errors (e.g.
-- ones that propagate to the main thread).
-- NOTE: we don't use the rethrow policy for `accept` calls, where
-- all but `ECONNABORTED` are fatal exceptions.
--
-- NOTE: UnsupportedOperation error type is not considered fatal since it can
-- happen on a race condition between the connect and accept call between two
-- nodes that have each other as local roots.
--
isFatal :: IOErrorType -> Bool
isFatal ResourceExhausted = True
-- EAGAIN -- connect, accept
-- EMFILE -- socket, accept
-- ENFILE -- socket, accept
-- ENOBUFS -- socket, accept
-- ENOMEM -- socket, accept
isFatal InvalidArgument = True
-- EINVAL -- socket, accept
-- ENOTSOCK -- connect
-- EBADF -- connect, accept
isFatal ProtocolError = True
-- EPROTONOSUPPOPRT -- socket
-- EPROTO -- accept
isFatal _ = False

--
-- Data flow
--
Expand Down
Loading