Test suite: Add coverage for some "unhappy" paths (error paths)

JuliaParallel · Feb 9, 2025 · 29acb23 · 29acb23
1 parent 19e1470
commit 29acb23
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 1 deletion.
diff --git a/test/error_path_intentionally_fail.jl b/test/error_path_intentionally_fail.jl
@@ -0,0 +1,38 @@
+mktempdir() do tmpdir
+  fake_bindir = joinpath(tmpdir, "bin")
+  fake_srun = joinpath(tmpdir, "bin", "srun")
+  mkpath(fake_bindir)
+  open(fake_srun, "w") do io
+    println(io, "#!/usr/bin/env bash")
+    println(io, "set -euf -o pipefail")
+    # println(io, "set -x")
+    println(io, "echo [stdout] fake-srun: INTENTIONALLY ERROR-ING")
+    println(io, "echo [stderr] fake-srun: INTENTIONALLY ERROR-ING >&2")
+    println(io, "exit 1")
+  end
+  chmod(fake_srun, 0o700) # chmod +x
+  directory_separator = Sys.iswindows() ? ';' : ':'
+  new_env = Dict{String, String}()
+  new_env["SLURM_NTASKS"] = "8"
+  new_env["SLURM_JOB_ID"] = "1234"
+  if haskey(ENV, "PATH")
+    old_path = ENV["PATH"]
+    new_env["PATH"] = fake_bindir * directory_separator * old_path
+  else
+    new_env["PATH"] = fake_bindir
+  end
+
+  @info "with old PATH" Sys.which("srun")
+  withenv(new_env...) do
+    @info "with new PATH" Sys.which("srun")
+
+    if Base.VERSION >= v"1.2-"
+      T_expected = TaskFailedException
+    else
+      T_expected = Base.IOError
+    end
+
+    mgr = SlurmClusterManager.SlurmManager()
+    @test_throws T_expected Distributed.addprocs(mgr)
+  end
+end
diff --git a/test/error_path_manager_timeout.jl b/test/error_path_manager_timeout.jl
@@ -0,0 +1,54 @@
+mktempdir() do tmpdir
+  fake_bindir = joinpath(tmpdir, "bin")
+  fake_srun = joinpath(tmpdir, "bin", "srun")
+  mkpath(fake_bindir)
+  open(fake_srun, "w") do io
+    println(io, "#!/usr/bin/env bash")
+    println(io, "set -euf -o pipefail")
+    # println(io, "set -x")
+
+    # we only print this to stderr; don't print to stdout, or we won't hit the desired error path
+    # (we'll hit a different error path instead, not the one we want to test)
+    println(io, "echo [stderr] fake-srun: sleeping for 15 seconds... >&2")
+
+    # Bash sleep for 15-seconds:
+    println(io, "sleep 15")
+
+    println(io, "echo [stdout] fake-srun: INTENTIONALLY ERROR-ING")
+    println(io, "echo [stderr] fake-srun: INTENTIONALLY ERROR-ING >&2")
+    println(io, "exit 1")
+  end
+  chmod(fake_srun, 0o700) # chmod +x
+  directory_separator = Sys.iswindows() ? ';' : ':'
+  new_env = Dict{String, String}()
+  new_env["SLURM_NTASKS"] = "8"
+  new_env["SLURM_JOB_ID"] = "1234"
+  if haskey(ENV, "PATH")
+    old_path = ENV["PATH"]
+    new_env["PATH"] = fake_bindir * directory_separator * old_path
+  else
+    new_env["PATH"] = fake_bindir
+  end
+
+  @info "with old PATH" Sys.which("srun")
+  withenv(new_env...) do
+    @info "with new PATH" Sys.which("srun")
+
+    if Base.VERSION >= v"1.2-"
+      expected_outer_ex_T = TaskFailedException
+      expected_inner_ex_INSTANCE = ErrorException("launch_timeout exceeded")
+    else
+      expected_outer_ex_T = ErrorException
+      expected_inner_ex_INSTANCE = ErrorException("launch_timeout exceeded")
+    end
+
+    mgr = SlurmClusterManager.SlurmManager(; launch_timeout = 2.0)
+    test_result = @test_throws expected_outer_ex_T Distributed.addprocs(mgr)
+
+    cfg = ConfigForTestingTaskFailedException(;
+      expected_outer_ex_T=expected_outer_ex_T,
+      expected_inner_ex_INSTANCE=expected_inner_ex_INSTANCE,
+    )
+    test_task_failed_exception(test_result, cfg)
+  end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,7 +6,7 @@ import Distributed
 import Test
 
 # Bring some names into scope, just for convenience:
-using Test: @testset, @test, @test_throws, @test_logs
+using Test: @testset, @test, @test_throws, @test_logs, @test_skip, @test_broken
 
 const original_JULIA_DEBUG = strip(get(ENV, "JULIA_DEBUG", ""))
 if isempty(original_JULIA_DEBUG)
@@ -15,10 +15,21 @@ else
   ENV["JULIA_DEBUG"] = original_JULIA_DEBUG * ",SlurmClusterManager"
 end
 
+include("util.jl")
+
 @testset "SlurmClusterManager.jl" begin
   @testset "Unit tests" begin
     include("unit.jl")
   end
+
+  @testset "Test some unhappy paths (error paths)" begin
+    @testset "intentionally fail" begin
+      include("error_path_intentionally_fail.jl")
+    end
+    @testset "manager's launch timeout" begin
+      include("error_path_manager_timeout.jl")
+    end
+  end
 
   # test that slurm is available
   @test !(Sys.which("sinfo") === nothing)

diff --git a/test/util.jl b/test/util.jl
@@ -0,0 +1,27 @@
+extract_test_result_value(test_result::Test.Pass) = test_result.value
+
+recursively_unwrap_ex(ex::ErrorException) = ex
+recursively_unwrap_ex(ex::Base.IOError) = ex
+
+@static if Base.VERSION >= v"1.2-"
+  function recursively_unwrap_ex(outer_ex::TaskFailedException)
+    new_thing = outer_ex.task.exception
+    return recursively_unwrap_ex(new_thing)
+  end
+end
+
+Base.@kwdef struct ConfigForTestingTaskFailedException
+  expected_outer_ex_T
+  expected_inner_ex_INSTANCE
+end
+
+function test_task_failed_exception(test_result::Test.Pass, cfg::ConfigForTestingTaskFailedException)
+  observed_outer_ex = extract_test_result_value(test_result)
+  @test observed_outer_ex isa cfg.expected_outer_ex_T
+
+  observed_inner_ex = recursively_unwrap_ex(observed_outer_ex)
+  @test observed_inner_ex isa typeof(cfg.expected_inner_ex_INSTANCE)
+  @test observed_inner_ex == cfg.expected_inner_ex_INSTANCE
+
+  return nothing
+end