Skip to content

Commit

Permalink
debug
Browse files Browse the repository at this point in the history
Signed-off-by: Dean Roehrich <[email protected]>
  • Loading branch information
roehrich-hpe committed Apr 1, 2024
1 parent f9f9cc6 commit e5a6aa8
Showing 1 changed file with 30 additions and 28 deletions.
58 changes: 30 additions & 28 deletions testsuite/integration/src/tests/slurmctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

import os
import time
import docker
import re
import docker
from tenacity import *

# Submitting jobs can fail, occasionally, when the DWS webhook rejects the
Expand Down Expand Up @@ -53,7 +53,7 @@ def exec_run(self, cmd):
def submit_job(self, script_path):
# The --wait option could be used here. However, other tests need to
# asynchronously track the job status
cmd = f"sbatch -v --output={script_path}.out --error={script_path}.error.out {script_path}"
cmd = f"sbatch -vv --output={script_path}.out --error={script_path}.error.out {script_path}"
rc, out = self.exec_run(cmd)
if rc != 0:
print("BEGIN Job submission error")
Expand Down Expand Up @@ -108,33 +108,35 @@ def scontrol_show_job(self, jobId):
return key_val[1], out
assert False, "Could not parse state from: " + out

@retry(
wait=wait_fixed(2),
stop=stop_after_attempt(5)
)
def wait_until_job_has_been_x(self, jobId, job_state_wanted, script_path):
job_state, out = self.scontrol_show_job(jobId)
print(f"Found \"{job_state}\" in JobState")
if job_state == "FAILED" and job_state_wanted == "COMPLETED":
# We're in the weeds. Drop a clue.
print("BEGIN scontrol show job")
print(out)
print("END scontrol show job")
print("BEGIN get workflows")
rc,out = self.exec_run("kubectl --kubeconfig /etc/slurm/slurm-dws.kubeconfig get workflows -A")
print(f"rc = {rc}\n{out}")
print("END get workflows")
print("BEGIN job output file")
rc,out = self.exec_run(f"cat {script_path}.out")
print("END job output file")
print("BEGIN job error output file")
rc,out = self.exec_run(f"cat {script_path}.error.out")
print("END job error output file")
print("BEGIN slurmctld log")
cmd = ['docker', 'logs', 'slurmctld']
os.system(' '.join(cmd))
print("END slurmctld log")

cnt = 0
while cnt < 5:
job_state, out = self.scontrol_show_job(jobId)
print(f"Found \"{job_state}\" in JobState")
if job_state == job_state_wanted:
break
if job_state == "FAILED" and job_state_wanted == "COMPLETED":
# We're in the weeds. Drop a clue.
print("BEGIN scontrol show job")
print(out)
print("END scontrol show job")
print("BEGIN get workflows")
rc,out = self.exec_run("kubectl --kubeconfig /etc/slurm/slurm-dws.kubeconfig get workflows -A")
print(f"rc = {rc}\n{out}")
print("END get workflows")
print("BEGIN job output file")
rc,out = self.exec_run(f"cat {script_path}.out")
print("END job output file")
print("BEGIN job error output file")
rc,out = self.exec_run(f"cat {script_path}.error.out")
print("END job error output file")
print("BEGIN slurmctld log")
os.system("docker logs slurmctld 2>&1")
print("END slurmctld log")
assert job_state == job_state_wanted # stop looping now

cnt += 1
time.sleep(2)
assert job_state == job_state_wanted

@retry(
Expand Down

0 comments on commit e5a6aa8

Please sign in to comment.