Monitor: importance of seeing events from job, workflow and node contexts
A job failed with this error message
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 88, in _context
yield execinfo
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 54, in job_context
yield execinfo
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoks/bindings.py", line 78, in execute_graph
result = mod.execute_graph(graph, execinfo=execinfo, **execute_options)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 25, in wrapper
return method(*args, execinfo=execinfo, **kw)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoksppf/bindings.py", line 600, in execute_graph
return ppfgraph.run(**execute_options)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoksppf/bindings.py", line 569, in run
raise RuntimeError(ex["errorMessage"])
RuntimeError: Task 'dozor submit SLURM job' failed
This is not very informative. When you look at the error messages of the other contexts
from ewoksjob.events.readers import instantiate_reader
reader = instantiate_reader("redis://ewoksserver1:25004/0")
for event in reader.get_events(job_id = "2f3b56d2-503b-48e0-9f9f-56d334f9677f"):
error_traceback = event["error_traceback"]
if error_traceback:
print("---------------", event["context"])
print(error_traceback)
you get more information of what the actual problem is
--------------- node
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/task.py", line 401, in execute
self.run()
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/ppftasks.py", line 30, in run
result = method(**method_kwargs)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/actors/submitSLURMJob.py", line 50, in run
slurmScriptPath, slurmJobId, stdout, stderr = mxnice.submitJobToSLURM(
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/workflow_lib/mxnice.py", line 85, in submitJobToSLURM
stdout, stderr = runCommand("sbatch {0}".format(slurmScriptPath))
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/workflow_lib/mxnice.py", line 37, in runCommand
proc = subprocess.Popen(
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/subprocess.py", line 951, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/subprocess.py", line 1821, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'sbatch'
--------------- workflow
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/task.py", line 401, in execute
self.run()
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/ppftasks.py", line 30, in run
result = method(**method_kwargs)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/actors/submitSLURMJob.py", line 50, in run
slurmScriptPath, slurmJobId, stdout, stderr = mxnice.submitJobToSLURM(
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/workflow_lib/mxnice.py", line 85, in submitJobToSLURM
stdout, stderr = runCommand("sbatch {0}".format(slurmScriptPath))
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/bes/src/bes/workflow_lib/mxnice.py", line 37, in runCommand
proc = subprocess.Popen(
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/subprocess.py", line 951, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/subprocess.py", line 1821, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'sbatch'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/pypushflow/concurrent/interrupt/process.py", line 14, in task_main
return fn(*args, **kwargs)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoksppf/ppfrunscript.py", line 27, in run
task.execute()
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/task.py", line 407, in execute
raise RuntimeError(f"Task '{self.label}' failed") from e
RuntimeError: Task 'dozor submit SLURM job' failed
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/pypushflow/concurrent/process.py", line 78, in cb
result = future.result()
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/concurrent/futures/_base.py", line 439, in result
return self.__get_result()
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/concurrent/futures/_base.py", line 391, in __get_result
raise self._exception
RuntimeError: Task 'dozor submit SLURM job' failed
--------------- job
Traceback (most recent call last):
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 88, in _context
yield execinfo
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 54, in job_context
yield execinfo
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoks/bindings.py", line 78, in execute_graph
result = mod.execute_graph(graph, execinfo=execinfo, **execute_options)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewokscore/events/contexts.py", line 25, in wrapper
return method(*args, execinfo=execinfo, **kw)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoksppf/bindings.py", line 600, in execute_graph
return ppfgraph.run(**execute_options)
File "/cvmfs/sb.esrf.fr/software/packages/ubuntu20.04/x86_64/bes/20230110/id30a2/miniconda3/envs/id30a2/lib/python3.9/site-packages/ewoksppf/bindings.py", line 569, in run
raise RuntimeError(ex["errorMessage"])
RuntimeError: Task 'dozor submit SLURM job' failed