Slurm traceback not complete
On Slurm
INFO: underlay of /etc/localtime required more than 50 (113) bind mounts
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/ewokscore/task.py", line 401, in execute
self.run()
File "/home/esrf/denolf/ewoks_parallel/tasks_config.py", line 113, in run
res = ai.integrate1d(
File "/usr/local/lib/python3.10/dist-packages/pyFAI/azimuthalIntegrator.py", line 1328, in integrate1d_ng
integr = method.class_funct_ng.klass(csr_integr.lut,
File "/usr/local/lib/python3.10/dist-packages/pyFAI/opencl/azim_csr.py", line 170, in __init__
self.compile_kernels()
File "/usr/local/lib/python3.10/dist-packages/pyFAI/opencl/azim_csr.py", line 283, in compile_kernels
OpenclProcessing.compile_kernels(self, kernels, compile_options)
File "/usr/local/lib/python3.10/dist-packages/silx/opencl/processing.py", line 306, in compile_kernels
self.program = pyopencl.Program(self.ctx, kernel_src).build(
File "/usr/lib/python3/dist-packages/pyopencl/__init__.py", line 536, in build
self._prg, was_cached = self._build_and_catch_errors(
File "/usr/lib/python3/dist-packages/pyopencl/__init__.py", line 584, in _build_and_catch_errors
raise err
pyopencl._cl.RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE
Build on <pyopencl.Device 'pthread-AMD EPYC 9654 96-Core Processor' on 'Portable Computing Language' at 0x55ee54fdee30>:
error: unknown target CPU 'generic'
Device pthread-AMD EPYC 9654 96-Core Processor failed to build the program, log: error: unknown target CPU 'generic'
(options: -D NBINS=2000 -D NIMAGE=1023183 -I /usr/lib/python3/dist-packages/pyopencl/cl)
(source saved as /tmp/tmp62bo0en5.cl)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<stdin>", line 50, in <module>
File "/usr/local/lib/python3.10/dist-packages/ewoks/bindings.py", line 78, in execute_graph
result = mod.execute_graph(graph, execinfo=execinfo, **execute_options)
File "/usr/local/lib/python3.10/dist-packages/ewokscore/events/contexts.py", line 25, in wrapper
return method(*args, execinfo=execinfo, **kw)
File "/usr/local/lib/python3.10/dist-packages/ewokscore/bindings.py", line 56, in execute_graph
return sequential.execute_graph(taskgraph.graph, **execute_options)
File "/usr/local/lib/python3.10/dist-packages/ewokscore/graph/execute/sequential.py", line 122, in execute_graph
task.execute(
File "/usr/local/lib/python3.10/dist-packages/ewokscore/task.py", line 407, in execute
raise RuntimeError(f"Task '{self.label}' failed") from e
RuntimeError: Task 'node_openintegratesave' failed
In the worker logs we are missing the cause of the exception, we only see the top-level exception and trace
[2024-02-23 20:20:51,035: ERROR/MainProcess] Task ewoksjob.apps.ewoks.execute_graph[d7314a07-5ad9-405c-a0b8-8479f6ab44f9] raised unexpected: RuntimeError("Task 'node_openintegratesave' failed")
Traceback (most recent call last):
File "/home/denolf/dev/celery/celery/app/trace.py", line 453, in trace_task
R = retval = fun(*args, **kwargs)
File "/home/denolf/dev/celery/celery/app/trace.py", line 736, in __protected_call__
return self.run(*args, **kwargs)
File "/home/denolf/dev/ewoksjob/src/ewoksjob/apps/ewoks.py", line 26, in new_celery_task
return celery_task(self, *args, **kwargs)
File "/home/denolf/dev/ewoksjob/src/ewoksjob/apps/ewoks.py", line 52, in new_celery_task
return executor(ewoks_task, *args, **kwargs)
File "/home/denolf/dev/ewoksjob/src/ewoksjob/worker/slurm.py", line 82, in executor
return future.result()
File "/home/denolf/dev/pyslurmutils/src/pyslurmutils/client/job_io/tcp_io.py", line 67, in result
errors.reraise_remote_exception_from_tb(*result, tb)
File "/home/denolf/dev/pyslurmutils/src/pyslurmutils/client/errors.py", line 42, in reraise_remote_exception_from_tb
raise remote_exception_from_tb(exc_cls, exc_msg, tb)
File "/usr/local/lib/python3.10/dist-packages/ewokscore/task.py", line 401, in execute
File "/home/esrf/denolf/ewoks_parallel/tasks_config.py", line 113, in run
File "/usr/local/lib/python3.10/dist-packages/pyFAI/azimuthalIntegrator.py", line 1328, in integrate1d_ng
File "/usr/local/lib/python3.10/dist-packages/pyFAI/opencl/azim_csr.py", line 170, in __init__
File "/usr/local/lib/python3.10/dist-packages/pyFAI/opencl/azim_csr.py", line 283, in compile_kernels
File "/usr/local/lib/python3.10/dist-packages/silx/opencl/processing.py", line 306, in compile_kernels
File "/usr/lib/python3/dist-packages/pyopencl/__init__.py", line 536, in build
File "/usr/lib/python3/dist-packages/pyopencl/__init__.py", line 584, in _build_and_catch_errors
RuntimeError: Task 'node_openintegratesave' failed
Edited by Wout De Nolf