Skip to content

Commit 5a74eee

Browse files
rst0gitavagin
authored andcommitted
cuda: unlock on timeout error
When attempting to checkpoint a container with CUDA processes, CRIU could fail with the following error: Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 1 Error (cuda_plugin.c:143): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:384): cuda_plugin: PAUSE_DEVICES failed with In this situation, the target process is locked, but CRIU fails due to a timeout and exits with an error. We need to make sure that the target PID is unlocked in such case. Signed-off-by: Radostin Stoyanov <[email protected]>
1 parent 5ba1f84 commit 5a74eee

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

plugins/cuda/cuda_plugin.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "cr_options.h"
55
#include "pid.h"
66
#include "proc_parse.h"
7+
#include "seize.h"
78

89
#include <common/list.h>
910
#include <compel/infect.h>
@@ -379,18 +380,23 @@ int cuda_plugin_pause_devices(int pid)
379380
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
380381
if (status) {
381382
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
383+
if (alarm_timeouted())
384+
goto unlock;
382385
return -1;
383386
}
387+
384388
if (add_pid_to_buf(&cuda_pids, pid)) {
385389
pr_err("unable to track paused pid %d\n", pid);
386-
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
387-
if (status) {
388-
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
389-
}
390-
return -1;
390+
goto unlock;
391391
}
392392

393393
return 0;
394+
unlock:
395+
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
396+
if (status) {
397+
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
398+
}
399+
return -1;
394400
}
395401
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)
396402

0 commit comments

Comments
 (0)