Skip to content

Commit c81c11a

Browse files
committed
fix for sending SIGTERM to main process
1 parent 9ccbc6f commit c81c11a

File tree

3 files changed

+8
-1
lines changed

3 files changed

+8
-1
lines changed

metaflow/plugins/aws/batch/batch_decorator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ def task_pre_step(
299299
self._save_logs_sidecar.start()
300300

301301
# Start spot termination monitor sidecar.
302+
# TODO: A nicer way to pass the main process id to a Sidecar, in order to allow sidecars to send signals back to the main process.
303+
os.environ["MF_MAIN_PID"] = str(os.getpid())
302304
current._update_env(
303305
{"spot_termination_notice": "/tmp/spot_termination_notice"}
304306
)

metaflow/plugins/kubernetes/kubernetes_decorator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,8 @@ def task_pre_step(
559559
self._save_logs_sidecar.start()
560560

561561
# Start spot termination monitor sidecar.
562+
# TODO: A nicer way to pass the main process id to a Sidecar, in order to allow sidecars to send signals back to the main process.
563+
os.environ["MF_MAIN_PID"] = str(os.getpid())
562564
current._update_env(
563565
{"spot_termination_notice": "/tmp/spot_termination_notice"}
564566
)

metaflow/plugins/kubernetes/spot_monitor_sidecar.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def __init__(self):
2121
self._token = None
2222
self._token_expiry = 0
2323

24+
# Due to nesting, os.getppid is not reliable for fetching the main task pid
25+
self.main_pid = int(os.getenv("MF_MAIN_PID", os.getppid()))
26+
2427
if self._is_aws_spot_instance():
2528
self._process = Process(target=self._monitor_loop)
2629
self._process.start()
@@ -71,7 +74,7 @@ def _monitor_loop(self):
7174
if response.status_code == 200:
7275
termination_time = response.text
7376
self._emit_termination_metadata(termination_time)
74-
os.kill(os.getppid(), signal.SIGTERM)
77+
os.kill(self.main_pid, signal.SIGTERM)
7578
break
7679
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
7780
pass

0 commit comments

Comments
 (0)