Skip to content

Commit 3b08fb7

Browse files
feat: Add trial_timeout parameter to Katib tune API
- Add optional trial_timeout parameter to tune() function - Support timeout for both Job and PyTorchJob trial types - Set active_deadline_seconds on Job spec for Job-based trials - Set active_deadline_seconds on RunPolicy for PyTorchJob-based trials - Add comprehensive documentation and usage examples - Add test cases for both Job and PyTorchJob timeout scenarios - Maintain backward compatibility with existing code This feature prevents individual trials from running indefinitely and consuming cluster resources by allowing users to specify per-trial timeouts in seconds.
1 parent c9528e7 commit 3b08fb7

File tree

3 files changed

+71
-2
lines changed

3 files changed

+71
-2
lines changed

sdk/python/v1beta1/kubeflow/katib/api/katib_client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def tune(
216216
packages_to_install: List[str] = None,
217217
pip_index_url: str = "https://pypi.org/simple",
218218
metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"},
219+
trial_timeout: Optional[int] = None,
219220
):
220221
"""
221222
Create HyperParameter Tuning Katib Experiment using one of the following
@@ -355,6 +356,11 @@ class name in this argument.
355356
metrics_collector_config: Specify the config of metrics collector,
356357
for example, `metrics_collector_config = {"kind": "Push"}`.
357358
Currently, we only support `StdOut` and `Push` metrics collector.
359+
trial_timeout: Optional timeout in seconds for each trial. If None, no timeout is
360+
applied. For Job-based trials, this sets the activeDeadlineSeconds field.
361+
For PyTorchJob-based trials, this sets the activeDeadlineSeconds field on the
362+
Master replica. This prevents individual trials from running indefinitely and
363+
consuming resources.
358364
359365
Raises:
360366
ValueError: Function arguments have incorrect type or value.
@@ -511,13 +517,15 @@ class name in this argument.
511517
resources_per_trial,
512518
training_utils.get_pod_template_spec(containers=[container_spec]),
513519
training_utils.get_pod_template_spec(containers=[container_spec]),
520+
trial_timeout,
514521
)
515522
# Otherwise, Trial uses Job for model training.
516523
else:
517524
trial_template = utils.get_trial_template_with_job(
518525
retain_trials,
519526
trial_parameters,
520527
training_utils.get_pod_template_spec(containers=[container_spec]),
528+
trial_timeout,
521529
)
522530

523531
# If users choose to use external models and datasets.
@@ -694,6 +702,7 @@ class name in this argument.
694702
resources_per_trial,
695703
master_pod_template_spec,
696704
worker_pod_template_spec,
705+
trial_timeout,
697706
)
698707

699708
# Add parameters to the Katib Experiment.

sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,34 @@ def create_experiment(
541541
},
542542
TEST_RESULT_SUCCESS,
543543
),
544+
(
545+
"valid flow with trial_timeout for Job-based trials",
546+
{
547+
"name": "tune_test",
548+
"objective": lambda x: print(f"a={x}"),
549+
"parameters": {"a": katib.search.int(min=10, max=100)},
550+
"objective_metric_name": "a",
551+
"resources_per_trial": {"gpu": "2"},
552+
"trial_timeout": 3600, # 1 hour timeout
553+
},
554+
TEST_RESULT_SUCCESS,
555+
),
556+
(
557+
"valid flow with trial_timeout for PyTorchJob-based trials",
558+
{
559+
"name": "tune_test",
560+
"objective": lambda x: print(f"a={x}"),
561+
"parameters": {"a": katib.search.int(min=10, max=100)},
562+
"objective_metric_name": "a",
563+
"resources_per_trial": types.TrainerResources(
564+
num_workers=2,
565+
num_procs_per_worker=2,
566+
resources_per_worker={"gpu": "2"},
567+
),
568+
"trial_timeout": 7200, # 2 hours timeout
569+
},
570+
TEST_RESULT_SUCCESS,
571+
),
544572
]
545573

546574

@@ -667,6 +695,33 @@ def test_tune(katib_client, test_name, kwargs, expected_output):
667695
KubeflowOrgV1PyTorchJob,
668696
)
669697

698+
elif (
699+
test_name
700+
== "valid flow with trial_timeout for Job-based trials"
701+
):
702+
# Verify trial_timeout is set on Job spec
703+
job_spec = experiment.spec.trial_template.trial_spec.spec
704+
assert job_spec.active_deadline_seconds == 3600
705+
# Verify other Job-specific fields
706+
assert isinstance(experiment.spec.trial_template.trial_spec, V1Job)
707+
args_content = "".join(
708+
experiment.spec.trial_template.trial_spec.spec.template.spec.containers[
709+
0
710+
].args
711+
)
712+
assert "'a': '${trialParameters.a}'" in args_content
713+
714+
elif (
715+
test_name
716+
== "valid flow with trial_timeout for PyTorchJob-based trials"
717+
):
718+
# Verify trial_timeout is set on PyTorchJob run_policy
719+
pytorch_spec = experiment.spec.trial_template.trial_spec.spec
720+
assert pytorch_spec.run_policy.active_deadline_seconds == 7200
721+
# Verify PyTorchJob type
722+
trial_spec = experiment.spec.trial_template.trial_spec
723+
assert isinstance(trial_spec, KubeflowOrgV1PyTorchJob)
724+
670725
elif test_name == "valid flow with external model tuning":
671726
# Verify input_params
672727
args_content = "".join(

sdk/python/v1beta1/kubeflow/katib/utils/utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ def get_trial_template_with_job(
284284
retain_trials: bool,
285285
trial_parameters: List[models.V1beta1TrialParameterSpec],
286286
pod_template_spec: client.V1PodTemplateSpec,
287+
trial_timeout: Optional[int] = None,
287288
) -> models.V1beta1TrialTemplate:
288289
"""
289290
Get Trial template with Job as a Trial's Worker
@@ -296,7 +297,7 @@ def get_trial_template_with_job(
296297
job = client.V1Job(
297298
api_version="batch/v1",
298299
kind="Job",
299-
spec=client.V1JobSpec(template=pod_template_spec),
300+
spec=client.V1JobSpec(template=pod_template_spec, active_deadline_seconds=trial_timeout),
300301
)
301302

302303
trial_template = models.V1beta1TrialTemplate(
@@ -314,6 +315,7 @@ def get_trial_template_with_pytorchjob(
314315
resources_per_trial: types.TrainerResources,
315316
master_pod_template_spec: models.V1PodTemplateSpec,
316317
worker_pod_template_spec: models.V1PodTemplateSpec,
318+
trial_timeout: Optional[int] = None,
317319
) -> models.V1beta1TrialTemplate:
318320
"""
319321
Get Trial template with PyTorchJob as a Trial's Worker
@@ -324,7 +326,10 @@ def get_trial_template_with_pytorchjob(
324326
api_version=API_VERSION,
325327
kind=PYTORCHJOB_KIND,
326328
spec=training_models.KubeflowOrgV1PyTorchJobSpec(
327-
run_policy=training_models.KubeflowOrgV1RunPolicy(clean_pod_policy=None),
329+
run_policy=training_models.KubeflowOrgV1RunPolicy(
330+
clean_pod_policy=None,
331+
active_deadline_seconds=trial_timeout,
332+
),
328333
nproc_per_node=str(resources_per_trial.num_procs_per_worker),
329334
pytorch_replica_specs={
330335
"Master": training_models.KubeflowOrgV1ReplicaSpec(

0 commit comments

Comments
 (0)