Skip to content

Commit 052fd34

Browse files
droctothorpeshipengcheng1230andreyvelich
committed
[SDK] Enable resource specification for trial containers
Co-authored-by: shipengcheng1230 <[email protected]> Co-authored-by: Andrey Velichkevich <[email protected]>
1 parent c749d27 commit 052fd34

File tree

1 file changed

+27
-3
lines changed

1 file changed

+27
-3
lines changed

sdk/python/v1beta1/kubeflow/katib/api/katib_client.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import multiprocessing
1717
import textwrap
1818
import time
19-
from typing import Any, Callable, Dict, List, Optional
19+
from typing import Any, Callable, Dict, List, Optional, Union
2020

2121
import grpc
2222
import kubeflow.katib.katib_api_pb2 as katib_api_pb2
@@ -144,6 +144,7 @@ def tune(
144144
max_trial_count: int = None,
145145
parallel_trial_count: int = None,
146146
max_failed_trial_count: int = None,
147+
resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
147148
retain_trials: bool = False,
148149
packages_to_install: List[str] = None,
149150
pip_index_url: str = "https://pypi.org/simple",
@@ -177,6 +178,21 @@ def tune(
177178
values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
178179
parallel_trial_count: Number of Trials that Experiment runs in parallel.
179180
max_failed_trial_count: Maximum number of Trials allowed to fail.
181+
resources_per_trial: A parameter that lets you specify how much
182+
resources each trial container should have. You can either specify a
183+
kubernetes.client.V1ResourceRequirements object (documented here:
184+
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
185+
or a dictionary that includes one or more of the following keys:
186+
`cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
187+
values for these keys are documented here:
188+
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
189+
For example:
190+
{
191+
"cpu": "1",
192+
"gpu": "1",
193+
"memory": "2Gi",
194+
}
195+
This parameter is optional and defaults to None.
180196
retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
181197
packages_to_install: List of Python packages to install in addition
182198
to the base image packages. These packages are installed before
@@ -280,6 +296,15 @@ def tune(
280296
+ exec_script
281297
)
282298

299+
if isinstance(resources_per_trial, dict):
300+
if "gpu" in resources_per_trial:
301+
resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")
302+
303+
resources_per_trial = client.V1ResourceRequirements(
304+
requests=resources_per_trial,
305+
limits=resources_per_trial,
306+
)
307+
283308
# Create Trial specification.
284309
trial_spec = client.V1Job(
285310
api_version="batch/v1",
@@ -297,6 +322,7 @@ def tune(
297322
image=base_image,
298323
command=["bash", "-c"],
299324
args=[exec_script],
325+
resources=resources_per_trial,
300326
)
301327
],
302328
),
@@ -640,7 +666,6 @@ def wait_for_experiment_condition(
640666
namespace = namespace or self.namespace
641667

642668
for _ in range(round(timeout / polling_interval)):
643-
644669
# We should get Experiment only once per cycle and check the statuses.
645670
experiment = self.get_experiment(name, namespace, apiserver_timeout)
646671

@@ -1175,7 +1200,6 @@ def get_trial_metrics(
11751200
)
11761201

11771202
with katib_api_pb2.beta_create_DBManager_stub(channel) as client:
1178-
11791203
try:
11801204
# When metric name is empty, we select all logs from the Katib DB.
11811205
observation_logs = client.GetObservationLog(

0 commit comments

Comments
 (0)