Skip to content

Commit 0440fc3

Browse files
droctothorpeshipengcheng1230andreyvelich
committed
[SDK] Enable resource specification for trial containers
Co-authored-by: shipengcheng1230 <[email protected]> Co-authored-by: Andrey Velichkevich <[email protected]>
1 parent c749d27 commit 0440fc3

File tree

1 file changed

+32
-1
lines changed

1 file changed

+32
-1
lines changed

sdk/python/v1beta1/kubeflow/katib/api/katib_client.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import multiprocessing
1717
import textwrap
1818
import time
19-
from typing import Any, Callable, Dict, List, Optional
19+
from typing import Any, Callable, Dict, List, Optional, Union
2020

2121
import grpc
2222
import kubeflow.katib.katib_api_pb2 as katib_api_pb2
@@ -144,6 +144,7 @@ def tune(
144144
max_trial_count: int = None,
145145
parallel_trial_count: int = None,
146146
max_failed_trial_count: int = None,
147+
resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
147148
retain_trials: bool = False,
148149
packages_to_install: List[str] = None,
149150
pip_index_url: str = "https://pypi.org/simple",
@@ -177,6 +178,21 @@ def tune(
177178
values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
178179
parallel_trial_count: Number of Trials that Experiment runs in parallel.
179180
max_failed_trial_count: Maximum number of Trials allowed to fail.
181+
resources_per_trial: A parameter that lets you specify how much
182+
resources each trial container should have. You can either specify a
183+
kubernetes.client.V1ResourceRequirements object (documented here:
184+
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
185+
or a dictionary that includes one or more of the following keys:
186+
`cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
187+
values for these keys are documented here:
188+
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
189+
For example:
190+
{
191+
"cpu": "1",
192+
"gpu": "1",
193+
"memory": "2Gi",
194+
}
195+
This parameter is optional and defaults to None.
180196
retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
181197
packages_to_install: List of Python packages to install in addition
182198
to the base image packages. These packages are installed before
@@ -280,6 +296,20 @@ def tune(
280296
+ exec_script
281297
)
282298

299+
if isinstance(resources_per_trial, dict):
300+
requests = {
301+
"cpu": "200m",
302+
"memory": "256Mi",
303+
}
304+
if "gpu" in resources_per_trial:
305+
resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")
306+
requests.update(resources_per_trial)
307+
308+
resources_per_trial = client.V1ResourceRequirements(
309+
requests=requests,
310+
limits=requests,
311+
)
312+
283313
# Create Trial specification.
284314
trial_spec = client.V1Job(
285315
api_version="batch/v1",
@@ -297,6 +327,7 @@ def tune(
297327
image=base_image,
298328
command=["bash", "-c"],
299329
args=[exec_script],
330+
resources=resources_per_trial,
300331
)
301332
],
302333
),

0 commit comments

Comments
 (0)