[SDK] Enable resource specification for trial containers

droctothorpe · shipengcheng1230 · andreyvelich · droctothorpe · commit 0440fc383b7b · 2023-08-04T13:23:29.000-04:00
Co-authored-by: shipengcheng1230 &lt;shipengcheng1230@gmail.com&gt;
Co-authored-by: Andrey Velichkevich &lt;andrey.velichkevich@gmail.com&gt;
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -16,7 +16,7 @@
 import multiprocessing
 import textwrap
 import time
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import grpc
 import kubeflow.katib.katib_api_pb2 as katib_api_pb2
@@ -144,6 +144,7 @@ def tune(
         max_trial_count: int = None,
         parallel_trial_count: int = None,
         max_failed_trial_count: int = None,
+        resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
         retain_trials: bool = False,
         packages_to_install: List[str] = None,
         pip_index_url: str = "https://pypi.org/simple",
@@ -177,6 +178,21 @@ def tune(
                 values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
             parallel_trial_count: Number of Trials that Experiment runs in parallel.
             max_failed_trial_count: Maximum number of Trials allowed to fail.
+            resources_per_trial: A parameter that lets you specify how much
+            resources each trial container should have. You can either specify a
+            kubernetes.client.V1ResourceRequirements object (documented here:
+            https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
+            or a dictionary that includes one or more of the following keys:
+            `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
+            values for these keys are documented here:
+            https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
+            For example:
+                {
+                    "cpu": "1",
+                    "gpu": "1",
+                    "memory": "2Gi",
+                }
+            This parameter is optional and defaults to None.
             retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
             packages_to_install: List of Python packages to install in addition
                 to the base image packages. These packages are installed before
@@ -280,6 +296,20 @@ def tune(
                 + exec_script
             )
 
+        if isinstance(resources_per_trial, dict):
+            requests = {
+                "cpu": "200m",
+                "memory": "256Mi",
+            }
+            if "gpu" in resources_per_trial:
+                resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")
+            requests.update(resources_per_trial)
+
+            resources_per_trial = client.V1ResourceRequirements(
+                requests=requests,
+                limits=requests,
+            )
+
         # Create Trial specification.
         trial_spec = client.V1Job(
             api_version="batch/v1",
@@ -297,6 +327,7 @@ def tune(
                                 image=base_image,
                                 command=["bash", "-c"],
                                 args=[exec_script],
+                                resources=resources_per_trial,
                             )
                         ],
                     ),