Skip to content

Commit b43f603

Browse files
refactor(test/sdk): add run-e2e-tune-api.py.
Signed-off-by: Electronic-Waste <[email protected]>
1 parent 153cdef commit b43f603

File tree

5 files changed

+242
-138
lines changed

5 files changed

+242
-138
lines changed

test/e2e/v1beta1/scripts/gh-actions/build-load.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ TAG="e2e-test"
3434
VERSION="v1beta1"
3535
CMD_PREFIX="cmd"
3636
SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu" "pytorch-mnist-cpu")
37+
DEFAULT_IMAGE_FOR_TUNE="docker.io/tensorflow/tensorflow:2.13.0"
3738

3839
IFS="," read -r -a TRIAL_IMAGE_ARRAY <<< "$TRIAL_IMAGES"
3940
IFS="," read -r -a EXPERIMENT_ARRAY <<< "$EXPERIMENTS"
@@ -162,6 +163,10 @@ for name in "${TRIAL_IMAGE_ARRAY[@]}"; do
162163
run "$name" "examples/$VERSION/trial-images/$name/Dockerfile"
163164
done
164165

166+
# Testing image for tune function
167+
echo -e "\nPulling testing image for tune function..."
168+
docker pull $DEFAULT_IMAGE_FOR_TUNE
169+
165170
echo -e "\nCleanup Build Cache...\n"
166171
docker buildx prune -f
167172

test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py

Lines changed: 1 addition & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import argparse
22
import logging
3-
import time
43

54
from kubeflow.katib import ApiClient
65
from kubeflow.katib import KatibClient
@@ -9,6 +8,7 @@
98
from kubeflow.katib.constants import constants
109
from kubeflow.katib.utils.utils import FakeResponse
1110
from kubernetes import client
11+
from verify import verify_experiment_results
1212
import yaml
1313

1414
# Experiment timeout is 40 min.
@@ -18,143 +18,6 @@
1818
logging.basicConfig(level=logging.INFO)
1919

2020

21-
def verify_experiment_results(
22-
katib_client: KatibClient,
23-
experiment: models.V1beta1Experiment,
24-
exp_name: str,
25-
exp_namespace: str,
26-
):
27-
28-
# Get the best objective metric.
29-
best_objective_metric = None
30-
for metric in experiment.status.current_optimal_trial.observation.metrics:
31-
if metric.name == experiment.spec.objective.objective_metric_name:
32-
best_objective_metric = metric
33-
break
34-
35-
if best_objective_metric is None:
36-
raise Exception(
37-
"Unable to get the best metrics for objective: {}. Current Optimal Trial: {}".format(
38-
experiment.spec.objective.objective_metric_name,
39-
experiment.status.current_optimal_trial,
40-
)
41-
)
42-
43-
# Get Experiment Succeeded reason.
44-
for c in experiment.status.conditions:
45-
if (
46-
c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED
47-
and c.status == constants.CONDITION_STATUS_TRUE
48-
):
49-
succeeded_reason = c.reason
50-
break
51-
52-
trials_completed = experiment.status.trials_succeeded or 0
53-
trials_completed += experiment.status.trials_early_stopped or 0
54-
max_trial_count = experiment.spec.max_trial_count
55-
56-
# If Experiment is Succeeded because of Max Trial Reached, all Trials must be completed.
57-
if (
58-
succeeded_reason == "ExperimentMaxTrialsReached"
59-
and trials_completed != max_trial_count
60-
):
61-
raise Exception(
62-
"All Trials must be Completed. Max Trial count: {}, Experiment status: {}".format(
63-
max_trial_count, experiment.status
64-
)
65-
)
66-
67-
# If Experiment is Succeeded because of Goal reached, the metrics must be correct.
68-
if succeeded_reason == "ExperimentGoalReached" and (
69-
(
70-
experiment.spec.objective.type == "minimize"
71-
and float(best_objective_metric.min) > float(experiment.spec.objective.goal)
72-
)
73-
or (
74-
experiment.spec.objective.type == "maximize"
75-
and float(best_objective_metric.max) < float(experiment.spec.objective.goal)
76-
)
77-
):
78-
raise Exception(
79-
"Experiment goal is reached, but metrics are incorrect. "
80-
f"Experiment objective: {experiment.spec.objective}. "
81-
f"Experiment best objective metric: {best_objective_metric}"
82-
)
83-
84-
# Verify Suggestion's resources. Suggestion name = Experiment name.
85-
suggestion = katib_client.get_suggestion(exp_name, exp_namespace)
86-
87-
# For the Never or FromVolume resume policies Suggestion must be Succeeded.
88-
# For the LongRunning resume policy Suggestion must be always Running.
89-
for c in suggestion.status.conditions:
90-
if (
91-
c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED
92-
and c.status == constants.CONDITION_STATUS_TRUE
93-
and experiment.spec.resume_policy == "LongRunning"
94-
):
95-
raise Exception(
96-
f"Suggestion is Succeeded while Resume Policy is {experiment.spec.resume_policy}."
97-
f"Suggestion conditions: {suggestion.status.conditions}"
98-
)
99-
elif (
100-
c.type == constants.EXPERIMENT_CONDITION_RUNNING
101-
and c.status == constants.CONDITION_STATUS_TRUE
102-
and experiment.spec.resume_policy != "LongRunning"
103-
):
104-
raise Exception(
105-
f"Suggestion is Running while Resume Policy is {experiment.spec.resume_policy}."
106-
f"Suggestion conditions: {suggestion.status.conditions}"
107-
)
108-
109-
# For Never and FromVolume resume policies verify Suggestion's resources.
110-
if (
111-
experiment.spec.resume_policy == "Never"
112-
or experiment.spec.resume_policy == "FromVolume"
113-
):
114-
resource_name = exp_name + "-" + experiment.spec.algorithm.algorithm_name
115-
116-
# Suggestion's Service and Deployment should be deleted.
117-
for i in range(10):
118-
try:
119-
client.AppsV1Api().read_namespaced_deployment(
120-
resource_name, exp_namespace
121-
)
122-
except client.ApiException as e:
123-
if e.status == 404:
124-
break
125-
else:
126-
raise e
127-
# Deployment deletion might take some time.
128-
time.sleep(1)
129-
if i == 10:
130-
raise Exception(
131-
"Suggestion Deployment is still alive for Resume Policy: {}".format(
132-
experiment.spec.resume_policy
133-
)
134-
)
135-
136-
try:
137-
client.CoreV1Api().read_namespaced_service(resource_name, exp_namespace)
138-
except client.ApiException as e:
139-
if e.status != 404:
140-
raise e
141-
else:
142-
raise Exception(
143-
"Suggestion Service is still alive for Resume Policy: {}".format(
144-
experiment.spec.resume_policy
145-
)
146-
)
147-
148-
# For FromVolume resume policy PVC should not be deleted.
149-
if experiment.spec.resume_policy == "FromVolume":
150-
try:
151-
client.CoreV1Api().read_namespaced_persistent_volume_claim(
152-
resource_name, exp_namespace
153-
)
154-
except client.ApiException:
155-
raise Exception("PVC is deleted for FromVolume Resume Policy")
156-
157-
15821
def run_e2e_experiment(
15922
katib_client: KatibClient,
16023
experiment: models.V1beta1Experiment,

test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,7 @@ for exp_name in "${EXPERIMENT_FILE_ARRAY[@]}"; do
4848
--verbose || (kubectl get pods -n kubeflow && exit 1)
4949
done
5050

51+
python run-e2e-tune-api.py --namespace default \
52+
--verbose || (kubectl get pods -n kubeflow && exit 1)
53+
5154
exit 0
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import argparse
2+
import logging
3+
4+
from kubeflow.katib import KatibClient
5+
from kubeflow.katib import search
6+
from verify import verify_experiment_results
7+
import yaml
8+
9+
# Experiment timeout is 40 min.
10+
EXPERIMENT_TIMEOUT = 60 * 40
11+
12+
# The default logging config.
13+
logging.basicConfig(level=logging.INFO)
14+
15+
16+
def run_e2e_experiment_create_by_tune(
17+
katib_client: KatibClient,
18+
exp_name: str,
19+
exp_namespace: str,
20+
):
21+
# Create Katib Experiment and wait until it is finished.
22+
logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
23+
24+
# Use the test case from get-started tutorial.
25+
# https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk
26+
# [1] Create an objective function.
27+
def objective(parameters):
28+
result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
29+
print(f"result={result}")
30+
31+
# [2] Create hyperparameter search space.
32+
parameters = {
33+
"a": search.int(min=10, max=20),
34+
"b": search.double(min=0.1, max=0.2)
35+
}
36+
37+
# [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
38+
# And Wait until Experiment reaches Succeeded condition.
39+
katib_client.tune(
40+
name=exp_name,
41+
namespace=exp_namespace,
42+
objective=objective,
43+
parameters=parameters,
44+
objective_metric_name="result",
45+
max_trial_count=4,
46+
resources_per_trial={"cpu": "2"},
47+
)
48+
experiment = katib_client.wait_for_experiment_condition(
49+
exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
50+
)
51+
52+
# Verify the Experiment results.
53+
verify_experiment_results(katib_client, experiment, exp_name, exp_namespace)
54+
55+
# Print the Experiment and Suggestion.
56+
logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
57+
logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
58+
59+
60+
if __name__ == "__main__":
61+
parser = argparse.ArgumentParser()
62+
parser.add_argument(
63+
"--namespace", type=str, required=True, help="Namespace for the Katib E2E test",
64+
)
65+
parser.add_argument(
66+
"--trial-pod-annotations", type=str, help="Annotation for the pod created by trial",
67+
)
68+
parser.add_argument(
69+
"--verbose", action="store_true", help="Verbose output for the Katib E2E test",
70+
)
71+
args = parser.parse_args()
72+
73+
katib_client = KatibClient()
74+
75+
# Test with run_e2e_experiment_create_by_tune
76+
exp_name = "tune-example"
77+
exp_namespace = args.namespace
78+
try:
79+
run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace)
80+
logging.info("---------------------------------------------------------------")
81+
logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
82+
except Exception as e:
83+
logging.info("---------------------------------------------------------------")
84+
logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
85+
raise e
86+
finally:
87+
# Delete the Experiment.
88+
logging.info("---------------------------------------------------------------")
89+
logging.info("---------------------------------------------------------------")
90+
katib_client.delete_experiment(exp_name, exp_namespace)

0 commit comments

Comments
 (0)