|
1 | 1 | import argparse
|
2 | 2 | import logging
|
3 |
| -import time |
4 | 3 |
|
5 | 4 | from kubeflow.katib import ApiClient
|
6 | 5 | from kubeflow.katib import KatibClient
|
7 | 6 | from kubeflow.katib import models
|
8 | 7 | from kubeflow.katib.constants import constants
|
9 | 8 | from kubeflow.katib.utils.utils import FakeResponse
|
10 | 9 | from kubernetes import client
|
| 10 | +from verify import verify_experiment_results |
11 | 11 | import yaml
|
12 | 12 |
|
13 | 13 | # Experiment timeout is 40 min.
|
|
17 | 17 | logging.basicConfig(level=logging.INFO)
|
18 | 18 |
|
19 | 19 |
|
20 |
| -def verify_experiment_results( |
21 |
| - katib_client: KatibClient, |
22 |
| - experiment: models.V1beta1Experiment, |
23 |
| - exp_name: str, |
24 |
| - exp_namespace: str, |
25 |
| -): |
26 |
| - |
27 |
| - # Get the best objective metric. |
28 |
| - best_objective_metric = None |
29 |
| - for metric in experiment.status.current_optimal_trial.observation.metrics: |
30 |
| - if metric.name == experiment.spec.objective.objective_metric_name: |
31 |
| - best_objective_metric = metric |
32 |
| - break |
33 |
| - |
34 |
| - if best_objective_metric is None: |
35 |
| - raise Exception( |
36 |
| - "Unable to get the best metrics for objective: {}. Current Optimal Trial: {}".format( |
37 |
| - experiment.spec.objective.objective_metric_name, |
38 |
| - experiment.status.current_optimal_trial, |
39 |
| - ) |
40 |
| - ) |
41 |
| - |
42 |
| - # Get Experiment Succeeded reason. |
43 |
| - for c in experiment.status.conditions: |
44 |
| - if ( |
45 |
| - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED |
46 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
47 |
| - ): |
48 |
| - succeeded_reason = c.reason |
49 |
| - break |
50 |
| - |
51 |
| - trials_completed = experiment.status.trials_succeeded or 0 |
52 |
| - trials_completed += experiment.status.trials_early_stopped or 0 |
53 |
| - max_trial_count = experiment.spec.max_trial_count |
54 |
| - |
55 |
| - # If Experiment is Succeeded because of Max Trial Reached, all Trials must be completed. |
56 |
| - if ( |
57 |
| - succeeded_reason == "ExperimentMaxTrialsReached" |
58 |
| - and trials_completed != max_trial_count |
59 |
| - ): |
60 |
| - raise Exception( |
61 |
| - "All Trials must be Completed. Max Trial count: {}, Experiment status: {}".format( |
62 |
| - max_trial_count, experiment.status |
63 |
| - ) |
64 |
| - ) |
65 |
| - |
66 |
| - # If Experiment is Succeeded because of Goal reached, the metrics must be correct. |
67 |
| - if succeeded_reason == "ExperimentGoalReached" and ( |
68 |
| - ( |
69 |
| - experiment.spec.objective.type == "minimize" |
70 |
| - and float(best_objective_metric.min) > float(experiment.spec.objective.goal) |
71 |
| - ) |
72 |
| - or ( |
73 |
| - experiment.spec.objective.type == "maximize" |
74 |
| - and float(best_objective_metric.max) < float(experiment.spec.objective.goal) |
75 |
| - ) |
76 |
| - ): |
77 |
| - raise Exception( |
78 |
| - "Experiment goal is reached, but metrics are incorrect. " |
79 |
| - f"Experiment objective: {experiment.spec.objective}. " |
80 |
| - f"Experiment best objective metric: {best_objective_metric}" |
81 |
| - ) |
82 |
| - |
83 |
| - # Verify Suggestion's resources. Suggestion name = Experiment name. |
84 |
| - suggestion = katib_client.get_suggestion(exp_name, exp_namespace) |
85 |
| - |
86 |
| - # For the Never or FromVolume resume policies Suggestion must be Succeeded. |
87 |
| - # For the LongRunning resume policy Suggestion must be always Running. |
88 |
| - for c in suggestion.status.conditions: |
89 |
| - if ( |
90 |
| - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED |
91 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
92 |
| - and experiment.spec.resume_policy == "LongRunning" |
93 |
| - ): |
94 |
| - raise Exception( |
95 |
| - f"Suggestion is Succeeded while Resume Policy is {experiment.spec.resume_policy}." |
96 |
| - f"Suggestion conditions: {suggestion.status.conditions}" |
97 |
| - ) |
98 |
| - elif ( |
99 |
| - c.type == constants.EXPERIMENT_CONDITION_RUNNING |
100 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
101 |
| - and experiment.spec.resume_policy != "LongRunning" |
102 |
| - ): |
103 |
| - raise Exception( |
104 |
| - f"Suggestion is Running while Resume Policy is {experiment.spec.resume_policy}." |
105 |
| - f"Suggestion conditions: {suggestion.status.conditions}" |
106 |
| - ) |
107 |
| - |
108 |
| - # For Never and FromVolume resume policies verify Suggestion's resources. |
109 |
| - if ( |
110 |
| - experiment.spec.resume_policy == "Never" |
111 |
| - or experiment.spec.resume_policy == "FromVolume" |
112 |
| - ): |
113 |
| - resource_name = exp_name + "-" + experiment.spec.algorithm.algorithm_name |
114 |
| - |
115 |
| - # Suggestion's Service and Deployment should be deleted. |
116 |
| - for i in range(10): |
117 |
| - try: |
118 |
| - client.AppsV1Api().read_namespaced_deployment( |
119 |
| - resource_name, exp_namespace |
120 |
| - ) |
121 |
| - except client.ApiException as e: |
122 |
| - if e.status == 404: |
123 |
| - break |
124 |
| - else: |
125 |
| - raise e |
126 |
| - # Deployment deletion might take some time. |
127 |
| - time.sleep(1) |
128 |
| - if i == 10: |
129 |
| - raise Exception( |
130 |
| - "Suggestion Deployment is still alive for Resume Policy: {}".format( |
131 |
| - experiment.spec.resume_policy |
132 |
| - ) |
133 |
| - ) |
134 |
| - |
135 |
| - try: |
136 |
| - client.CoreV1Api().read_namespaced_service(resource_name, exp_namespace) |
137 |
| - except client.ApiException as e: |
138 |
| - if e.status != 404: |
139 |
| - raise e |
140 |
| - else: |
141 |
| - raise Exception( |
142 |
| - "Suggestion Service is still alive for Resume Policy: {}".format( |
143 |
| - experiment.spec.resume_policy |
144 |
| - ) |
145 |
| - ) |
146 |
| - |
147 |
| - # For FromVolume resume policy PVC should not be deleted. |
148 |
| - if experiment.spec.resume_policy == "FromVolume": |
149 |
| - try: |
150 |
| - client.CoreV1Api().read_namespaced_persistent_volume_claim( |
151 |
| - resource_name, exp_namespace |
152 |
| - ) |
153 |
| - except client.ApiException: |
154 |
| - raise Exception("PVC is deleted for FromVolume Resume Policy") |
155 |
| - |
156 |
| - |
157 | 20 | def run_e2e_experiment(
|
158 | 21 | katib_client: KatibClient,
|
159 | 22 | experiment: models.V1beta1Experiment,
|
|
0 commit comments