|
1 | 1 | import argparse
|
2 | 2 | import logging
|
3 |
| -import time |
4 | 3 |
|
5 | 4 | from kubeflow.katib import ApiClient
|
6 | 5 | from kubeflow.katib import KatibClient
|
|
9 | 8 | from kubeflow.katib.constants import constants
|
10 | 9 | from kubeflow.katib.utils.utils import FakeResponse
|
11 | 10 | from kubernetes import client
|
| 11 | +from verify import verify_experiment_results |
12 | 12 | import yaml
|
13 | 13 |
|
14 | 14 | # Experiment timeout is 40 min.
|
|
18 | 18 | logging.basicConfig(level=logging.INFO)
|
19 | 19 |
|
20 | 20 |
|
21 |
| -def verify_experiment_results( |
22 |
| - katib_client: KatibClient, |
23 |
| - experiment: models.V1beta1Experiment, |
24 |
| - exp_name: str, |
25 |
| - exp_namespace: str, |
26 |
| -): |
27 |
| - |
28 |
| - # Get the best objective metric. |
29 |
| - best_objective_metric = None |
30 |
| - for metric in experiment.status.current_optimal_trial.observation.metrics: |
31 |
| - if metric.name == experiment.spec.objective.objective_metric_name: |
32 |
| - best_objective_metric = metric |
33 |
| - break |
34 |
| - |
35 |
| - if best_objective_metric is None: |
36 |
| - raise Exception( |
37 |
| - "Unable to get the best metrics for objective: {}. Current Optimal Trial: {}".format( |
38 |
| - experiment.spec.objective.objective_metric_name, |
39 |
| - experiment.status.current_optimal_trial, |
40 |
| - ) |
41 |
| - ) |
42 |
| - |
43 |
| - # Get Experiment Succeeded reason. |
44 |
| - for c in experiment.status.conditions: |
45 |
| - if ( |
46 |
| - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED |
47 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
48 |
| - ): |
49 |
| - succeeded_reason = c.reason |
50 |
| - break |
51 |
| - |
52 |
| - trials_completed = experiment.status.trials_succeeded or 0 |
53 |
| - trials_completed += experiment.status.trials_early_stopped or 0 |
54 |
| - max_trial_count = experiment.spec.max_trial_count |
55 |
| - |
56 |
| - # If Experiment is Succeeded because of Max Trial Reached, all Trials must be completed. |
57 |
| - if ( |
58 |
| - succeeded_reason == "ExperimentMaxTrialsReached" |
59 |
| - and trials_completed != max_trial_count |
60 |
| - ): |
61 |
| - raise Exception( |
62 |
| - "All Trials must be Completed. Max Trial count: {}, Experiment status: {}".format( |
63 |
| - max_trial_count, experiment.status |
64 |
| - ) |
65 |
| - ) |
66 |
| - |
67 |
| - # If Experiment is Succeeded because of Goal reached, the metrics must be correct. |
68 |
| - if succeeded_reason == "ExperimentGoalReached" and ( |
69 |
| - ( |
70 |
| - experiment.spec.objective.type == "minimize" |
71 |
| - and float(best_objective_metric.min) > float(experiment.spec.objective.goal) |
72 |
| - ) |
73 |
| - or ( |
74 |
| - experiment.spec.objective.type == "maximize" |
75 |
| - and float(best_objective_metric.max) < float(experiment.spec.objective.goal) |
76 |
| - ) |
77 |
| - ): |
78 |
| - raise Exception( |
79 |
| - "Experiment goal is reached, but metrics are incorrect. " |
80 |
| - f"Experiment objective: {experiment.spec.objective}. " |
81 |
| - f"Experiment best objective metric: {best_objective_metric}" |
82 |
| - ) |
83 |
| - |
84 |
| - # Verify Suggestion's resources. Suggestion name = Experiment name. |
85 |
| - suggestion = katib_client.get_suggestion(exp_name, exp_namespace) |
86 |
| - |
87 |
| - # For the Never or FromVolume resume policies Suggestion must be Succeeded. |
88 |
| - # For the LongRunning resume policy Suggestion must be always Running. |
89 |
| - for c in suggestion.status.conditions: |
90 |
| - if ( |
91 |
| - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED |
92 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
93 |
| - and experiment.spec.resume_policy == "LongRunning" |
94 |
| - ): |
95 |
| - raise Exception( |
96 |
| - f"Suggestion is Succeeded while Resume Policy is {experiment.spec.resume_policy}." |
97 |
| - f"Suggestion conditions: {suggestion.status.conditions}" |
98 |
| - ) |
99 |
| - elif ( |
100 |
| - c.type == constants.EXPERIMENT_CONDITION_RUNNING |
101 |
| - and c.status == constants.CONDITION_STATUS_TRUE |
102 |
| - and experiment.spec.resume_policy != "LongRunning" |
103 |
| - ): |
104 |
| - raise Exception( |
105 |
| - f"Suggestion is Running while Resume Policy is {experiment.spec.resume_policy}." |
106 |
| - f"Suggestion conditions: {suggestion.status.conditions}" |
107 |
| - ) |
108 |
| - |
109 |
| - # For Never and FromVolume resume policies verify Suggestion's resources. |
110 |
| - if ( |
111 |
| - experiment.spec.resume_policy == "Never" |
112 |
| - or experiment.spec.resume_policy == "FromVolume" |
113 |
| - ): |
114 |
| - resource_name = exp_name + "-" + experiment.spec.algorithm.algorithm_name |
115 |
| - |
116 |
| - # Suggestion's Service and Deployment should be deleted. |
117 |
| - for i in range(10): |
118 |
| - try: |
119 |
| - client.AppsV1Api().read_namespaced_deployment( |
120 |
| - resource_name, exp_namespace |
121 |
| - ) |
122 |
| - except client.ApiException as e: |
123 |
| - if e.status == 404: |
124 |
| - break |
125 |
| - else: |
126 |
| - raise e |
127 |
| - # Deployment deletion might take some time. |
128 |
| - time.sleep(1) |
129 |
| - if i == 10: |
130 |
| - raise Exception( |
131 |
| - "Suggestion Deployment is still alive for Resume Policy: {}".format( |
132 |
| - experiment.spec.resume_policy |
133 |
| - ) |
134 |
| - ) |
135 |
| - |
136 |
| - try: |
137 |
| - client.CoreV1Api().read_namespaced_service(resource_name, exp_namespace) |
138 |
| - except client.ApiException as e: |
139 |
| - if e.status != 404: |
140 |
| - raise e |
141 |
| - else: |
142 |
| - raise Exception( |
143 |
| - "Suggestion Service is still alive for Resume Policy: {}".format( |
144 |
| - experiment.spec.resume_policy |
145 |
| - ) |
146 |
| - ) |
147 |
| - |
148 |
| - # For FromVolume resume policy PVC should not be deleted. |
149 |
| - if experiment.spec.resume_policy == "FromVolume": |
150 |
| - try: |
151 |
| - client.CoreV1Api().read_namespaced_persistent_volume_claim( |
152 |
| - resource_name, exp_namespace |
153 |
| - ) |
154 |
| - except client.ApiException: |
155 |
| - raise Exception("PVC is deleted for FromVolume Resume Policy") |
156 |
| - |
157 |
| - |
158 | 21 | def run_e2e_experiment(
|
159 | 22 | katib_client: KatibClient,
|
160 | 23 | experiment: models.V1beta1Experiment,
|
|
0 commit comments