File tree Expand file tree Collapse file tree 4 files changed +104
-1
lines changed
manifests/v1beta1/katib-controller Expand file tree Collapse file tree 4 files changed +104
-1
lines changed Original file line number Diff line number Diff line change @@ -384,8 +384,14 @@ gcr.io/kubeflow-ci/pytorch-dist-mnist-test
384
384
gcr.io/kubeflow-ci/tf-mnist-with-summaries
385
385
` ` `
386
386
387
- - FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py)
387
+ - FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py).
388
388
389
389
` ` `
390
390
docker.io/inaccel/jupyter:lab
391
391
` ` `
392
+
393
+ - MPI operator horovod mnist example, [source](https://github.com/kubeflow/mpi-operator/tree/master/examples/horovod).
394
+
395
+ ` ` `
396
+ docker.io/kubeflow/mpi-horovod-mnist
397
+ ` ` `
Original file line number Diff line number Diff line change
1
+ apiVersion : " kubeflow.org/v1beta1"
2
+ kind : Experiment
3
+ metadata :
4
+ namespace : kubeflow
5
+ name : mpi-horovod-mnist
6
+ spec :
7
+ objective :
8
+ type : minimize
9
+ goal : 0.01
10
+ objectiveMetricName : loss
11
+ algorithm :
12
+ algorithmName : random
13
+ parallelTrialCount : 2
14
+ maxTrialCount : 6
15
+ maxFailedTrialCount : 3
16
+ parameters :
17
+ - name : lr
18
+ parameterType : double
19
+ feasibleSpace :
20
+ min : " 0.001"
21
+ max : " 0.003"
22
+ - name : num-steps
23
+ parameterType : int
24
+ feasibleSpace :
25
+ min : " 50"
26
+ max : " 150"
27
+ step : " 10"
28
+ trialTemplate :
29
+ primaryPodLabels :
30
+ mpi-job-role : launcher
31
+ primaryContainerName : mpi-launcher
32
+ successCondition : status.conditions.#(type=="Succeeded")#|#(status=="True")#
33
+ failureCondition : status.conditions.#(type=="Failed")#|#(status=="True")#
34
+ trialParameters :
35
+ - name : learningRate
36
+ description : Learning rate for the training model
37
+ reference : lr
38
+ - name : numberSteps
39
+ description : Number of training steps
40
+ reference : num-steps
41
+ trialSpec :
42
+ apiVersion : kubeflow.org/v1
43
+ kind : MPIJob
44
+ spec :
45
+ slotsPerWorker : 1
46
+ cleanPodPolicy : Running
47
+ mpiReplicaSpecs :
48
+ Launcher :
49
+ replicas : 1
50
+ template :
51
+ spec :
52
+ containers :
53
+ - image : docker.io/kubeflow/mpi-horovod-mnist
54
+ name : mpi-launcher
55
+ command :
56
+ - mpirun
57
+ args :
58
+ - -np
59
+ - " 2"
60
+ - --allow-run-as-root
61
+ - -bind-to
62
+ - none
63
+ - -map-by
64
+ - slot
65
+ - -x
66
+ - LD_LIBRARY_PATH
67
+ - -x
68
+ - PATH
69
+ - -mca
70
+ - pml
71
+ - ob1
72
+ - -mca
73
+ - btl
74
+ - ^openib
75
+ - python
76
+ - /examples/tensorflow_mnist.py
77
+ - --lr
78
+ - ${trialParameters.learningRate}
79
+ - --num-steps
80
+ - ${trialParameters.numberSteps}
81
+ resources :
82
+ limits :
83
+ cpu : 500m
84
+ memory : 2Gi
85
+ Worker :
86
+ replicas : 2
87
+ template :
88
+ spec :
89
+ containers :
90
+ - image : docker.io/kubeflow/mpi-horovod-mnist
91
+ name : mpi-worker
92
+ resources :
93
+ limits :
94
+ cpu : 500m
95
+ memory : 4Gi
Original file line number Diff line number Diff line change 25
25
command : ["./katib-controller"]
26
26
args :
27
27
- " --webhook-port=8443"
28
+ - " --trial-resources=MPIJob.v1.kubeflow.org"
28
29
ports :
29
30
- containerPort : 8443
30
31
name : webhook
Original file line number Diff line number Diff line change 70
70
resources :
71
71
- tfjobs
72
72
- pytorchjobs
73
+ - mpijobs
73
74
verbs :
74
75
- " *"
75
76
---
You can’t perform that action at this time.
0 commit comments