Add MPI operator horovod example (#1342)

andreyvelich · web-flow · commit 8ebba43c0d42 · 2020-10-16T17:02:13.000-07:00
* Add MPI Job horovod example

* Add link to dockerimage

* Change mpi example docker hub registry

* Remove istio sidecar annotation
diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md
@@ -384,8 +384,14 @@ gcr.io/kubeflow-ci/pytorch-dist-mnist-test
 gcr.io/kubeflow-ci/tf-mnist-with-summaries
 ```
 
-- FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py)
+- FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py).
 
 ```
 docker.io/inaccel/jupyter:lab
 ```
+
+- MPI operator horovod mnist example, [source](https://github.com/kubeflow/mpi-operator/tree/master/examples/horovod).
+
+```
+docker.io/kubeflow/mpi-horovod-mnist
+```
diff --git a/examples/v1beta1/mpijob-horovod.yaml b/examples/v1beta1/mpijob-horovod.yaml
@@ -0,0 +1,95 @@
+apiVersion: "kubeflow.org/v1beta1"
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: mpi-horovod-mnist
+spec:
+  objective:
+    type: minimize
+    goal: 0.01
+    objectiveMetricName: loss
+  algorithm:
+    algorithmName: random
+  parallelTrialCount: 2
+  maxTrialCount: 6
+  maxFailedTrialCount: 3
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.001"
+        max: "0.003"
+    - name: num-steps
+      parameterType: int
+      feasibleSpace:
+        min: "50"
+        max: "150"
+        step: "10"
+  trialTemplate:
+    primaryPodLabels:
+      mpi-job-role: launcher
+    primaryContainerName: mpi-launcher
+    successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")#
+    failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+      - name: numberSteps
+        description: Number of training steps
+        reference: num-steps
+    trialSpec:
+      apiVersion: kubeflow.org/v1
+      kind: MPIJob
+      spec:
+        slotsPerWorker: 1
+        cleanPodPolicy: Running
+        mpiReplicaSpecs:
+          Launcher:
+            replicas: 1
+            template:
+              spec:
+                containers:
+                  - image: docker.io/kubeflow/mpi-horovod-mnist
+                    name: mpi-launcher
+                    command:
+                      - mpirun
+                    args:
+                      - -np
+                      - "2"
+                      - --allow-run-as-root
+                      - -bind-to
+                      - none
+                      - -map-by
+                      - slot
+                      - -x
+                      - LD_LIBRARY_PATH
+                      - -x
+                      - PATH
+                      - -mca
+                      - pml
+                      - ob1
+                      - -mca
+                      - btl
+                      - ^openib
+                      - python
+                      - /examples/tensorflow_mnist.py
+                      - --lr
+                      - ${trialParameters.learningRate}
+                      - --num-steps
+                      - ${trialParameters.numberSteps}
+                    resources:
+                      limits:
+                        cpu: 500m
+                        memory: 2Gi
+          Worker:
+            replicas: 2
+            template:
+              spec:
+                containers:
+                  - image: docker.io/kubeflow/mpi-horovod-mnist
+                    name: mpi-worker
+                    resources:
+                      limits:
+                        cpu: 500m
+                        memory: 4Gi
diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml
@@ -25,6 +25,7 @@ spec:
           command: ["./katib-controller"]
           args:
             - "--webhook-port=8443"
+            - "--trial-resources=MPIJob.v1.kubeflow.org"
           ports:
             - containerPort: 8443
               name: webhook
diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml
@@ -70,6 +70,7 @@ rules:
     resources:
       - tfjobs
       - pytorchjobs
+      - mpijobs
     verbs:
       - "*"
 ---