Add Kubeflow MXJob example (#1688)

andreyvelich · web-flow · commit 60baacd0fd95 · 2021-10-07T17:27:23.000-07:00
* Add Kubeflow MXJob example

* Reduce num examples

* Update image link

* Fix FPGA doc

* Add BytePS image
diff --git a/docs/images-location.md b/docs/images-location.md
@@ -293,6 +293,17 @@ The following table shows images for training containers which are used in the
         <a href="https://github.com/kubeflow/training-operator/blob/2712f5667ec78f17d22288630f8719f0c08990ba/examples/tensorflow/mnist_with_summaries/Dockerfile">Dockerfile</a>
       </td>
     </tr>
+    <tr align="center">
+      <td>
+        <code>docker.io/bytepsimage/mxnet</code>
+      </td>
+      <td>
+        Distributed BytePS example for MXJob
+      </td>
+      <td>
+        <a href="https://github.com/bytedance/byteps/blob/v0.2.5/docker/Dockerfile">Dockerfile</a>
+      </td>
+    </tr>
     <tr align="center">
       <td>
         <code>docker.io/kubeflowkatib/xgboost-lightgbm</code>
diff --git a/examples/v1beta1/fpga/README.md b/examples/v1beta1/fpga/README.md
@@ -7,12 +7,12 @@ If you want to read more about provisioning FPGA resources and deploying
 accelerated applications (e.g. Kubeflow Pipelines) on any Kubernetes cluster,
 visit the [InAccel](https://docs.inaccel.com) documentation.
 
-## Simplifying FPGA management in EKS* (Elastic Kubernetes Serice)
+## Simplifying FPGA management in EKS\* (Elastic Kubernetes Service)
 
-**For development and testing purposes you can still [deploy Kubeflow Katib
+\*_For development and testing purposes you can still [deploy Kubeflow Katib
 using Minikube](https://kubeflow.org/docs/started/workstation/minikube-linux) in
 a single AMI instance. In production environments, Amazon's managed Kubernetes
-service ([EKS](https://aws.amazon.com/eks)) is recommended.*
+service ([EKS](https://aws.amazon.com/eks)) is recommended._
 
 The InAccel FPGA Operator allows administrators of Kubernetes clusters to manage
 FPGA nodes just like CPU nodes in the cluster. Instead of provisioning a special
diff --git a/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml b/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml
@@ -0,0 +1,85 @@
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: mxjob-byteps
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Train-accuracy
+  algorithm:
+    algorithmName: random
+  parallelTrialCount: 1
+  maxTrialCount: 4
+  maxFailedTrialCount: 3
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.1"
+        max: "0.11"
+  trialTemplate:
+    primaryContainerName: mxnet
+    # In this example we can collect metrics only from the Worker pods.
+    primaryPodLabels:
+      replica-type: worker
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+    trialSpec:
+      apiVersion: kubeflow.org/v1
+      kind: MXJob
+      spec:
+        jobMode: MXTrain
+        runPolicy:
+          cleanPodPolicy: None
+        mxReplicaSpecs:
+          Scheduler:
+            replicas: 1
+            restartPolicy: Never
+            template:
+              spec:
+                containers:
+                  - name: mxnet
+                    image: docker.io/bytepsimage/mxnet
+                    command: ["bpslaunch"]
+          Server:
+            replicas: 1
+            restartPolicy: Never
+            template:
+              spec:
+                containers:
+                  - name: mxnet
+                    image: docker.io/bytepsimage/mxnet
+                    command: ["bpslaunch"]
+          Worker:
+            replicas: 1
+            restartPolicy: Never
+            template:
+              spec:
+                containers:
+                  - name: mxnet
+                    image: docker.io/bytepsimage/mxnet
+                    command: ["bpslaunch"]
+                    args:
+                      [
+                        "python3",
+                        "/usr/local/byteps/example/mxnet/train_imagenet_byteps.py",
+                        "--benchmark",
+                        "1",
+                        "--lr=${trialParameters.learningRate}",
+                        "--num-examples=1000",
+                        "--num-epochs=4",
+                      ]
+                    volumeMounts:
+                      - mountPath: /dev/shm
+                        name: dshm
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 1
+                volumes:
+                  - name: dshm
+                    emptyDir:
+                      medium: Memory
diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml
@@ -30,6 +30,7 @@ spec:
             - "--trial-resources=PyTorchJob.v1.kubeflow.org"
             - "--trial-resources=MPIJob.v1.kubeflow.org"
             - "--trial-resources=XGBoostJob.v1.kubeflow.org"
+            - "--trial-resources=MXJob.v1.kubeflow.org"
           ports:
             - containerPort: 8443
               name: webhook
diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml
@@ -54,6 +54,7 @@ rules:
       - pytorchjobs
       - mpijobs
       - xgboostjobs
+      - mxjobs
     verbs:
       - "*"
 ---
diff --git a/pkg/apis/controller/experiments/v1beta1/constants.go b/pkg/apis/controller/experiments/v1beta1/constants.go
@@ -29,14 +29,22 @@ const (
 	// DefaultJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Job.
 	DefaultJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
 
-	// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Job.
+	// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Training Job.
 	DefaultKubeflowJobSuccessCondition = "status.conditions.#(type==\"Succeeded\")#|#(status==\"True\")#"
 
-	// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Job.
+	// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Training Job.
 	DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
 )
 
 var (
-	// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Job.
+	// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job.
 	DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"}
+
+	// KubeflowJobKinds is the list of Kubeflow Training Job kinds.
+	KubeflowJobKinds = map[string]bool{
+		"TFJob":      true,
+		"PyTorchJob": true,
+		"XGBoostJob": true,
+		"MXJob":      true,
+	}
 )
diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go
@@ -106,7 +106,7 @@ func (e *Experiment) setDefaultTrialTemplate() {
 			if t.FailureCondition == "" {
 				t.FailureCondition = DefaultJobFailureCondition
 			}
-		} else if jobKind == consts.JobKindTF || jobKind == consts.JobKindPyTorch || jobKind == consts.JobKindXGBoost {
+		} else if _, ok := KubeflowJobKinds[jobKind]; ok {
 			if t.SuccessCondition == "" {
 				t.SuccessCondition = DefaultKubeflowJobSuccessCondition
 			}
diff --git a/pkg/controller.v1beta1/consts/const.go b/pkg/controller.v1beta1/consts/const.go
@@ -130,12 +130,6 @@ const (
 
 	// JobKindJob is the kind of the Kubernetes Job.
 	JobKindJob = "Job"
-	// JobKindTF is the kind of TFJob.
-	JobKindTF = "TFJob"
-	// JobKindPyTorch is the kind of PyTorchJob.
-	JobKindPyTorch = "PyTorchJob"
-	// JobKindXGBoost is the kind of XGBoostJob.
-	JobKindXGBoost = "XGBoostJob"
 
 	// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
 	AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"
diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go
@@ -458,10 +458,9 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
 			},
 			ResumePolicy: experimentsv1beta1.NeverResume,
 			TrialTemplate: &experimentsv1beta1.TrialTemplate{
-				PrimaryPodLabels:     experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
 				PrimaryContainerName: primaryContainer,
-				SuccessCondition:     experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
-				FailureCondition:     experimentsv1beta1.DefaultKubeflowJobFailureCondition,
+				SuccessCondition:     experimentsv1beta1.DefaultJobSuccessCondition,
+				FailureCondition:     experimentsv1beta1.DefaultJobFailureCondition,
 				TrialParameters: []experimentsv1beta1.TrialParameterSpec{
 					{
 						Name:        "learningRate",
diff --git a/pkg/ui/v1beta1/frontend/src/reducers/general.js b/pkg/ui/v1beta1/frontend/src/reducers/general.js
@@ -50,14 +50,14 @@ const initialState = {
       value: 'status.conditions.#(type=="Complete")#|#(status=="True")#',
       description: `Condition when Trial custom resource is succeeded.
       Default value for k8s BatchJob: status.conditions.#(type=="Complete")#|#(status=="True")#.
-      Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
+      Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
     },
     {
       name: 'FailureCondition',
       value: 'status.conditions.#(type=="Failed")#|#(status=="True")#',
       description: `Condition when Trial custom resource is failed.
       Default value for k8s BatchJob: status.conditions.#(type=="Failed")#|#(status=="True")#.
-      Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
+      Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
     },
     {
       name: 'Retain',
diff --git a/pkg/webhook/v1beta1/experiment/validator/validator.go b/pkg/webhook/v1beta1/experiment/validator/validator.go
@@ -317,7 +317,7 @@ func (g *DefaultValidator) validateTrialTemplate(instance *experimentsv1beta1.Ex
 		return fmt.Errorf("APIVersion and Kind in spec.trialTemplate must be specified")
 	}
 
-	// Check if Job can be converted to Batch Job/TFJob/PyTorchJob
+	// Check if Job can be converted to Batch Job
 	// Other CRDs are not validated
 	if err := g.validateTrialJob(runSpec); err != nil {
 		return fmt.Errorf("invalid spec.trialTemplate: %v", err)
@@ -329,7 +329,7 @@ func (g *DefaultValidator) validateTrialTemplate(instance *experimentsv1beta1.Ex
 func (g *DefaultValidator) validateTrialJob(runSpec *unstructured.Unstructured) error {
 	gvk := runSpec.GroupVersionKind()
 
-	// Validate only Job, TFJob and PyTorchJob
+	// Validate only Job
 	switch gvk.Kind {
 	case consts.JobKindJob:
 		batchJob := batchv1.Job{}
@@ -359,7 +359,7 @@ func validatePatchJob(runSpec *unstructured.Unstructured, job interface{}, jobTy
 	// Not necessary to check error job must be valid JSON
 	runSpecAfter, _ := json.Marshal(job)
 
-	// Create Patch on tranformed Job (e.g: Job, TFJob) using unstructured JSON
+	// Create Patch on tranformed Job (e.g: Job) using unstructured JSON
 	runSpecPatchOperations, err := jsonPatch.CreatePatch(runSpecAfter, runSpecBefore)
 	if err != nil {
 		return fmt.Errorf("create patch error: %v", err)
diff --git a/pkg/webhook/v1beta1/experiment/validator/validator_test.go b/pkg/webhook/v1beta1/experiment/validator/validator_test.go
@@ -1089,8 +1089,8 @@ func newFakeTrialTemplate(trialJob interface{}, trialParameters []experimentsv1b
 
 	return &experimentsv1beta1.TrialTemplate{
 		PrimaryContainerName: "training-container",
-		SuccessCondition:     experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
-		FailureCondition:     experimentsv1beta1.DefaultKubeflowJobFailureCondition,
+		SuccessCondition:     experimentsv1beta1.DefaultJobSuccessCondition,
+		FailureCondition:     experimentsv1beta1.DefaultJobFailureCondition,
 		TrialSource: experimentsv1beta1.TrialSource{
 			TrialSpec: trialSpec,
 		},
diff --git a/pkg/webhook/v1beta1/pod/inject_webhook_test.go b/pkg/webhook/v1beta1/pod/inject_webhook_test.go
@@ -67,9 +67,8 @@ func TestWrapWorkerContainer(t *testing.T) {
 				},
 			},
 			PrimaryContainerName: primaryContainer,
-			PrimaryPodLabels:     experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
-			SuccessCondition:     experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
-			FailureCondition:     experimentsv1beta1.DefaultKubeflowJobFailureCondition,
+			SuccessCondition:     experimentsv1beta1.DefaultJobSuccessCondition,
+			FailureCondition:     experimentsv1beta1.DefaultJobFailureCondition,
 		},
 	}
 

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ func (e *Experiment) setDefaultTrialTemplate() {`
`106`	`106`	`if t.FailureCondition == "" {`
`107`	`107`	`t.FailureCondition = DefaultJobFailureCondition`
`108`	`108`	`}`
`109`		`- } else if jobKind == consts.JobKindTF \|\| jobKind == consts.JobKindPyTorch \|\| jobKind == consts.JobKindXGBoost {`
	`109`	`+ } else if _, ok := KubeflowJobKinds[jobKind]; ok {`
`110`	`110`	`if t.SuccessCondition == "" {`
`111`	`111`	`t.SuccessCondition = DefaultKubeflowJobSuccessCondition`
`112`	`112`	`}`