Skip to content

Commit 60baacd

Browse files
authored
Add Kubeflow MXJob example (#1688)
* Add Kubeflow MXJob example * Reduce num examples * Update image link * Fix FPGA doc * Add BytePS image
1 parent 983a867 commit 60baacd

File tree

13 files changed

+124
-26
lines changed

13 files changed

+124
-26
lines changed

docs/images-location.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,17 @@ The following table shows images for training containers which are used in the
293293
<a href="https://github.com/kubeflow/training-operator/blob/2712f5667ec78f17d22288630f8719f0c08990ba/examples/tensorflow/mnist_with_summaries/Dockerfile">Dockerfile</a>
294294
</td>
295295
</tr>
296+
<tr align="center">
297+
<td>
298+
<code>docker.io/bytepsimage/mxnet</code>
299+
</td>
300+
<td>
301+
Distributed BytePS example for MXJob
302+
</td>
303+
<td>
304+
<a href="https://github.com/bytedance/byteps/blob/v0.2.5/docker/Dockerfile">Dockerfile</a>
305+
</td>
306+
</tr>
296307
<tr align="center">
297308
<td>
298309
<code>docker.io/kubeflowkatib/xgboost-lightgbm</code>

examples/v1beta1/fpga/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ If you want to read more about provisioning FPGA resources and deploying
77
accelerated applications (e.g. Kubeflow Pipelines) on any Kubernetes cluster,
88
visit the [InAccel](https://docs.inaccel.com) documentation.
99

10-
## Simplifying FPGA management in EKS* (Elastic Kubernetes Serice)
10+
## Simplifying FPGA management in EKS\* (Elastic Kubernetes Service)
1111

12-
**For development and testing purposes you can still [deploy Kubeflow Katib
12+
\*_For development and testing purposes you can still [deploy Kubeflow Katib
1313
using Minikube](https://kubeflow.org/docs/started/workstation/minikube-linux) in
1414
a single AMI instance. In production environments, Amazon's managed Kubernetes
15-
service ([EKS](https://aws.amazon.com/eks)) is recommended.*
15+
service ([EKS](https://aws.amazon.com/eks)) is recommended._
1616

1717
The InAccel FPGA Operator allows administrators of Kubernetes clusters to manage
1818
FPGA nodes just like CPU nodes in the cluster. Instead of provisioning a special
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
apiVersion: kubeflow.org/v1beta1
2+
kind: Experiment
3+
metadata:
4+
namespace: kubeflow
5+
name: mxjob-byteps
6+
spec:
7+
objective:
8+
type: maximize
9+
goal: 0.99
10+
objectiveMetricName: Train-accuracy
11+
algorithm:
12+
algorithmName: random
13+
parallelTrialCount: 1
14+
maxTrialCount: 4
15+
maxFailedTrialCount: 3
16+
parameters:
17+
- name: lr
18+
parameterType: double
19+
feasibleSpace:
20+
min: "0.1"
21+
max: "0.11"
22+
trialTemplate:
23+
primaryContainerName: mxnet
24+
# In this example we can collect metrics only from the Worker pods.
25+
primaryPodLabels:
26+
replica-type: worker
27+
trialParameters:
28+
- name: learningRate
29+
description: Learning rate for the training model
30+
reference: lr
31+
trialSpec:
32+
apiVersion: kubeflow.org/v1
33+
kind: MXJob
34+
spec:
35+
jobMode: MXTrain
36+
runPolicy:
37+
cleanPodPolicy: None
38+
mxReplicaSpecs:
39+
Scheduler:
40+
replicas: 1
41+
restartPolicy: Never
42+
template:
43+
spec:
44+
containers:
45+
- name: mxnet
46+
image: docker.io/bytepsimage/mxnet
47+
command: ["bpslaunch"]
48+
Server:
49+
replicas: 1
50+
restartPolicy: Never
51+
template:
52+
spec:
53+
containers:
54+
- name: mxnet
55+
image: docker.io/bytepsimage/mxnet
56+
command: ["bpslaunch"]
57+
Worker:
58+
replicas: 1
59+
restartPolicy: Never
60+
template:
61+
spec:
62+
containers:
63+
- name: mxnet
64+
image: docker.io/bytepsimage/mxnet
65+
command: ["bpslaunch"]
66+
args:
67+
[
68+
"python3",
69+
"/usr/local/byteps/example/mxnet/train_imagenet_byteps.py",
70+
"--benchmark",
71+
"1",
72+
"--lr=${trialParameters.learningRate}",
73+
"--num-examples=1000",
74+
"--num-epochs=4",
75+
]
76+
volumeMounts:
77+
- mountPath: /dev/shm
78+
name: dshm
79+
resources:
80+
limits:
81+
nvidia.com/gpu: 1
82+
volumes:
83+
- name: dshm
84+
emptyDir:
85+
medium: Memory

manifests/v1beta1/components/controller/controller.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ spec:
3030
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
3131
- "--trial-resources=MPIJob.v1.kubeflow.org"
3232
- "--trial-resources=XGBoostJob.v1.kubeflow.org"
33+
- "--trial-resources=MXJob.v1.kubeflow.org"
3334
ports:
3435
- containerPort: 8443
3536
name: webhook

manifests/v1beta1/components/controller/rbac.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ rules:
5454
- pytorchjobs
5555
- mpijobs
5656
- xgboostjobs
57+
- mxjobs
5758
verbs:
5859
- "*"
5960
---

pkg/apis/controller/experiments/v1beta1/constants.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,22 @@ const (
2929
// DefaultJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Job.
3030
DefaultJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
3131

32-
// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Job.
32+
// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Training Job.
3333
DefaultKubeflowJobSuccessCondition = "status.conditions.#(type==\"Succeeded\")#|#(status==\"True\")#"
3434

35-
// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Job.
35+
// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Training Job.
3636
DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
3737
)
3838

3939
var (
40-
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Job.
40+
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job.
4141
DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"}
42+
43+
// KubeflowJobKinds is the list of Kubeflow Training Job kinds.
44+
KubeflowJobKinds = map[string]bool{
45+
"TFJob": true,
46+
"PyTorchJob": true,
47+
"XGBoostJob": true,
48+
"MXJob": true,
49+
}
4250
)

pkg/apis/controller/experiments/v1beta1/experiment_defaults.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ func (e *Experiment) setDefaultTrialTemplate() {
106106
if t.FailureCondition == "" {
107107
t.FailureCondition = DefaultJobFailureCondition
108108
}
109-
} else if jobKind == consts.JobKindTF || jobKind == consts.JobKindPyTorch || jobKind == consts.JobKindXGBoost {
109+
} else if _, ok := KubeflowJobKinds[jobKind]; ok {
110110
if t.SuccessCondition == "" {
111111
t.SuccessCondition = DefaultKubeflowJobSuccessCondition
112112
}

pkg/controller.v1beta1/consts/const.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,6 @@ const (
130130

131131
// JobKindJob is the kind of the Kubernetes Job.
132132
JobKindJob = "Job"
133-
// JobKindTF is the kind of TFJob.
134-
JobKindTF = "TFJob"
135-
// JobKindPyTorch is the kind of PyTorchJob.
136-
JobKindPyTorch = "PyTorchJob"
137-
// JobKindXGBoost is the kind of XGBoostJob.
138-
JobKindXGBoost = "XGBoostJob"
139133

140134
// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
141135
AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"

pkg/controller.v1beta1/experiment/experiment_controller_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -458,10 +458,9 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
458458
},
459459
ResumePolicy: experimentsv1beta1.NeverResume,
460460
TrialTemplate: &experimentsv1beta1.TrialTemplate{
461-
PrimaryPodLabels: experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
462461
PrimaryContainerName: primaryContainer,
463-
SuccessCondition: experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
464-
FailureCondition: experimentsv1beta1.DefaultKubeflowJobFailureCondition,
462+
SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition,
463+
FailureCondition: experimentsv1beta1.DefaultJobFailureCondition,
465464
TrialParameters: []experimentsv1beta1.TrialParameterSpec{
466465
{
467466
Name: "learningRate",

pkg/ui/v1beta1/frontend/src/reducers/general.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,14 @@ const initialState = {
5050
value: 'status.conditions.#(type=="Complete")#|#(status=="True")#',
5151
description: `Condition when Trial custom resource is succeeded.
5252
Default value for k8s BatchJob: status.conditions.#(type=="Complete")#|#(status=="True")#.
53-
Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
53+
Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
5454
},
5555
{
5656
name: 'FailureCondition',
5757
value: 'status.conditions.#(type=="Failed")#|#(status=="True")#',
5858
description: `Condition when Trial custom resource is failed.
5959
Default value for k8s BatchJob: status.conditions.#(type=="Failed")#|#(status=="True")#.
60-
Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
60+
Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
6161
},
6262
{
6363
name: 'Retain',

0 commit comments

Comments
 (0)