Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/images-location.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,17 @@ The following table shows images for training containers which are used in the
<a href="https://github.com/kubeflow/training-operator/blob/2712f5667ec78f17d22288630f8719f0c08990ba/examples/tensorflow/mnist_with_summaries/Dockerfile">Dockerfile</a>
</td>
</tr>
<tr align="center">
<td>
<code>docker.io/bytepsimage/mxnet</code>
</td>
<td>
Distributed BytePS example for MXJob
</td>
<td>
<a href="https://github.com/bytedance/byteps/blob/v0.2.5/docker/Dockerfile">Dockerfile</a>
</td>
</tr>
<tr align="center">
<td>
<code>docker.io/kubeflowkatib/xgboost-lightgbm</code>
Expand Down
6 changes: 3 additions & 3 deletions examples/v1beta1/fpga/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ If you want to read more about provisioning FPGA resources and deploying
accelerated applications (e.g. Kubeflow Pipelines) on any Kubernetes cluster,
visit the [InAccel](https://docs.inaccel.com) documentation.

## Simplifying FPGA management in EKS* (Elastic Kubernetes Serice)
## Simplifying FPGA management in EKS\* (Elastic Kubernetes Service)

**For development and testing purposes you can still [deploy Kubeflow Katib
\*_For development and testing purposes you can still [deploy Kubeflow Katib
using Minikube](https://kubeflow.org/docs/started/workstation/minikube-linux) in
a single AMI instance. In production environments, Amazon's managed Kubernetes
service ([EKS](https://aws.amazon.com/eks)) is recommended.*
service ([EKS](https://aws.amazon.com/eks)) is recommended._

The InAccel FPGA Operator allows administrators of Kubernetes clusters to manage
FPGA nodes just like CPU nodes in the cluster. Instead of provisioning a special
Expand Down
85 changes: 85 additions & 0 deletions examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: mxjob-byteps
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: Train-accuracy
algorithm:
algorithmName: random
parallelTrialCount: 1
maxTrialCount: 4
maxFailedTrialCount: 3
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.1"
max: "0.11"
trialTemplate:
primaryContainerName: mxnet
# In this example we can collect metrics only from the Worker pods.
primaryPodLabels:
replica-type: worker
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
trialSpec:
apiVersion: kubeflow.org/v1
kind: MXJob
spec:
jobMode: MXTrain
runPolicy:
cleanPodPolicy: None
mxReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: docker.io/bytepsimage/mxnet
command: ["bpslaunch"]
Server:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: docker.io/bytepsimage/mxnet
command: ["bpslaunch"]
Worker:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: docker.io/bytepsimage/mxnet
command: ["bpslaunch"]
args:
[
"python3",
"/usr/local/byteps/example/mxnet/train_imagenet_byteps.py",
"--benchmark",
"1",
"--lr=${trialParameters.learningRate}",
"--num-examples=1000",
"--num-epochs=4",
]
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
limits:
nvidia.com/gpu: 1
volumes:
- name: dshm
emptyDir:
medium: Memory
1 change: 1 addition & 0 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ spec:
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
- "--trial-resources=MPIJob.v1.kubeflow.org"
- "--trial-resources=XGBoostJob.v1.kubeflow.org"
- "--trial-resources=MXJob.v1.kubeflow.org"
ports:
- containerPort: 8443
name: webhook
Expand Down
1 change: 1 addition & 0 deletions manifests/v1beta1/components/controller/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ rules:
- pytorchjobs
- mpijobs
- xgboostjobs
- mxjobs
verbs:
- "*"
---
Expand Down
14 changes: 11 additions & 3 deletions pkg/apis/controller/experiments/v1beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,22 @@ const (
// DefaultJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Job.
DefaultJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"

// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Job.
// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Training Job.
DefaultKubeflowJobSuccessCondition = "status.conditions.#(type==\"Succeeded\")#|#(status==\"True\")#"

// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Job.
// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Training Job.
DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
)

var (
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Job.
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job.
DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"}

// KubeflowJobKinds is the list of Kubeflow Training Job kinds.
KubeflowJobKinds = map[string]bool{
"TFJob": true,
"PyTorchJob": true,
"XGBoostJob": true,
"MXJob": true,
}
)
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func (e *Experiment) setDefaultTrialTemplate() {
if t.FailureCondition == "" {
t.FailureCondition = DefaultJobFailureCondition
}
} else if jobKind == consts.JobKindTF || jobKind == consts.JobKindPyTorch || jobKind == consts.JobKindXGBoost {
} else if _, ok := KubeflowJobKinds[jobKind]; ok {
if t.SuccessCondition == "" {
t.SuccessCondition = DefaultKubeflowJobSuccessCondition
}
Expand Down
6 changes: 0 additions & 6 deletions pkg/controller.v1beta1/consts/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,6 @@ const (

// JobKindJob is the kind of the Kubernetes Job.
JobKindJob = "Job"
// JobKindTF is the kind of TFJob.
JobKindTF = "TFJob"
// JobKindPyTorch is the kind of PyTorchJob.
JobKindPyTorch = "PyTorchJob"
// JobKindXGBoost is the kind of XGBoostJob.
JobKindXGBoost = "XGBoostJob"

// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,10 +458,9 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
},
ResumePolicy: experimentsv1beta1.NeverResume,
TrialTemplate: &experimentsv1beta1.TrialTemplate{
PrimaryPodLabels: experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
PrimaryContainerName: primaryContainer,
SuccessCondition: experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultKubeflowJobFailureCondition,
SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultJobFailureCondition,
TrialParameters: []experimentsv1beta1.TrialParameterSpec{
{
Name: "learningRate",
Expand Down
4 changes: 2 additions & 2 deletions pkg/ui/v1beta1/frontend/src/reducers/general.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ const initialState = {
value: 'status.conditions.#(type=="Complete")#|#(status=="True")#',
description: `Condition when Trial custom resource is succeeded.
Default value for k8s BatchJob: status.conditions.#(type=="Complete")#|#(status=="True")#.
Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`,
},
{
name: 'FailureCondition',
value: 'status.conditions.#(type=="Failed")#|#(status=="True")#',
description: `Condition when Trial custom resource is failed.
Default value for k8s BatchJob: status.conditions.#(type=="Failed")#|#(status=="True")#.
Default value for Kubeflow Job (TFJob, PyTorchJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`,
},
{
name: 'Retain',
Expand Down
6 changes: 3 additions & 3 deletions pkg/webhook/v1beta1/experiment/validator/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ func (g *DefaultValidator) validateTrialTemplate(instance *experimentsv1beta1.Ex
return fmt.Errorf("APIVersion and Kind in spec.trialTemplate must be specified")
}

// Check if Job can be converted to Batch Job/TFJob/PyTorchJob
// Check if Job can be converted to Batch Job
// Other CRDs are not validated
if err := g.validateTrialJob(runSpec); err != nil {
return fmt.Errorf("invalid spec.trialTemplate: %v", err)
Expand All @@ -329,7 +329,7 @@ func (g *DefaultValidator) validateTrialTemplate(instance *experimentsv1beta1.Ex
func (g *DefaultValidator) validateTrialJob(runSpec *unstructured.Unstructured) error {
gvk := runSpec.GroupVersionKind()

// Validate only Job, TFJob and PyTorchJob
// Validate only Job
switch gvk.Kind {
case consts.JobKindJob:
batchJob := batchv1.Job{}
Expand Down Expand Up @@ -359,7 +359,7 @@ func validatePatchJob(runSpec *unstructured.Unstructured, job interface{}, jobTy
// Not necessary to check error job must be valid JSON
runSpecAfter, _ := json.Marshal(job)

// Create Patch on tranformed Job (e.g: Job, TFJob) using unstructured JSON
// Create Patch on tranformed Job (e.g: Job) using unstructured JSON
runSpecPatchOperations, err := jsonPatch.CreatePatch(runSpecAfter, runSpecBefore)
if err != nil {
return fmt.Errorf("create patch error: %v", err)
Expand Down
4 changes: 2 additions & 2 deletions pkg/webhook/v1beta1/experiment/validator/validator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1089,8 +1089,8 @@ func newFakeTrialTemplate(trialJob interface{}, trialParameters []experimentsv1b

return &experimentsv1beta1.TrialTemplate{
PrimaryContainerName: "training-container",
SuccessCondition: experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultKubeflowJobFailureCondition,
SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultJobFailureCondition,
TrialSource: experimentsv1beta1.TrialSource{
TrialSpec: trialSpec,
},
Expand Down
5 changes: 2 additions & 3 deletions pkg/webhook/v1beta1/pod/inject_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,8 @@ func TestWrapWorkerContainer(t *testing.T) {
},
},
PrimaryContainerName: primaryContainer,
PrimaryPodLabels: experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
SuccessCondition: experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultKubeflowJobFailureCondition,
SuccessCondition: experimentsv1beta1.DefaultJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultJobFailureCondition,
},
}

Expand Down