Adding out of the box support to TrainJob (kubeflow#2560)

ram4444 · andreyvelich · tenzen-y · web-flow · commit c9528e7d4e74 · 2025-08-23T00:01:07.000Z
* Out-of-the-box support TrainJob Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Example for Pytorch Distributed Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Update examples/v1beta1/kubeflow-training-operator/trainjob-pytorch.yaml Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Create folder for Trainer as suggested Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Movethe exmaple of trainjob to the new folder Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Ref the primaryContainerName to that of ClusterTrainingRuntime Signed-off-by: Ram Lau <ramwt4444@gmail.com> * tenzen-y steps down from Katib approver role (kubeflow#2561) Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com> Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Set Default value for TrainJob Success, Failure Condition and PrimaryPodLabels in the trial Template Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Enchance Handling for default value of Success, Fail Cond & Pod Label Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Bug fix for default value condition Signed-off-by: Ram Lau <ramwt4444@gmail.com> * code format by hack/update-gofmt.sh Signed-off-by: Ram Lau <ramwt4444@gmail.com> * add TrainJob trial Resources to cert manager config Signed-off-by: Ram Lau <ramwt4444@gmail.com> * add trainjob to controller rbac Signed-off-by: Ram Lau <ramwt4444@gmail.com> * Grant JobSet permission to Katib controller Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove create/delete RBAC for TrainJob Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix docker build with libpcre2 Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> --------- Signed-off-by: Ram Lau <ramwt4444@gmail.com> Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com> Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile
@@ -11,8 +11,7 @@ ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/
 WORKDIR  ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}
 
 RUN if [ "${TARGETARCH}" = "arm64" ]; then \
-    apt-get -y update && \
-    apt-get -y install gfortran libpcre3 libpcre3-dev && \
+    apt-get -y update && apt-get -y install gfortran libpcre2-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*; \
     fi
diff --git a/examples/v1beta1/kubeflow-trainer/trainjob-pytorch.yaml b/examples/v1beta1/kubeflow-trainer/trainjob-pytorch.yaml
@@ -0,0 +1,51 @@
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: torch-distributed-example
+spec:
+  parallelTrialCount: 3
+  maxTrialCount: 12
+  maxFailedTrialCount: 3
+  objective:
+    type: minimize
+    goal: 0.001
+    objectiveMetricName: loss
+  algorithm:
+    algorithmName: random
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.05"
+    - name: momentum
+      parameterType: double
+      feasibleSpace:
+        min: "0.5"
+        max: "0.9"
+  trialTemplate:
+    primaryContainerName: node
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+      - name: momentum
+        description: Momentum for the training model
+        reference: momentum
+    trialSpec:
+      apiVersion: trainer.kubeflow.org/v1alpha1
+      kind: TrainJob
+      spec:
+        runtimeRef:
+          name: torch-distributed
+        trainer:
+          numNodes: 2
+          image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+          command:
+            - "python3"
+            - "/opt/pytorch-mnist/mnist.py"
+            - "--epochs=1"
+            - "--lr=${trialParameters.learningRate}"
+            - "--momentum=${trialParameters.momentum}"
diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu
@@ -10,7 +10,7 @@ WORKDIR  ${TARGET_DIR}
 
 RUN if [ "${TARGETARCH}" = "arm64" ]; then \
     apt-get -y update && \
-    apt-get -y install gfortran libpcre3 libpcre3-dev && \
+    apt-get -y install gfortran libpcre2-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*; \
     fi
diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
@@ -8,7 +8,7 @@ WORKDIR /opt/tf-mnist-with-summaries
 
 RUN if [ "${TARGETARCH}" = "arm64" ]; then \
     apt-get -y update && \
-    apt-get -y install gfortran libpcre3 libpcre3-dev && \
+    apt-get -y install gfortran libpcre2-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*; \
     fi
diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml
@@ -90,6 +90,22 @@ rules:
       - "watch"
       - "create"
       - "delete"
+  - apiGroups:
+      - jobset.x-k8s.io
+    resources:
+      - jobsets
+    verbs:
+      - "get"
+      - "list"
+      - "watch"
+  - apiGroups:
+      - trainer.kubeflow.org
+    resources:
+      - trainjobs
+    verbs:
+      - "get"
+      - "list"
+      - "watch"
   - apiGroups:
       - kubeflow.org
     resources:
diff --git a/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml b/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml
@@ -5,6 +5,7 @@ init:
   controller:
     webhookPort: 8443
     trialResources:
+      - TrainJob.v1alpha1.trainer.kubeflow.org
       - Job.v1.batch
       - TFJob.v1.kubeflow.org
       - PyTorchJob.v1.kubeflow.org
diff --git a/manifests/v1beta1/installs/katib-standalone/katib-config.yaml b/manifests/v1beta1/installs/katib-standalone/katib-config.yaml
@@ -6,6 +6,7 @@ init:
   controller:
     webhookPort: 8443
     trialResources:
+      - TrainJob.v1alpha1.trainer.kubeflow.org
       - Job.v1.batch
       - TFJob.v1.kubeflow.org
       - PyTorchJob.v1.kubeflow.org
diff --git a/pkg/apis/controller/experiments/v1beta1/constants.go b/pkg/apis/controller/experiments/v1beta1/constants.go
@@ -34,17 +34,27 @@ const (
 
 	// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Training Job.
 	DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
+
+	// DefaultTrainJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Training Operator Job.
+	DefaultTrainJobSuccessCondition = "status.conditions.#(type==\"Complete\")#|#(status==\"True\")#"
+
+	// DefaultTrainJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Training Operator Job.
+	DefaultTrainJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
 )
 
 var (
 	// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job.
 	DefaultKubeflowJobPrimaryPodLabels = map[string]string{"training.kubeflow.org/job-role": "master"}
 
+	// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Training Operator Job.
+	DefaultTrainJobPrimaryPodLabels = map[string]string{"jobset.sigs.k8s.io/replicatedjob-name": "node", "batch.kubernetes.io/job-completion-index": "0"}
+
 	// KubeflowJobKinds is the list of Kubeflow Training Job kinds.
 	KubeflowJobKinds = map[string]bool{
 		"TFJob":      true,
 		"PyTorchJob": true,
 		"XGBoostJob": true,
 		"MPIJob":     true,
+		"TrainJob":   true,
 	}
 )
diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go
@@ -109,14 +109,27 @@ func (e *Experiment) setDefaultTrialTemplate() {
 			}
 		} else if _, ok := KubeflowJobKinds[jobKind]; ok {
 			if t.SuccessCondition == "" {
-				t.SuccessCondition = DefaultKubeflowJobSuccessCondition
+				if jobKind == "TrainJob" {
+					t.SuccessCondition = DefaultTrainJobSuccessCondition
+				} else {
+					t.SuccessCondition = DefaultKubeflowJobSuccessCondition
+				}
 			}
 			if t.FailureCondition == "" {
-				t.FailureCondition = DefaultKubeflowJobFailureCondition
+				if jobKind == "TrainJob" {
+					t.FailureCondition = DefaultTrainJobFailureCondition
+				} else {
+					t.FailureCondition = DefaultKubeflowJobFailureCondition
+				}
 			}
 			// For Kubeflow Job also set default PrimaryPodLabels
 			if len(t.PrimaryPodLabels) == 0 {
-				t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels
+				if jobKind == "TrainJob" {
+					t.PrimaryPodLabels = DefaultTrainJobPrimaryPodLabels
+				} else {
+					t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels
+				}
+
 			}
 		}
 	}

Original file line number	Diff line number	Diff line change
`@@ -109,14 +109,27 @@ func (e *Experiment) setDefaultTrialTemplate() {`
`109`	`109`	`}`
`110`	`110`	`} else if _, ok := KubeflowJobKinds[jobKind]; ok {`
`111`	`111`	`if t.SuccessCondition == "" {`
`112`		`- t.SuccessCondition = DefaultKubeflowJobSuccessCondition`
	`112`	`+ if jobKind == "TrainJob" {`
	`113`	`+ t.SuccessCondition = DefaultTrainJobSuccessCondition`
	`114`	`+ } else {`
	`115`	`+ t.SuccessCondition = DefaultKubeflowJobSuccessCondition`
	`116`	`+ }`
`113`	`117`	`}`
`114`	`118`	`if t.FailureCondition == "" {`
`115`		`- t.FailureCondition = DefaultKubeflowJobFailureCondition`
	`119`	`+ if jobKind == "TrainJob" {`
	`120`	`+ t.FailureCondition = DefaultTrainJobFailureCondition`
	`121`	`+ } else {`
	`122`	`+ t.FailureCondition = DefaultKubeflowJobFailureCondition`
	`123`	`+ }`
`116`	`124`	`}`
`117`	`125`	`// For Kubeflow Job also set default PrimaryPodLabels`
`118`	`126`	`if len(t.PrimaryPodLabels) == 0 {`
`119`		`- t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels`
	`127`	`+ if jobKind == "TrainJob" {`
	`128`	`+ t.PrimaryPodLabels = DefaultTrainJobPrimaryPodLabels`
	`129`	`+ } else {`
	`130`	`+ t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels`
	`131`	`+ }`
	`132`	`+`
`120`	`133`	`}`
`121`	`134`	`}`
`122`	`135`	`}`