Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
dockerfile: build/images/training-operator/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
tag-prefix: v1
- component-name: training-operator
- component-name: training-operator-v2
dockerfile: cmd/training-operator.v2alpha1/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
tag-prefix: v2alpha1
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" "webhook" paths="./pkg/apis/kubeflow.org/v2alpha1/...;./pkg/webhook.v2/..." \
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" rbac:roleName=training-operator-v2 webhook \
paths="./pkg/apis/kubeflow.org/v2alpha1/...;./pkg/controller.v2/...;./pkg/runtime.v2/...;./pkg/webhook.v2/...;./pkg/cert/..." \
output:crd:artifacts:config=manifests/v2/base/crds \
output:rbac:artifacts:config=manifests/v2/base/rbac \
output:webhook:artifacts:config=manifests/v2/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Expand Down
7 changes: 5 additions & 2 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ import (
const (
// EnvKubeflowNamespace is an environment variable for namespace when deployed on kubernetes
EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"

webhookConfigurationName = "validator.training-operator.kubeflow.org"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should change it, since those parameters defines the particular webhook configuration, not the name of validation webhook configuration object. For example:
https://github.com/kubeflow/training-operator/blob/4bab32ab8898f33fbae45c694165afce69257c32/manifests/v2/base/webhook/manifests.yaml#L35

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, you're right.
This will be used in

- op: replace
  path: /metadata/name
  value: validator.training-operator-v2.kubeflow.org

)

var (
Expand Down Expand Up @@ -150,8 +152,9 @@ func main() {
certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookConfigurationName: webhookConfigurationName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
Expand Down
9 changes: 7 additions & 2 deletions cmd/training-operator.v2alpha1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ import (
webhookv2 "github.com/kubeflow/training-operator/pkg/webhook.v2"
)

const (
webhookConfigurationName = "validator.training-operator-v2.kubeflow.org"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)

var (
scheme = apiruntime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
Expand Down Expand Up @@ -124,8 +128,9 @@ func main() {

certsReady := make(chan struct{})
if err = cert.ManageCerts(mgr, cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookConfigurationName: webhookConfigurationName,
}, certsReady); err != nil {
setupLog.Error(err, "unable to set up cert rotation")
os.Exit(1)
Expand Down
9 changes: 9 additions & 0 deletions manifests/v2/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
# We can't set namespace in the overlays since we use remote JobSet manifests in the resources.
namespace: kubeflow-system
resources:
- ./crds
- ./rbac
- ./webhook
- ./manager
2 changes: 2 additions & 0 deletions manifests/v2/base/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
resources:
- manager.yaml
70 changes: 70 additions & 0 deletions manifests/v2/base/manager/manager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: training-operator-v2
labels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
spec:
selector:
matchLabels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
template:
metadata:
labels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
spec:
containers:
- name: manager
image: kubeflow/training-operator-v2
env:
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
timeoutSeconds: 3
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 10
periodSeconds: 15
timeoutSeconds: 3
serviceAccountName: training-operator-v2
volumes:
- name: cert
secret:
defaultMode: 420
secretName: training-operator-v2-webhook-cert
---
apiVersion: v1
kind: Service
metadata:
name: training-operator-v2
spec:
ports:
- name: monitoring-port
port: 8080
targetPort: 8080
- name: webhook-server
port: 443
protocol: TCP
targetPort: 9443
selector:
app.kubernetes.io/component: manager
4 changes: 4 additions & 0 deletions manifests/v2/base/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- role.yaml
- role_binding.yaml
- service_account.yaml
78 changes: 78 additions & 0 deletions manifests/v2/base/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: training-operator-v2
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- update
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- update
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- clustertrainingruntimes
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- trainingruntimes
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- trainjobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- trainjobs/status
verbs:
- get
- patch
- update
- apiGroups:
- scheduling.x-k8s.io
resources:
- podgroups
verbs:
- create
- get
- list
- watch
12 changes: 12 additions & 0 deletions manifests/v2/base/rbac/role_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: training-operator-v2
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: training-operator-v2
subjects:
- kind: ServiceAccount
name: training-operator-v2
5 changes: 5 additions & 0 deletions manifests/v2/base/rbac/service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: training-operator-v2
10 changes: 10 additions & 0 deletions manifests/v2/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- manifests.yaml
patches:
- path: patch.yaml
target:
group: admissionregistration.k8s.io
version: v1
kind: ValidatingWebhookConfiguration
configurations:
- kustomizeconfig.yaml
10 changes: 10 additions & 0 deletions manifests/v2/base/webhook/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# the following config is for teaching kustomize where to look at when substituting vars.
# It requires kustomize v2.1.0 or newer to work properly.
namespace:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/namespace
create: true

varReference:
- path: metadata/annotations
12 changes: 12 additions & 0 deletions manifests/v2/base/webhook/patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- op: replace
path: /webhooks/0/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /webhooks/1/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /webhooks/2/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /metadata/name
value: validator.training-operator-v2.kubeflow.org
15 changes: 15 additions & 0 deletions manifests/v2/overlays/standalone/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../base
# TODO (andreyvelich): JobSet should support kubeflow-system namespace.
- https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml
images:
- name: kubeflow/training-operator-v2
newTag: latest
secretGenerator:
- name: training-operator-v2-webhook-cert
namespace: kubeflow-system
options:
disableNameSuffixHash: true
4 changes: 4 additions & 0 deletions manifests/v2/overlays/standalone/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow-system
12 changes: 6 additions & 6 deletions pkg/cert/cert.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ import (

const (
certDir = "/tmp/k8s-webhook-server/serving-certs"
vwcName = "validator.training-operator.kubeflow.org"
caName = "training-operator-ca"
caOrganization = "training-operator"
defaultOperatorNamespace = "kubeflow"
)

type Config struct {
WebhookServiceName string
WebhookSecretName string
WebhookServiceName string
WebhookSecretName string
WebhookConfigurationName string
}

// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;update
// +kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=validatingwebhookconfigurations,verbs=get;list;watch;update
//+kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;update
//+kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=validatingwebhookconfigurations,verbs=get;list;watch;update

// ManageCerts creates all certs for webhooks.
func ManageCerts(mgr ctrl.Manager, cfg Config, setupFinished chan struct{}) error {
Expand All @@ -61,7 +61,7 @@ func ManageCerts(mgr ctrl.Manager, cfg Config, setupFinished chan struct{}) erro
IsReady: setupFinished,
Webhooks: []cert.WebhookInfo{{
Type: cert.Validating,
Name: vwcName,
Name: cfg.WebhookConfigurationName,
}},
// When training-operator is running in the leader election mode,
// we expect webhook server will run in primary and secondary instance
Expand Down
3 changes: 3 additions & 0 deletions pkg/controller.v2/trainjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ func NewTrainJobReconciler(client client.Client, recorder record.EventRecorder)
}
}

//+kubebuilder:rbac:groups=kubeflow.org,resources=trainjobs,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=kubeflow.org,resources=trainjobs/status,verbs=get;update;patch

func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
var trainJob kubeflowv2.TrainJob
if err := r.client.Get(ctx, req.NamespacedName, &trainJob); err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/runtime.v2/core/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
runtime "github.com/kubeflow/training-operator/pkg/runtime.v2"
)

//+kubebuilder:rbac:groups=kubeflow.org,resources=trainingruntimes,verbs=get;list;watch
//+kubebuilder:rbac:groups=kubeflow.org,resources=clustertrainingruntimes,verbs=get;list;watch

func New(ctx context.Context, client client.Client, indexer client.FieldIndexer) (map[string]runtime.Runtime, error) {
registry := NewRuntimeRegistry()
runtimes := make(map[string]runtime.Runtime, len(registry))
Expand Down
2 changes: 2 additions & 0 deletions pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ var (

const Name = "CoScheduling"

//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create

func New(ctx context.Context, c client.Client, indexer client.FieldIndexer) (framework.Plugin, error) {
if err := indexer.IndexField(ctx, &kubeflowv2.TrainingRuntime{}, TrainingRuntimeContainerRuntimeClassKey,
IndexTrainingRuntimeContainerRuntimeClass); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/runtime.v2/framework/plugins/jobset/jobset.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ var _ framework.ComponentBuilderPlugin = (*JobSet)(nil)

const Name = "JobSet"

//+kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets,verbs=get;list;watch;create

func New(ctx context.Context, c client.Client, _ client.FieldIndexer) (framework.Plugin, error) {
return &JobSet{
client: c,
Expand Down
2 changes: 1 addition & 1 deletion test/integration/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (f *Framework) Init() *rest.Config {
f.testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "crds")},
WebhookInstallOptions: envtest.WebhookInstallOptions{
Paths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "webhook")},
Paths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "webhook", "manifests.yaml")},
},
ErrorIfCRDPathMissing: true,
}
Expand Down
Loading