Skip to content

Commit d3009e9

Browse files
committed
set up for new gpu class, creating notebooks, localqueue config to point to clusterqueues, and observability for jobs through rolebinding
1 parent b59331c commit d3009e9

File tree

6 files changed

+472
-3
lines changed

6 files changed

+472
-3
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
repos:
22
- repo: https://github.com/Lucas-C/pre-commit-hooks
3-
rev: v1.5.4
3+
rev: v1.5.5
44
hooks:
55
- id: remove-tabs
66

77
- repo: https://github.com/pre-commit/pre-commit-hooks
8-
rev: v4.5.0
8+
rev: v6.0.0
99
hooks:
1010
- id: trailing-whitespace
1111
- id: check-merge-conflict
@@ -18,7 +18,7 @@ repos:
1818
- id: detect-private-key
1919

2020
- repo: https://github.com/adrienverge/yamllint.git
21-
rev: v1.32.0
21+
rev: v1.37.1
2222
hooks:
2323
- id: yamllint
2424
files: \.(yaml|yml)$

gpu-class/clusterqueue_rb.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: kueue-clusterq-reader
5+
rules:
6+
- apiGroups: ["kueue.x-k8s.io"]
7+
resources: ["clusterqueues"]
8+
verbs: ["get", "list", "watch"]
9+
---
10+
apiVersion: rbac.authorization.k8s.io/v1
11+
kind: ClusterRoleBinding
12+
metadata:
13+
name: kueue-clusterq-reader
14+
roleRef:
15+
apiGroup: rbac.authorization.k8s.io
16+
kind: ClusterRole
17+
name: kueue-clusterqueue-reader
18+
subjects:
19+
- kind: Group
20+
name: system:serviceaccounts
21+
apiGroup: rbac.authorization.k8s.io

gpu-class/gpu-class-setup.sh

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/bin/bash
2+
3+
CLASS_NAME="csw991"
4+
5+
create_resource_command=(oc create -f -)
6+
openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.ocp-test.nerc.mghpcc.org/projects/ope-test?section=workbenches
7+
# split openshift url to provide as parameters
8+
host="${openshift_url%/projects*}" # get everything before projects
9+
hub_host=$host
10+
run_name="gpu_class_test"
11+
image_name="csw-dev-f25"
12+
13+
create_wb() {
14+
random_id=$(openssl rand -hex 3)
15+
16+
#set namespace
17+
namespace=$1
18+
19+
#get student sername from namespace
20+
username=$(echo "$namespace" | awk -F'-' '{print $2}')
21+
22+
# give notebook within namespace a name
23+
notebook_name=${username,,}-${random_id}
24+
25+
params=(
26+
-p NOTEBOOK_NAME="$notebook_name"
27+
-p RUN_NAME="$run_name"
28+
-p USERNAME="$username"
29+
-p NAMESPACE="$namespace"
30+
-p IMAGE_NAME="$image_name"
31+
-p OPENSHIFT_URL="$openshift_url"
32+
-p HUB_HOST="$hub_host"
33+
)
34+
35+
oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" 1>&2
36+
37+
echo "$notebook_name"
38+
}
39+
40+
apply_localqueue() {
41+
namespace=$1
42+
43+
local_params=(
44+
-p NAMESPACE="$namespace"
45+
)
46+
47+
oc process -f localqueue.yaml "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
48+
}
49+
50+
apply_rolebinding() {
51+
#set namespace and nb name
52+
namespace=$1
53+
notebook_name=$2
54+
55+
rb_params=(
56+
-p NAMESPACE="$namespace"
57+
-p SERVICE_ACCOUNT_NB="$notebook_name"
58+
)
59+
60+
oc process -f rb.yaml "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
61+
}
62+
63+
apply_clusterq() {
64+
65+
oc apply -f clusterqueue_rb.yaml --as system:admin
66+
}
67+
68+
apply_clusterq
69+
70+
oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
71+
oc project "$ns"
72+
73+
#create a workbench and save the name of the notebook to apply rolebindings
74+
nb_name="$(create_wb "$ns")"
75+
apply_rolebinding "$ns" "$nb_name"
76+
apply_localqueue "$ns"
77+
78+
done

gpu-class/localqueue.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
metadata:
4+
name: localqueue
5+
parameters:
6+
- name: NAMESPACE
7+
required: true
8+
objects:
9+
- apiVersion: kueue.x-k8s.io/v1beta1
10+
kind: LocalQueue
11+
metadata:
12+
name: v100-localqueue
13+
namespace: ${NAMESPACE}
14+
spec:
15+
clusterQueue: v100-clusterqueue
16+
- apiVersion: kueue.x-k8s.io/v1beta1
17+
kind: LocalQueue
18+
metadata:
19+
name: a100-localqueue
20+
namespace: ${NAMESPACE}
21+
spec:
22+
clusterQueue: a100-clusterqueue
23+
- apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: LocalQueue
25+
metadata:
26+
name: h100-localqueue
27+
namespace: ${NAMESPACE}
28+
spec:
29+
clusterQueue: h100-clusterqueue

gpu-class/notebook_resource.yaml

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
parameters:
4+
- name: NOTEBOOK_NAME
5+
required: true
6+
- name: RUN_NAME
7+
required: true
8+
- name: USERNAME
9+
required: true
10+
- name: IMAGE_NAME
11+
required: true
12+
- name: NAMESPACE
13+
required: true
14+
- name: OPENSHIFT_URL
15+
required: true
16+
- name: IMAGE_REPO
17+
required: true
18+
value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications"
19+
- name: HUB_HOST
20+
required: true
21+
- name: PVC_SIZE
22+
required: true
23+
value: "20Gi"
24+
- name: TOKEN
25+
required: false
26+
objects:
27+
- apiVersion: kubeflow.org/v1beta1
28+
kind: Notebook
29+
metadata:
30+
annotations:
31+
notebooks.opendatahub.io/inject-oauth: 'true'
32+
notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME}
33+
notebooks.opendatahub.io/last-size-selection: Small
34+
notebooks.opendatahub.io/oauth-logout-url: >-
35+
${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
36+
opendatahub.io/username: ${USERNAME}
37+
openshift.io/description: ''
38+
openshift.io/display-name: ${NOTEBOOK_NAME}
39+
name: ${NOTEBOOK_NAME}
40+
labels:
41+
ope-run: ${RUN_NAME}
42+
app: ${NOTEBOOK_NAME}
43+
opendatahub.io/dashboard: 'true'
44+
opendatahub.io/odh-managed: 'true'
45+
opendatahub.io/user: ${USERNAME}
46+
spec:
47+
template:
48+
spec:
49+
containers:
50+
- resources:
51+
limits:
52+
cpu: '2'
53+
memory: 8Gi
54+
requests:
55+
cpu: '1'
56+
memory: 8Gi
57+
readinessProbe:
58+
failureThreshold: 3
59+
httpGet:
60+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
61+
port: notebook-port
62+
scheme: HTTP
63+
initialDelaySeconds: 10
64+
periodSeconds: 5
65+
successThreshold: 1
66+
timeoutSeconds: 1
67+
name: ${NOTEBOOK_NAME}
68+
livenessProbe:
69+
failureThreshold: 3
70+
httpGet:
71+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
72+
port: notebook-port
73+
scheme: HTTP
74+
initialDelaySeconds: 10
75+
periodSeconds: 5
76+
successThreshold: 1
77+
timeoutSeconds: 1
78+
env:
79+
- name: NOTEBOOK_ARGS
80+
value: |-
81+
--ServerApp.port=8888
82+
--ServerApp.token=${TOKEN}
83+
--ServerApp.password=''
84+
--ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME}
85+
--ServerApp.quit_button=False
86+
--ServerApp.tornado_settings={"user":"${USERNAME}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"}
87+
- name: JUPYTER_IMAGE
88+
value: >-
89+
${IMAGE_REPO}/${IMAGE_NAME}
90+
ports:
91+
- containerPort: 8888
92+
name: notebook-port
93+
protocol: TCP
94+
imagePullPolicy: Always
95+
volumeMounts:
96+
- mountPath: /opt/app-root/src
97+
name: ${NOTEBOOK_NAME}
98+
- mountPath: /dev/shm
99+
name: shm
100+
image: >-
101+
${IMAGE_REPO}/${IMAGE_NAME}
102+
workingDir: /opt/app-root/src
103+
- resources:
104+
limits:
105+
cpu: 100m
106+
memory: 64Mi
107+
requests:
108+
cpu: 100m
109+
memory: 64Mi
110+
readinessProbe:
111+
failureThreshold: 3
112+
httpGet:
113+
path: /oauth/healthz
114+
port: oauth-proxy
115+
scheme: HTTPS
116+
initialDelaySeconds: 5
117+
periodSeconds: 5
118+
successThreshold: 1
119+
timeoutSeconds: 1
120+
name: oauth-proxy
121+
livenessProbe:
122+
failureThreshold: 3
123+
httpGet:
124+
path: /oauth/healthz
125+
port: oauth-proxy
126+
scheme: HTTPS
127+
initialDelaySeconds: 30
128+
periodSeconds: 5
129+
successThreshold: 1
130+
timeoutSeconds: 1
131+
env:
132+
- name: NAMESPACE
133+
valueFrom:
134+
fieldRef:
135+
fieldPath: metadata.namespace
136+
ports:
137+
- containerPort: 8443
138+
name: oauth-proxy
139+
protocol: TCP
140+
imagePullPolicy: Always
141+
volumeMounts:
142+
- mountPath: /etc/oauth/config
143+
name: oauth-config
144+
- mountPath: /etc/tls/private
145+
name: tls-certificates
146+
image: >-
147+
registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46
148+
args:
149+
- '--provider=openshift'
150+
- '--https-address=:8443'
151+
- '--http-address='
152+
- '--openshift-service-account=${NOTEBOOK_NAME}'
153+
- '--cookie-secret-file=/etc/oauth/config/cookie_secret'
154+
- '--cookie-expire=24h0m0s'
155+
- '--tls-cert=/etc/tls/private/tls.crt'
156+
- '--tls-key=/etc/tls/private/tls.key'
157+
- '--upstream=http://localhost:8888'
158+
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
159+
- '--email-domain=*'
160+
- '--skip-provider-button'
161+
- >-
162+
--openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"}
163+
- >-
164+
--logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
165+
enableServiceLinks: false
166+
serviceAccountName: ${NOTEBOOK_NAME}
167+
volumes:
168+
- name: ${NOTEBOOK_NAME}
169+
persistentVolumeClaim:
170+
claimName: ${NOTEBOOK_NAME}
171+
- emptyDir:
172+
medium: Memory
173+
name: shm
174+
- name: oauth-config
175+
secret:
176+
defaultMode: 420
177+
secretName: ${NOTEBOOK_NAME}-oauth-config
178+
- name: tls-certificates
179+
secret:
180+
defaultMode: 420
181+
secretName: ${NOTEBOOK_NAME}-tls
182+
- apiVersion: v1
183+
kind: PersistentVolumeClaim
184+
metadata:
185+
name: ${NOTEBOOK_NAME}
186+
labels:
187+
app: ${NOTEBOOK_NAME}
188+
notebook-name: ${NOTEBOOK_NAME}
189+
ope-run: ${RUN_NAME}
190+
opendatahub.io/dashboard: 'true'
191+
spec:
192+
accessModes:
193+
- ReadWriteOnce
194+
resources:
195+
requests:
196+
storage: "${PVC_SIZE}"

0 commit comments

Comments
 (0)