kubeflow · HumairAK · Aug 20, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/.github/actions/kfp-cluster/action.yml b/.github/actions/kfp-cluster/action.yml
@@ -30,6 +30,15 @@ inputs:
   image_registry:
     required: true
     description: "Image Registry address of the images"
+  multi_user:
+    description: "If KFP should be deployed in multi-user mode"
+    required: false
+    default: 'false'
+  storage_backend:
+    description: "Storage backend to use (minio or seaweedfs)"
+    required: false
+    default: 'seaweedfs'
+
 runs:
   using: "composite"
   steps:
@@ -68,10 +77,22 @@ runs:
 
         if [ "${{ inputs.proxy }}" = "true" ]; then
           ARGS="${ARGS} --proxy"
-        elif [ "${{inputs.cache_enabled }}" = "false" ]; then
+        fi
+
+        if [ "${{inputs.cache_enabled }}" = "false" ]; then
           ARGS="${ARGS} --cache-disabled"
-        elif [ "${{inputs.pipeline_store }}" = "kubernetes" ]; then
-          ARGS="${ARGS}  --deploy-k8s-native"
+        fi
+
+        if [ "${{inputs.pipeline_store }}" = "kubernetes" ]; then
+          ARGS="${ARGS} --deploy-k8s-native"
+        fi
+
+        if [ "${{ inputs.multi_user }}" = "true" ]; then
+          ARGS="${ARGS} --multi-user"
+        fi
+
+        if [ "${{ inputs.storage_backend }}" != "seaweedfs" ]; then
+          ARGS="${ARGS} --storage ${{ inputs.storage_backend }}"
         fi
 
         ./.github/resources/scripts/deploy-kfp.sh $ARGS
diff --git a/.github/resources/manifests/argo/overlays/multi-user-minio/apiserver-env.yaml b/.github/resources/manifests/argo/overlays/multi-user-minio/apiserver-env.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ml-pipeline
+spec:
+  template:
+    spec:
+      containers:
+        - name: ml-pipeline-api-server
+          env:
+            - name: V2_DRIVER_IMAGE
+              value: kind-registry:5000/driver
+            - name: V2_LAUNCHER_IMAGE
+              value: kind-registry:5000/launcher
+            - name: LOG_LEVEL
+              value: "debug"
diff --git a/.github/resources/manifests/argo/overlays/multi-user-minio/kustomization.yaml b/.github/resources/manifests/argo/overlays/multi-user-minio/kustomization.yaml
@@ -0,0 +1,19 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../../../manifests/kustomize/env/platform-agnostic-multi-user-minio
+
+images:
+- name: ghcr.io/kubeflow/kfp-api-server
+  newName: kind-registry:5000/apiserver
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-persistence-agent
+  newName: kind-registry:5000/persistenceagent
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
+  newName: kind-registry:5000/scheduledworkflow
+  newTag: latest
+
+patchesStrategicMerge:
+- apiserver-env.yaml
diff --git a/.github/resources/manifests/argo/overlays/multi-user/apiserver-env.yaml b/.github/resources/manifests/argo/overlays/multi-user/apiserver-env.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ml-pipeline
+spec:
+  template:
+    spec:
+      containers:
+        - name: ml-pipeline-api-server
+          env:
+            - name: V2_DRIVER_IMAGE
+              value: kind-registry:5000/driver
+            - name: V2_LAUNCHER_IMAGE
+              value: kind-registry:5000/launcher
+            - name: LOG_LEVEL
+              value: "debug"
diff --git a/.github/resources/manifests/argo/overlays/multi-user/kustomization.yaml b/.github/resources/manifests/argo/overlays/multi-user/kustomization.yaml
@@ -0,0 +1,19 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../../../manifests/kustomize/env/platform-agnostic-multi-user
+
+images:
+- name: ghcr.io/kubeflow/kfp-api-server
+  newName: kind-registry:5000/apiserver
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-persistence-agent
+  newName: kind-registry:5000/persistenceagent
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
+  newName: kind-registry:5000/scheduledworkflow
+  newTag: latest
+
+patchesStrategicMerge:
+- apiserver-env.yaml
diff --git a/.github/resources/manifests/argo/overlays/no-proxy-minio/apiserver-env.yaml b/.github/resources/manifests/argo/overlays/no-proxy-minio/apiserver-env.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ml-pipeline
+spec:
+  template:
+    spec:
+      containers:
+        - name: ml-pipeline-api-server
+          env:
+            - name: V2_DRIVER_IMAGE
+              value: kind-registry:5000/driver
+            - name: V2_LAUNCHER_IMAGE
+              value: kind-registry:5000/launcher
+            - name: LOG_LEVEL
+              value: "debug"
diff --git a/.github/resources/manifests/argo/overlays/no-proxy-minio/kustomization.yaml b/.github/resources/manifests/argo/overlays/no-proxy-minio/kustomization.yaml
@@ -0,0 +1,19 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../../../manifests/kustomize/env/platform-agnostic-minio
+
+images:
+- name: ghcr.io/kubeflow/kfp-api-server
+  newName: kind-registry:5000/apiserver
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-persistence-agent
+  newName: kind-registry:5000/persistenceagent
+  newTag: latest
+- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
+  newName: kind-registry:5000/scheduledworkflow
+  newTag: latest
+
+patches:
+- path: apiserver-env.yaml
diff --git a/.github/resources/manifests/argo/overlays/no-proxy/kustomization.yaml b/.github/resources/manifests/argo/overlays/no-proxy/kustomization.yaml
@@ -17,4 +17,3 @@ images:
 
 patches:
 - path: apiserver-env.yaml
-- path: workflow-disable-logs-patch.yaml
diff --git a/.github/resources/manifests/argo/overlays/no-proxy/workflow-disable-logs-patch.yaml b/.github/resources/manifests/argo/overlays/no-proxy/workflow-disable-logs-patch.yaml
diff --git a/.github/resources/manifests/argo/overlays/proxy/proxy-env.yaml b/.github/resources/manifests/argo/overlays/proxy/proxy-env.yaml
@@ -13,4 +13,6 @@ spec:
             - name: HTTPS_PROXY
               value: "http://squid.squid.svc.cluster.local:3128"
             - name: NO_PROXY
-              value: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc,metadata-grpc-service,0,1,2,3,4,5,6,7,8,9"
+              value: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc,minio-service.kubeflow,metadata-grpc-service,metadata-grpc-service.kubeflow,ml-pipeline.kubeflow"
+            - name: OBJECTSTORECONFIG_HOST
+              value: "minio-service.kubeflow.svc.cluster.local"
diff --git a/.github/resources/scripts/deploy-kfp.sh b/.github/resources/scripts/deploy-kfp.sh
@@ -28,6 +28,8 @@ TEST_MANIFESTS=".github/resources/manifests/argo"
 PIPELINES_STORE="database"
 USE_PROXY=false
 CACHE_DISABLED=false
+MULTI_USER=false
+STORAGE_BACKEND="seaweedfs"
 
 # Loop over script arguments passed. This uses a single switch-case
 # block with default value in case we want to make alternative deployments
@@ -46,6 +48,14 @@ while [ "$#" -gt 0 ]; do
       CACHE_DISABLED=true
       shift
       ;;
+    --multi-user)
+      MULTI_USER=true
+      shift
+      ;;
+    --storage)
+      STORAGE_BACKEND="$2"
+      shift 2
+      ;;
   esac
 done
 
@@ -54,10 +64,19 @@ if [ "${USE_PROXY}" == "true" ] && [ "${PIPELINES_STORE}" == "kubernetes" ]; the
   exit 1
 fi
 
-kubectl apply -k "manifests/kustomize/cluster-scoped-resources/"
+if [ "${MULTI_USER}" == "true" ] && [ "${USE_PROXY}" == "true" ]; then
+  echo "ERROR: Multi-user mode cannot be deployed with proxy support."
+  exit 1
+fi
+
+if [ "${STORAGE_BACKEND}" != "minio" ] && [ "${STORAGE_BACKEND}" != "seaweedfs" ]; then
+  echo "ERROR: Storage backend must be either 'minio' or 'seaweedfs'."
+  exit 1
+fi
+
+kubectl apply -k "manifests/kustomize/cluster-scoped-resources/" || EXIT_CODE=$?
 kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s || EXIT_CODE=$?
-if [[ $EXIT_CODE -ne 0 ]]
-then
+if [[ $EXIT_CODE -ne 0 ]]; then
   echo "Failed to deploy cluster-scoped resources."
   exit $EXIT_CODE
 fi
@@ -73,13 +92,47 @@ if [ "${PIPELINES_STORE}" == "kubernetes" ]; then
   fi
 fi
 
+
+# Deploy multi-user prerequisites if multi-user mode is enabled
+if [ "${MULTI_USER}" == "true" ]; then
+  echo "Installing Istio..."
+  kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-crds/base?ref=master
+  kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-namespace/base?ref=master
+  kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-install/base?ref=master
+  echo "Waiting for all Istio Pods to become ready..."
+  kubectl wait --for=condition=Ready pods --all -n istio-system --timeout=300s
+
+  echo "Deploying Metacontroller CRD..."
+  kubectl apply -f manifests/kustomize/third-party/metacontroller/base/crd.yaml
+  kubectl wait --for condition=established --timeout=30s crd/compositecontrollers.metacontroller.k8s.io
+
+  echo "Installing Profile Controller Resources..."
+  kubectl apply -k https://github.com/kubeflow/manifests/applications/profiles/upstream/overlays/kubeflow?ref=master
+  kubectl -n kubeflow wait --for=condition=Ready pods -l kustomize.component=profiles --timeout 180s
+
+  echo "Creating KF Profile..."
+  kubectl apply -f test/seaweedfs/test-profiles.yaml
+
+  echo "Applying kubeflow-edit ClusterRole with proper aggregation..."
+  kubectl apply -f test/seaweedfs/kubeflow-edit-clusterrole.yaml
+
+  echo "Applying network policy to allow user namespace access to kubeflow services..."
+  kubectl apply -f test/seaweedfs/allow-user-namespace-access.yaml
+fi
+
 # Manifests will be deployed according to the flag provided
 if $CACHE_DISABLED; then
   TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/cache-disabled"
 elif $USE_PROXY; then
   TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/proxy"
 elif [ "${PIPELINES_STORE}" == "kubernetes" ]; then
   TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/kubernetes-native"
+elif [ "${MULTI_USER}" == "true" ] && [ "${STORAGE_BACKEND}" == "seaweedfs" ]; then
+  TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/multi-user"
+elif [ "${MULTI_USER}" == "true" ] && [ "${STORAGE_BACKEND}" == "minio" ]; then
+  TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/multi-user-minio"
+elif [ "${STORAGE_BACKEND}" == "minio" ]; then
+  TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/no-proxy-minio"
 else
   TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/no-proxy"
 fi
@@ -101,6 +154,16 @@ then
   exit 1
 fi
 
+# Verify pipeline integration for multi-user mode
+if [ "${MULTI_USER}" == "true" ]; then
+  echo "Verifying Pipeline Integration..."
+  KF_PROFILE=kubeflow-user-example-com
+  if ! kubectl get secret mlpipeline-minio-artifact -n $KF_PROFILE > /dev/null 2>&1; then
+    echo "Error: Secret mlpipeline-minio-artifact not found in namespace $KF_PROFILE"
+  fi
+  kubectl get secret mlpipeline-minio-artifact -n "$KF_PROFILE" -o json | jq -r '.data | keys[] as $k | "\($k): \(. | .[$k] | @base64d)"' | tr '\n' ' '
+fi
+
 collect_artifacts kubeflow
 
 echo "Finished KFP deployment."
diff --git a/.github/resources/scripts/free-disk-space.sh b/.github/resources/scripts/free-disk-space.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -euo pipefail
+
+# This script frees up disk space on GitHub Actions runners.
+# Several GHA workflows were failing with "no space left on device" errors.
+# This script is only meant to run in GitHub Actions CI environment.
+
+# Safety check: Only run on GitHub Actions
+if [[ "${GITHUB_ACTIONS:-false}" != "true" ]]; then
+    echo "ERROR: This script is for GitHub Actions runners only!"
+    exit 1
+fi
+
+echo "=== Initial disk usage ==="
+df -h
+
+echo "=== Freeing up disk space ==="
+
+# Remove large directories not needed for KFP tests
+sudo rm -rf /usr/share/dotnet
+sudo rm -rf /opt/ghc
+sudo rm -rf /usr/local/share/boost
+sudo rm -rf /usr/local/lib/android
+sudo rm -rf /usr/local/.ghcup
+sudo rm -rf /usr/share/swift
+
+# Selectively remove large tools from hostedtoolcache while preserving Go, Node, Python
+# Remove these specific large tools that aren't needed for KFP tests
+sudo rm -rf /opt/hostedtoolcache/CodeQL || true
+sudo rm -rf /opt/hostedtoolcache/Java_* || true
+sudo rm -rf /opt/hostedtoolcache/Ruby || true
+sudo rm -rf /opt/hostedtoolcache/PyPy || true
+sudo rm -rf /opt/hostedtoolcache/boost || true
+
+# Clean package manager
+sudo apt-get autoremove -y
+sudo apt-get autoclean
+
+# Clean Docker
+docker system prune -af --volumes
+docker image prune -af
+
+# Clean containerd
+sudo systemctl stop containerd || true
+sudo rm -rf /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/* || true
+sudo systemctl start containerd || true
+
+echo "=== Final disk usage ==="
+df -h
diff --git a/.github/resources/scripts/kfp-readiness/wait_for_pods.py b/.github/resources/scripts/kfp-readiness/wait_for_pods.py
@@ -49,8 +49,17 @@ def get_pod_statuses():
 
 
 def all_pods_ready(statuses):
-    return all(pod_status == 'Running' and ready == total
-               for pod_status, ready, total, _ in statuses.values())
+    def is_pod_ready(pod_status, ready, total):
+        # Jobs/CronJobs are ready when they succeed
+        if pod_status == 'Succeeded':
+            return True
+        # Regular pods are ready when running and all containers are ready
+        if pod_status == 'Running' and ready == total:
+            return True
+        return False
+
+    return all(is_pod_ready(pod_status, ready, total)
+               for _, (pod_status, ready, total, _) in statuses.items())
 
 
 def print_get_pods():
@@ -107,7 +116,9 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
 
     logging.info("Final pod statuses:")
     for pod_name, (pod_status, ready, total, _) in previous_statuses.items():
-        if pod_status == 'Running' and ready == total:
+        if pod_status == 'Succeeded':
+            logging.info(f"Pod {pod_name} completed successfully (Job/CronJob)")
+        elif pod_status == 'Running' and ready == total:
             logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
         else:
             logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,3 @@ images:

		patches:
		- path: apiserver-env.yaml
		- path: workflow-disable-logs-patch.yaml