Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions .github/actions/kfp-cluster/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ inputs:
image_registry:
required: true
description: "Image Registry address of the images"
multi_user:
description: "If KFP should be deployed in multi-user mode"
required: false
default: 'false'
storage_backend:
description: "Storage backend to use (minio or seaweedfs)"
required: false
default: 'seaweedfs'

runs:
using: "composite"
steps:
Expand Down Expand Up @@ -68,10 +77,22 @@ runs:

if [ "${{ inputs.proxy }}" = "true" ]; then
ARGS="${ARGS} --proxy"
elif [ "${{inputs.cache_enabled }}" = "false" ]; then
fi

if [ "${{inputs.cache_enabled }}" = "false" ]; then
ARGS="${ARGS} --cache-disabled"
elif [ "${{inputs.pipeline_store }}" = "kubernetes" ]; then
ARGS="${ARGS} --deploy-k8s-native"
fi

if [ "${{inputs.pipeline_store }}" = "kubernetes" ]; then
ARGS="${ARGS} --deploy-k8s-native"
fi

if [ "${{ inputs.multi_user }}" = "true" ]; then
ARGS="${ARGS} --multi-user"
fi

if [ "${{ inputs.storage_backend }}" != "seaweedfs" ]; then
ARGS="${ARGS} --storage ${{ inputs.storage_backend }}"
fi

./.github/resources/scripts/deploy-kfp.sh $ARGS
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-pipeline
spec:
template:
spec:
containers:
- name: ml-pipeline-api-server
env:
- name: V2_DRIVER_IMAGE
value: kind-registry:5000/driver
- name: V2_LAUNCHER_IMAGE
value: kind-registry:5000/launcher
- name: LOG_LEVEL
value: "debug"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../../../../../../manifests/kustomize/env/platform-agnostic-multi-user-minio

images:
- name: ghcr.io/kubeflow/kfp-api-server
newName: kind-registry:5000/apiserver
newTag: latest
- name: ghcr.io/kubeflow/kfp-persistence-agent
newName: kind-registry:5000/persistenceagent
newTag: latest
- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
newName: kind-registry:5000/scheduledworkflow
newTag: latest

patchesStrategicMerge:
- apiserver-env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-pipeline
spec:
template:
spec:
containers:
- name: ml-pipeline-api-server
env:
- name: V2_DRIVER_IMAGE
value: kind-registry:5000/driver
- name: V2_LAUNCHER_IMAGE
value: kind-registry:5000/launcher
- name: LOG_LEVEL
value: "debug"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../../../../../../manifests/kustomize/env/platform-agnostic-multi-user

images:
- name: ghcr.io/kubeflow/kfp-api-server
newName: kind-registry:5000/apiserver
newTag: latest
- name: ghcr.io/kubeflow/kfp-persistence-agent
newName: kind-registry:5000/persistenceagent
newTag: latest
- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
newName: kind-registry:5000/scheduledworkflow
newTag: latest

patchesStrategicMerge:
- apiserver-env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-pipeline
spec:
template:
spec:
containers:
- name: ml-pipeline-api-server
env:
- name: V2_DRIVER_IMAGE
value: kind-registry:5000/driver
- name: V2_LAUNCHER_IMAGE
value: kind-registry:5000/launcher
- name: LOG_LEVEL
value: "debug"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../../../../../../manifests/kustomize/env/platform-agnostic-minio

images:
- name: ghcr.io/kubeflow/kfp-api-server
newName: kind-registry:5000/apiserver
newTag: latest
- name: ghcr.io/kubeflow/kfp-persistence-agent
newName: kind-registry:5000/persistenceagent
newTag: latest
- name: ghcr.io/kubeflow/kfp-scheduled-workflow-controller
newName: kind-registry:5000/scheduledworkflow
newTag: latest

patches:
- path: apiserver-env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ images:

patches:
- path: apiserver-env.yaml
- path: workflow-disable-logs-patch.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ spec:
- name: HTTPS_PROXY
value: "http://squid.squid.svc.cluster.local:3128"
- name: NO_PROXY
value: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc,metadata-grpc-service,0,1,2,3,4,5,6,7,8,9"
value: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc,minio-service.kubeflow,metadata-grpc-service,metadata-grpc-service.kubeflow,ml-pipeline.kubeflow"
- name: OBJECTSTORECONFIG_HOST
value: "minio-service.kubeflow.svc.cluster.local"
69 changes: 66 additions & 3 deletions .github/resources/scripts/deploy-kfp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ TEST_MANIFESTS=".github/resources/manifests/argo"
PIPELINES_STORE="database"
USE_PROXY=false
CACHE_DISABLED=false
MULTI_USER=false
STORAGE_BACKEND="seaweedfs"

# Loop over script arguments passed. This uses a single switch-case
# block with default value in case we want to make alternative deployments
Expand All @@ -46,6 +48,14 @@ while [ "$#" -gt 0 ]; do
CACHE_DISABLED=true
shift
;;
--multi-user)
MULTI_USER=true
shift
;;
--storage)
STORAGE_BACKEND="$2"
shift 2
;;
esac
done

Expand All @@ -54,10 +64,19 @@ if [ "${USE_PROXY}" == "true" ] && [ "${PIPELINES_STORE}" == "kubernetes" ]; the
exit 1
fi

kubectl apply -k "manifests/kustomize/cluster-scoped-resources/"
if [ "${MULTI_USER}" == "true" ] && [ "${USE_PROXY}" == "true" ]; then
echo "ERROR: Multi-user mode cannot be deployed with proxy support."
exit 1
fi

if [ "${STORAGE_BACKEND}" != "minio" ] && [ "${STORAGE_BACKEND}" != "seaweedfs" ]; then
echo "ERROR: Storage backend must be either 'minio' or 'seaweedfs'."
exit 1
fi

kubectl apply -k "manifests/kustomize/cluster-scoped-resources/" || EXIT_CODE=$?
kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
if [[ $EXIT_CODE -ne 0 ]]; then
echo "Failed to deploy cluster-scoped resources."
exit $EXIT_CODE
fi
Expand All @@ -73,13 +92,47 @@ if [ "${PIPELINES_STORE}" == "kubernetes" ]; then
fi
fi


# Deploy multi-user prerequisites if multi-user mode is enabled
if [ "${MULTI_USER}" == "true" ]; then
echo "Installing Istio..."
kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-crds/base?ref=master
kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-namespace/base?ref=master
kubectl apply -k https://github.com/kubeflow/manifests//common/istio/istio-install/base?ref=master
echo "Waiting for all Istio Pods to become ready..."
kubectl wait --for=condition=Ready pods --all -n istio-system --timeout=300s

echo "Deploying Metacontroller CRD..."
kubectl apply -f manifests/kustomize/third-party/metacontroller/base/crd.yaml
kubectl wait --for condition=established --timeout=30s crd/compositecontrollers.metacontroller.k8s.io

echo "Installing Profile Controller Resources..."
kubectl apply -k https://github.com/kubeflow/manifests/applications/profiles/upstream/overlays/kubeflow?ref=master
kubectl -n kubeflow wait --for=condition=Ready pods -l kustomize.component=profiles --timeout 180s

echo "Creating KF Profile..."
kubectl apply -f test/seaweedfs/test-profiles.yaml

echo "Applying kubeflow-edit ClusterRole with proper aggregation..."
kubectl apply -f test/seaweedfs/kubeflow-edit-clusterrole.yaml

echo "Applying network policy to allow user namespace access to kubeflow services..."
kubectl apply -f test/seaweedfs/allow-user-namespace-access.yaml
fi

# Manifests will be deployed according to the flag provided
if $CACHE_DISABLED; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/cache-disabled"
elif $USE_PROXY; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/proxy"
elif [ "${PIPELINES_STORE}" == "kubernetes" ]; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/kubernetes-native"
elif [ "${MULTI_USER}" == "true" ] && [ "${STORAGE_BACKEND}" == "seaweedfs" ]; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/multi-user"
elif [ "${MULTI_USER}" == "true" ] && [ "${STORAGE_BACKEND}" == "minio" ]; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/multi-user-minio"
elif [ "${STORAGE_BACKEND}" == "minio" ]; then
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/no-proxy-minio"
else
TEST_MANIFESTS="${TEST_MANIFESTS}/overlays/no-proxy"
fi
Expand All @@ -101,6 +154,16 @@ then
exit 1
fi

# Verify pipeline integration for multi-user mode
if [ "${MULTI_USER}" == "true" ]; then
echo "Verifying Pipeline Integration..."
KF_PROFILE=kubeflow-user-example-com
if ! kubectl get secret mlpipeline-minio-artifact -n $KF_PROFILE > /dev/null 2>&1; then
echo "Error: Secret mlpipeline-minio-artifact not found in namespace $KF_PROFILE"
fi
kubectl get secret mlpipeline-minio-artifact -n "$KF_PROFILE" -o json | jq -r '.data | keys[] as $k | "\($k): \(. | .[$k] | @base64d)"' | tr '\n' ' '
fi

collect_artifacts kubeflow

echo "Finished KFP deployment."
49 changes: 49 additions & 0 deletions .github/resources/scripts/free-disk-space.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail

# This script frees up disk space on GitHub Actions runners.
# Several GHA workflows were failing with "no space left on device" errors.
# This script is only meant to run in GitHub Actions CI environment.

# Safety check: Only run on GitHub Actions
if [[ "${GITHUB_ACTIONS:-false}" != "true" ]]; then
echo "ERROR: This script is for GitHub Actions runners only!"
exit 1
fi

echo "=== Initial disk usage ==="
df -h

echo "=== Freeing up disk space ==="

# Remove large directories not needed for KFP tests
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/share/swift

# Selectively remove large tools from hostedtoolcache while preserving Go, Node, Python
# Remove these specific large tools that aren't needed for KFP tests
sudo rm -rf /opt/hostedtoolcache/CodeQL || true
sudo rm -rf /opt/hostedtoolcache/Java_* || true
sudo rm -rf /opt/hostedtoolcache/Ruby || true
sudo rm -rf /opt/hostedtoolcache/PyPy || true
sudo rm -rf /opt/hostedtoolcache/boost || true

# Clean package manager
sudo apt-get autoremove -y
sudo apt-get autoclean

# Clean Docker
docker system prune -af --volumes
docker image prune -af

# Clean containerd
sudo systemctl stop containerd || true
sudo rm -rf /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/* || true
sudo systemctl start containerd || true

echo "=== Final disk usage ==="
df -h
17 changes: 14 additions & 3 deletions .github/resources/scripts/kfp-readiness/wait_for_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,17 @@ def get_pod_statuses():


def all_pods_ready(statuses):
return all(pod_status == 'Running' and ready == total
for pod_status, ready, total, _ in statuses.values())
def is_pod_ready(pod_status, ready, total):
# Jobs/CronJobs are ready when they succeed
if pod_status == 'Succeeded':
return True
# Regular pods are ready when running and all containers are ready
if pod_status == 'Running' and ready == total:
return True
return False

return all(is_pod_ready(pod_status, ready, total)
for _, (pod_status, ready, total, _) in statuses.items())


def print_get_pods():
Expand Down Expand Up @@ -107,7 +116,9 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):

logging.info("Final pod statuses:")
for pod_name, (pod_status, ready, total, _) in previous_statuses.items():
if pod_status == 'Running' and ready == total:
if pod_status == 'Succeeded':
logging.info(f"Pod {pod_name} completed successfully (Job/CronJob)")
elif pod_status == 'Running' and ready == total:
logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
else:
logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})")
Expand Down
Loading
Loading