refactor: auto-scaling and observability components #290
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI tests | |
| on: | |
| push: | |
| branches: [ "main" ] | |
| pull_request: | |
| branches: [ "main" ] | |
| env: | |
| HELM_VERSION: v3.15.2 | |
| PGO_VERSION: 5.7.4 | |
| jobs: | |
| fast-checks: | |
| name: Simple tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - uses: actions/setup-node@v6 | |
| with: | |
| node-version: '24' | |
| - name: Install Helm | |
| uses: azure/setup-helm@v4 | |
| with: | |
| version: ${{ env.HELM_VERSION }} | |
| - name: Setup Helm dependencies | |
| run: ./scripts/deploy.sh setup | |
| - name: Install ajv-cli | |
| run: npm install -g ajv-cli ajv-formats | |
| - name: Run linters | |
| run: make lint | |
| - name: Validate Helm values schema | |
| run: make validate-schema | |
| - name: Run Helm unit tests | |
| run: make tests | |
| integration-tests: | |
| name: Integration tests | |
| # needs: fast-checks | |
| if: github.event.pull_request.head.repo.full_name == github.repository | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - name: Start K3s cluster | |
| uses: jupyterhub/action-k3s-helm@v4 | |
| with: | |
| k3s-channel: latest | |
| helm-version: ${{ env.HELM_VERSION }} | |
| metrics-enabled: false | |
| docker-enabled: true | |
| - name: Set release name | |
| run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" | |
| - name: Wait for K3s readiness | |
| run: | | |
| echo "=== Waiting for K3s cluster to be ready ===" | |
| # The action already sets up kubectl context, just verify it works | |
| kubectl cluster-info | |
| kubectl get nodes | |
| # Wait for core components | |
| kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s | |
| # Verify Traefik CRDs | |
| timeout=300; counter=0 | |
| for crd in "middlewares.traefik.io" "ingressroutes.traefik.io"; do | |
| while [ $counter -lt $timeout ] && ! kubectl get crd "$crd" &>/dev/null; do | |
| sleep 3; counter=$((counter + 3)) | |
| done | |
| [ $counter -ge $timeout ] && { echo "❌ Timeout waiting for $crd"; exit 1; } | |
| done | |
| echo "✅ K3s cluster ready" | |
| - name: Deploy eoAPI | |
| id: deploy | |
| run: | | |
| echo "=== eoAPI Deployment ===" | |
| export RELEASE_NAME="${RELEASE_NAME}" | |
| export PGO_VERSION="${{ env.PGO_VERSION }}" | |
| export CI_MODE=true | |
| # Deploy using consolidated script with CI mode | |
| ./scripts/deploy.sh --ci | |
| - name: Validate deployment | |
| run: | | |
| echo "=== Post-deployment validation ===" | |
| ./scripts/test.sh check-deployment | |
| - name: Run integration tests | |
| run: | | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| ./scripts/test.sh integration --debug | |
| - name: Debug failed deployment | |
| if: failure() | |
| run: | | |
| ./scripts/debug-deployment.sh | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| helm uninstall "$RELEASE_NAME" -n eoapi || true | |
| kubectl delete namespace eoapi || true | |
| validate-docs: | |
| name: Validate documentation | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: Check internal links | |
| run: | | |
| broken=0 | |
| find docs -name "*.md" | while read -r file; do | |
| if grep -q "](\./" "$file" 2>/dev/null; then | |
| grep -n "](\./" "$file" | while IFS=: read -r line link; do | |
| path=$(echo "$link" | sed -n 's/.*](\.\///; s/).*//p') | |
| if [[ "$path" == images/* ]]; then | |
| full="docs/$path" | |
| else | |
| full="docs/$path" | |
| fi | |
| if [[ ! -e "$full" ]]; then | |
| echo "❌ $file:$line -> $path" | |
| broken=1 | |
| fi | |
| done | |
| fi | |
| done | |
| exit $broken | |
| - name: Check external links | |
| run: | | |
| npm install -g [email protected] | |
| echo '{"timeout":"10s","retryCount":2,"aliveStatusCodes":[200,301,302,403,999]}' > .mlc.json | |
| find docs -name "*.md" -exec timeout 30 markdown-link-check {} --config .mlc.json \; || true | |
| - name: Check frontmatter | |
| run: | | |
| missing=0 | |
| find docs -name "*.md" -not -path "docs/_includes/*" | while read -r file; do | |
| head -1 "$file" | grep -q "^---$" || { echo "❌ Missing frontmatter: $file"; missing=1; } | |
| done | |
| exit $missing | |
| observability-tests: | |
| name: Observability tests | |
| if: github.event.pull_request.head.repo.full_name == github.repository | |
| permissions: | |
| contents: 'read' | |
| id-token: 'write' | |
| # needs: integration-tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - name: Start K3s cluster | |
| uses: jupyterhub/action-k3s-helm@v4 | |
| with: | |
| k3s-channel: latest | |
| helm-version: ${{ env.HELM_VERSION }} | |
| metrics-enabled: false | |
| docker-enabled: true | |
| - name: Set release name | |
| run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" | |
| - name: Wait for K3s to be fully ready | |
| run: | | |
| echo "=== Waiting for K3s to be fully ready ===" | |
| kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s | |
| kubectl get nodes | |
| kubectl get pods --all-namespaces | |
| sleep 10 | |
| echo "✅ K3s is ready" | |
| - name: Deploy eoAPI with monitoring | |
| run: | | |
| echo "=== Deploying eoAPI with monitoring stack ===" | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| export PGO_VERSION="${{ env.PGO_VERSION }}" | |
| export GITHUB_SHA="${{ github.sha }}" | |
| export CI_MODE=true | |
| export OBSERVABILITY_MODE=true | |
| # Deploy using consolidated script with observability mode enabled | |
| ./scripts/deploy.sh --ci | |
| - name: Wait for monitoring stack to be ready | |
| run: | | |
| echo "=== Waiting for monitoring components ===" | |
| # Wait for metrics-server first (required for HPA) | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=metrics-server -n eoapi --timeout=300s || echo "metrics-server not ready" | |
| # Wait for Prometheus server | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server,app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus server not ready" | |
| # Wait for Grafana | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready" | |
| # Wait for prometheus-adapter | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready" | |
| # Give time for HPA to be created and configured | |
| echo "=== Waiting for HPA creation ===" | |
| sleep 60 | |
| echo "=== Final monitoring stack status ===" | |
| kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server)" || true | |
| kubectl get hpa -n eoapi || echo "No HPA resources found yet" | |
| - name: Validate core eoAPI services | |
| run: | | |
| echo "=== Validating core eoAPI services ===" | |
| # Wait for core application pods to be ready | |
| kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-stac -n eoapi --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-raster -n eoapi --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-vector -n eoapi --timeout=300s | |
| echo "✅ Core eoAPI services are ready" | |
| - name: Run observability tests | |
| run: | | |
| echo "=== Running observability test suite ===" | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| export NAMESPACE="eoapi" | |
| # Install python dependencies for testing | |
| python -m pip install --upgrade pip | |
| pip install pytest requests psycopg2-binary | |
| # Run observability tests | |
| python -m pytest .github/workflows/tests/test_observability.py -v --tb=short | |
| # Run autoscaling tests | |
| python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow" | |
| - name: Debug observability stack on failure | |
| if: failure() | |
| run: | | |
| echo "=== Observability Debug Information ===" | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| export NAMESPACE="eoapi" | |
| echo "=== All Pods in namespace ===" | |
| kubectl get pods -n eoapi -o wide || true | |
| echo "=== Monitoring Pods Status ===" | |
| kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server|adapter)" || true | |
| echo "=== Core eoAPI Pods Status ===" | |
| kubectl get pods -n eoapi | grep -E "(stac|raster|vector|postgres)" || true | |
| echo "=== HPA Status ===" | |
| kubectl get hpa -n eoapi -o wide || true | |
| kubectl describe hpa -n eoapi || true | |
| echo "=== Custom Metrics API ===" | |
| kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || echo "Custom metrics API not available" | |
| echo "=== Metrics Server API ===" | |
| kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" || echo "Metrics server API not available" | |
| echo "=== Pod Metrics ===" | |
| kubectl top pods -n eoapi || echo "Pod metrics not available" | |
| echo "=== Services ===" | |
| kubectl get svc -n eoapi || true | |
| echo "=== Recent Events ===" | |
| kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -30 || true | |
| echo "=== Failed Pod Logs ===" | |
| for pod in $(kubectl get pods -n eoapi --field-selector=status.phase!=Running --no-headers -o custom-columns=":metadata.name" 2>/dev/null || echo ""); do | |
| if [ -n "$pod" ]; then | |
| echo "=== Logs for failed pod: $pod ===" | |
| kubectl logs "$pod" -n eoapi --tail=50 || true | |
| fi | |
| done | |
| echo "=== Component Logs ===" | |
| kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || echo "No prometheus-adapter logs" | |
| kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || echo "No grafana logs" | |
| kubectl logs -l app.kubernetes.io/name=metrics-server -n eoapi --tail=30 || echo "No metrics-server logs" | |
| - name: Cleanup observability test | |
| if: always() | |
| run: | | |
| helm uninstall "$RELEASE_NAME" || true |