Skip to content

refactor: auto-scaling and observability components #290

refactor: auto-scaling and observability components

refactor: auto-scaling and observability components #290

Workflow file for this run

name: CI tests
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
env:
HELM_VERSION: v3.15.2
PGO_VERSION: 5.7.4
jobs:
fast-checks:
name: Simple tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/setup-node@v6
with:
node-version: '24'
- name: Install Helm
uses: azure/setup-helm@v4
with:
version: ${{ env.HELM_VERSION }}
- name: Setup Helm dependencies
run: ./scripts/deploy.sh setup
- name: Install ajv-cli
run: npm install -g ajv-cli ajv-formats
- name: Run linters
run: make lint
- name: Validate Helm values schema
run: make validate-schema
- name: Run Helm unit tests
run: make tests
integration-tests:
name: Integration tests
# needs: fast-checks
if: github.event.pull_request.head.repo.full_name == github.repository
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Start K3s cluster
uses: jupyterhub/action-k3s-helm@v4
with:
k3s-channel: latest
helm-version: ${{ env.HELM_VERSION }}
metrics-enabled: false
docker-enabled: true
- name: Set release name
run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV"
- name: Wait for K3s readiness
run: |
echo "=== Waiting for K3s cluster to be ready ==="
# The action already sets up kubectl context, just verify it works
kubectl cluster-info
kubectl get nodes
# Wait for core components
kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
# Verify Traefik CRDs
timeout=300; counter=0
for crd in "middlewares.traefik.io" "ingressroutes.traefik.io"; do
while [ $counter -lt $timeout ] && ! kubectl get crd "$crd" &>/dev/null; do
sleep 3; counter=$((counter + 3))
done
[ $counter -ge $timeout ] && { echo "❌ Timeout waiting for $crd"; exit 1; }
done
echo "✅ K3s cluster ready"
- name: Deploy eoAPI
id: deploy
run: |
echo "=== eoAPI Deployment ==="
export RELEASE_NAME="${RELEASE_NAME}"
export PGO_VERSION="${{ env.PGO_VERSION }}"
export CI_MODE=true
# Deploy using consolidated script with CI mode
./scripts/deploy.sh --ci
- name: Validate deployment
run: |
echo "=== Post-deployment validation ==="
./scripts/test.sh check-deployment
- name: Run integration tests
run: |
export RELEASE_NAME="$RELEASE_NAME"
./scripts/test.sh integration --debug
- name: Debug failed deployment
if: failure()
run: |
./scripts/debug-deployment.sh
- name: Cleanup
if: always()
run: |
helm uninstall "$RELEASE_NAME" -n eoapi || true
kubectl delete namespace eoapi || true
validate-docs:
name: Validate documentation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- name: Check internal links
run: |
broken=0
find docs -name "*.md" | while read -r file; do
if grep -q "](\./" "$file" 2>/dev/null; then
grep -n "](\./" "$file" | while IFS=: read -r line link; do
path=$(echo "$link" | sed -n 's/.*](\.\///; s/).*//p')
if [[ "$path" == images/* ]]; then
full="docs/$path"
else
full="docs/$path"
fi
if [[ ! -e "$full" ]]; then
echo "❌ $file:$line -> $path"
broken=1
fi
done
fi
done
exit $broken
- name: Check external links
run: |
npm install -g [email protected]
echo '{"timeout":"10s","retryCount":2,"aliveStatusCodes":[200,301,302,403,999]}' > .mlc.json
find docs -name "*.md" -exec timeout 30 markdown-link-check {} --config .mlc.json \; || true
- name: Check frontmatter
run: |
missing=0
find docs -name "*.md" -not -path "docs/_includes/*" | while read -r file; do
head -1 "$file" | grep -q "^---$" || { echo "❌ Missing frontmatter: $file"; missing=1; }
done
exit $missing
observability-tests:
name: Observability tests
if: github.event.pull_request.head.repo.full_name == github.repository
permissions:
contents: 'read'
id-token: 'write'
# needs: integration-tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Start K3s cluster
uses: jupyterhub/action-k3s-helm@v4
with:
k3s-channel: latest
helm-version: ${{ env.HELM_VERSION }}
metrics-enabled: false
docker-enabled: true
- name: Set release name
run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV"
- name: Wait for K3s to be fully ready
run: |
echo "=== Waiting for K3s to be fully ready ==="
kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
kubectl get nodes
kubectl get pods --all-namespaces
sleep 10
echo "✅ K3s is ready"
- name: Deploy eoAPI with monitoring
run: |
echo "=== Deploying eoAPI with monitoring stack ==="
export RELEASE_NAME="$RELEASE_NAME"
export PGO_VERSION="${{ env.PGO_VERSION }}"
export GITHUB_SHA="${{ github.sha }}"
export CI_MODE=true
export OBSERVABILITY_MODE=true
# Deploy using consolidated script with observability mode enabled
./scripts/deploy.sh --ci
- name: Wait for monitoring stack to be ready
run: |
echo "=== Waiting for monitoring components ==="
# Wait for metrics-server first (required for HPA)
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=metrics-server -n eoapi --timeout=300s || echo "metrics-server not ready"
# Wait for Prometheus server
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server,app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus server not ready"
# Wait for Grafana
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready"
# Wait for prometheus-adapter
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready"
# Give time for HPA to be created and configured
echo "=== Waiting for HPA creation ==="
sleep 60
echo "=== Final monitoring stack status ==="
kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server)" || true
kubectl get hpa -n eoapi || echo "No HPA resources found yet"
- name: Validate core eoAPI services
run: |
echo "=== Validating core eoAPI services ==="
# Wait for core application pods to be ready
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-stac -n eoapi --timeout=300s
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-raster -n eoapi --timeout=300s
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-vector -n eoapi --timeout=300s
echo "✅ Core eoAPI services are ready"
- name: Run observability tests
run: |
echo "=== Running observability test suite ==="
export RELEASE_NAME="$RELEASE_NAME"
export NAMESPACE="eoapi"
# Install python dependencies for testing
python -m pip install --upgrade pip
pip install pytest requests psycopg2-binary
# Run observability tests
python -m pytest .github/workflows/tests/test_observability.py -v --tb=short
# Run autoscaling tests
python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow"
- name: Debug observability stack on failure
if: failure()
run: |
echo "=== Observability Debug Information ==="
export RELEASE_NAME="$RELEASE_NAME"
export NAMESPACE="eoapi"
echo "=== All Pods in namespace ==="
kubectl get pods -n eoapi -o wide || true
echo "=== Monitoring Pods Status ==="
kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server|adapter)" || true
echo "=== Core eoAPI Pods Status ==="
kubectl get pods -n eoapi | grep -E "(stac|raster|vector|postgres)" || true
echo "=== HPA Status ==="
kubectl get hpa -n eoapi -o wide || true
kubectl describe hpa -n eoapi || true
echo "=== Custom Metrics API ==="
kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || echo "Custom metrics API not available"
echo "=== Metrics Server API ==="
kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" || echo "Metrics server API not available"
echo "=== Pod Metrics ==="
kubectl top pods -n eoapi || echo "Pod metrics not available"
echo "=== Services ==="
kubectl get svc -n eoapi || true
echo "=== Recent Events ==="
kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -30 || true
echo "=== Failed Pod Logs ==="
for pod in $(kubectl get pods -n eoapi --field-selector=status.phase!=Running --no-headers -o custom-columns=":metadata.name" 2>/dev/null || echo ""); do
if [ -n "$pod" ]; then
echo "=== Logs for failed pod: $pod ==="
kubectl logs "$pod" -n eoapi --tail=50 || true
fi
done
echo "=== Component Logs ==="
kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || echo "No prometheus-adapter logs"
kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || echo "No grafana logs"
kubectl logs -l app.kubernetes.io/name=metrics-server -n eoapi --tail=30 || echo "No metrics-server logs"
- name: Cleanup observability test
if: always()
run: |
helm uninstall "$RELEASE_NAME" || true