Skip to content

More stats

More stats #4580

name: System tests and deploy
permissions: read-all
on:
push:
branches:
- master
- canary
paths-ignore:
- "docs/**"
pull_request:
branches:
- master
paths-ignore:
- "docs/**"
concurrency:
# New commits cancel previous builds only on pull requests. This is because `github.head_ref`` is only set on pull requests and `github.sha` is unique for each commit.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
jobs:
build-and-system-tests:
permissions:
contents: read
runs-on: ubicloud-standard-30-ubuntu-2404
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: System Information Report
run: |
set -euo pipefail
echo "╔══════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ 🖥️ SYSTEM INFORMATION REPORT ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════╝"
echo
# CPU Information
echo "🔧 CPU INFORMATION"
echo "────────────────────────────────────────────────────────────────────────────────────────"
CPU_CORES=$(nproc)
CPU_THREADS=$(nproc --all)
CPU_MODEL=$(lscpu | grep 'Model name' | cut -d ':' -f2 | xargs)
CPU_ARCH=$(lscpu | grep 'Architecture' | cut -d ':' -f2 | xargs)
CPU_MHz=$(lscpu | grep 'CPU MHz' | cut -d ':' -f2 | xargs || echo "N/A")
printf " %-20s %s\n" "CPU Cores:" "$CPU_CORES"
printf " %-20s %s\n" "CPU Threads:" "$CPU_THREADS"
printf " %-20s %s\n" "CPU Model:" "$CPU_MODEL"
printf " %-20s %s\n" "Architecture:" "$CPU_ARCH"
printf " %-20s %s MHz\n" "Current Speed:" "$CPU_MHz"
echo
# Memory Information
echo "💾 MEMORY INFORMATION"
echo "────────────────────────────────────────────────────────────────────────────────────────"
TOTAL_RAM=$(free -h | awk 'NR==2{print $2}')
USED_RAM=$(free -h | awk 'NR==2{print $3}')
FREE_RAM=$(free -h | awk 'NR==2{print $4}')
AVAILABLE_RAM=$(free -h | awk 'NR==2{print $7}')
TOTAL_RAM_KB=$(free | awk 'NR==2{print $2}')
USED_RAM_KB=$(free | awk 'NR==2{print $3}')
USAGE_PERCENT=$(awk "BEGIN {printf \"%.1f\", ($USED_RAM_KB/$TOTAL_RAM_KB)*100}")
printf " %-20s %s\n" "Total RAM:" "$TOTAL_RAM"
printf " %-20s %s (%s%%)\n" "Used RAM:" "$USED_RAM" "$USAGE_PERCENT"
printf " %-20s %s\n" "Free RAM:" "$FREE_RAM"
printf " %-20s %s\n" "Available RAM:" "$AVAILABLE_RAM"
echo
# Disk Information
echo "💿 DISK INFORMATION"
echo "────────────────────────────────────────────────────────────────────────────────────────"
df -h --output=source,size,used,avail,pcent,target | grep -E '^(/dev/|tmpfs)' | while read -r line; do
echo " $line"
done
echo
ROOT_TOTAL=$(df -h / | awk 'NR==2{print $2}')
ROOT_USED=$(df -h / | awk 'NR==2{print $3}')
ROOT_AVAIL=$(df -h / | awk 'NR==2{print $4}')
ROOT_PERCENT=$(df -h / | awk 'NR==2{print $5}')
printf " %-20s %s\n" "Root Total:" "$ROOT_TOTAL"
printf " %-20s %s (%s)\n" "Root Used:" "$ROOT_USED" "$ROOT_PERCENT"
printf " %-20s %s\n" "Root Available:" "$ROOT_AVAIL"
echo
# System Load and Uptime
echo "⚡ SYSTEM PERFORMANCE"
echo "────────────────────────────────────────────────────────────────────────────────────────"
LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}' | xargs)
UPTIME_INFO=$(uptime -p)
printf " %-20s %s\n" "Load Average:" "$LOAD_AVG"
printf " %-20s %s\n" "Uptime:" "$UPTIME_INFO"
echo
# Additional System Info
echo "🔍 ADDITIONAL SYSTEM INFO"
echo "────────────────────────────────────────────────────────────────────────────────────────"
if command -v lsb_release >/dev/null 2>&1; then
OS_NAME="$(lsb_release -d | cut -d: -f2 | xargs)"
else
OS_NAME="$(grep '^PRETTY_NAME=' /etc/os-release | cut -d'=' -f2- | tr -d '\"')"
fi
KERNEL_VERSION=$(uname -r)
HOSTNAME=$(hostname)
printf " %-20s %s\n" "OS:" "$OS_NAME"
printf " %-20s %s\n" "Kernel:" "$KERNEL_VERSION"
printf " %-20s %s\n" "Hostname:" "$HOSTNAME"
echo
# Process Information
echo "📈 TOP PROCESSES BY MEMORY"
echo "────────────────────────────────────────────────────────────────────────────────────────"
ps aux --sort=-%mem --no-headers | head -5 | awk '{printf " %-15s %5s%% %8s %s\n", $1, $4, $6"K", $11}'
echo
# Network Information
echo "🌐 NETWORK INTERFACES"
echo "────────────────────────────────────────────────────────────────────────────────────────"
ip -4 addr show | grep inet | awk '{print " " $NF ": " $2}' | grep -v 127.0.0.1
- name: Worker fingerprint & fast sanity check
run: |
set -euo pipefail
echo "== Worker/Kernel =="
uname -a || true
echo
echo "== OS Release =="
cat /etc/os-release || true
echo
echo "== CPU detail =="
lscpu || true
echo
echo "== GLIBC =="
(ldd --version 2>&1 | head -1) || true
echo
echo "== Ulimits =="
ulimit -a || true
echo
echo "== CGroup limits =="
cat /sys/fs/cgroup/memory.max 2>/dev/null || echo "no memory.max"
cat /sys/fs/cgroup/memory.current 2>/dev/null || echo "no memory.current"
cat /sys/fs/cgroup/cpu.max 2>/dev/null || echo "no cpu.max"
cat /sys/fs/cgroup/pids.max 2>/dev/null || echo "no pids.max"
echo
echo "== Docker service memory limits =="
systemctl show docker -p MemoryMax 2>/dev/null || echo "no MemoryMax"
echo
echo "== Memory usage breakdown =="
grep -E "(MemTotal|MemAvailable|MemFree|Buffers|Cached|Shmem)" /proc/meminfo || true
echo
echo "== PSI (pressure stall info) =="
for f in cpu memory io; do echo "[$f]"; cat /proc/pressure/$f || true; echo; done
echo
echo "== Free/DF =="
free -h || true
df -h || true
echo
echo "== Docker info (brief) =="
docker --version || true
(docker info 2>/dev/null | grep -E 'Server Version|Cgroup Driver|Cgroup Version|Docker Root Dir|Kernel Version' || true)
echo
echo "== dmesg scan (early) =="
(dmesg -T 2>/dev/null | grep -iE 'mce|hardware error|memory error|ecc|segfault|general protection|oom' || true)
- name: Make sure we have no focused system-tests
run: |
test "$(grep -r 'test.only' ./system-tests/src/ | wc -l)" -eq 0
- name: Free up disk space
run: |
echo "Space before:"
df -h
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
echo "Space after:"
df -h
- name: Configure Docker to use tmpfs for better performance
run: |
set -euo pipefail
echo "Configuring Docker to use tmpfs (RAM) for data storage..."
sudo systemctl stop docker
sudo mkdir -p /mnt/docker-tmpfs
sudo mount -t tmpfs -o size=50g tmpfs /mnt/docker-tmpfs
sudo mkdir -p /etc/docker
echo '{
"data-root": "/mnt/docker-tmpfs",
"storage-driver": "overlay2"
}' | sudo tee /etc/docker/daemon.json
# Check Docker service memory limits
echo "Checking Docker service memory limits..."
echo "Current Docker service memory limit:"
MEMORY_LIMIT=$(systemctl show docker -p MemoryMax --value 2>/dev/null)
if [ -z "$MEMORY_LIMIT" ] || [ "$MEMORY_LIMIT" = "infinity" ] || [ "$MEMORY_LIMIT" = "0" ]; then
echo " No memory limit set (unlimited)"
else
echo " MemoryMax=$MEMORY_LIMIT"
fi
sudo systemctl start docker
sudo systemctl status docker --no-pager
docker info | grep "Docker Root Dir"
echo "Docker configured to use tmpfs. All container operations will use RAM."
# Create cleanup script
cat <<'EOS' | sudo tee /usr/local/bin/_cleanup_docker_tmpfs
#!/usr/bin/env bash
set -e
sudo systemctl stop docker || true
sudo rm -f /etc/docker/daemon.json || true
sudo systemctl start docker || true
sudo umount /mnt/docker-tmpfs || true
sudo rmdir /mnt/docker-tmpfs || true
EOS
sudo chmod +x /usr/local/bin/_cleanup_docker_tmpfs
shell: bash
- name: Cleanup docker tmpfs
if: always()
run: sudo /usr/local/bin/_cleanup_docker_tmpfs
# If glcoud cli is installed skaffold spams the output because we're not logged in and in some cases might even fail the build.
- name: Hide gcloud cli from skaffold
run: sudo mv /usr/bin/gcloud /tmp/gcloud
- name: Install kustomize, kubectl & skaffold
uses: yokawasa/[email protected]
with:
kubectl: "1.30.3"
kustomize: "5.4.1"
skaffold: "2.13.1"
- uses: ./.github/actions/setup-node-pnpm
- name: Run pnpm install --frozen-lockfile so that shared module gets copied
run: pnpm install --frozen-lockfile
- name: Download tmc-langs
run: bin/tmc-langs-setup
- name: Make sure skaffold local env patch files exist
run: bin/make-sure-skaffold-local-env-patch-files-exists
- name: Workaround for kube-proxy set nf_conntrack_max permission denied
run: |
# Workarounds this issue when starting up kube-system/kube-proxy:
# I0719 17:18:25.711233 1 conntrack.go:100] Set sysctl 'net/netfilter/nf_conntrack_max' to 131072
# F0719 17:18:25.711251 1 server.go:489] open /proc/sys/net/netfilter/nf_conntrack_max: permission denied
sudo sysctl net/netfilter/nf_conntrack_max=131072
- name: Install, start & configure minikube
uses: manusa/[email protected]
with:
minikube version: "v1.33.1"
kubernetes version: "v1.30.0"
start args: "--addons ingress --memory 60g"
driver: "docker"
github token: ${{ secrets.GITHUB_TOKEN }}
- name: Build
id: build
run: bin/skaffold-build-test-env-and-serialize-output
- name: Store skaffold output json
uses: actions/upload-artifact@v4
with:
name: skaffold-build-output
path: ./skaffold-build-output.json
retention-days: 7
# If glcoud cli is installed skaffold spams the output because we're not logged in and in some cases might even fail the build.
- name: Unhide gcloud cli if master
run: sudo mv /tmp/gcloud /usr/bin/gcloud
if: ${{ github.ref == 'refs/heads/master' }}
- name: Authenticate to Google Cloud if master
env:
GCLOUD_SERVICE_KEY: ${{ secrets.GCLOUD_SERVICE_KEY }}
run: |
echo Authenticating to Google Cloud
echo "$GCLOUD_SERVICE_KEY" | python -m base64 -d > /tmp/key.json
gcloud auth activate-service-account --key-file=/tmp/key.json
if gcloud auth configure-docker -q; then
echo "Authenticated to Google Cloud..."
else
echo "Authentication to Google Cloud failed. Exiting..."
exit 1
fi
if: ${{ github.ref == 'refs/heads/master' }}
- name: Enable snippet directives
run: |
kubectl patch configmap ingress-nginx-controller -p '{"data":{"allow-snippet-annotations":"true"}}' -n ingress-nginx --type merge
- name: Remove ingress nginx admission webhook
run: |
kubectl delete -A ValidatingWebhookConfiguration ingress-nginx-admission
- name: Debug kube-system
run: |
kubectl get pods --namespace kube-system
PODS=$(kubectl get pods --namespace kube-system -o name | cut -d '/' -f 2)
for pod in $PODS; do
echo "------------------"
kubectl describe pod "$pod" --namespace kube-system
done
- name: Debug ingress
run: |
kubectl get pods --namespace ingress-nginx
PODS=$(kubectl get pods --namespace ingress-nginx -o name | cut -d '/' -f 2)
for pod in $PODS; do
echo "------------------"
kubectl describe pod "$pod" --namespace ingress-nginx
done
- name: Deploy to local minikube
id: deploy_local
run: skaffold deploy --force --filename ./skaffold.production.yaml --build-artifacts=./skaffold-build-output.json
- name: Dump migration logs on failure
if: failure()
run: |
echo "=== Migrations job logs ==="
kubectl logs job/headless-lms-run-migrations --tail=200 || true
echo
echo "=== Email-deliver pod logs (if any) ==="
kubectl logs deployment/email-deliver --tail=100 || true
echo
echo "=== Job list ==="
kubectl get jobs || true
echo
echo "=== Pod list ==="
kubectl get pods -A || true
- name: Configure ingress
run: echo "$(minikube ip) project-331.local" | sudo tee --append /etc/hosts
- name: Setup system test environment
run: pnpm install --frozen-lockfile
working-directory: ./system-tests/
- name: Run system tests
run: pnpm run test
working-directory: ./system-tests/
- name: System Test Summary
if: always()
run: |
{
echo "## 🧪 System Tests"
echo "- **Status**: ${{ job.status }}"
echo "- **Environment**: Minikube with Docker tmpfs"
echo "- **Memory**: 60GB allocated"
} >> "$GITHUB_STEP_SUMMARY"
# To prevent accidentally introducing flaky tests in a pr. Skipped on master in order to not slow down deploys.
- name: Run system tests again to make sure they're stable (if not master)
run: pnpm run test
working-directory: ./system-tests/
if: ${{ github.ref != 'refs/heads/master' }}
# upload logs for all the services on failure
- name: Dump kernel and system diagnostics (failure only)
if: ${{ failure() }}
run: |
set -euo pipefail
mkdir -p ./_failure_debug
echo "== dmesg (last 1000 lines) ==" | tee ./_failure_debug/dmesg.txt
(dmesg -T 2>/dev/null | tail -n 1000) | tee -a ./_failure_debug/dmesg.txt || true
echo "== journalctl -k (last 500 lines) ==" | tee ./_failure_debug/journalctl-kernel.txt
(journalctl -k -n 500 --no-pager 2>/dev/null) | tee -a ./_failure_debug/journalctl-kernel.txt || true
echo "== Top memory processes ==" | tee ./_failure_debug/top-mem-procs.txt
ps aux --sort=-%mem | head -n 30 | tee -a ./_failure_debug/top-mem-procs.txt || true
echo "== Limits ==" | tee ./_failure_debug/limits.txt
ulimit -a | tee -a ./_failure_debug/limits.txt || true
echo "== CGroup limits ==" | tee ./_failure_debug/cgroup.txt
{
echo "memory.max: $(cat /sys/fs/cgroup/memory.max 2>/dev/null || echo n/a)"
echo "memory.current: $(cat /sys/fs/cgroup/memory.current 2>/dev/null || echo n/a)"
echo "cpu.max: $(cat /sys/fs/cgroup/cpu.max 2>/dev/null || echo n/a)"
echo "pids.max: $(cat /sys/fs/cgroup/pids.max 2>/dev/null || echo n/a)"
} | tee -a ./_failure_debug/cgroup.txt
echo "== PSI (pressure stall info) ==" | tee ./_failure_debug/psi.txt
for f in cpu memory io; do
echo "[$f]" | tee -a ./_failure_debug/psi.txt
(cat /proc/pressure/$f 2>/dev/null || echo n/a) | tee -a ./_failure_debug/psi.txt
echo | tee -a ./_failure_debug/psi.txt
done
echo "== Filesystem usage ==" | tee ./_failure_debug/df.txt
df -h | tee -a ./_failure_debug/df.txt || true
echo "== OS/CPU detail ==" | tee ./_failure_debug/os-cpu.txt
{ cat /etc/os-release; echo; lscpu; echo; head -n 50 /proc/cpuinfo; } \
| tee -a ./_failure_debug/os-cpu.txt || true
echo "== Docker info (brief) ==" | tee ./_failure_debug/docker-info.txt
(docker info 2>/dev/null | grep -E 'Server Version|Cgroup|Docker Root Dir|Logging Driver|Kernel Version' || true) \
| tee -a ./_failure_debug/docker-info.txt
- name: Upload failure diagnostics
uses: actions/upload-artifact@v4
if: ${{ failure() }}
with:
name: worker-failure-diagnostics
path: ./_failure_debug/*
if-no-files-found: ignore
retention-days: 7
- name: Save service logs to disk
if: ${{ failure() && steps.deploy_local.conclusion == 'success' }}
run: |
mkdir ./logs
kubectl logs 'deployment/chatbot-syncer' > ./logs/chatbot-syncer.log
kubectl logs 'deployment/cms' > ./logs/cms.log
kubectl logs 'deployment/course-material' > ./logs/course-material.log
kubectl logs 'deployment/example-exercise' > ./logs/example-exercise.log
kubectl logs 'deployment/headless-lms' > ./logs/headless-lms.log
kubectl logs 'deployment/mailchimp-syncer' > ./logs/mailchimp-syncer.log
kubectl logs 'deployment/main-frontend' > ./logs/main-frontend.log
kubectl logs 'deployment/quizzes' > ./logs/quizzes.log
- name: Upload logs
uses: actions/upload-artifact@v4
if: ${{ failure() && steps.deploy_local.conclusion == 'success' }}
with:
name: service-logs
path: ./logs/*
retention-days: 7
- name: Print pods information
if: ${{ failure() && steps.deploy_local.conclusion == 'success' }}
run: |
echo "> kubectl get pods"
kubectl get pods
echo
echo "----------------------------------------------------------"
for POD in $(kubectl get pods -o custom-columns=NAME:.metadata.name --no-headers=true)
do
echo
echo "----------------------------------------------------------"
echo "> kubectl describe pod $POD"
kubectl describe pod "$POD"
done
- name: Upload test results
uses: actions/upload-artifact@v4
if: ${{ failure() && steps.deploy_local.conclusion == 'success' }}
with:
name: test-results
path: ./system-tests/test-results/
if-no-files-found: ignore
retention-days: 7
- name: Upload playwright report
uses: actions/upload-artifact@v4
if: ${{ failure() && steps.deploy_local.conclusion == 'success' }}
with:
name: playwright-html-report
path: ./system-tests/playwright-report/index.html
if-no-files-found: ignore
retention-days: 7
# These images are used for deployments and caching
- name: Push images if master
run: skaffold build -p push-images --filename ./skaffold.production.yaml
if: ${{ github.ref == 'refs/heads/master' }}
- name: Push latest images if master
run: skaffold build -p push-images,latest-tag --filename ./skaffold.production.yaml
if: ${{ github.ref == 'refs/heads/master' }}
- name: Push images tagged with sha if master
run: |
echo "=== Starting image push process ==="
# Point Docker CLI to minikube's Docker daemon since images are built there
echo "> Setting up Docker environment for minikube..."
eval "$(minikube -p minikube docker-env --shell bash)"
echo "> Docker environment configured"
echo "> Authenticating with Google Cloud Docker registry..."
gcloud auth configure-docker -q
echo "> Authentication completed"
# Check if the skaffold output file exists and show its contents
echo "> Checking skaffold-build-output.json file..."
if [ ! -f "skaffold-build-output.json" ]; then
echo "ERROR: skaffold-build-output.json file not found!"
exit 1
fi
echo "> File size: $(wc -c < skaffold-build-output.json) bytes"
echo "> File contents (formatted):"
python -m json.tool skaffold-build-output.json
echo ""
# Extract and display all image tags
echo "> Extracting image tags from skaffold output..."
TAGS=$(jq --raw-output '.builds[].tag' < skaffold-build-output.json)
echo "> Found tags:"
echo "$TAGS"
echo ""
# Count the number of tags
TAG_COUNT=$(echo "$TAGS" | wc -l)
echo "> Total number of tags to push: $TAG_COUNT"
if [ "$TAG_COUNT" -eq 0 ]; then
echo "ERROR: No image tags found in skaffold output!"
exit 1
fi
# Push each image with detailed logging
echo "> Starting to push images..."
echo "$TAGS" | while IFS= read -r tag; do
if [ -n "$tag" ]; then
echo ""
echo "=== Pushing image: $tag ==="
echo "> Checking if image exists locally..."
if docker image inspect "$tag" >/dev/null 2>&1; then
echo "> Image found locally, pushing..."
if docker push "$tag"; then
echo "> SUCCESS: Pushed $tag"
else
echo "> ERROR: Failed to push $tag"
exit 1
fi
else
echo "> ERROR: Image $tag not found locally!"
echo "> Available images:"
docker images
exit 1
fi
fi
done
echo ""
echo "=== Image push process completed successfully ==="
if: ${{ github.ref == 'refs/heads/master' }}
deploy:
needs: build-and-system-tests
if: ${{ github.ref == 'refs/heads/master' }}
permissions:
contents: read
runs-on: ubuntu-24.04
environment:
name: production
url: https://courses.mooc.fi
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Download skaffold build output artifact
uses: actions/download-artifact@v4
with:
name: skaffold-build-output
path: skaffold-build-output
- name: Install kustomize, kubectl & skaffold
uses: yokawasa/[email protected]
with:
kubectl: "1.30.3"
kustomize: "5.4.1"
skaffold: "2.13.1"
- id: "auth"
uses: "google-github-actions/auth@v2"
with:
credentials_json: "${{ secrets.GCLOUD_SERVICE_KEY_DEPLOY_TO_PRODUCTION }}"
- name: "Set up Cloud SDK"
uses: "google-github-actions/setup-gcloud@v3"
with:
project_id: ${{ secrets.GKE_PROJECT }}
- uses: google-github-actions/get-gke-credentials@v2
with:
cluster_name: ${{ secrets.GKE_CLUSTER }}
location: ${{ secrets.GKE_ZONE }}
- name: Masking
run: |
IPS=$(kubectl config view | grep server | tr -s ' ' | cut -d ' ' -f 3 | xargs echo)
echo "::add-mask::$IPS"
- run: gcloud --quiet auth configure-docker
- name: Make sure skaffold local env patch files exist
run: bin/make-sure-skaffold-local-env-patch-files-exists
- name: Deploy with Skaffold
run: skaffold deploy --force --filename ./skaffold.production.yaml -p production --build-artifacts=./skaffold-build-output/skaffold-build-output.json --namespace courses-moocfi
typecheck:
permissions:
contents: read
runs-on: ubuntu-24.04
steps:
- name: Check out repository code
uses: actions/checkout@v4
- uses: ./.github/actions/setup-node-pnpm
- name: Run pnpm install --frozen-lockfile in repo root
run: pnpm install --frozen-lockfile
- name: Run pnpm install --frozen-lockfile system-tests
run: pnpm install --frozen-lockfile
working-directory: ./system-tests/
- name: Run tsc
run: pnpm exec tsc --noEmit
working-directory: ./system-tests/