Debug info ci failures (#3400)

jschaul · web-flow · commit c675bf0202cd · 2023-07-06T15:08:29.000+02:00
* add bash logic to find more context of why things go wrong

* fail to schedule rabbit; and fail to start brig to see if the logs are as expected. To be reverted

* changelog

* fixup

* PR feedback

* take bash from env

* undo one on-purpose-unschedulable bug

* no need for context before Events:

* undo brig misconfiguration
diff --git a/changelog.d/5-internal/integration-test-debug-logs b/changelog.d/5-internal/integration-test-debug-logs
@@ -0,0 +1 @@
+On CI runs, provide additional context when 'helmfile install' fails.
diff --git a/hack/bin/integration-setup-federation.sh b/hack/bin/integration-setup-federation.sh
@@ -49,7 +49,20 @@ export FEDERATION_DOMAIN_2="federation-test-helper.$FEDERATION_DOMAIN_BASE"
 
 echo "Installing charts..."
 
+set +e
 helmfile --environment "$HELMFILE_ENV" --file "${TOP_LEVEL}/hack/helmfile.yaml" sync --skip-deps --concurrency 0
+EXIT_CODE=$?
+
+if (( EXIT_CODE > 0)); then
+    echo "!! Helm install failed. Attempting to get some more information ..."
+
+    kubectl -n "$NAMESPACE_1" get events | grep -v "Normal "
+    kubectl -n "$NAMESPACE_2" get events | grep -v "Normal "
+    "${DIR}/kubectl-get-debug-info.sh" "$NAMESPACE_1"
+    "${DIR}/kubectl-get-debug-info.sh" "$NAMESPACE_2"
+    exit $EXIT_CODE
+fi
+set -e
 
 # wait for fakeSNS to create resources. TODO, cleaner: make initiate-fake-aws-sns a post hook. See cassandra-migrations chart for an example.
 resourcesReady() {
diff --git a/hack/bin/kubectl-get-debug-info.sh b/hack/bin/kubectl-get-debug-info.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+USAGE="$0 <NAMESPACE>"
+NAMESPACE=${1:?$USAGE}
+
+echo "Checking pods in namespace '${NAMESPACE}' that failed to schedule..."
+
+# Get pods that failed to schedule
+UNSCHEDULED_PODS=$(kubectl get pods --namespace "$NAMESPACE" -o json | jq -r '.items[] | select(.status.phase=="Pending") | .metadata.name')
+
+for POD in $UNSCHEDULED_PODS; do
+  echo "Pod $POD failed to schedule for the following reasons:"
+  # Get events for pod
+  kubectl describe pod "$POD" --namespace "$NAMESPACE" | grep -A 10 "Events:"
+  echo ""
+done
+
+echo "Checking pods in namespace '${NAMESPACE}' that are crashlooping..."
+
+# Get pods that are crashlooping
+CRASHLOOPING_PODS=$(kubectl get pods --namespace "$NAMESPACE" -o json | jq -r '.items[] | select(.status.containerStatuses[]?.state.waiting.reason=="CrashLoopBackOff") | .metadata.name')
+
+for POD in $CRASHLOOPING_PODS; do
+  echo "Pod $POD is crashlooping for the following reasons:"
+  # Get logs of previous run for pod
+  kubectl logs "$POD" --namespace "$NAMESPACE" --previous
+  echo ""
+done
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+On CI runs, provide additional context when 'helmfile install' fails.`