robusta-dev · Sheeproid · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/checkout-service.yaml
@@ -0,0 +1,183 @@
+# Simplified Checkout Service with Dummy SQL
+apiVersion: v1
+kind: Secret
+metadata:
+  name: checkout-app
+type: Opaque
+stringData:
+  app.py: |
+    import os
+    import time
+    import random
+    from flask import Flask, request, jsonify
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "checkout-service"})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint="tempo.app-115.svc.cluster.local:4317",
+        insecure=True
+    )
+    provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    app = Flask(__name__)
+    FlaskInstrumentor().instrument_app(app)
+
+    tracer = trace.get_tracer(__name__)
+
+    @app.route('/health')
+    def health():
+        return 'OK'
+
+    @app.route('/checkout', methods=['POST'])
+    def checkout():
+        with tracer.start_as_current_span("process_checkout") as span:
+            data = request.json or {}
+
+            # Log the incoming request (without revealing the data)
+            print(f"[CHECKOUT] Processing checkout request for user {data.get('user_id', 'guest')}", flush=True)
+
+            # Extract parameters
+            user_id = data.get('user_id', 'guest')
+            zone_id = data.get('zone_id', 'us-west-1')
+            promo_code = data.get('promo_code')
+            items = data.get('items', [])
+
+            # Add span attributes
+            span.set_attribute("user.id", user_id)
+            span.set_attribute("zone.id", zone_id)
+            span.set_attribute("items.count", len(items))
+            if promo_code:
+                span.set_attribute("promo.code", promo_code)
+
+            # Simulate database query for shipping calculation
+            with tracer.start_as_current_span("database_query") as db_span:
+                db_span.set_attribute("db.system", "postgresql")
+                db_span.set_attribute("db.operation", "SELECT")
+
+                if promo_code:
+                    # Simulate database error with promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+
+                    # Simulate a small delay before failure
+                    sleep_time = random.uniform(0.05, 0.1)
+                    time.sleep(sleep_time)
+
+                    # Record the exception in the span
+                    error_msg = f"ERROR: duplicate key value violates unique constraint 'promo_codes_pkey'"
+                    db_span.record_exception(Exception(error_msg))
+                    db_span.set_status(trace.Status(trace.StatusCode.ERROR, error_msg))
+
+                    # Also record on parent span
+                    span.record_exception(Exception(error_msg))
+                    span.set_status(trace.Status(trace.StatusCode.ERROR, "Database error"))
+
+                    print(f"[CHECKOUT] Request failed", flush=True)
+
+                    return jsonify({
+                        "error": "Internal server error",
+                        "message": "Request failed"
+                    }), 500
+                else:
+                    # Simulate successful query without promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+                    sleep_time = random.uniform(0.05, 0.1)
+                    time.sleep(sleep_time)
+                    shipping_rate = 5.0
+                    discount = 0.0
+
+                # Calculate shipping cost (only reached for non-promo requests)
+                total_weight = sum(item.get('weight', 1.0) for item in items)
+                shipping_cost = total_weight * shipping_rate * (1 - discount/100)
+
+            # Calculate total
+            subtotal = sum(item.get('price', 0) for item in items)
+            total = subtotal + shipping_cost
+
+            response = {
+                "order_id": f"ord-{random.randint(1000, 9999)}",
+                "subtotal": subtotal,
+                "shipping": round(shipping_cost, 2),
+                "total": round(total, 2)
+            }
+
+            print(f"[CHECKOUT] Completed checkout request", flush=True)
+            return jsonify(response)
+
+    if __name__ == '__main__':
+        print("[CHECKOUT] Starting checkout service on port 8080", flush=True)
+        app.run(host='0.0.0.0', port=8080)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: checkout
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: checkout
+  template:
+    metadata:
+      labels:
+        app: checkout
+    spec:
+      containers:
+      - name: checkout
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install flask opentelemetry-api opentelemetry-sdk \
+            opentelemetry-instrumentation-flask \
+            opentelemetry-exporter-otlp-proto-grpc && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        ports:
+        - containerPort: 8080
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        startupProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 24
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "50m"
+          limits:
+            memory: "256Mi"
+            cpu: "200m"
+      volumes:
+      - name: app
+        secret:
+          secretName: checkout-app
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: checkout
+spec:
+  selector:
+    app: checkout
+  ports:
+  - port: 8080
+    targetPort: 8080
diff --git a/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/test_case.yaml
@@ -0,0 +1,125 @@
+user_prompt:
+- "The checkout service in namespace app-115 is experiencing errors. Investigate why."
+
+expected_output:
+  - The answer must explicitly state that requests with promo codes are failing or all errors/failures occur when promo code is included.
+#  - The answer must mention or imply that there are other requests without promo codes that succeed.
+  - Including a query that mentions promo code is not sufficient.
+
+tags:
+  - kubernetes
+  - hard
+  - chain-of-causation
+  - traces
+
+port_forwards:
+  - namespace: app-115
+    service: tempo
+    local_port: 3200
+    remote_port: 3200
+
+before_test: |
+  echo "🚀 Setting up test 115 - Creating namespace app-115"
+  kubectl create namespace app-115 || true
+  echo "✅ Namespace app-115 created successfully!"
+
+  echo "📦 Deploying Tempo from shared config"
+  kubectl apply -f ../../shared/tempo.yaml -n app-115
+
+  echo "⏳ Waiting for Tempo pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=tempo -n app-115 --timeout=60s
+
+  echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)"
+  TEMPO_READY=false
+  for i in {1..12}; do
+    if kubectl exec -n app-115 deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then
+      echo "✅ Tempo is ready!"
+      TEMPO_READY=true
+      break
+    else
+      echo "⏳ Attempt $i/12: Tempo not ready yet, waiting 5s..."
+      sleep 5
+    fi
+  done
+
+  if [ "$TEMPO_READY" = false ]; then
+    echo "❌ Tempo failed to become ready after 60 seconds"
+    exit 1
+  fi
+
+  echo "✅ Tempo deployment complete!"
+
+  echo "🛒 Deploying checkout service"
+  kubectl apply -f checkout-service.yaml -n app-115
+
+  echo "⏳ Waiting for checkout pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=checkout -n app-115 --timeout=60s
+
+  echo "🔍 Checking checkout deployment status"
+  kubectl get pods -n app-115 -l app=checkout
+
+  echo "🚦 Deploying traffic generator"
+  kubectl apply -f traffic-generator.yaml -n app-115
+
+  echo "⏳ Waiting for traffic generator pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-115 --timeout=60s
+
+  echo "🔍 Checking all pods status"
+  kubectl get pods -n app-115
+
+  echo "⏰ Letting traffic generator run for 45 seconds to generate requests"
+  sleep 20
+
+  echo "🔍 Verifying traffic generator log entries"
+  if kubectl logs -n app-115 -l app=traffic-generator --tail=-1 | grep -q "WITH promo_code"; then
+    echo "✅ Found traffic generator log WITH promo_code"
+  else
+    echo "❌ Missing traffic generator log WITH promo_code"
+    exit 1
+  fi
+
+  if kubectl logs -n app-115 -l app=traffic-generator --tail=-1 | grep -q "WITHOUT promo_code"; then
+    echo "✅ Found traffic generator log WITHOUT promo_code"
+  else
+    echo "❌ Missing traffic generator log WITHOUT promo_code"
+    exit 1
+  fi
+
+  if kubectl logs -n app-115 -l app=checkout --tail=100 | grep -q "Processing checkout request"; then
+    echo "✅ Found checkout request log"
+  else
+    echo "❌ Missing checkout request log"
+    exit 1
+  fi
+
+  # Commented out traffic generator trace checks as it no longer sends traces
+  # echo "🔍 Querying Tempo for traces from traffic generator"
+  # TRAFFIC_GEN_TRACES=$(curl -s "http://localhost:3200/api/search?tags=service.name%3Dtraffic-generator&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator"
+
+  echo "🔍 Querying Tempo for traces from checkout service"
+  CHECKOUT_TRACES=$(kubectl run -n app-115 tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  echo "Found $CHECKOUT_TRACES traces from checkout-service"
+
+  # Commented out traffic generator trace check
+  # if [ "$TRAFFIC_GEN_TRACES" -gt "0" ]; then
+  #   echo "✅ Found traces from traffic-generator"
+  # else
+  #   echo "❌ No traces found from traffic-generator"
+  #   exit 1
+  # fi
+
+  if [ "$CHECKOUT_TRACES" -gt "0" ]; then
+    echo "✅ Found traces from checkout-service"
+  else
+    echo "❌ No traces found from checkout-service"
+    exit 1
+  fi
+
+  # Delete Traffic generator so the ai won't cheat
+  kubectl delete -f traffic-generator.yaml -n app-115
+
+  echo "✅ Test setup complete!"
+
+after_test: |
+  kubectl delete namespace app-115 || true
diff --git a/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/115_checkout_errors_tracing/toolsets.yaml
@@ -0,0 +1,10 @@
+toolsets:
+  kubernetes/core:
+    enabled: true
+  kubernetes/logs:
+    enabled: true
+  grafana/tempo:
+    enabled: true
+    config:
+      url: http://localhost:3200
+      healthcheck: "ready"