Add traces

thiagohora · thiagohora · commit a5c6a73a971d · 2025-08-06T19:37:38.000+02:00
diff --git a/apps/opik-python-backend/requirements.txt b/apps/opik-python-backend/requirements.txt
@@ -20,5 +20,6 @@ opentelemetry-api==1.36.0
 opentelemetry-sdk==1.36.0
 opentelemetry-exporter-otlp-proto-http==1.36.0
 opentelemetry-instrumentation-flask==0.57b0
+opentelemetry-instrumentation-requests==0.57b0
 opentelemetry-instrumentation-system-metrics==0.57b0
 schedule==1.2.1
diff --git a/apps/opik-python-backend/src/opik_backend/__init__.py b/apps/opik-python-backend/src/opik_backend/__init__.py
@@ -10,6 +10,7 @@
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
 from opentelemetry.sdk.resources import Resource
 from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
 
 
 def create_app(test_config=None, should_init_executor=True):
@@ -83,3 +84,6 @@ def setup_telemetry(app):
 
     # Configure system metrics instrumentation
     SystemMetricsInstrumentor().instrument()
+
+    # Configure requests instrumentation
+    RequestsInstrumentor().instrument()
diff --git a/apps/opik-python-backend/src/opik_backend/executor_docker.py b/apps/opik-python-backend/src/opik_backend/executor_docker.py
@@ -11,6 +11,7 @@
 import docker
 import schedule
 from opentelemetry import metrics
+from opentelemetry import trace
 
 from opik_backend.executor import CodeExecutorBase, ExecutionResult
 from opik_backend.scoring_commands import PYTHON_SCORING_COMMAND
@@ -82,6 +83,8 @@ def __init__(self):
         # Start the pool monitor
         self._start_pool_monitor()
 
+        self.tracer = trace.get_tracer(__name__)
+
         atexit.register(self.cleanup)
 
     def _start_pool_monitor(self):
@@ -160,36 +163,38 @@ def _calculate_latency_ms(self, start_time):
     def create_container(self):
         # Record the start time for detailed container creation metrics
         start_time = time.time()
+        with self.tracer.start_as_current_span("docker.create_container"):
+            
+            new_container = self.client.containers.run(
+                image=f"{self.docker_registry}/{self.docker_image}:{self.docker_tag}",
+                command=["tail", "-f", "/dev/null"], # a never ending process so Docker won't kill the container
+                mem_limit="256mb",
+                cpu_shares=2,
+                detach=True,
+                network_disabled=self.network_disabled,
+                security_opt=["no-new-privileges"],
+                labels=self.container_labels
+            )
 
-        new_container = self.client.containers.run(
-            image=f"{self.docker_registry}/{self.docker_image}:{self.docker_tag}",
-            command=["tail", "-f", "/dev/null"], # a never ending process so Docker won't kill the container
-            mem_limit="256mb",
-            cpu_shares=2,
-            detach=True,
-            network_disabled=self.network_disabled,
-            security_opt=["no-new-privileges"],
-            labels=self.container_labels
-        )
-
-        # Add the container to the pool
-        self.container_pool.put(new_container)
-
-        # Calculate and record the latency for the direct container creation
-        latency = self._calculate_latency_ms(start_time)
-        container_creation_histogram.record(latency, attributes={"method": "create_container"})
+            # Add the container to the pool
+            self.container_pool.put(new_container)
 
-        logger.info(f"Created container, id '{new_container.id}' in {latency:.3f} milliseconds")
+            # Calculate and record the latency for the direct container creation
+            latency = self._calculate_latency_ms(start_time)
+            container_creation_histogram.record(latency, attributes={"method": "create_container"})
+
+            logger.info(f"Created container, id '{new_container.id}' in {latency:.3f} milliseconds")
 
     def release_container(self, container):
         self.releaser_executor.submit(self.async_release, container)
 
     def async_release(self, container):
-        # First, ensure we have enough containers in the pool
-        self._create_new_container()
+        with self.tracer.start_as_current_span("async_release"):
+            # First, ensure we have enough containers in the pool
+            self._create_new_container()
 
-        # Now stop and remove the old container
-        self._stopContainer(container)
+            # Now stop and remove the old container
+            self._stopContainer(container)
 
     def _create_new_container(self):
         try:
@@ -199,35 +204,37 @@ def _create_new_container(self):
             logger.error(f"Error replacing container: {e}")
 
     def _stopContainer(self, container):
-        try:
-            logger.info(f"Stopping container {container.id}. Will create a new one.")
+        with self.tracer.start_as_current_span("docker.stop_container"):
+            try:
+                logger.info(f"Stopping container {container.id}. Will create a new one.")
 
-            # Record the start time
-            start_time = time.time()
+                # Record the start time
+                start_time = time.time()
 
-            # Remove the container
-            container.remove(force=True)
+                # Remove the container
+                container.remove(force=True)
 
-            # Calculate and record the latency
-            latency = self._calculate_latency_ms(start_time)
-            container_stop_histogram.record(latency, attributes={"method": "stop_container"})
+                # Calculate and record the latency
+                latency = self._calculate_latency_ms(start_time)
+                container_stop_histogram.record(latency, attributes={"method": "stop_container"})
 
-            logger.info(f"Stopped container {container.id} in {latency:.3f} milliseconds")
-        except Exception as e:
-            logger.error(f"Failed to stop container: {e}")
+                logger.info(f"Stopped container {container.id} in {latency:.3f} milliseconds")
+            except Exception as e:
+                logger.error(f"Failed to stop container: {e}")
 
     def get_container(self):
-        if self.stop_event.is_set():
-            raise RuntimeError("Executor is shutting down, no containers available")
-            
-        while not self.stop_event.is_set():
-            try:
-                return self.container_pool.get(timeout=self.exec_timeout)
-            except Exception as e:
-                if self.stop_event.is_set():
-                    raise RuntimeError("Executor is shutting down, no containers available")
-                    
-                logger.warning(f"Couldn't get a container to execute after waiting for {self.exec_timeout}s. Will retry: {e}")
+        with self.tracer.start_as_current_span("get_container"):
+            if self.stop_event.is_set():
+                raise RuntimeError("Executor is shutting down, no containers available")
+
+            while not self.stop_event.is_set():
+                try:
+                    return self.container_pool.get(timeout=self.exec_timeout)
+                except Exception as e:
+                    if self.stop_event.is_set():
+                        raise RuntimeError("Executor is shutting down, no containers available")
+
+                    logger.warning(f"Couldn't get a container to execute after waiting for {self.exec_timeout}s. Will retry: {e}")
 
     def run_scoring(self, code: str, data: dict, payload_type: str | None = None) -> dict:
         if self.stop_event.is_set():
@@ -239,38 +246,39 @@ def run_scoring(self, code: str, data: dict, payload_type: str | None = None) ->
         logger.info(f"Scoring executor latency: {latency:.3f} milliseconds")
         get_container_histogram.record(latency, attributes={"method": "get_container"})
 
-        try:
-            future = self.scoring_executor.submit(
-                container.exec_run,
-                cmd=["python", "-c", PYTHON_SCORING_COMMAND, code, json.dumps(data), payload_type or ""],
-                detach=False,
-                stdin=False,
-                tty=False
-            )
-
-            result = future.result(timeout=self.exec_timeout)
-            exec_result = ExecutionResult(
-                exit_code=result.exit_code,
-                output=result.output
-            )
-            latency = self._calculate_latency_ms(start_time)
-            logger.info(f"Scoring executor latency: {latency:.3f} milliseconds")
-
-            # Access ThreadPoolExecutor's internal work queue (private attribute)
-            queue_size = self.scoring_executor._work_queue.qsize()
-            scoring_executor_queue_size_gauge.set(queue_size)
-
-            scoring_executor_histogram.record(latency, attributes={"method": "run_scoring"})
-            return self.parse_execution_result(exec_result)
-        except concurrent.futures.TimeoutError:
-            logger.error(f"Execution timed out in container {container.id}")
-            return {"code": 504, "error": "Server processing exceeded timeout limit."}
-        except Exception as e:
-            logger.error(f"An unexpected error occurred: {e}")
-            return {"code": 500, "error": "An unexpected error occurred"}
-        finally:
-            # Stop and remove the container, then create a new one asynchronously
-            self.release_container(container)
+        with self.tracer.start_as_current_span("docker.run_scoring"):
+            try:
+                future = self.scoring_executor.submit(
+                    container.exec_run,
+                    cmd=["python", "-c", PYTHON_SCORING_COMMAND, code, json.dumps(data), payload_type or ""],
+                    detach=False,
+                    stdin=False,
+                    tty=False
+                )
+
+                result = future.result(timeout=self.exec_timeout)
+                exec_result = ExecutionResult(
+                    exit_code=result.exit_code,
+                    output=result.output
+                )
+                latency = self._calculate_latency_ms(start_time)
+                logger.info(f"Scoring executor latency: {latency:.3f} milliseconds")
+
+                # Access ThreadPoolExecutor's internal work queue (private attribute)
+                queue_size = self.scoring_executor._work_queue.qsize()
+                scoring_executor_queue_size_gauge.set(queue_size)
+
+                scoring_executor_histogram.record(latency, attributes={"method": "run_scoring"})
+                return self.parse_execution_result(exec_result)
+            except concurrent.futures.TimeoutError:
+                logger.error(f"Execution timed out in container {container.id}")
+                return {"code": 504, "error": "Server processing exceeded timeout limit."}
+            except Exception as e:
+                logger.error(f"An unexpected error occurred: {e}")
+                return {"code": 500, "error": "An unexpected error occurred"}
+            finally:
+                # Stop and remove the container, then create a new one asynchronously
+                self.release_container(container)
             
     def cleanup(self):
         """Clean up all containers managed by this executor."""