Skip to content

NA: Use push metrics, trace and logs ON Python Backend via Otel #2821

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apps/opik-python-backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM docker:27.5.1

RUN apk add --no-cache tini python3 py3-pip rust cargo
RUN apk add --no-cache tini python3 py3-pip python3-dev musl-dev gcc libffi-dev rust cargo

WORKDIR /opt/opik-python-backend

Expand Down Expand Up @@ -31,5 +31,7 @@ ENV PYTHON_CODE_EXECUTOR_EXEC_TIMEOUT_IN_SECS=3
ENV PYTHON_CODE_EXECUTOR_STRATEGY="docker"
ENV PYTHON_CODE_EXECUTOR_ALLOW_NETWORK=false

ENV OPIK_VERSION=${OPIK_VERSION}

ENTRYPOINT ["tini", "--"]
CMD ./entrypoint.sh
25 changes: 24 additions & 1 deletion apps/opik-python-backend/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,33 @@ echo "Starting the Opik Python Backend server"
# Use same number of threads as container pool size
NUM_THREADS=${PYTHON_CODE_EXECUTOR_PARALLEL_NUM:-5}

opentelemetry-instrument gunicorn --access-logfile '-' \
echo "OPIK_VERSION=$OPIK_VERSION"
echo "OPIK_OTEL_SDK_ENABLED=$OPIK_OTEL_SDK_ENABLED"

if [ "$OPIK_OTEL_SDK_ENABLED" = "true" ]; then
echo "Starting the Opik Python Backend server with Open Telemetry instrumentation"

if [ -z "$OTEL_RESOURCE_ATTRIBUTES" ]; then
export OTEL_RESOURCE_ATTRIBUTES="service.name=opik-python-backend,service.version=${OPIK_VERSION}"
fi

export OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED=true
export OTEL_PYTHON_LOG_CORRELATION=true

opentelemetry-instrument gunicorn --access-logfile '-' \
--access-logformat '{"body_bytes_sent": %(B)s, "http_referer": "%(f)s", "http_user_agent": "%(a)s", "remote_addr": "%(h)s", "remote_user": "%(u)s", "request_length": 0, "request_time": %(L)s, "request": "%(r)s", "source": "gunicorn", "status": %(s)s, "time_local": "%(t)s", "time": %(T)s, "x_forwarded_for": "%(h)s"}' \
--workers 1 \
--threads "$NUM_THREADS" \
--worker-class gthread \
--bind=0.0.0.0:8000 \
--chdir ./src 'opik_backend:create_app()'
else
echo "Starting the Opik Python Backend server without Open Telemetry instrumentation"
gunicorn --access-logfile '-' \
--access-logformat '{"body_bytes_sent": %(B)s, "http_referer": "%(f)s", "http_user_agent": "%(a)s", "remote_addr": "%(h)s", "remote_user": "%(u)s", "request_length": 0, "request_time": %(L)s, "request": "%(r)s", "source": "gunicorn", "status": %(s)s, "time_local": "%(t)s", "time": %(T)s, "x_forwarded_for": "%(h)s"}' \
--workers 1 \
--threads "$NUM_THREADS" \
--worker-class gthread \
--bind=0.0.0.0:8000 \
--chdir ./src 'opik_backend:create_app()'
fi
13 changes: 7 additions & 6 deletions apps/opik-python-backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ pydantic-settings>=2.0.0,<3.0.0,!=2.9.0
requests==2.32.3
urllib3==2.3.0
Werkzeug==3.1.3
opentelemetry-api==1.31.1
opentelemetry-sdk==1.31.1
opentelemetry-exporter-otlp-proto-http==1.31.1
opentelemetry-exporter-prometheus==0.52b1
opentelemetry-instrumentation-flask==0.52b1
prometheus-client==0.21.1
opentelemetry-api==1.36.0
opentelemetry-sdk==1.36.0
opentelemetry-exporter-otlp-proto-http==1.36.0
opentelemetry-instrumentation-flask==0.57b0
opentelemetry-instrumentation-requests==0.57b0
opentelemetry-instrumentation-system-metrics==0.57b0
opentelemetry-instrumentation-logging==0.57b0
schedule==1.2.1
134 changes: 98 additions & 36 deletions apps/opik-python-backend/src/opik_backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,25 @@
import os
import sys

from flask import Flask, make_response
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from opentelemetry import metrics
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from flask import Flask
from opentelemetry import metrics, trace

from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace.export import BatchSpanProcessor

from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource

from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter

from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
from opentelemetry._logs import set_logger_provider

# Note: All auto-instrumentation is handled by 'opentelemetry-instrument' command in entrypoint.sh

def create_app(test_config=None, should_init_executor=True):
app = Flask(__name__, instance_relative_config=True)
Expand Down Expand Up @@ -42,7 +51,8 @@ def create_app(test_config=None, should_init_executor=True):
app.config.from_mapping(test_config)

# Setup OpenTelemetry before registering blueprints
setup_telemetry(app)
if os.environ.get("OPIK_OTEL_SDK_ENABLED", "").lower() == "true":
setup_telemetry(app)

from opik_backend.evaluator import evaluator, init_executor
from opik_backend.post_user_signup import post_user_signup
Expand All @@ -58,39 +68,91 @@ def create_app(test_config=None, should_init_executor=True):

return app

def setup_telemetry(app):
"""Configure OpenTelemetry metrics for the application."""
# Create metric readers based on environment
metric_readers = []
def setup_otel_metrics(app, resource):
"""Configure OpenTelemetry metrics export."""
otlp_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
provider = MeterProvider(resource=resource, metric_readers=[otlp_reader])
metrics.set_meter_provider(provider)
app.logger.debug("OpenTelemetry metrics configured")

def setup_otel_traces(app, resource):
"""Configure OpenTelemetry traces export."""
trace_provider = TracerProvider(resource=resource)
trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
trace.set_tracer_provider(trace_provider)
app.logger.debug("OpenTelemetry traces configured")

# Always add Prometheus reader for k8s scraping
prometheus_reader = PrometheusMetricReader()
metric_readers.append(prometheus_reader)
def setup_otel_logs(app, resource):
"""Configure OpenTelemetry logs export and return the logging handler."""
logger_provider = LoggerProvider(resource=resource)
logger_provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter()))
set_logger_provider(logger_provider)

# Create and configure OpenTelemetry logging handler
handler = LoggingHandler(logger_provider=logger_provider)

# Set OpenTelemetry handler level based on environment or app debug mode
otel_log_level_str = os.getenv('OTEL_PYTHON_LOG_LEVEL', 'DEBUG' if app.debug else 'INFO')
try:
otel_log_level = getattr(logging, otel_log_level_str.upper())
except AttributeError:
otel_log_level = logging.INFO
app.logger.warning(f"Invalid OTEL_PYTHON_LOG_LEVEL '{otel_log_level_str}', defaulting to INFO")

handler.setLevel(otel_log_level)
app.logger.info(f"OpenTelemetry log level set to: {logging.getLevelName(otel_log_level)}")

# Add OTLP reader if endpoint is configured
return handler

def setup_console_logging(app):
"""Configure console logging levels to control verbosity."""
console_log_level_str = os.getenv('OPIK_CONSOLE_LOG_LEVEL', 'INFO')
try:
console_log_level = getattr(logging, console_log_level_str.upper())
except AttributeError:
console_log_level = logging.INFO
app.logger.warning(f"Invalid OPIK_CONSOLE_LOG_LEVEL '{console_log_level_str}', defaulting to INFO")

# Set root logger level to control all third-party library logging
# This is cleaner than maintaining hardcoded logger lists
root_logger = logging.getLogger()
if console_log_level > root_logger.level or root_logger.level == logging.NOTSET:
root_logger.setLevel(console_log_level)
app.logger.info(f"Console log level set to: {logging.getLevelName(console_log_level)}")



def setup_telemetry(app):
"""Configure OpenTelemetry metrics, traces, and logs using OTLP export."""
# Check if OTLP endpoint is configured
otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if otlp_endpoint:
app.logger.info(f"Configured OTLP endpoint: {otlp_endpoint}. Will push metrics to this endpoint.")
otlp_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
metric_readers.append(otlp_reader)
else:
app.logger.info("No OTLP endpoint configured. Will not push metrics.")
if not otlp_endpoint:
app.logger.warning("No OTLP endpoint configured. Metrics will not be exported.")
return

# Create MeterProvider with all readers
resource = Resource.create({"service.name": os.getenv("OTEL_SERVICE_NAME", "opik-python-backend")})
provider = MeterProvider(resource=resource, metric_readers=metric_readers)
app.logger.info(f"Configured OTLP endpoint: {otlp_endpoint}. Will push metrics to this endpoint.")

# Set the global MeterProvider
metrics.set_meter_provider(provider)
# Create shared resource for all telemetry signals
resource = Resource.create()

# Set up each telemetry signal
setup_otel_metrics(app, resource)
setup_otel_traces(app, resource)
otel_handler = setup_otel_logs(app, resource)

# Configure logging handlers
# Set root logger to NOTSET to allow all messages through to handlers
# This ensures OpenTelemetry can receive all log levels, as recommended by:
# https://markandruth.co.uk/2025/05/30/getting-opentelemetry-logging-working-in-python
root_logger = logging.getLogger()
root_logger.setLevel(logging.NOTSET)
root_logger.addHandler(otel_handler)

# Also add handler to Flask app logger for guaranteed coverage
app.logger.addHandler(otel_handler)

# Configure Flask instrumentation to exclude metrics endpoint
FlaskInstrumentor().instrument_app(
app,
excluded_urls="/metrics"
)
# Configure console logging levels
setup_console_logging(app)

# Add Prometheus metrics endpoint
@app.route("/metrics")
def prometheus_metrics():
"""Endpoint for Prometheus metrics scraping."""
return make_response(generate_latest(), 200, {'Content-Type': CONTENT_TYPE_LATEST})
# Note: Auto-instrumentation is handled by 'opentelemetry-instrument' command in entrypoint.sh
# No manual instrumentation needed here
Loading
Loading