Arize-ai · axiomofjoy · Nov 21, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025
diff --git a/.github/workflows/python-CI.yml b/.github/workflows/python-CI.yml
@@ -36,6 +36,7 @@ jobs:
       ipynb: ${{ steps.filter.outputs.ipynb }}
       ipynb_files: ${{ steps.filter.outputs.ipynb_files }}
       proto: ${{ steps.filter.outputs.proto }}
+      prompts: ${{ steps.filter.outputs.prompts }}
       phoenix: ${{ steps.filter.outputs.phoenix }}
       phoenix_client: ${{ steps.filter.outputs.phoenix_client }}
       phoenix_evals: ${{ steps.filter.outputs.phoenix_evals }}
@@ -53,6 +54,9 @@ jobs:
             proto:
               - "src/phoenix/proto/**"
               - "src/phoenix/trace/v1/**"
+            prompts:
+              - "prompts/**"
+              - "scripts/prompts/compile_python_prompts.py"
             phoenix:
               - "src/**"
               - "tests/**"
@@ -75,6 +79,7 @@ jobs:
           echo "ipynb: ${{ steps.filter.outputs.ipynb }}"
           echo "ipynb_files: ${{ steps.filter.outputs.ipynb_files }}"
           echo "proto: ${{ steps.filter.outputs.proto }}"
+          echo "prompts: ${{ steps.filter.outputs.prompts }}"
           echo "phoenix: ${{ steps.filter.outputs.phoenix }}"
           echo "phoenix_client: ${{ steps.filter.outputs.phoenix_client }}"
           echo "phoenix_evals: ${{ steps.filter.outputs.phoenix_evals }}"
@@ -266,6 +271,31 @@ jobs:
         run: uvx --with tox-uv==1.27.0 --with uv==0.8.6 tox run -e compile_protobuf
       - run: git diff --exit-code
 
+  compile-prompts:
+    name: Compile Prompts
+    needs: changes
+    if: ${{ needs.changes.outputs.prompts == 'true' }}
+    strategy:
+      matrix:
+        py: ["3.10"]
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.py }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.py }}
+      - name: Set up `uv`
+        uses: astral-sh/setup-uv@v6
+        with:
+          version: 0.8.6
+      - name: Compile Prompts
+        run: uvx --with tox-uv==1.27.0 --with uv==0.8.6 tox run -e compile_prompts
+      - name: Check for changes
+        run: git diff --exit-code
+
   ruff:
     name: Ruff
     runs-on: ${{ matrix.os }}

@@ -1,36 +1,11 @@
 from pydantic import BaseModel, Field
 
+from ..__generated__.classification_evaluator_configs import (
+    HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG,
+)
 from ..evaluators import ClassificationEvaluator
 from ..llm import LLM
-from ..templating import Template
-
-_DEFAULT_HALLUCINATION_TEMPLATE = """\
-In this task, you will be presented with a query, some context and a response. The response
-is generated to the question based on the context. The response may contain false
-information. You must use the context to determine if the response to the question
-contains false information, if the response is a hallucination of facts. Your objective is
-to determine whether the response text contains factual information and is not a
-hallucination. A 'hallucination' refers to a response that is not based on the context or
-assumes information that is not available in the context. Your response should be a single
-word: either 'factual' or 'hallucinated', and it should not include any other text or
-characters. 'hallucinated' indicates that the response provides factually inaccurate
-information to the query based on the context. 'factual' indicates that the response to
-the question is correct relative to the context, and does not contain made up
-information. Please read the query and context carefully before determining your
-response.
-
-[BEGIN DATA]
-************
-[Query]: {input}
-************
-[Context]: {context}
-************
-[Response]: {output}
-************
-[END DATA]
-
-Is the response above factual or hallucinated based on the query and context?
-"""
+from ..templating import Template, TemplateFormat
 
 
 class HallucinationEvaluator(ClassificationEvaluator):
@@ -65,9 +40,12 @@ class HallucinationEvaluator(ClassificationEvaluator):
 
     """
 
-    NAME = "hallucination"
-    PROMPT = Template(template=_DEFAULT_HALLUCINATION_TEMPLATE)
-    CHOICES = {"hallucinated": 0.0, "factual": 1.0}
+    NAME = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.name
+    PROMPT = Template(
+        template=HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.messages[0].content,
+        template_format=TemplateFormat.MUSTACHE,
+    )
+    CHOICES = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.choices
 
     class HallucinationInputSchema(BaseModel):
         input: str = Field(description="The input query.")

diff --git a/prompts/classification_evaluator_configs/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.yaml b/prompts/classification_evaluator_configs/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.yaml
@@ -0,0 +1,33 @@
+name: hallucination
+description: A specialized evaluator for detecting hallucinations in grounded LLM responses.
+messages:
+  - role: user
+    content: |
+      In this task, you will be presented with a query, some context and a response. The response
+      is generated to the question based on the context. The response may contain false
+      information. You must use the context to determine if the response to the question
+      contains false information, if the response is a hallucination of facts. Your objective is
+      to determine whether the response text contains factual information and is not a
+      hallucination. A 'hallucination' refers to a response that is not based on the context or
+      assumes information that is not available in the context. Your response should be a single
+      word: either 'factual' or 'hallucinated', and it should not include any other text or
+      characters. 'hallucinated' indicates that the response provides factually inaccurate
+      information to the query based on the context. 'factual' indicates that the response to
+      the question is correct relative to the context, and does not contain made up
+      information. Please read the query and context carefully before determining your
+      response.
+
+      [BEGIN DATA]
+      ************
+      [Query]: {{input}}
+      ************
+      [Context]: {{context}}
+      ************
+      [Response]: {{output}}
+      ************
+      [END DATA]
+
+      Is the response above factual or hallucinated based on the query and context?
+choices:
+  hallucinated: 0.0
+  factual: 1.0
diff --git a/scripts/prompts/compile_python_prompts.py b/scripts/prompts/compile_python_prompts.py
@@ -0,0 +1,147 @@
+"""
+Compiles YAML prompts into Python code.
+"""
+
+import argparse
+import inspect
+from pathlib import Path
+from typing import Literal
+
+import yaml
+from jinja2 import Template
+from pydantic import BaseModel
+
+
+# Based message class copied into the compiled module.
+class PromptMessage(BaseModel):
+    role: Literal["user"]
+    content: str
+
+
+# Base classification evaluator config class copied into the compiled module.
+class ClassificationEvaluatorConfig(BaseModel):
+    name: str
+    description: str
+    messages: list[PromptMessage]
+    choices: dict[str, float]
+
+
+MODELS_TEMPLATE = """\
+# This file is generated. Do not edit by hand.
+
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+{{ prompt_message_source }}
+
+{{ classification_evaluator_config_source }}
+"""
+
+CLASSIFICATION_EVALUATOR_CONFIG_TEMPLATE = """\
+# This file is generated. Do not edit by hand.
+# ruff: noqa: E501
+
+from ._models import ClassificationEvaluatorConfig, PromptMessage
+
+{{ classification_evaluator_config_name }} = {{ classification_evaluator_config_definition }}
+"""
+
+INIT_TEMPLATE = """\
+# This file is generated. Do not edit by hand.
+
+from ._models import ClassificationEvaluatorConfig, PromptMessage
+{% for name in prompt_names -%}
+from ._{{ name.lower() }} import {{ name }}
+{% endfor %}
+
+__all__ = [
+    "ClassificationEvaluatorConfig",
+    "PromptMessage",
+    {{ prompt_names|map('tojson')|join(', ') }}
+]
+"""
+
+
+def get_models_file_contents() -> str:
+    """
+    Gets the contents of _models.py containing Pydantic model definitions.
+    """
+    template = Template(MODELS_TEMPLATE)
+    prompt_message_source = inspect.getsource(PromptMessage).strip()
+    classification_evaluator_config_source = inspect.getsource(
+        ClassificationEvaluatorConfig
+    ).strip()
+    content = template.render(
+        prompt_message_source=prompt_message_source,
+        classification_evaluator_config_source=classification_evaluator_config_source,
+    )
+    return content
+
+
+def get_prompt_file_contents(config: ClassificationEvaluatorConfig, name: str) -> str:
+    """
+    Gets the Python code contents for a ClassificationEvaluatorConfig.
+    """
+    template = Template(CLASSIFICATION_EVALUATOR_CONFIG_TEMPLATE)
+    content = template.render(
+        classification_evaluator_config_name=name,
+        classification_evaluator_config_definition=repr(config),
+    )
+    return content
+
+
+def get_init_file_contents(prompt_names: list[str]) -> str:
+    """
+    Gets the __init__.py file contents with exports for all prompts.
+    """
+    template = Template(INIT_TEMPLATE)
+    content = template.render(prompt_names=prompt_names)
+    return content
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compile YAML prompts to Python code")
+    parser.add_argument(
+        "compiled_module_path",
+        type=Path,
+        help="Path to the compiled module",
+    )
+
+    args = parser.parse_args()
+
+    output_dir = args.compiled_module_path
+    prompts_dir = Path("prompts/classification_evaluator_configs")
+
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate _models.py containing Pydantic model definitions
+    models_content = get_models_file_contents()
+    models_path = output_dir / "_models.py"
+    models_path.write_text(models_content, encoding="utf-8")
+
+    # Compile all YAML prompts to Python
+    yaml_files = list(prompts_dir.glob("*.yaml"))
+    prompt_names = []
+
+    for yaml_file in sorted(yaml_files):
+        # Read and validate YAML
+        with open(yaml_file, "r", encoding="utf-8") as f:
+            raw_config = yaml.safe_load(f)
+        config = ClassificationEvaluatorConfig.model_validate(raw_config)
+
+        # Generate Python code using YAML filename as the module/variable name
+        name = yaml_file.stem
+        content = get_prompt_file_contents(config, name)
+        prompt_names.append(name)
+
+        # Write to file
+        output_path = output_dir / f"_{name.lower()}.py"
+        output_path.write_text(content, encoding="utf-8")
+
+    # Generate the __init__.py file
+    init_content = get_init_file_contents(prompt_names)
+    init_path = output_dir / "__init__.py"
+    init_path.write_text(init_content, encoding="utf-8")