Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/python-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
ipynb: ${{ steps.filter.outputs.ipynb }}
ipynb_files: ${{ steps.filter.outputs.ipynb_files }}
proto: ${{ steps.filter.outputs.proto }}
prompts: ${{ steps.filter.outputs.prompts }}
phoenix: ${{ steps.filter.outputs.phoenix }}
phoenix_client: ${{ steps.filter.outputs.phoenix_client }}
phoenix_evals: ${{ steps.filter.outputs.phoenix_evals }}
Expand All @@ -53,6 +54,9 @@ jobs:
proto:
- "src/phoenix/proto/**"
- "src/phoenix/trace/v1/**"
prompts:
- "prompts/**"
- "scripts/prompts/compile_python_prompts.py"
phoenix:
- "src/**"
- "tests/**"
Expand All @@ -75,6 +79,7 @@ jobs:
echo "ipynb: ${{ steps.filter.outputs.ipynb }}"
echo "ipynb_files: ${{ steps.filter.outputs.ipynb_files }}"
echo "proto: ${{ steps.filter.outputs.proto }}"
echo "prompts: ${{ steps.filter.outputs.prompts }}"
echo "phoenix: ${{ steps.filter.outputs.phoenix }}"
echo "phoenix_client: ${{ steps.filter.outputs.phoenix_client }}"
echo "phoenix_evals: ${{ steps.filter.outputs.phoenix_evals }}"
Expand Down Expand Up @@ -266,6 +271,31 @@ jobs:
run: uvx --with tox-uv==1.27.0 --with uv==0.8.6 tox run -e compile_protobuf
- run: git diff --exit-code

compile-prompts:
name: Compile Prompts
needs: changes
if: ${{ needs.changes.outputs.prompts == 'true' }}
strategy:
matrix:
py: ["3.10"]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python ${{ matrix.py }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.py }}
- name: Set up `uv`
uses: astral-sh/setup-uv@v6
with:
version: 0.8.6
- name: Compile Prompts
run: uvx --with tox-uv==1.27.0 --with uv==0.8.6 tox run -e compile_prompts
- name: Check for changes
run: git diff --exit-code

ruff:
name: Ruff
runs-on: ${{ matrix.os }}
Expand Down
Empty file.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 10 additions & 32 deletions packages/phoenix-evals/src/phoenix/evals/metrics/hallucination.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,11 @@
from pydantic import BaseModel, Field

from ..__generated__.classification_evaluator_configs import (
HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG,
)
from ..evaluators import ClassificationEvaluator
from ..llm import LLM
from ..templating import Template

_DEFAULT_HALLUCINATION_TEMPLATE = """\
In this task, you will be presented with a query, some context and a response. The response
is generated to the question based on the context. The response may contain false
information. You must use the context to determine if the response to the question
contains false information, if the response is a hallucination of facts. Your objective is
to determine whether the response text contains factual information and is not a
hallucination. A 'hallucination' refers to a response that is not based on the context or
assumes information that is not available in the context. Your response should be a single
word: either 'factual' or 'hallucinated', and it should not include any other text or
characters. 'hallucinated' indicates that the response provides factually inaccurate
information to the query based on the context. 'factual' indicates that the response to
the question is correct relative to the context, and does not contain made up
information. Please read the query and context carefully before determining your
response.

[BEGIN DATA]
************
[Query]: {input}
************
[Context]: {context}
************
[Response]: {output}
************
[END DATA]

Is the response above factual or hallucinated based on the query and context?
"""
from ..templating import Template, TemplateFormat


class HallucinationEvaluator(ClassificationEvaluator):
Expand Down Expand Up @@ -65,9 +40,12 @@ class HallucinationEvaluator(ClassificationEvaluator):

"""

NAME = "hallucination"
PROMPT = Template(template=_DEFAULT_HALLUCINATION_TEMPLATE)
CHOICES = {"hallucinated": 0.0, "factual": 1.0}
NAME = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.name
PROMPT = Template(
template=HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.messages[0].content,
template_format=TemplateFormat.MUSTACHE,
)
CHOICES = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.choices

class HallucinationInputSchema(BaseModel):
input: str = Field(description="The input query.")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: hallucination
description: A specialized evaluator for detecting hallucinations in grounded LLM responses.
messages:
- role: user
content: |
In this task, you will be presented with a query, some context and a response. The response
is generated to the question based on the context. The response may contain false
information. You must use the context to determine if the response to the question
contains false information, if the response is a hallucination of facts. Your objective is
to determine whether the response text contains factual information and is not a
hallucination. A 'hallucination' refers to a response that is not based on the context or
assumes information that is not available in the context. Your response should be a single
word: either 'factual' or 'hallucinated', and it should not include any other text or
characters. 'hallucinated' indicates that the response provides factually inaccurate
information to the query based on the context. 'factual' indicates that the response to
the question is correct relative to the context, and does not contain made up
information. Please read the query and context carefully before determining your
response.

[BEGIN DATA]
************
[Query]: {{input}}
************
[Context]: {{context}}
************
[Response]: {{output}}
************
[END DATA]

Is the response above factual or hallucinated based on the query and context?
choices:
hallucinated: 0.0
factual: 1.0
147 changes: 147 additions & 0 deletions scripts/prompts/compile_python_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
Compiles YAML prompts into Python code.
"""

import argparse
import inspect
from pathlib import Path
from typing import Literal

import yaml
from jinja2 import Template
from pydantic import BaseModel


# Based message class copied into the compiled module.
class PromptMessage(BaseModel):
role: Literal["user"]
content: str


# Base classification evaluator config class copied into the compiled module.
class ClassificationEvaluatorConfig(BaseModel):
name: str
description: str
messages: list[PromptMessage]
choices: dict[str, float]


MODELS_TEMPLATE = """\
# This file is generated. Do not edit by hand.

from typing import Literal

from pydantic import BaseModel


{{ prompt_message_source }}

{{ classification_evaluator_config_source }}
"""

CLASSIFICATION_EVALUATOR_CONFIG_TEMPLATE = """\
# This file is generated. Do not edit by hand.
# ruff: noqa: E501

from ._models import ClassificationEvaluatorConfig, PromptMessage

{{ classification_evaluator_config_name }} = {{ classification_evaluator_config_definition }}
"""

INIT_TEMPLATE = """\
# This file is generated. Do not edit by hand.

from ._models import ClassificationEvaluatorConfig, PromptMessage
{% for name in prompt_names -%}
from ._{{ name.lower() }} import {{ name }}
{% endfor %}

__all__ = [
"ClassificationEvaluatorConfig",
"PromptMessage",
{{ prompt_names|map('tojson')|join(', ') }}
]
"""


def get_models_file_contents() -> str:
"""
Gets the contents of _models.py containing Pydantic model definitions.
"""
template = Template(MODELS_TEMPLATE)
prompt_message_source = inspect.getsource(PromptMessage).strip()
classification_evaluator_config_source = inspect.getsource(
ClassificationEvaluatorConfig
).strip()
content = template.render(
prompt_message_source=prompt_message_source,
classification_evaluator_config_source=classification_evaluator_config_source,
)
return content


def get_prompt_file_contents(config: ClassificationEvaluatorConfig, name: str) -> str:
"""
Gets the Python code contents for a ClassificationEvaluatorConfig.
"""
template = Template(CLASSIFICATION_EVALUATOR_CONFIG_TEMPLATE)
content = template.render(
classification_evaluator_config_name=name,
classification_evaluator_config_definition=repr(config),
)
return content


def get_init_file_contents(prompt_names: list[str]) -> str:
"""
Gets the __init__.py file contents with exports for all prompts.
"""
template = Template(INIT_TEMPLATE)
content = template.render(prompt_names=prompt_names)
return content


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compile YAML prompts to Python code")
parser.add_argument(
"compiled_module_path",
type=Path,
help="Path to the compiled module",
)

args = parser.parse_args()

output_dir = args.compiled_module_path
prompts_dir = Path("prompts/classification_evaluator_configs")

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Generate _models.py containing Pydantic model definitions
models_content = get_models_file_contents()
models_path = output_dir / "_models.py"
models_path.write_text(models_content, encoding="utf-8")

# Compile all YAML prompts to Python
yaml_files = list(prompts_dir.glob("*.yaml"))
prompt_names = []

for yaml_file in sorted(yaml_files):
# Read and validate YAML
with open(yaml_file, "r", encoding="utf-8") as f:
raw_config = yaml.safe_load(f)
config = ClassificationEvaluatorConfig.model_validate(raw_config)

# Generate Python code using YAML filename as the module/variable name
name = yaml_file.stem
content = get_prompt_file_contents(config, name)
prompt_names.append(name)

# Write to file
output_path = output_dir / f"_{name.lower()}.py"
output_path.write_text(content, encoding="utf-8")

# Generate the __init__.py file
init_content = get_init_file_contents(prompt_names)
init_path = output_dir / "__init__.py"
init_path.write_text(init_content, encoding="utf-8")
Empty file.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading