Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions spacy_llm/tasks/util/span.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type

import jinja2
Expand Down Expand Up @@ -36,6 +37,48 @@ def __init__(
self._case_sensitive_matching = case_sensitive_matching
self._single_match = single_match

if self._examples:
self._examples = self._check_label_consistency()

def _check_label_consistency(self) -> List[SpanExample]:
"""Checks consistency of labels between examples and defined labels. Emits warning on inconsistency.
RETURNS ():
"""
assert self._examples
example_labels = {
self._normalizer(key): key
for example in self._examples
for key in example.entities
}
unspecified_labels = {
example_labels[key]
for key in (set(example_labels.keys()) - set(self._label_dict.keys()))
}
if not set(example_labels.keys()) <= set(self._label_dict.keys()):
warnings.warn(
f"Examples contain labels that are not specified in the task configuration. The latter contains the "
f"following labels: {sorted(list(set(self._label_dict.values())))}. Labels in examples missing from "
f"the task configuration: {sorted(list(unspecified_labels))}. Please ensure your label specification "
f"and example labels are consistent."
)

# Return examples without non-declared labels. If an example only has undeclared labels, it is discarded.
return [
example
for example in [
SpanExample(
text=example.text,
entities={
label: entities
for label, entities in example.entities.items()
if self._normalizer(label) in self._label_dict
},
)
for example in self._examples
]
if len(example.entities)
]

@property
def labels(self) -> Tuple[str, ...]:
return tuple(self._label_dict.values())
Expand Down
20 changes: 20 additions & 0 deletions spacy_llm/tests/tasks/examples/ner_inconsistent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
- text: Jack and Jill went up the hill.
entities:
PERSON:
- Jack
- Jill
LOCATION:
- hill
- text: spaCy is a great tool
entities:
TECH:
- spaCy
- text: Jack and Jill went up the hill and spaCy is a great tool.
entities:
PERSON:
- Jack
- Jill
LOCATION:
- hill
TECH:
- spaCy
46 changes: 46 additions & 0 deletions spacy_llm/tests/tasks/test_ner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re
from pathlib import Path

import pytest
Expand Down Expand Up @@ -815,3 +816,48 @@ def test_ner_to_disk(noop_config, tmp_path: Path):
nlp2.from_disk(path)

assert task1._label_dict == task2._label_dict == labels


def test_label_inconsistency():
"""Test whether inconsistency between specified labels and labels in examples is detected."""
cfg = f"""
[nlp]
lang = "en"
pipeline = ["llm"]

[components]

[components.llm]
factory = "llm"

[components.llm.task]
@llm_tasks = "spacy.NER.v2"
labels = ["PERSON", "LOCATION"]

[components.llm.task.examples]
@misc = "spacy.FewShotReader.v1"
path = {str((Path(__file__).parent / "examples" / "ner_inconsistent.yml"))}

[components.llm.model]
@llm_models = "test.NoOpModel.v1"
"""

config = Config().from_str(cfg)
with pytest.warns(
UserWarning,
match=re.escape(
"Examples contain labels that are not specified in the task configuration. The latter contains the "
"following labels: ['LOCATION', 'PERSON']. Labels in examples missing from the task configuration: "
"['TECH']. Please ensure your label specification and example labels are consistent."
),
):
nlp = assemble_from_config(config)

examples = nlp.get_pipe("llm")._task._examples
assert len(examples) == 2
assert examples[0].text == "Jack and Jill went up the hill."
assert examples[0].entities == {"LOCATION": ["hill"], "PERSON": ["Jack", "Jill"]}
assert (
examples[1].text == "Jack and Jill went up the hill and spaCy is a great tool."
)
assert examples[1].entities == {"LOCATION": ["hill"], "PERSON": ["Jack", "Jill"]}