Detect label inconsistency in SpanTask (#183)

rmitsch · svlandeg · web-flow · commit dfd43feeb949 · 2023-07-05T16:46:37.000+02:00
* Check for label consistency in span tasks.

* Relax label comparison to sub-/superset one.

* Fix set comparison.

* Update spacy_llm/tests/tasks/examples/ner_inconsistent.yml

Co-authored-by: Sofie Van Landeghem &lt;svlandeg@users.noreply.github.com&gt;

* Expand error message.

* Fix tests.

* Change exception to warning. Only discard invalid labels and examples containing only invalid labels.

* Fix test config error.

* Fix test config error.

* Incorporate feedback.

* Rename self._examples to self._prompt_examples.

* Fix non-REL test failures.

* Fix REL issue.

---------

Co-authored-by: Sofie Van Landeghem &lt;svlandeg@users.noreply.github.com&gt;
diff --git a/spacy_llm/tasks/rel.py b/spacy_llm/tasks/rel.py
@@ -3,7 +3,7 @@
 import jinja2
 from pydantic import BaseModel, Field, ValidationError, validator
 from spacy.language import Language
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc
 from spacy.training import Example
 from wasabi import msg
 
@@ -45,7 +45,6 @@ def _preannotate(doc: Union[Doc, RELExample]) -> str:
     text = doc.text
 
     for i, ent in enumerate(doc.ents):
-        assert isinstance(ent, Span)
         end = ent.end_char
         before, after = text[: end + offset], text[end + offset :]
 
diff --git a/spacy_llm/tasks/span.py b/spacy_llm/tasks/span.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type
 
 import jinja2
@@ -43,6 +44,48 @@ def __init__(
         self._case_sensitive_matching = case_sensitive_matching
         self._single_match = single_match
 
+        if self._prompt_examples:
+            self._prompt_examples = self._check_label_consistency()
+
+    def _check_label_consistency(self) -> List[SpanExample]:
+        """Checks consistency of labels between examples and defined labels. Emits warning on inconsistency.
+        RETURNS (List[SpanExample]): List of SpanExamples with valid labels.
+        """
+        assert self._prompt_examples
+        example_labels = {
+            self._normalizer(key): key
+            for example in self._prompt_examples
+            for key in example.entities
+        }
+        unspecified_labels = {
+            example_labels[key]
+            for key in (set(example_labels.keys()) - set(self._label_dict.keys()))
+        }
+        if not set(example_labels.keys()) <= set(self._label_dict.keys()):
+            warnings.warn(
+                f"Examples contain labels that are not specified in the task configuration. The latter contains the "
+                f"following labels: {sorted(list(set(self._label_dict.values())))}. Labels in examples missing from "
+                f"the task configuration: {sorted(list(unspecified_labels))}. Please ensure your label specification "
+                f"and example labels are consistent."
+            )
+
+        # Return examples without non-declared labels. If an example only has undeclared labels, it is discarded.
+        return [
+            example
+            for example in [
+                SpanExample(
+                    text=example.text,
+                    entities={
+                        label: entities
+                        for label, entities in example.entities.items()
+                        if self._normalizer(label) in self._label_dict
+                    },
+                )
+                for example in self._prompt_examples
+            ]
+            if len(example.entities)
+        ]
+
     @property
     def labels(self) -> Tuple[str, ...]:
         return tuple(self._label_dict.values())
diff --git a/spacy_llm/tests/tasks/examples/ner_inconsistent.yml b/spacy_llm/tests/tasks/examples/ner_inconsistent.yml
@@ -0,0 +1,20 @@
+- text: Jack and Jill went up the hill.
+  entities:
+    PERSON:
+      - Jack
+      - Jill
+    LOCATION:
+      - hill
+- text: spaCy is a great tool
+  entities:
+    TECH:
+      - spaCy
+- text: Jack and Jill went up the hill and spaCy is a great tool.
+  entities:
+    PERSON:
+      - Jack
+      - Jill
+    LOCATION:
+      - hill
+    TECH:
+      - spaCy
diff --git a/spacy_llm/tests/tasks/test_ner.py b/spacy_llm/tests/tasks/test_ner.py
@@ -1,4 +1,5 @@
 import json
+import re
 from pathlib import Path
 
 import pytest
@@ -200,7 +201,7 @@ def test_ner_config(cfg_string, request):
     labels = split_labels(labels)
     task = pipe.task
     assert isinstance(task, Labeled)
-    assert task.labels == tuple(labels)
+    assert sorted(task.labels) == sorted(tuple(labels))
     assert pipe.labels == task.labels
     assert nlp.pipe_labels["llm"] == list(task.labels)
 
@@ -827,3 +828,55 @@ def test_ner_to_disk(noop_config, tmp_path: Path):
     nlp2.from_disk(path)
 
     assert task1._label_dict == task2._label_dict == labels
+
+
+def test_label_inconsistency():
+    """Test whether inconsistency between specified labels and labels in examples is detected."""
+    cfg = f"""
+    [nlp]
+    lang = "en"
+    pipeline = ["llm"]
+
+    [components]
+
+    [components.llm]
+    factory = "llm"
+
+    [components.llm.task]
+    @llm_tasks = "spacy.NER.v2"
+    labels = ["PERSON", "LOCATION"]
+
+    [components.llm.task.examples]
+    @misc = "spacy.FewShotReader.v1"
+    path = {str((Path(__file__).parent / "examples" / "ner_inconsistent.yml"))}
+
+    [components.llm.model]
+    @llm_models = "test.NoOpModel.v1"
+    """
+
+    config = Config().from_str(cfg)
+    with pytest.warns(
+        UserWarning,
+        match=re.escape(
+            "Examples contain labels that are not specified in the task configuration. The latter contains the "
+            "following labels: ['LOCATION', 'PERSON']. Labels in examples missing from the task configuration: "
+            "['TECH']. Please ensure your label specification and example labels are consistent."
+        ),
+    ):
+        nlp = assemble_from_config(config)
+
+    prompt_examples = nlp.get_pipe("llm")._task._prompt_examples
+    assert len(prompt_examples) == 2
+    assert prompt_examples[0].text == "Jack and Jill went up the hill."
+    assert prompt_examples[0].entities == {
+        "LOCATION": ["hill"],
+        "PERSON": ["Jack", "Jill"],
+    }
+    assert (
+        prompt_examples[1].text
+        == "Jack and Jill went up the hill and spaCy is a great tool."
+    )
+    assert prompt_examples[1].entities == {
+        "LOCATION": ["hill"],
+        "PERSON": ["Jack", "Jill"],
+    }
diff --git a/spacy_llm/tests/tasks/test_rel.py b/spacy_llm/tests/tasks/test_rel.py
@@ -133,7 +133,7 @@ def test_rel_config(cfg_string, request: FixtureRequest):
 
 @pytest.mark.external
 @pytest.mark.skipif(has_openai_key is False, reason="OpenAI API key not available")
-@pytest.mark.parametrize("cfg_string", ["zeroshot_cfg_string", "fewshot_cfg_string"])
+@pytest.mark.parametrize("cfg_string", ["fewshot_cfg_string"])  # "zeroshot_cfg_string",
 def test_rel_predict(task, cfg_string, request):
     """Use OpenAI to get REL results.
     Note that this test may fail randomly, as the LLM's output is unguaranteed to be consistent/predictable
diff --git a/spacy_llm/tests/tasks/test_spancat.py b/spacy_llm/tests/tasks/test_spancat.py
@@ -92,7 +92,7 @@ def test_spancat_config(cfg_string, request):
     labels = split_labels(labels)
     task = pipe.task
     assert isinstance(task, Labeled)
-    assert task.labels == tuple(labels)
+    assert sorted(task.labels) == sorted(tuple(labels))
     assert pipe.labels == task.labels
     assert nlp.pipe_labels["llm"] == list(task.labels)
 
diff --git a/spacy_llm/tests/tasks/test_textcat.py b/spacy_llm/tests/tasks/test_textcat.py
@@ -209,7 +209,7 @@ def test_textcat_config(task, cfg_string, request):
     labels = split_labels(labels)
     task = pipe.task
     assert isinstance(task, Labeled)
-    assert task.labels == tuple(labels)
+    assert sorted(task.labels) == sorted(tuple(labels))
     assert pipe.labels == task.labels
     assert nlp.pipe_labels["llm"] == list(task.labels)