Change exception to warning. Only discard invalid labels and examples containing only invalid labels.

rmitsch · rmitsch · commit d2aadb365856 · 2023-06-29T16:15:42.000+02:00
diff --git a/spacy_llm/tasks/util/span.py b/spacy_llm/tasks/util/span.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type
 
 import jinja2
@@ -36,12 +37,14 @@ def __init__(
         self._case_sensitive_matching = case_sensitive_matching
         self._single_match = single_match
 
-        self._check_label_consistency()
+        if self._examples:
+            self._examples = self._check_label_consistency()
 
-    def _check_label_consistency(self) -> None:
-        """Checks consistency of labels between examples and defined labels."""
-        if not self._examples:
-            return
+    def _check_label_consistency(self) -> List[SpanExample]:
+        """Checks consistency of labels between examples and defined labels. Emits warning on inconsistency.
+        RETURNS ():
+        """
+        assert self._examples
         example_labels = {
             self._normalizer(key): key
             for example in self._examples
@@ -52,13 +55,30 @@ def _check_label_consistency(self) -> None:
             for key in (set(example_labels.keys()) - set(self._label_dict.keys()))
         }
         if not set(example_labels.keys()) <= set(self._label_dict.keys()):
-            raise ValueError(
+            warnings.warn(
                 f"Examples contain labels that are not specified in the task configuration. The latter contains the "
                 f"following labels: {sorted(list(set(self._label_dict.values())))}. Labels in examples missing from "
                 f"the task configuration: {sorted(list(unspecified_labels))}. Please ensure your label specification "
                 f"and example labels are consistent."
             )
 
+        # Return examples without non-declared labels. If an example only has undeclared labels, it is discarded.
+        return [
+            example
+            for example in [
+                SpanExample(
+                    text=example.text,
+                    entities={
+                        label: entities
+                        for label, entities in example.entities.items()
+                        if self._normalizer(label) in self._label_dict
+                    },
+                )
+                for example in self._examples
+            ]
+            if len(example.entities)
+        ]
+
     @property
     def labels(self) -> Tuple[str, ...]:
         return tuple(self._label_dict.values())
diff --git a/spacy_llm/tests/tasks/examples/ner_inconsistent.yml b/spacy_llm/tests/tasks/examples/ner_inconsistent.yml
@@ -7,5 +7,14 @@
       - hill
 - text: spaCy is a great tool
   entities:
+    TECH:
+      - spaCy
+- text: Jack and Jill went up the hill and spaCy is a great tool.
+  entities:
+    PERSON:
+      - Jack
+      - Jill
+    LOCATION:
+      - hill
     TECH:
       - spaCy
diff --git a/spacy_llm/tests/tasks/test_ner.py b/spacy_llm/tests/tasks/test_ner.py
@@ -851,12 +851,21 @@ def test_label_inconsistency():
     """
 
     config = Config().from_str(cfg)
-    with pytest.raises(
-        ValueError,
+    with pytest.warns(
+        UserWarning,
         match=re.escape(
             "Examples contain labels that are not specified in the task configuration. The latter contains the "
             "following labels: ['LOCATION', 'PERSON']. Labels in examples missing from the task configuration: "
             "['TECH']. Please ensure your label specification and example labels are consistent."
         ),
     ):
-        assemble_from_config(config)
+        nlp = assemble_from_config(config)
+
+    examples = nlp.get_pipe("llm")._task._examples
+    assert len(examples) == 2
+    assert examples[0].text == "Jack and Jill went up the hill."
+    assert examples[0].entities == {"LOCATION": ["hill"], "PERSON": ["Jack", "Jill"]}
+    assert (
+        examples[1].text == "Jack and Jill went up the hill and spaCy is a great tool."
+    )
+    assert examples[1].entities == {"LOCATION": ["hill"], "PERSON": ["Jack", "Jill"]}