explosion
diff --git a/‎README.md‎
Lines changed: 42 additions & 6 deletions b/‎README.md‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎spacy_llm/pipeline/llm.py‎
Lines changed: 1 addition & 1 deletion b/‎spacy_llm/pipeline/llm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎spacy_llm/tasks/lemma.py‎
Lines changed: 22 additions & 8 deletions b/‎spacy_llm/tasks/lemma.py‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎spacy_llm/tasks/ner.py‎
Lines changed: 28 additions & 14 deletions b/‎spacy_llm/tasks/ner.py‎
Lines changed: 28 additions & 14 deletions
@@ -305,6 +305,39 @@ Moreover, the task may define an optional [`scorer` method](https://spacy.io/api
 It should accept an iterable of `Example`s as input and return a score dictionary.
 If the `scorer` method is defined, `spacy-llm` will call it to evaluate the component.
 
+#### Providing examples for few-shot prompts
+
+All built-in tasks support few-shot prompts, i. e. including examples in a prompt. Examples can be supplied in two ways:
+(1) as a separate file containing only examples or (2) by initializing `llm` with a `get_examples()` callback (like any
+other spaCy pipeline component).
+
+##### (1) Few-shot example file
+
+A file containing examples for few-shot prompting can be configured like this:
+
+```ini
+[components.llm.task]
+@llm_tasks = "spacy.NER.v2"
+labels = PERSON,ORGANISATION,LOCATION
+[components.llm.task.examples]
+@misc = "spacy.FewShotReader.v1"
+path = "ner_examples.yml"
+```
+
+The supplied file has to conform to the format expected by the required task (see the task documentation further down).
+
+##### (2) Initializing the `llm` component with a `get_examples()` callback
+
+Alternatively, you can initialize your `nlp` pipeline by providing a `get_examples` callback for
+[`nlp.initialize`](https://spacy.io/api/language#initialize) and setting `n_prompt_examples` to a positive number to
+automatically fetch a few examples for few-shot learning. Set `n_prompt_examples` to `-1` to use all examples as
+part of the few-shot learning prompt.
+
+```ini
+[initialize.components.llm]
+n_prompt_examples = 3
+```
+
 #### <kbd>function</kbd> `task.generate_prompts`
 
 Takes a collection of documents, and returns a collection of "prompts", which can be of type `Any`.
@@ -389,7 +422,10 @@ labels = PERSON,ORGANISATION,LOCATION
 path = "ner_examples.yml"
 ```
 
-If you don't have specific examples to provide to the LLM, you can write definitions for each label and provide them via the `label_definitions` argument. This lets you tell the LLM exactly what you're looking for rather than relying on the LLM to interpret its task given just the label name. Label descriptions are freeform so you can write whatever you want here, but through some experiments a brief description along with some examples and counter examples seems to work quite well.
+You can also write definitions for each label and provide them via the `label_definitions` argument. This lets you tell
+the LLM exactly what you're looking for rather than relying on the LLM to interpret its task given just the label name.
+Label descriptions are freeform so you can write whatever you want here, but through some experiments a brief
+description along with some examples and counter examples seems to work quite well.
 
 ```ini
 [components.llm.task]
@@ -627,15 +663,11 @@ labels = ["LivesIn", "Visits"]
 To perform few-shot learning, you can write down a few examples in a separate file, and provide these to be injected into the prompt to the LLM.
 The default reader `spacy.FewShotReader.v1` supports `.yml`, `.yaml`, `.json` and `.jsonl`.
 
-```json
+```jsonl
 {"text": "Laura bought a house in Boston with her husband Mark.", "ents": [{"start_char": 0, "end_char": 5, "label": "PERSON"}, {"start_char": 24, "end_char": 30, "label": "GPE"}, {"start_char": 48, "end_char": 52, "label": "PERSON"}], "relations": [{"dep": 0, "dest": 1, "relation": "LivesIn"}, {"dep": 2, "dest": 1, "relation": "LivesIn"}]}
 {"text": "Michael travelled through South America by bike.", "ents": [{"start_char": 0, "end_char": 7, "label": "PERSON"}, {"start_char": 26, "end_char": 39, "label": "LOC"}], "relations": [{"dep": 0, "dest": 1, "relation": "Visits"}]}
 ```
 
-Note: the REL task relies on pre-extracted entities to make its prediction.
-Hence, you'll need to add a component that populates `doc.ents` with recognized
-spans to your spaCy pipeline and put it _before_ the REL component.
-
 ```ini
 [components.llm.task]
 @llm_tasks = "spacy.REL.v1"
@@ -645,6 +677,10 @@ labels = ["LivesIn", "Visits"]
 path = "rel_examples.jsonl"
 ```
 
+Note: the REL task relies on pre-extracted entities to make its prediction.
+Hence, you'll need to add a component that populates `doc.ents` with recognized
+spans to your spaCy pipeline and put it _before_ the REL component.
+
 #### spacy.Lemma.v1
 
 The `Lemma.v1` task lemmatizes the provided text and updates the `lemma_` attribute in the doc's tokens accordingly.
 
@@ -287,7 +287,7 @@ def to_disk(
         if isinstance(self._model, Serializable):
             serialize["model"] = lambda p: self._model.to_disk(p, exclude=exclude)  # type: ignore[attr-defined]
 
-        return util.to_disk(path, serialize, exclude)
+        util.to_disk(path, serialize, exclude)
 
     def from_disk(
         self, path: Path, *, exclude: Tuple[str] = cast(Tuple[str], tuple())
 
@@ -1,6 +1,7 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Type
 
 import jinja2
+from pydantic import BaseModel
 from spacy.language import Language
 from spacy.scorer import Scorer
 from spacy.tokens import Doc
@@ -10,12 +11,15 @@
 from ..ty import ExamplesConfigType
 from .templates import read_template
 from .util import SerializableTask
-from .util.examples import LemmaExample
-from .util.serialization import ExampleType
 
 _DEFAULT_LEMMA_TEMPLATE_V1 = read_template("lemma")
 
 
+class LemmaExample(BaseModel):
+    text: str
+    lemmas: List[Dict[str, str]]
+
+
 @registry.llm_tasks("spacy.Lemma.v1")
 def make_lemma_task(
     template: str = _DEFAULT_LEMMA_TEMPLATE_V1,
@@ -29,11 +33,11 @@ def make_lemma_task(
         passed, then zero-shot learning will be used.
     """
     raw_examples = examples() if callable(examples) else examples
-    span_examples = (
+    lemma_examples = (
         [LemmaExample(**eg) for eg in raw_examples] if raw_examples else None
     )
 
-    return LemmaTask(template=template, examples=span_examples)
+    return LemmaTask(template=template, examples=lemma_examples)
 
 
 class LemmaTask(SerializableTask[LemmaExample]):
@@ -50,28 +54,33 @@ def __init__(
             passed, then zero-shot learning will be used.
         """
         self._template = template
-        self._examples = examples
+        self._prompt_examples = examples or []
 
     def initialize(
         self,
         get_examples: Callable[[], Iterable["Example"]],
         nlp: Language,
+        n_prompt_examples: int = 0,
         **kwargs: Any,
     ) -> None:
         """Nothing to initialize for the LEMMA task.
         get_examples (Callable[[], Iterable["Example"]]): Callable that provides examples
             for initialization.
         nlp (Language): Language instance.
-        labels (List[str]): Optional list of labels.
+        n_prompt_examples (int): How many prompt examples to infer from the provided Example objects.
+            0 by default. Takes all examples if set to -1.
         """
+        for eg in get_examples():
+            if n_prompt_examples < 0 or len(self._prompt_examples) < n_prompt_examples:
+                self._prompt_examples.append(self._create_prompt_example(eg))
 
     def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
         environment = jinja2.Environment()
         _template = environment.from_string(self._template)
         for doc in docs:
             prompt = _template.render(
                 text=doc.text,
-                examples=self._examples,
+                examples=self._prompt_examples,
             )
             yield prompt
 
@@ -114,5 +123,10 @@ def _cfg_keys(self) -> List[str]:
         return ["_template"]
 
     @property
-    def _Example(self) -> Type[ExampleType]:
+    def _Example(self) -> Type[LemmaExample]:
         return LemmaExample
+
+    def _create_prompt_example(self, example: Example) -> LemmaExample:
+        """Create a lemma prompt example from a spaCy example."""
+        lemma_dict = [{t.text: t.lemma_} for t in example.reference]
+        return LemmaExample(text=example.reference.text, lemmas=lemma_dict)
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from spacy.language import Language
@@ -10,8 +11,8 @@
 from ..registry import registry
 from ..ty import ExamplesConfigType
 from ..util import split_labels
+from .span import SpanExample, SpanTask
 from .templates import read_template
-from .util import SpanExample, SpanTask
 
 _DEFAULT_NER_TEMPLATE_V1 = read_template("ner")
 _DEFAULT_NER_TEMPLATE_V2 = read_template("ner.v2")
@@ -52,7 +53,7 @@ def make_ner_task(
     return NERTask(
         labels=labels_list,
         template=_DEFAULT_NER_TEMPLATE_V1,
-        examples=span_examples,
+        prompt_examples=span_examples,
         normalizer=normalizer,
         alignment_mode=alignment_mode,
         case_sensitive_matching=case_sensitive_matching,
@@ -98,7 +99,7 @@ def make_ner_task_v2(
         labels=labels_list,
         template=template,
         label_definitions=label_definitions,
-        examples=span_examples,
+        prompt_examples=span_examples,
         normalizer=normalizer,
         alignment_mode=alignment_mode,
         case_sensitive_matching=case_sensitive_matching,
@@ -112,7 +113,7 @@ def __init__(
         labels: List[str] = [],
         template: str = _DEFAULT_NER_TEMPLATE_V2,
         label_definitions: Optional[Dict[str, str]] = None,
-        examples: Optional[List[SpanExample]] = None,
+        prompt_examples: Optional[List[SpanExample]] = None,
         normalizer: Optional[Callable[[str], str]] = None,
         alignment_mode: Literal["strict", "contract", "expand"] = "contract",
         case_sensitive_matching: bool = False,
@@ -140,7 +141,7 @@ def __init__(
             labels=labels,
             template=template,
             label_definitions=label_definitions,
-            examples=examples,
+            prompt_examples=prompt_examples,
             normalizer=normalizer,
             alignment_mode=alignment_mode,
             case_sensitive_matching=case_sensitive_matching,
@@ -152,6 +153,7 @@ def initialize(
         get_examples: Callable[[], Iterable["Example"]],
         nlp: Language,
         labels: List[str] = [],
+        n_prompt_examples: int = 0,
         **kwargs: Any,
     ) -> None:
         """Initialize the NER task, by auto-discovering labels.
@@ -166,22 +168,26 @@ def initialize(
             for initialization.
         nlp (Language): Language instance.
         labels (List[str]): Optional list of labels.
+        n_prompt_examples (int): How many prompt examples to infer from the Example objects.
+            0 by default. Takes all examples if set to -1.
         """
-
-        examples = get_examples()
-
         if not labels:
             labels = list(self._label_dict.values())
+        infer_labels = not labels
 
-        if not labels:
-            label_set = set()
+        if infer_labels:
+            labels = []
 
-            for eg in examples:
+        for eg in get_examples():
+            if infer_labels:
                 for ent in eg.reference.ents:
-                    label_set.add(ent.label_)
-            labels = list(label_set)
+                    labels.append(ent.label_)
+            if n_prompt_examples < 0 or len(self._prompt_examples) < n_prompt_examples:
+                self._prompt_examples.append(self._create_prompt_example(eg))
 
-        self._label_dict = {self._normalizer(label): label for label in labels}
+        self._label_dict = {
+            self._normalizer(label): label for label in sorted(set(labels))
+        }
 
     def assign_spans(
         self,
@@ -196,3 +202,11 @@ def scorer(
         examples: Iterable[Example],
     ) -> Dict[str, Any]:
         return get_ner_prf(examples)
+
+    def _create_prompt_example(self, example: Example) -> SpanExample:
+        """Create an NER prompt example from a spaCy example."""
+        entities = defaultdict(list)
+        for ent in example.reference.ents:
+            entities[ent.label_].append(ent.text)
+
+        return SpanExample(text=example.reference.text, entities=entities)