explosion · bdura · May 25, 2023 · May 23, 2023 · May 23, 2023 · May 23, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -84,6 +84,7 @@ jobs:
         run: |
           python -m pip install -U -r requirements.txt
           python -m pip install -U -r requirements-dev.txt
+          python -m spacy download en_core_web_md
 
       - name: Run spacy_llm tests
         shell: bash

diff --git a/.gitignore b/.gitignore
@@ -88,6 +88,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+*.ipynb
 
 # IPython
 profile_default/

diff --git a/README.md b/README.md
@@ -364,6 +364,45 @@ labels = COMPLIMENT,INSULT
 path = "textcat_examples.json"
 ```
 
+#### spacy.REL.v1
+
+The built-in REL task supports both zero-shot and few-shot prompting.
+It relies on an upstream NER component for entities extraction.
+
+```ini
+[components.llm.task]
+@llm_tasks = "spacy.REL.v1"
+labels = LivesIn,Visits
+```
+
+| Argument            | Type                                    | Default | Description                                                                                                          |
+| ------------------- | --------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------- |
+| `labels`            | `str`                                   |         | Comma-separated list of relation labels.                                                                             |
+| `label_description` | `Optional[Dict[str, str]]`              | `None`  | Dictionary providind a description for each relation label.                                                          |
+| `examples`          | `Optional[Callable[[], Iterable[Any]]]` | `None`  | Optional function that generates examples for few-shot learning.                                                     |
+| `normalizer`        | `Optional[Callable[[str], str]]`        | `None`  | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. |
+| `verbose`           | `bool`                                  | `False` | If set to `True`, warnings will be generated when the LLM returns invalid responses.                                 |
+
+To perform few-shot learning, you can write down a few examples in a separate file, and provide these to be injected into the prompt to the LLM.
+The default reader `spacy.FewShotReader.v1` supports `.yml`, `.yaml`, `.json` and `.jsonl`.
+
+```json
+{"text": "Laura bought a house in Boston with her husband Mark.", "ents": [{"start_char": 0, "end_char": 5, "label": "PERSON"}, {"start_char": 24, "end_char": 30, "label": "GPE"}, {"start_char": 48, "end_char": 52, "label": "PERSON"}], "relations": [{"dep": 0, "dest": 1, "relation": "LivesIn"}, {"dep": 2, "dest": 1, "relation": "LivesIn"}]}
+{"text": "Michael travelled through South America by bike.", "ents": [{"start_char": 0, "end_char": 7, "label": "PERSON"}, {"start_char": 26, "end_char": 39, "label": "LOC"}], "relations": [{"dep": 0, "dest": 1, "relation": "Visits"}]}
+```
+
+Note: you'll need to add a NER-capable component to your spaCy pipeline
+and put it _before_ the REL component.
+
+```ini
+[components.llm.task]
+@llm_tasks = "spacy.REL.v1"
+labels = LivesIn,Visits
+[components.llm.task.examples]
+@misc = "spacy.FewShotReader.v1"
+path = "rel_examples.jsonl"
+```
+
 #### spacy.NoOp.v1
 
 This task is only useful for testing - it tells the LLM to do nothing, and does not set any fields on the `docs`.

diff --git a/spacy_llm/tasks/__init__.py b/spacy_llm/tasks/__init__.py
@@ -1,6 +1,13 @@
 from .ner import NERTask
 from .noop import NoopTask
+from .rel import RELTask
 from .spancat import SpanCatTask
 from .textcat import TextCatTask
 
-__all__ = ["NoopTask", "NERTask", "TextCatTask", "SpanCatTask"]
+__all__ = [
+    "NoopTask",
+    "NERTask",
+    "TextCatTask",
+    "SpanCatTask",
+    "RELTask",
+]
diff --git a/spacy_llm/tasks/rel.py b/spacy_llm/tasks/rel.py
@@ -0,0 +1,114 @@
+from typing import Callable, Dict, Iterable, List, Optional, Union
+
+import jinja2
+from pydantic import BaseModel, Field, ValidationError, validator
+from spacy.tokens import Doc
+from wasabi import msg
+
+from ..registry import lowercase_normalizer, registry
+from .templates import read_template
+
+
+class RelationItem(BaseModel):
+    dep: int
+    dest: int
+    relation: str
+
+    @validator("dep", "dest", pre=True)
+    def clean_ent(cls, value):
+        if isinstance(value, str):
+            value = value.strip("ENT")
+        return value
+
+
+class EntityItem(BaseModel):
+    start_char: int
+    end_char: int
+    label_: str = Field(alias="label")
+
+
+class RELExample(BaseModel):
+    text: str
+    ents: List[EntityItem]
+    relations: List[RelationItem]
+
+
+def _preannotate(doc: Union[Doc, RELExample]) -> str:
+    """Creates a text version of the document with annotated entities."""
+    offset = 0
+
+    text = doc.text
+
+    for i, ent in enumerate(doc.ents):
+        end = ent.end_char
+        before, after = text[: end + offset], text[end + offset :]
+
+        annotation = f"[ENT{i}:{ent.label_}]"
+        offset += len(annotation)
+
+        text = f"{before}{annotation}{after}"
+
+    return text
+
+
+@registry.llm_tasks("spacy.REL.v1")
+class RELTask:
+    """Simple REL task. Populates a `Doc._.rel` custom attribute."""
+
+    _TEMPLATE_STR = read_template("rel")
+
+    def __init__(
+        self,
+        labels: str,
+        label_definitions: Optional[Dict[str, str]] = None,
+        examples: Optional[Callable[[], Iterable[Dict]]] = None,
+        normalizer: Optional[Callable[[str], str]] = None,
+        verbose: bool = False,
+    ):
+
+        if not Doc.has_extension("rel"):
+            Doc.set_extension("rel", default=[])
+
+        self._normalizer = normalizer if normalizer else lowercase_normalizer()
+        self._label_dict = {
+            self._normalizer(label): label for label in labels.split(",")
+        }
+        self._label_definitions = label_definitions
+        self._examples = examples and [RELExample.parse_obj(eg) for eg in examples()]
+
+        self._verbose = verbose
+
+    def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
+        environment = jinja2.Environment()
+        _template = environment.from_string(self._TEMPLATE_STR)
+        for doc in docs:
+            prompt = _template.render(
+                text=_preannotate(doc),
+                labels=list(self._label_dict.values()),
+                label_definitions=self._label_definitions,
+                examples=self._examples,
+                preannotate=_preannotate,
+            )
+            yield prompt
+
+    def _format_response(self, response: str) -> Iterable[RelationItem]:
+        """Parse raw string response into a structured format"""
+        relations = []
+        for line in response.strip().split("\n"):
+            try:
+                relations.append(RelationItem.parse_raw(line))
+            except ValidationError:
+                msg.warn(
+                    "Validation issue",
+                    line,
+                    show=self._verbose,
+                )
+        return relations
+
+    def parse_responses(
+        self, docs: Iterable[Doc], responses: Iterable[str]
+    ) -> Iterable[Doc]:
+        for doc, prompt_response in zip(docs, responses):
+            rels = self._format_response(prompt_response)
+            doc._.rel = rels
+            yield doc
diff --git a/spacy_llm/tasks/templates/rel.jinja b/spacy_llm/tasks/templates/rel.jinja
@@ -0,0 +1,57 @@
+The text below contains pre-extracted entities, denoted in the following format within the text:
+{# whitespace #}
+<entity text>[ENT<entity id>:<entity label>]
+{# whitespace #}
+From the text below, extract the following relations between entities:
+{# whitespace #}
+{# whitespace #}
+{%- for label in labels -%}
+{{ label }}
+{# whitespace #}
+{%- endfor -%}
+{# whitespace #}
+The extraction has to use the following format, with one line for each detected relation:
+{# whitespace #}
+{"dep": <entity id>, "dest": <entity id>, "relation": <relation label>}
+{# whitespace #}
+Make sure that only relevant relations are listed, and that each line is a valid JSON object.
+{# whitespace #}
+{%- if label_definitions -%}
+Below are definitions of each label to help aid you in what kinds of relationship to extract for each label.
+Assume these definitions are written by an expert and follow them closely.
+{# whitespace #}
+{# whitespace #}
+{%- for label, definition in label_definitions.items() -%}
+{{ label }}: {{ definition }}
+{# whitespace #}
+{%- endfor -%}
+{# whitespace #}
+{# whitespace #}
+{%- endif -%}
+{%- if examples -%}
+Below are some examples (only use these as a guide):
+{# whitespace #}
+{# whitespace #}
+{%- for example in examples -%}
+Text:
+'''
+{{ preannotate(example) }}
+'''
+{# whitespace #}
+{%- for item in example.relations -%}
+{# whitespace #}
+{{ item.json() }}
+{%- endfor -%}
+{# whitespace #}
+{# whitespace #}
+{# whitespace #}
+{%- endfor -%}
+{# whitespace #}
+{# whitespace #}
+{%- endif -%}
+Here is the text that needs labeling:
+{# whitespace #}
+Text:
+'''
+{{ text }}
+'''
diff --git a/spacy_llm/tests/tasks/examples/rel_examples.jsonl b/spacy_llm/tests/tasks/examples/rel_examples.jsonl
@@ -0,0 +1,2 @@
+{"text": "Laura bought a house in Boston with her husband Mark.", "ents": [{"start_char": 0, "end_char": 5, "label": "PERSON"}, {"start_char": 24, "end_char": 30, "label": "GPE"}, {"start_char": 48, "end_char": 52, "label": "PERSON"}], "relations": [{"dep": 0, "dest": 1, "relation": "LivesIn"}, {"dep": 2, "dest": 1, "relation": "LivesIn"}]}
+{"text": "Michael travelled through South America by bike.", "ents": [{"start_char": 0, "end_char": 7, "label": "PERSON"}, {"start_char": 26, "end_char": 39, "label": "LOC"}], "relations": [{"dep": 0, "dest": 1, "relation": "Visits"}]}
diff --git a/spacy_llm/tests/tasks/test_rel.py b/spacy_llm/tests/tasks/test_rel.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+
+import pytest
+import spacy
+from confection import Config
+from pytest import FixtureRequest
+
+from spacy_llm.tasks.rel import RelationItem
+
+from ..compat import has_openai_key
+
+EXAMPLES_DIR = Path(__file__).parent / "examples"
+
+
+@pytest.fixture
+def zeroshot_cfg_string():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["ner", "llm"]
+    batch_size = 128
+
+    [components]
+
+    [components.ner]
+    source = "en_core_web_md"
+
+    [components.llm]
+    factory = "llm"
+
+    [components.llm.task]
+    @llm_tasks = "spacy.REL.v1"
+    labels = "LivesIn,Visits"
+
+    [components.llm.backend]
+    @llm_backends = "spacy.REST.v1"
+    api = "OpenAI"
+    """
+
+
+@pytest.fixture
+def fewshot_cfg_string():
+    return f"""
+    [nlp]
+    lang = "en"
+    pipeline = ["ner", "llm"]
+    batch_size = 128
+
+    [components]
+
+    [components.ner]
+    source = "en_core_web_md"
+
+    [components.llm]
+    factory = "llm"
+
+    [components.llm.task]
+    @llm_tasks = "spacy.REL.v1"
+    labels = "LivesIn,Visits"
+
+    [components.llm.task.examples]
+    @misc = "spacy.FewShotReader.v1"
+    path = {str(EXAMPLES_DIR / "rel_examples.jsonl")}
+
+    [components.llm.backend]
+    @llm_backends = "spacy.REST.v1"
+    api = "OpenAI"
+    """
+
+
+@pytest.fixture
+def task():
+    text = "Joey rents a place in New York City."
+    gold_relations = [RelationItem(dep=0, dest=1, relation="LivesIn")]
+    return text, gold_relations
+
+
+@pytest.mark.skipif(has_openai_key is False, reason="OpenAI API key not available")
+@pytest.mark.parametrize("cfg_string", ["zeroshot_cfg_string", "fewshot_cfg_string"])
+def test_rel_config(cfg_string, request: FixtureRequest):
+    """Simple test to check if the config loads properly given different settings"""
+
+    cfg_string = request.getfixturevalue(cfg_string)
+    orig_config = Config().from_str(cfg_string)
+    nlp = spacy.util.load_model_from_config(orig_config, auto_fill=True)
+    assert nlp.pipe_names == ["ner", "llm"]
+
+
+@pytest.mark.external
+@pytest.mark.skipif(has_openai_key is False, reason="OpenAI API key not available")
+@pytest.mark.parametrize("cfg_string", ["zeroshot_cfg_string", "fewshot_cfg_string"])
+def test_rel_predict(task, cfg_string, request):
+    """Use OpenAI to get REL results.
+    Note that this test may fail randomly, as the LLM's output is unguaranteed to be consistent/predictable
+    """
+    cfg_string = request.getfixturevalue(cfg_string)
+    orig_config = Config().from_str(cfg_string)
+    nlp = spacy.util.load_model_from_config(orig_config, auto_fill=True)
+
+    text, _ = task
+    doc = nlp(text)
+
+    assert doc.ents
+    assert doc._.rel
diff --git a/usage_examples/rel_openai/README.md b/usage_examples/rel_openai/README.md
@@ -0,0 +1,38 @@
+# Relation extraction using LLMs
+
+This example shows how you can use a model from OpenAI for relation extraction in
+zero- and few-shot settings.
+
+Here, we use the pretrained [`en_core_web_md` model](https://spacy.io/models/en#en_core_web_sm)
+to perform Named Entity Recognition (NER) using a fast and properly evaluated pipeline.
+Then, we leverage the OpenAI API to detect the relations between the extracted entities.
+In this example, we focus on two simple relations: `LivesIn` and `Visits`.
+
+First, create a new API key from
+[openai.com](https://platform.openai.com/account/api-keys) or fetch an existing
+one. Record the secret key and make sure this is available as an environmental
+variable:
+
+```sh
+export OPENAI_API_KEY="sk-..."
+export OPENAI_API_ORG="org-..."
+```
+
+Then, you can run the pipeline on a sample text via:
+
+```sh
+python run_rel_openai_pipeline.py [TEXT] [PATH TO CONFIG]
+```
+
+For example:
+
+```sh
+python run_rel_openai_pipeline.py \
+    "Laura just bought an apartment in Boston." \
+    ./openai_rel_zeroshot.cfg
+```
+
+You can also include examples to perform few-shot annotation. To do so, use the
+`openai_rel_fewshot.cfg` file instead. You can find the few-shot examples in
+the `examples.jsonl` file. Feel free to change and update it to your liking.
+We also support other file formats, including `.json`, `.yml` and `.yaml`.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"text": "Laura bought a house in Boston with her husband Mark.", "ents": [{"start_char": 0, "end_char": 5, "label": "PERSON"}, {"start_char": 24, "end_char": 30, "label": "GPE"}, {"start_char": 48, "end_char": 52, "label": "PERSON"}], "relations": [{"dep": 0, "dest": 1, "relation": "LivesIn"}, {"dep": 2, "dest": 1, "relation": "LivesIn"}]}
		{"text": "Michael travelled through South America by bike.", "ents": [{"start_char": 0, "end_char": 7, "label": "PERSON"}, {"start_char": 26, "end_char": 39, "label": "LOC"}], "relations": [{"dep": 0, "dest": 1, "relation": "Visits"}]}