Skip to content

Commit 242b085

Browse files
authored
Merge pull request #24315
* community: Add Hierarchy link extractor * add example * lint
1 parent c3308f3 commit 242b085

File tree

3 files changed

+196
-0
lines changed

3 files changed

+196
-0
lines changed

libs/community/langchain_community/graph_vectorstores/extractors/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
2+
HierarchyInput,
3+
HierarchyLinkExtractor,
4+
)
15
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
26
HtmlInput,
37
HtmlLinkExtractor,
@@ -12,6 +16,8 @@
1216
__all__ = [
1317
"LinkExtractor",
1418
"LinkExtractorAdapter",
19+
"HierarchyInput",
20+
"HierarchyLinkExtractor",
1521
"HtmlInput",
1622
"HtmlLinkExtractor",
1723
]
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from typing import Callable, List, Set
2+
3+
from langchain_core.documents import Document
4+
from langchain_core.graph_vectorstores.links import Link
5+
6+
from langchain_community.graph_vectorstores.extractors.link_extractor import (
7+
LinkExtractor,
8+
)
9+
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
10+
LinkExtractorAdapter,
11+
)
12+
13+
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
14+
HierarchyInput = List[str]
15+
16+
_PARENT: str = "p:"
17+
_CHILD: str = "c:"
18+
_SIBLING: str = "s:"
19+
20+
21+
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
22+
def __init__(
23+
self,
24+
*,
25+
kind: str = "hierarchy",
26+
parent_links: bool = True,
27+
child_links: bool = False,
28+
sibling_links: bool = False,
29+
):
30+
"""Extract links from a document hierarchy.
31+
32+
Example:
33+
.. code_block: python
34+
# Given three paths (in this case, within the "Root" document):
35+
h1 = ["Root", "H1"]
36+
h1a = ["Root", "H1", "a"]
37+
h1b = ["Root", "H1", "b"]
38+
39+
# Parent links `h1a` and `h1b` to `h1`.
40+
# Child links `h1` to `h1a` and `h1b`.
41+
# Sibling links `h1a` and `h1b` together (both directions).
42+
43+
Example use with documents:
44+
.. code_block: python
45+
transformer = LinkExtractorTransformer([
46+
HierarchyLinkExtractor().as_document_extractor(
47+
# Assumes the "path" to each document is in the metadata.
48+
# Could split strings, etc.
49+
lambda doc: doc.metadata.get("path", [])
50+
)
51+
])
52+
linked = transformer.transform_documents(docs)
53+
54+
Args:
55+
kind: Kind of links to produce with this extractor.
56+
parent_links: Link from a section to its parent.
57+
child_links: Link from a section to its children.
58+
sibling_links: Link from a section to other sections with the same parent.
59+
"""
60+
self._kind = kind
61+
self._parent_links = parent_links
62+
self._child_links = child_links
63+
self._sibling_links = sibling_links
64+
65+
def as_document_extractor(
66+
self, hierarchy: Callable[[Document], HierarchyInput]
67+
) -> LinkExtractor[Document]:
68+
"""Create a LinkExtractor from `Document`.
69+
70+
Args:
71+
hierarchy: Function that returns the path for the given document.
72+
73+
Returns:
74+
A `LinkExtractor[Document]` suitable for application to `Documents` directly
75+
or with `LinkExtractorTransformer`.
76+
"""
77+
return LinkExtractorAdapter(underlying=self, transform=hierarchy)
78+
79+
def extract_one(
80+
self,
81+
input: HierarchyInput,
82+
) -> Set[Link]:
83+
this_path = "/".join(input)
84+
parent_path = None
85+
86+
links = set()
87+
if self._parent_links:
88+
# This is linked from everything with this parent path.
89+
links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
90+
if self._child_links:
91+
# This is linked to every child with this as it's "parent" path.
92+
links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))
93+
94+
if len(input) >= 1:
95+
parent_path = "/".join(input[0:-1])
96+
if self._parent_links and len(input) > 1:
97+
# This is linked to the nodes with the given parent path.
98+
links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
99+
if self._child_links and len(input) > 1:
100+
# This is linked from every node with the given parent path.
101+
links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
102+
if self._sibling_links:
103+
# This is a sibling of everything with the same parent.
104+
links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))
105+
106+
return links
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from langchain_core.graph_vectorstores.links import Link
2+
3+
from langchain_community.graph_vectorstores.extractors import HierarchyLinkExtractor
4+
5+
PATH_1 = ["Root", "H1", "h2"]
6+
7+
PATH_2 = ["Root", "H1"]
8+
9+
PATH_3 = ["Root"]
10+
11+
12+
def test_up_only() -> None:
13+
extractor = HierarchyLinkExtractor()
14+
15+
assert extractor.extract_one(PATH_1) == {
16+
# Path1 links up to Root/H1
17+
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
18+
# Path1 is linked to by stuff under Root/H1/h2
19+
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
20+
}
21+
22+
assert extractor.extract_one(PATH_2) == {
23+
# Path2 links up to Root
24+
Link.outgoing(kind="hierarchy", tag="up:Root"),
25+
# Path2 is linked to by stuff under Root/H1/h2
26+
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
27+
}
28+
29+
assert extractor.extract_one(PATH_3) == {
30+
# Path3 is linked to by stuff under Root
31+
Link.incoming(kind="hierarchy", tag="up:Root"),
32+
}
33+
34+
35+
def test_up_and_down() -> None:
36+
extractor = HierarchyLinkExtractor(child_links=True)
37+
38+
assert extractor.extract_one(PATH_1) == {
39+
# Path1 links up to Root/H1
40+
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
41+
# Path1 is linked to by stuff under Root/H1/h2
42+
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
43+
# Path1 links down to things under Root/H1/h2.
44+
Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"),
45+
# Path1 is linked down to by Root/H1
46+
Link.incoming(kind="hierarchy", tag="down:Root/H1"),
47+
}
48+
49+
assert extractor.extract_one(PATH_2) == {
50+
# Path2 links up to Root
51+
Link.outgoing(kind="hierarchy", tag="up:Root"),
52+
# Path2 is linked to by stuff under Root/H1/h2
53+
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
54+
# Path2 links down to things under Root/H1.
55+
Link.outgoing(kind="hierarchy", tag="down:Root/H1"),
56+
# Path2 is linked down to by Root
57+
Link.incoming(kind="hierarchy", tag="down:Root"),
58+
}
59+
60+
assert extractor.extract_one(PATH_3) == {
61+
# Path3 is linked to by stuff under Root
62+
Link.incoming(kind="hierarchy", tag="up:Root"),
63+
# Path3 links down to things under Root/H1.
64+
Link.outgoing(kind="hierarchy", tag="down:Root"),
65+
}
66+
67+
68+
def test_sibling() -> None:
69+
extractor = HierarchyLinkExtractor(sibling_links=True, parent_links=False)
70+
71+
assert extractor.extract_one(PATH_1) == {
72+
# Path1 links with anything else in Root/H1
73+
Link.bidir(kind="hierarchy", tag="sib:Root/H1"),
74+
}
75+
76+
assert extractor.extract_one(PATH_2) == {
77+
# Path2 links with anything else in Root
78+
Link.bidir(kind="hierarchy", tag="sib:Root"),
79+
}
80+
81+
assert extractor.extract_one(PATH_3) == {
82+
# Path3 links with anything else at the top level
83+
Link.bidir(kind="hierarchy", tag="sib:"),
84+
}

0 commit comments

Comments
 (0)