Skip to content

Commit 2b6bcbe

Browse files
authored
feat: Add Tavily extract function for URL content extraction (#20038)
1 parent 50f3926 commit 2b6bcbe

File tree

5 files changed

+322
-2
lines changed

5 files changed

+322
-2
lines changed

llama-index-integrations/tools/llama-index-tools-tavily-research/README.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,31 @@ agent = FunctionAgent(
3434
await agent.run("What happened in the latest Burning Man festival?")
3535
```
3636

37-
`search`: Search for relevant dynamic data based on a query. Returns a list of urls and their relevant content.
37+
## Available Functions
38+
39+
`search`: Search for relevant dynamic data based on a query. Returns a list of Document objects with urls and their relevant content.
40+
41+
`extract`: Extract raw content from specific URLs using Tavily Extract API. Returns a list of Document objects containing the extracted content and metadata.
42+
43+
### Extract Function Example
44+
45+
```python
46+
from llama_index.tools.tavily_research import TavilyToolSpec
47+
48+
tavily_tool = TavilyToolSpec(api_key="your-key")
49+
50+
# Extract content from specific URLs
51+
documents = tavily_tool.extract(
52+
urls=["https://example.com/article1", "https://example.com/article2"],
53+
include_images=True,
54+
include_favicon=True,
55+
extract_depth="advanced", # "basic" or "advanced"
56+
format="markdown", # "markdown" or "text"
57+
)
58+
59+
for doc in documents:
60+
print(f"URL: {doc.extra_info['url']}")
61+
print(f"Content: {doc.text[:200]}...")
62+
```
3863

3964
This loader is designed to be used as a way to load data as a Tool in an Agent.

llama-index-integrations/tools/llama-index-tools-tavily-research/examples/tavily.ipynb

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,45 @@
100100
"tavily_tool.search(\"What happened in the latest Burning Man festival?\", max_results=3)"
101101
]
102102
},
103+
{
104+
"cell_type": "markdown",
105+
"id": "e61abb6d",
106+
"metadata": {},
107+
"source": [
108+
"## Testing the Tavily extract tool\n",
109+
"\n",
110+
"The extract function allows you to extract raw content from specific URLs. This is useful when you have specific URLs you want to extract content from, rather than searching for content."
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"id": "360386ef",
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"# Extract content from specific URLs\n",
121+
"urls_to_extract = [\n",
122+
" \"https://en.wikipedia.org/wiki/Burning_Man\",\n",
123+
" \"https://burningman.org/about/\",\n",
124+
"]\n",
125+
"\n",
126+
"extracted_docs = tavily_tool.extract(\n",
127+
" urls=urls_to_extract,\n",
128+
" include_images=False,\n",
129+
" include_favicon=True,\n",
130+
" extract_depth=\"basic\",\n",
131+
" format=\"markdown\",\n",
132+
")\n",
133+
"\n",
134+
"print(f\"Extracted {len(extracted_docs)} documents:\")\n",
135+
"for i, doc in enumerate(extracted_docs):\n",
136+
" print(f\"\\nDocument {i+1}:\")\n",
137+
" print(f\"URL: {doc.extra_info.get('url', 'N/A')}\")\n",
138+
" print(f\"Content preview: {doc.text[:300]}...\")\n",
139+
" print(f\"Has favicon: {doc.extra_info.get('favicon') is not None}\")"
140+
]
141+
},
103142
{
104143
"cell_type": "markdown",
105144
"id": "1210906d-87a7-466a-9712-1d17dba2c2ec",

llama-index-integrations/tools/llama-index-tools-tavily-research/llama_index/tools/tavily_research/base.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class TavilyToolSpec(BaseToolSpec):
1111

1212
spec_functions = [
1313
"search",
14+
"extract",
1415
]
1516

1617
def __init__(self, api_key: str) -> None:
@@ -40,3 +41,51 @@ def search(self, query: str, max_results: Optional[int] = 6) -> List[Document]:
4041
Document(text=result["content"], extra_info={"url": result["url"]})
4142
for result in response["results"]
4243
]
44+
45+
def extract(
46+
self,
47+
urls: List[str],
48+
include_images: bool = False,
49+
include_favicon: bool = False,
50+
extract_depth: str = "basic",
51+
format: str = "markdown",
52+
) -> List[Document]:
53+
"""
54+
Extract raw content from a URL using Tavily Extract API.
55+
56+
Args:
57+
urls: The URL(s) to extract content from.
58+
include_images: Whether to include images in the response.
59+
include_favicon: Whether to include the favicon in the response.
60+
extract_depth: 'basic' or 'advanced' (default: 'basic').
61+
format: 'markdown' or 'text' (default: 'markdown').
62+
63+
Returns:
64+
A list of Document objects containing the extracted content and metadata,
65+
or an empty list if no results were returned.
66+
67+
"""
68+
response = self.client.extract(
69+
urls,
70+
include_images=include_images,
71+
include_favicon=include_favicon,
72+
extract_depth=extract_depth,
73+
format=format,
74+
)
75+
76+
results = response.get("results", [])
77+
78+
if not results:
79+
return []
80+
81+
return [
82+
Document(
83+
text=result.get("raw_content", ""),
84+
extra_info={
85+
"url": result.get("url"),
86+
"favicon": result.get("favicon"),
87+
"images": result.get("images"),
88+
},
89+
)
90+
for result in results
91+
]

llama-index-integrations/tools/llama-index-tools-tavily-research/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dev = [
2626

2727
[project]
2828
name = "llama-index-tools-tavily-research"
29-
version = "0.4.1"
29+
version = "0.4.2"
3030
description = "llama-index tools tavily_research integration"
3131
authors = [{name = "Your Name", email = "[email protected]"}]
3232
requires-python = ">=3.9,<4.0"
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,214 @@
1+
from unittest.mock import Mock, patch
12
from llama_index.core.tools.tool_spec.base import BaseToolSpec
3+
from llama_index.core.schema import Document
24
from llama_index.tools.tavily_research import TavilyToolSpec
35

46

57
def test_class():
68
names_of_base_classes = [b.__name__ for b in TavilyToolSpec.__mro__]
79
assert BaseToolSpec.__name__ in names_of_base_classes
10+
11+
12+
def test_spec_functions():
13+
"""Test that spec_functions includes both search and extract methods."""
14+
assert "search" in TavilyToolSpec.spec_functions
15+
assert "extract" in TavilyToolSpec.spec_functions
16+
17+
18+
@patch("tavily.TavilyClient")
19+
def test_init(mock_tavily_client):
20+
"""Test TavilyToolSpec initialization."""
21+
api_key = "test_api_key"
22+
tool = TavilyToolSpec(api_key=api_key)
23+
24+
mock_tavily_client.assert_called_once_with(api_key=api_key)
25+
assert tool.client == mock_tavily_client.return_value
26+
27+
28+
@patch("tavily.TavilyClient")
29+
def test_search(mock_tavily_client):
30+
"""Test search method returns properly formatted Document objects."""
31+
# Setup mock response
32+
mock_response = {
33+
"results": [
34+
{"content": "Test content 1", "url": "https://example1.com"},
35+
{"content": "Test content 2", "url": "https://example2.com"},
36+
]
37+
}
38+
39+
mock_client_instance = Mock()
40+
mock_client_instance.search.return_value = mock_response
41+
mock_tavily_client.return_value = mock_client_instance
42+
43+
# Create tool and call search
44+
tool = TavilyToolSpec(api_key="test_key")
45+
results = tool.search("test query", max_results=5)
46+
47+
# Verify client.search was called correctly
48+
mock_client_instance.search.assert_called_once_with(
49+
"test query", max_results=5, search_depth="advanced"
50+
)
51+
52+
# Verify results
53+
assert len(results) == 2
54+
assert all(isinstance(doc, Document) for doc in results)
55+
56+
assert results[0].text == "Test content 1"
57+
assert results[0].extra_info["url"] == "https://example1.com"
58+
59+
assert results[1].text == "Test content 2"
60+
assert results[1].extra_info["url"] == "https://example2.com"
61+
62+
63+
@patch("tavily.TavilyClient")
64+
def test_search_with_default_max_results(mock_tavily_client):
65+
"""Test search method uses default max_results of 6."""
66+
mock_response = {"results": []}
67+
68+
mock_client_instance = Mock()
69+
mock_client_instance.search.return_value = mock_response
70+
mock_tavily_client.return_value = mock_client_instance
71+
72+
tool = TavilyToolSpec(api_key="test_key")
73+
tool.search("test query")
74+
75+
mock_client_instance.search.assert_called_once_with(
76+
"test query", max_results=6, search_depth="advanced"
77+
)
78+
79+
80+
@patch("tavily.TavilyClient")
81+
def test_extract(mock_tavily_client):
82+
"""Test extract method returns properly formatted Document objects."""
83+
# Setup mock response
84+
mock_response = {
85+
"results": [
86+
{
87+
"raw_content": "Extracted content 1",
88+
"url": "https://example1.com",
89+
"favicon": "https://example1.com/favicon.ico",
90+
"images": ["https://example1.com/image1.jpg"],
91+
},
92+
{
93+
"raw_content": "Extracted content 2",
94+
"url": "https://example2.com",
95+
"favicon": "https://example2.com/favicon.ico",
96+
"images": ["https://example2.com/image2.jpg"],
97+
},
98+
]
99+
}
100+
101+
mock_client_instance = Mock()
102+
mock_client_instance.extract.return_value = mock_response
103+
mock_tavily_client.return_value = mock_client_instance
104+
105+
# Create tool and call extract
106+
tool = TavilyToolSpec(api_key="test_key")
107+
urls = ["https://example1.com", "https://example2.com"]
108+
results = tool.extract(
109+
urls=urls,
110+
include_images=True,
111+
include_favicon=True,
112+
extract_depth="advanced",
113+
format="text",
114+
)
115+
116+
# Verify client.extract was called correctly
117+
mock_client_instance.extract.assert_called_once_with(
118+
urls,
119+
include_images=True,
120+
include_favicon=True,
121+
extract_depth="advanced",
122+
format="text",
123+
)
124+
125+
# Verify results
126+
assert len(results) == 2
127+
assert all(isinstance(doc, Document) for doc in results)
128+
129+
assert results[0].text == "Extracted content 1"
130+
assert results[0].extra_info["url"] == "https://example1.com"
131+
assert results[0].extra_info["favicon"] == "https://example1.com/favicon.ico"
132+
assert results[0].extra_info["images"] == ["https://example1.com/image1.jpg"]
133+
134+
assert results[1].text == "Extracted content 2"
135+
assert results[1].extra_info["url"] == "https://example2.com"
136+
137+
138+
@patch("tavily.TavilyClient")
139+
def test_extract_with_defaults(mock_tavily_client):
140+
"""Test extract method uses correct default parameters."""
141+
mock_response = {"results": []}
142+
143+
mock_client_instance = Mock()
144+
mock_client_instance.extract.return_value = mock_response
145+
mock_tavily_client.return_value = mock_client_instance
146+
147+
tool = TavilyToolSpec(api_key="test_key")
148+
urls = ["https://example.com"]
149+
tool.extract(urls)
150+
151+
mock_client_instance.extract.assert_called_once_with(
152+
urls,
153+
include_images=False,
154+
include_favicon=False,
155+
extract_depth="basic",
156+
format="markdown",
157+
)
158+
159+
160+
@patch("tavily.TavilyClient")
161+
def test_extract_empty_results(mock_tavily_client):
162+
"""Test extract method handles empty results gracefully."""
163+
mock_response = {"results": []}
164+
165+
mock_client_instance = Mock()
166+
mock_client_instance.extract.return_value = mock_response
167+
mock_tavily_client.return_value = mock_client_instance
168+
169+
tool = TavilyToolSpec(api_key="test_key")
170+
results = tool.extract(urls=["https://example.com"])
171+
172+
assert results == []
173+
174+
175+
@patch("tavily.TavilyClient")
176+
def test_extract_missing_fields(mock_tavily_client):
177+
"""Test extract method handles missing fields in response."""
178+
# Mock response with missing fields
179+
mock_response = {
180+
"results": [
181+
{
182+
"url": "https://example.com"
183+
# Missing raw_content, favicon, images
184+
}
185+
]
186+
}
187+
188+
mock_client_instance = Mock()
189+
mock_client_instance.extract.return_value = mock_response
190+
mock_tavily_client.return_value = mock_client_instance
191+
192+
tool = TavilyToolSpec(api_key="test_key")
193+
results = tool.extract(urls=["https://example.com"])
194+
195+
assert len(results) == 1
196+
assert results[0].text == "" # Empty string for missing raw_content
197+
assert results[0].extra_info["url"] == "https://example.com"
198+
assert results[0].extra_info["favicon"] is None
199+
assert results[0].extra_info["images"] is None
200+
201+
202+
@patch("tavily.TavilyClient")
203+
def test_extract_no_results_key(mock_tavily_client):
204+
"""Test extract method handles response without 'results' key."""
205+
mock_response = {} # No 'results' key
206+
207+
mock_client_instance = Mock()
208+
mock_client_instance.extract.return_value = mock_response
209+
mock_tavily_client.return_value = mock_client_instance
210+
211+
tool = TavilyToolSpec(api_key="test_key")
212+
results = tool.extract(urls=["https://example.com"])
213+
214+
assert results == []

0 commit comments

Comments
 (0)