|
10 | 10 | "---"
|
11 | 11 | ]
|
12 | 12 | },
|
13 |
| - { |
14 |
| - "cell_type": "raw", |
15 |
| - "id": "f725a8a2", |
16 |
| - "metadata": { |
17 |
| - "vscode": { |
18 |
| - "languageId": "raw" |
19 |
| - } |
20 |
| - }, |
21 |
| - "source": [ |
22 |
| - "**Note**: This notebook has been updated to include `SmartCrawlerTool` and remove `LocalScraperTool`. The SmartCrawlerTool provides advanced crawling capabilities for multi-page data extraction.\n", |
23 |
| - "\n", |
24 |
| - "### Updated Integration Details\n", |
25 |
| - "\n", |
26 |
| - "| Class | Package | Serializable | JS support | Package latest |\n", |
27 |
| - "| :--- | :--- | :---: | :---: | :---: |\n", |
28 |
| - "| [SmartScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
29 |
| - "| [SmartCrawlerTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
30 |
| - "| [MarkdownifyTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
31 |
| - "| [GetCreditsTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
32 |
| - "\n", |
33 |
| - "### Updated Tool Features\n", |
34 |
| - "\n", |
35 |
| - "| Tool | Purpose | Input | Output |\n", |
36 |
| - "| :--- | :--- | :--- | :--- |\n", |
37 |
| - "| SmartScraperTool | Extract structured data from websites | URL + prompt | JSON |\n", |
38 |
| - "| SmartCrawlerTool | Extract data from multiple pages with crawling | URL + prompt + crawl options | JSON |\n", |
39 |
| - "| MarkdownifyTool | Convert webpages to markdown | URL | Markdown text |\n", |
40 |
| - "| GetCreditsTool | Check API credits | None | Credit info |\n" |
41 |
| - ] |
42 |
| - }, |
43 | 13 | {
|
44 | 14 | "cell_type": "markdown",
|
45 | 15 | "id": "a6f91f20",
|
|
188 | 158 | ]
|
189 | 159 | },
|
190 | 160 | {
|
191 |
| - "cell_type": "raw", |
| 161 | + "cell_type": "markdown", |
| 162 | + "id": "d5a88cf2", |
192 | 163 | "metadata": {
|
193 | 164 | "vscode": {
|
194 | 165 | "languageId": "raw"
|
|
239 | 210 | "\n",
|
240 | 211 | "# SmartCrawler\n",
|
241 | 212 | "url = \"https://scrapegraphai.com/\"\n",
|
242 |
| - "prompt = \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 213 | + "prompt = (\n", |
| 214 | + " \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 215 | + ")\n", |
243 | 216 | "\n",
|
244 | 217 | "# Use the tool with crawling parameters\n",
|
245 |
| - "result_crawler = smartcrawler.invoke({\n", |
246 |
| - " \"url\": url,\n", |
247 |
| - " \"prompt\": prompt,\n", |
248 |
| - " \"cache_website\": True,\n", |
249 |
| - " \"depth\": 2,\n", |
250 |
| - " \"max_pages\": 2,\n", |
251 |
| - " \"same_domain_only\": True\n", |
252 |
| - "})\n", |
| 218 | + "result_crawler = smartcrawler.invoke(\n", |
| 219 | + " {\n", |
| 220 | + " \"url\": url,\n", |
| 221 | + " \"prompt\": prompt,\n", |
| 222 | + " \"cache_website\": True,\n", |
| 223 | + " \"depth\": 2,\n", |
| 224 | + " \"max_pages\": 2,\n", |
| 225 | + " \"same_domain_only\": True,\n", |
| 226 | + " }\n", |
| 227 | + ")\n", |
253 | 228 | "\n",
|
254 | 229 | "print(\"\\nSmartCrawler Result:\")\n",
|
255 | 230 | "print(json.dumps(result_crawler, indent=2))\n",
|
|
279 | 254 | "\n",
|
280 | 255 | "# Example based on the provided code snippet\n",
|
281 | 256 | "url = \"https://scrapegraphai.com/\"\n",
|
282 |
| - "prompt = \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 257 | + "prompt = (\n", |
| 258 | + " \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 259 | + ")\n", |
283 | 260 | "\n",
|
284 | 261 | "# Use the tool with crawling parameters\n",
|
285 |
| - "result = tool.invoke({\n", |
286 |
| - " \"url\": url,\n", |
287 |
| - " \"prompt\": prompt,\n", |
288 |
| - " \"cache_website\": True,\n", |
289 |
| - " \"depth\": 2,\n", |
290 |
| - " \"max_pages\": 2,\n", |
291 |
| - " \"same_domain_only\": True\n", |
292 |
| - "})\n", |
293 |
| - "\n", |
294 |
| - "print(json.dumps(result, indent=2))\n" |
| 262 | + "result = tool.invoke(\n", |
| 263 | + " {\n", |
| 264 | + " \"url\": url,\n", |
| 265 | + " \"prompt\": prompt,\n", |
| 266 | + " \"cache_website\": True,\n", |
| 267 | + " \"depth\": 2,\n", |
| 268 | + " \"max_pages\": 2,\n", |
| 269 | + " \"same_domain_only\": True,\n", |
| 270 | + " }\n", |
| 271 | + ")\n", |
| 272 | + "\n", |
| 273 | + "print(json.dumps(result, indent=2))" |
295 | 274 | ]
|
296 | 275 | },
|
297 | 276 | {
|
|
428 | 407 | "source": [
|
429 | 408 | "## API reference\n",
|
430 | 409 | "\n",
|
431 |
| - "For detailed documentation of all ScrapeGraph features and configurations head to the Langchain API reference: https://python.langchain.com/docs/integrations/tools/scrapegraph\n", |
| 410 | + "For detailed documentation of all ScrapeGraph features and configurations head to [the Langchain API reference](https://python.langchain.com/docs/integrations/tools/scrapegraph).\n", |
432 | 411 | "\n",
|
433 |
| - "Or to the official SDK repo: https://github.com/ScrapeGraphAI/langchain-scrapegraph" |
| 412 | + "Or to [the official SDK repo](https://github.com/ScrapeGraphAI/langchain-scrapegraph)." |
434 | 413 | ]
|
| 414 | + }, |
| 415 | + { |
| 416 | + "cell_type": "markdown", |
| 417 | + "id": "d710dad8", |
| 418 | + "metadata": {}, |
| 419 | + "source": [] |
435 | 420 | }
|
436 | 421 | ],
|
437 | 422 | "metadata": {
|
438 | 423 | "kernelspec": {
|
439 |
| - "display_name": "Python 3", |
| 424 | + "display_name": "langchain", |
440 | 425 | "language": "python",
|
441 | 426 | "name": "python3"
|
442 | 427 | },
|
|
450 | 435 | "name": "python",
|
451 | 436 | "nbconvert_exporter": "python",
|
452 | 437 | "pygments_lexer": "ipython3",
|
453 |
| - "version": "3.11.9" |
| 438 | + "version": "3.10.16" |
454 | 439 | }
|
455 | 440 | },
|
456 | 441 | "nbformat": 4,
|
|
0 commit comments