Skip to content

Commit a982bd9

Browse files
authored
Merge pull request #728 from hughestech/main
feat(document_loader): Add optional selector
2 parents 64a59b6 + 8051a09 commit a982bd9

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed
Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { test } from "@jest/globals";
1+
import { expect, test } from "@jest/globals";
22
import { CheerioWebBaseLoader } from "../web/cheerio.js";
33

44
test("Test cheerio web scraper loader", async () => {
@@ -7,3 +7,13 @@ test("Test cheerio web scraper loader", async () => {
77
);
88
await loader.load();
99
});
10+
11+
test("Test cheerio web scraper loader with selector", async () => {
12+
const selectH1 = "h1";
13+
const loader = new CheerioWebBaseLoader("https://about.google/commitments/", {}, selectH1);
14+
15+
const doc = await loader.load();
16+
expect(doc[0].pageContent.trim()).toBe(
17+
"Committed to significantly improving the lives of as many people as possible."
18+
);
19+
});

langchain/src/document_loaders/web/cheerio.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import type { CheerioAPI, load as LoadT } from "cheerio";
2-
import { Document } from "../../document.js";
3-
import { BaseDocumentLoader } from "../base.js";
4-
import type { DocumentLoader } from "../base.js";
5-
import { AsyncCaller, AsyncCallerParams } from "../../util/async_caller.js";
1+
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
2+
import { Document } from "../document.js";
3+
import { BaseDocumentLoader } from "./base.js";
4+
import type { DocumentLoader } from "./base.js";
5+
import { AsyncCaller, AsyncCallerParams } from "../util/async_caller.js";
66

77
export interface WebBaseLoaderParams extends AsyncCallerParams {
88
/**
@@ -19,11 +19,14 @@ export class CheerioWebBaseLoader
1919

2020
caller: AsyncCaller;
2121

22-
constructor(public webPath: string, fields?: WebBaseLoaderParams) {
22+
selector?: SelectorType;
23+
24+
constructor(public webPath: string, fields?: WebBaseLoaderParams, selector?: SelectorType) {
2325
super();
2426
const { timeout, ...rest } = fields ?? {};
2527
this.timeout = timeout ?? 10000;
2628
this.caller = new AsyncCaller(rest);
29+
this.selector = selector;
2730
}
2831

2932
static async _scrape(
@@ -49,7 +52,7 @@ export class CheerioWebBaseLoader
4952

5053
async load(): Promise<Document[]> {
5154
const $ = await this.scrape();
52-
const text = $("body").text();
55+
const text = $(this.selector ?? "body").text();
5356
const metadata = { source: this.webPath };
5457
return [new Document({ pageContent: text, metadata })];
5558
}

0 commit comments

Comments
 (0)