Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion langchain/src/document_loaders/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export interface DocumentLoader {
}

export abstract class BaseDocumentLoader implements DocumentLoader {
abstract load(): Promise<Document[]>;
abstract load(selector?: unknown): Promise<Document[]>;

async loadAndSplit(
splitter: TextSplitter = new RecursiveCharacterTextSplitter()
Expand Down
6 changes: 3 additions & 3 deletions langchain/src/document_loaders/cheerio_web_base.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { CheerioAPI, load as LoadT } from "cheerio";
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
import { Document } from "../document.js";
import { BaseDocumentLoader } from "./base.js";
import type { DocumentLoader } from "./base.js";
Expand Down Expand Up @@ -47,9 +47,9 @@ export class CheerioWebBaseLoader
);
}

async load(): Promise<Document[]> {
async load(selector?: SelectorType): Promise<Document[]> {
const $ = await this.scrape();
const text = $("body").text();
const text = $(selector).text() ?? $("body").text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
Expand Down
11 changes: 10 additions & 1 deletion langchain/src/document_loaders/tests/cheerio_web.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { test } from "@jest/globals";
import { expect, test } from "@jest/globals";
import { CheerioWebBaseLoader } from "../cheerio_web_base.js";

test("Test cheerio web scraper loader", async () => {
Expand All @@ -7,3 +7,12 @@ test("Test cheerio web scraper loader", async () => {
);
await loader.load();
});

test("Test cheerio web scraper loader with selector", async () => {
const loader = new CheerioWebBaseLoader("https://about.google/commitments/");
const selectH1 = "h1";
const doc = await loader.load(selectH1);
expect(doc[0].pageContent.trim()).toBe(
"Committed to significantly improving the lives of as many people as possible."
);
});