1- import type { CheerioAPI , load as LoadT } from "cheerio" ;
2- import { Document } from "../../ document.js" ;
3- import { BaseDocumentLoader } from ".. /base.js" ;
4- import type { DocumentLoader } from ".. /base.js" ;
5- import { AsyncCaller , AsyncCallerParams } from "../../ util/async_caller.js" ;
1+ import type { CheerioAPI , load as LoadT , SelectorType } from "cheerio" ;
2+ import { Document } from "../document.js" ;
3+ import { BaseDocumentLoader } from "./base.js" ;
4+ import type { DocumentLoader } from "./base.js" ;
5+ import { AsyncCaller , AsyncCallerParams } from "../util/async_caller.js" ;
66
77export interface WebBaseLoaderParams extends AsyncCallerParams {
88 /**
@@ -19,11 +19,14 @@ export class CheerioWebBaseLoader
1919
2020 caller : AsyncCaller ;
2121
22- constructor ( public webPath : string , fields ?: WebBaseLoaderParams ) {
22+ selector ?: SelectorType ;
23+
24+ constructor ( public webPath : string , fields ?: WebBaseLoaderParams , selector ?: SelectorType ) {
2325 super ( ) ;
2426 const { timeout, ...rest } = fields ?? { } ;
2527 this . timeout = timeout ?? 10000 ;
2628 this . caller = new AsyncCaller ( rest ) ;
29+ this . selector = selector ;
2730 }
2831
2932 static async _scrape (
@@ -49,7 +52,7 @@ export class CheerioWebBaseLoader
4952
5053 async load ( ) : Promise < Document [ ] > {
5154 const $ = await this . scrape ( ) ;
52- const text = $ ( "body" ) . text ( ) ;
55+ const text = $ ( this . selector ?? "body" ) . text ( ) ;
5356 const metadata = { source : this . webPath } ;
5457 return [ new Document ( { pageContent : text , metadata } ) ] ;
5558 }
0 commit comments