Skip to content

Commit 7ce9690

Browse files
committed
feat: add search1api crawler implementation for Weixin Sogou links
1 parent adfadff commit 7ce9690

File tree

7 files changed

+111
-8
lines changed

7 files changed

+111
-8
lines changed

packages/web-crawler/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ LobeChat's built-in web crawling module for intelligent extraction of web conten
99
## 🛠️ Core Features
1010

1111
- **Intelligent Content Extraction**: Identifies main content based on Mozilla Readability algorithm
12-
- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, and Browserless rendering
12+
- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, Search1API, and Browserless rendering
1313
- **Custom URL Rules**: Handles specific website crawling logic through a flexible rule system
1414

1515
## 🤝 Contribution
@@ -32,8 +32,8 @@ const url = [
3232
// Optional: URL transformation, redirects to an easier-to-crawl version
3333
urlTransform: 'https://example.com/print/$1',
3434

35-
// Optional: specify crawling implementation, supports 'naive', 'jina', and 'browserless'
36-
impls: ['naive', 'jina', 'browserless'],
35+
// Optional: specify crawling implementation, supports 'naive', 'jina', 'search1api', and 'browserless'
36+
impls: ['naive', 'jina', 'search1api', 'browserless'],
3737

3838
// Optional: content filtering configuration
3939
filterOptions: {

packages/web-crawler/README.zh-CN.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ LobeChat 内置的网页抓取模块,用于智能提取网页内容并转换
99
## 🛠️ 核心功能
1010

1111
- **智能内容提取**:基于 Mozilla Readability 算法识别主要内容
12-
- **多级抓取策略**:支持多种抓取实现,包括基础抓取、Jina 和 Browserless 渲染抓取
12+
- **多级抓取策略**:支持多种抓取实现,包括基础抓取、Jina、Search1API 和 Browserless 渲染抓取
1313
- **自定义 URL 规则**:通过灵活的规则系统处理特定网站的抓取逻辑
1414

1515
## 🤝 参与共建
@@ -32,8 +32,8 @@ const url = [
3232
// 可选:URL 转换,用于重定向到更易抓取的版本
3333
urlTransform: 'https://example.com/print/$1',
3434

35-
// 可选:指定抓取实现方式,支持 'naive'、'jina' 和 'browserless' 三种
36-
impls: ['naive', 'jina', 'browserless'],
35+
// 可选:指定抓取实现方式,支持 'naive'、'jina'、'search1api' 和 'browserless' 四种
36+
impls: ['naive', 'jina', 'search1api', 'browserless'],
3737

3838
// 可选:内容过滤配置
3939
filterOptions: {
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import { browserless } from './browserless';
22
import { jina } from './jina';
33
import { naive } from './naive';
4+
import { search1api } from './search1api';
45

56
export const crawlImpls = {
67
browserless,
78
jina,
89
naive,
10+
search1api,
911
};
1012

1113
export type CrawlImplType = keyof typeof crawlImpls;
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import { CrawlImpl, CrawlSuccessResult } from '../type';
2+
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
3+
4+
interface Search1ApiResponse {
5+
crawlParameters: {
6+
url: string;
7+
};
8+
results: {
9+
title?: string;
10+
link?: string;
11+
content?: string;
12+
};
13+
}
14+
15+
const TIMEOUT_CONTROL = 10_000;
16+
17+
const withTimeout = <T>(promise: Promise<T>, ms: number): Promise<T> => {
18+
const controller = new AbortController();
19+
const timeoutPromise = new Promise<T>((_, reject) => {
20+
setTimeout(() => {
21+
controller.abort();
22+
reject(new TimeoutError(`Request timeout after ${ms}ms`));
23+
}, ms);
24+
});
25+
26+
return Promise.race([promise, timeoutPromise]);
27+
};
28+
29+
export const search1api: CrawlImpl = async (url, { filterOptions }) => {
30+
// Get API key from environment variable
31+
const apiKey = process.env.SEARCH1API_API_KEY;
32+
33+
if (!apiKey) {
34+
throw new Error('SEARCH1API_API_KEY environment variable is not set');
35+
}
36+
37+
let res: Response;
38+
39+
try {
40+
res = await withTimeout(
41+
fetch('https://api.search1api.com/crawl', {
42+
method: 'POST',
43+
headers: {
44+
'Authorization': `Bearer ${apiKey}`,
45+
'Content-Type': 'application/json',
46+
},
47+
body: JSON.stringify({
48+
url,
49+
}),
50+
}),
51+
TIMEOUT_CONTROL,
52+
);
53+
} catch (e) {
54+
const error = e as Error;
55+
if (error.message === 'fetch failed') {
56+
throw new NetworkConnectionError();
57+
}
58+
59+
if (error instanceof TimeoutError) {
60+
throw error;
61+
}
62+
63+
throw e;
64+
}
65+
66+
if (!res.ok) {
67+
if (res.status === 404) {
68+
throw new PageNotFoundError(res.statusText);
69+
}
70+
71+
throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
72+
}
73+
74+
try {
75+
const data = await res.json() as Search1ApiResponse;
76+
77+
// Check if content is empty or too short
78+
if (!data.results.content || data.results.content.length < 100) {
79+
return;
80+
}
81+
82+
return {
83+
content: data.results.content,
84+
contentType: 'text',
85+
title: data.results.title,
86+
description: data.results.title, // Using title as description since API doesn't provide a separate description
87+
length: data.results.content.length,
88+
siteName: new URL(url).hostname,
89+
url: data.results.link || url,
90+
} satisfies CrawlSuccessResult;
91+
} catch (error) {
92+
console.error(error);
93+
}
94+
95+
return;
96+
};

packages/web-crawler/src/crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export class Crawler {
1313
constructor(options: CrawlOptions = {}) {
1414
this.impls = !!options.impls?.length
1515
? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
16-
: (['naive', 'jina', 'browserless'] as const);
16+
: (['naive', 'jina', 'search1api','browserless'] as const);
1717
}
1818

1919
/**

packages/web-crawler/src/type.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export interface FilterOptions {
2222
pureText?: boolean;
2323
}
2424

25-
type CrawlImplType = 'naive' | 'jina' | 'browserless';
25+
type CrawlImplType = 'naive' | 'jina' | 'browserless' | 'search1api';
2626

2727
type CrawlImplParams<T> = T & {
2828
filterOptions: FilterOptions;

packages/web-crawler/src/urlRules.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import { CrawlUrlRule } from './type';
22

33
export const crawUrlRules: CrawlUrlRule[] = [
4+
// 搜狗微信链接,使用 search1api
5+
{
6+
impls: ['search1api'],
7+
urlPattern: 'https://weixin.sogou.com/link(.*)',
8+
},
49
// github 源码解析
510
{
611
filterOptions: {

0 commit comments

Comments
 (0)