Skip to content

Commit 7327138

Browse files
authored
✨ feat: add search1api crawler implementation for WeChat Sogou links (lobehub#7036)
* feat: add search1api crawler implementation for Weixin Sogou links * feat(web-crawler): add withTimeout utility and refactor crawler implementations * feat(web-crawler): add support for sogou.com links * docs(web-crawler): enhance error message for missing search1api key
1 parent 2a4e2ed commit 7327138

File tree

10 files changed

+272
-23
lines changed

10 files changed

+272
-23
lines changed

packages/web-crawler/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ LobeChat's built-in web crawling module for intelligent extraction of web conten
99
## 🛠️ Core Features
1010

1111
- **Intelligent Content Extraction**: Identifies main content based on Mozilla Readability algorithm
12-
- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, and Browserless rendering
12+
- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, Search1API, and Browserless rendering
1313
- **Custom URL Rules**: Handles specific website crawling logic through a flexible rule system
1414

1515
## 🤝 Contribution
@@ -32,8 +32,8 @@ const url = [
3232
// Optional: URL transformation, redirects to an easier-to-crawl version
3333
urlTransform: 'https://example.com/print/$1',
3434

35-
// Optional: specify crawling implementation, supports 'naive', 'jina', and 'browserless'
36-
impls: ['naive', 'jina', 'browserless'],
35+
// Optional: specify crawling implementation, supports 'naive', 'jina', 'search1api', and 'browserless'
36+
impls: ['naive', 'jina', 'search1api', 'browserless'],
3737

3838
// Optional: content filtering configuration
3939
filterOptions: {

packages/web-crawler/README.zh-CN.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ LobeChat 内置的网页抓取模块,用于智能提取网页内容并转换
99
## 🛠️ 核心功能
1010

1111
- **智能内容提取**:基于 Mozilla Readability 算法识别主要内容
12-
- **多级抓取策略**:支持多种抓取实现,包括基础抓取、Jina 和 Browserless 渲染抓取
12+
- **多级抓取策略**:支持多种抓取实现,包括基础抓取、Jina、Search1API 和 Browserless 渲染抓取
1313
- **自定义 URL 规则**:通过灵活的规则系统处理特定网站的抓取逻辑
1414

1515
## 🤝 参与共建
@@ -32,8 +32,8 @@ const url = [
3232
// 可选:URL 转换,用于重定向到更易抓取的版本
3333
urlTransform: 'https://example.com/print/$1',
3434

35-
// 可选:指定抓取实现方式,支持 'naive'、'jina' 和 'browserless' 三种
36-
impls: ['naive', 'jina', 'browserless'],
35+
// 可选:指定抓取实现方式,支持 'naive'、'jina'、'search1api' 和 'browserless' 四种
36+
impls: ['naive', 'jina', 'search1api', 'browserless'],
3737

3838
// 可选:内容过滤配置
3939
filterOptions: {
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
3+
import * as withTimeoutModule from '../../utils/withTimeout';
4+
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
5+
import { search1api } from '../search1api';
6+
7+
describe('search1api crawler', () => {
8+
// Mock fetch function
9+
const mockFetch = vi.fn();
10+
global.fetch = mockFetch;
11+
12+
// Original env
13+
let originalEnv: NodeJS.ProcessEnv;
14+
15+
beforeEach(() => {
16+
vi.resetAllMocks();
17+
originalEnv = { ...process.env };
18+
process.env.SEARCH1API_API_KEY = 'test-api-key';
19+
20+
// Mock withTimeout to directly return the promise
21+
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((promise) => promise);
22+
});
23+
24+
afterEach(() => {
25+
process.env = originalEnv;
26+
});
27+
28+
it('should throw error when API key is not set', async () => {
29+
delete process.env.SEARCH1API_API_KEY;
30+
31+
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
32+
'SEARCH1API_API_KEY environment variable is not set',
33+
);
34+
});
35+
36+
it('should throw NetworkConnectionError when fetch fails', async () => {
37+
mockFetch.mockRejectedValue(new Error('fetch failed'));
38+
39+
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
40+
NetworkConnectionError,
41+
);
42+
});
43+
44+
it('should throw TimeoutError when request times out', async () => {
45+
// Restore original withTimeout implementation for this test
46+
vi.spyOn(withTimeoutModule, 'withTimeout').mockRestore();
47+
48+
// Mock withTimeout to throw TimeoutError
49+
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation(() => {
50+
throw new TimeoutError('Request timeout after 10000ms');
51+
});
52+
53+
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
54+
TimeoutError,
55+
);
56+
});
57+
58+
it('should throw PageNotFoundError when status is 404', async () => {
59+
mockFetch.mockResolvedValue({
60+
ok: false,
61+
status: 404,
62+
statusText: 'Not Found',
63+
});
64+
65+
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
66+
PageNotFoundError,
67+
);
68+
});
69+
70+
it('should throw error for other failed responses', async () => {
71+
mockFetch.mockResolvedValue({
72+
ok: false,
73+
status: 500,
74+
statusText: 'Internal Server Error',
75+
});
76+
77+
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
78+
'Search1API request failed with status 500: Internal Server Error',
79+
);
80+
});
81+
82+
it('should return undefined when content is too short', async () => {
83+
mockFetch.mockResolvedValue({
84+
ok: true,
85+
json: () => Promise.resolve({
86+
crawlParameters: { url: 'https://example.com' },
87+
results: {
88+
title: 'Test Title',
89+
link: 'https://example.com',
90+
content: 'Short', // Less than 100 characters
91+
},
92+
}),
93+
});
94+
95+
const result = await search1api('https://example.com', { filterOptions: {} });
96+
expect(result).toBeUndefined();
97+
});
98+
99+
it('should return crawl result on successful fetch', async () => {
100+
const mockContent = 'This is a test content that is longer than 100 characters. '.repeat(3);
101+
102+
mockFetch.mockResolvedValue({
103+
ok: true,
104+
json: () => Promise.resolve({
105+
crawlParameters: { url: 'https://example.com' },
106+
results: {
107+
title: 'Test Title',
108+
link: 'https://example.com',
109+
content: mockContent,
110+
},
111+
}),
112+
});
113+
114+
const result = await search1api('https://example.com', { filterOptions: {} });
115+
116+
expect(mockFetch).toHaveBeenCalledWith('https://api.search1api.com/crawl', {
117+
method: 'POST',
118+
headers: {
119+
'Authorization': 'Bearer test-api-key',
120+
'Content-Type': 'application/json',
121+
},
122+
body: JSON.stringify({
123+
url: 'https://example.com',
124+
}),
125+
});
126+
127+
expect(result).toEqual({
128+
content: mockContent,
129+
contentType: 'text',
130+
title: 'Test Title',
131+
description: 'Test Title',
132+
length: mockContent.length,
133+
siteName: 'example.com',
134+
url: 'https://example.com',
135+
});
136+
});
137+
138+
it('should handle JSON parse errors', async () => {
139+
mockFetch.mockResolvedValue({
140+
ok: true,
141+
json: () => Promise.reject(new Error('Invalid JSON')),
142+
});
143+
144+
const result = await search1api('https://example.com', { filterOptions: {} });
145+
expect(result).toBeUndefined();
146+
});
147+
});
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import { browserless } from './browserless';
22
import { jina } from './jina';
33
import { naive } from './naive';
4+
import { search1api } from './search1api';
45

56
export const crawlImpls = {
67
browserless,
78
jina,
89
naive,
10+
search1api,
911
};
1012

1113
export type CrawlImplType = keyof typeof crawlImpls;

packages/web-crawler/src/crawImpl/naive.ts

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { CrawlImpl, CrawlSuccessResult } from '../type';
22
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
33
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
4+
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
45

56
const mixinHeaders = {
67
// 接受的内容类型
@@ -31,20 +32,6 @@ const mixinHeaders = {
3132
'sec-fetch-user': '?1',
3233
};
3334

34-
const TIMEOUT_CONTROL = 10_000;
35-
36-
const withTimeout = <T>(promise: Promise<T>, ms: number): Promise<T> => {
37-
const controller = new AbortController();
38-
const timeoutPromise = new Promise<T>((_, reject) => {
39-
setTimeout(() => {
40-
controller.abort();
41-
reject(new TimeoutError(`Request timeout after ${ms}ms`));
42-
}, ms);
43-
});
44-
45-
return Promise.race([promise, timeoutPromise]);
46-
};
47-
4835
export const naive: CrawlImpl = async (url, { filterOptions }) => {
4936
let res: Response;
5037

@@ -54,7 +41,7 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
5441
headers: mixinHeaders,
5542
signal: new AbortController().signal,
5643
}),
57-
TIMEOUT_CONTROL,
44+
DEFAULT_TIMEOUT,
5845
);
5946
} catch (e) {
6047
const error = e as Error;
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import { CrawlImpl, CrawlSuccessResult } from '../type';
2+
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
3+
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
4+
5+
interface Search1ApiResponse {
6+
crawlParameters: {
7+
url: string;
8+
};
9+
results: {
10+
title?: string;
11+
link?: string;
12+
content?: string;
13+
};
14+
}
15+
16+
export const search1api: CrawlImpl = async (url, { filterOptions }) => {
17+
// Get API key from environment variable
18+
const apiKey = process.env.SEARCH1API_API_KEY;
19+
20+
if (!apiKey) {
21+
throw new Error('SEARCH1API_API_KEY environment variable is not set. Visit https://www.search1api.com to get free quota.');
22+
}
23+
24+
let res: Response;
25+
26+
try {
27+
res = await withTimeout(
28+
fetch('https://api.search1api.com/crawl', {
29+
method: 'POST',
30+
headers: {
31+
'Authorization': `Bearer ${apiKey}`,
32+
'Content-Type': 'application/json',
33+
},
34+
body: JSON.stringify({
35+
url,
36+
}),
37+
}),
38+
DEFAULT_TIMEOUT,
39+
);
40+
} catch (e) {
41+
const error = e as Error;
42+
if (error.message === 'fetch failed') {
43+
throw new NetworkConnectionError();
44+
}
45+
46+
if (error instanceof TimeoutError) {
47+
throw error;
48+
}
49+
50+
throw e;
51+
}
52+
53+
if (!res.ok) {
54+
if (res.status === 404) {
55+
throw new PageNotFoundError(res.statusText);
56+
}
57+
58+
throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
59+
}
60+
61+
try {
62+
const data = await res.json() as Search1ApiResponse;
63+
64+
// Check if content is empty or too short
65+
if (!data.results.content || data.results.content.length < 100) {
66+
return;
67+
}
68+
69+
return {
70+
content: data.results.content,
71+
contentType: 'text',
72+
title: data.results.title,
73+
description: data.results.title, // Using title as description since API doesn't provide a separate description
74+
length: data.results.content.length,
75+
siteName: new URL(url).hostname,
76+
url: data.results.link || url,
77+
} satisfies CrawlSuccessResult;
78+
} catch (error) {
79+
console.error(error);
80+
}
81+
82+
return;
83+
};

packages/web-crawler/src/crawler.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export class Crawler {
1313
constructor(options: CrawlOptions = {}) {
1414
this.impls = !!options.impls?.length
1515
? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
16-
: (['naive', 'jina', 'browserless'] as const);
16+
: (['naive', 'jina', 'search1api','browserless'] as const);
1717
}
1818

1919
/**

packages/web-crawler/src/type.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export interface FilterOptions {
2222
pureText?: boolean;
2323
}
2424

25-
type CrawlImplType = 'naive' | 'jina' | 'browserless';
25+
type CrawlImplType = 'naive' | 'jina' | 'browserless' | 'search1api';
2626

2727
type CrawlImplParams<T> = T & {
2828
filterOptions: FilterOptions;

packages/web-crawler/src/urlRules.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
import { CrawlUrlRule } from './type';
22

33
export const crawUrlRules: CrawlUrlRule[] = [
4+
// 搜狗微信链接,使用 search1api
5+
{
6+
impls: ['search1api'],
7+
urlPattern: 'https://weixin.sogou.com/link(.*)',
8+
},
9+
// 搜狗链接,使用 search1api
10+
{
11+
impls: ['search1api'],
12+
urlPattern: 'https://sogou.com/link(.*)',
13+
},
414
// github 源码解析
515
{
616
filterOptions: {
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import { TimeoutError } from "./errorType";
2+
export const DEFAULT_TIMEOUT = 10_000;
3+
4+
/**
5+
* Wraps a promise with a timeout
6+
* @param promise Promise to wrap
7+
* @param ms Timeout in milliseconds
8+
* @returns Promise that will be rejected if it takes longer than ms to resolve
9+
*/
10+
export const withTimeout = <T>(promise: Promise<T>, ms: number = DEFAULT_TIMEOUT): Promise<T> => {
11+
const controller = new AbortController();
12+
const timeoutPromise = new Promise<T>((_, reject) => {
13+
setTimeout(() => {
14+
controller.abort();
15+
reject(new TimeoutError(`Request timeout after ${ms}ms`));
16+
}, ms);
17+
});
18+
19+
return Promise.race([promise, timeoutPromise]);
20+
};

0 commit comments

Comments
 (0)