✨ feat: add search1api crawler implementation for WeChat Sogou links (lobehub#7036)

fatwang2 · web-flow · commit 73271383672b · 2025-03-20T00:38:10.000+08:00
* feat: add search1api crawler implementation for Weixin Sogou links

* feat(web-crawler): add withTimeout utility and refactor crawler implementations

* feat(web-crawler): add support for sogou.com links

* docs(web-crawler): enhance error message for missing search1api key
diff --git a/packages/web-crawler/README.md b/packages/web-crawler/README.md
@@ -9,7 +9,7 @@ LobeChat's built-in web crawling module for intelligent extraction of web conten
 ## 🛠️ Core Features
 
 - **Intelligent Content Extraction**: Identifies main content based on Mozilla Readability algorithm
-- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, and Browserless rendering
+- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, Search1API, and Browserless rendering
 - **Custom URL Rules**: Handles specific website crawling logic through a flexible rule system
 
 ## 🤝 Contribution
@@ -32,8 +32,8 @@ const url = [
     // Optional: URL transformation, redirects to an easier-to-crawl version
     urlTransform: 'https://example.com/print/$1',
 
-    // Optional: specify crawling implementation, supports 'naive', 'jina', and 'browserless'
-    impls: ['naive', 'jina', 'browserless'],
+    // Optional: specify crawling implementation, supports 'naive', 'jina', 'search1api', and 'browserless'
+    impls: ['naive', 'jina', 'search1api', 'browserless'],
 
     // Optional: content filtering configuration
     filterOptions: {
diff --git a/packages/web-crawler/README.zh-CN.md b/packages/web-crawler/README.zh-CN.md
@@ -9,7 +9,7 @@ LobeChat 内置的网页抓取模块，用于智能提取网页内容并转换
 ## 🛠️ 核心功能
 
 - **智能内容提取**：基于 Mozilla Readability 算法识别主要内容
-- **多级抓取策略**：支持多种抓取实现，包括基础抓取、Jina 和 Browserless 渲染抓取
+- **多级抓取策略**：支持多种抓取实现，包括基础抓取、Jina、Search1API 和 Browserless 渲染抓取
 - **自定义 URL 规则**：通过灵活的规则系统处理特定网站的抓取逻辑
 
 ## 🤝 参与共建
@@ -32,8 +32,8 @@ const url = [
     // 可选：URL 转换，用于重定向到更易抓取的版本
     urlTransform: 'https://example.com/print/$1',
 
-    // 可选：指定抓取实现方式，支持 'naive'、'jina' 和 'browserless' 三种
-    impls: ['naive', 'jina', 'browserless'],
+    // 可选：指定抓取实现方式，支持 'naive'、'jina'、'search1api' 和 'browserless' 四种
+    impls: ['naive', 'jina', 'search1api', 'browserless'],
 
     // 可选：内容过滤配置
     filterOptions: {
diff --git a/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts b/packages/web-crawler/src/crawImpl/__tests__/search1api.test.ts
@@ -0,0 +1,147 @@
+import { describe, expect, it, vi } from 'vitest';
+
+import * as withTimeoutModule from '../../utils/withTimeout';
+import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
+import { search1api } from '../search1api';
+
+describe('search1api crawler', () => {
+  // Mock fetch function
+  const mockFetch = vi.fn();
+  global.fetch = mockFetch;
+  
+  // Original env
+  let originalEnv: NodeJS.ProcessEnv;
+
+  beforeEach(() => {
+    vi.resetAllMocks();
+    originalEnv = { ...process.env };
+    process.env.SEARCH1API_API_KEY = 'test-api-key';
+    
+    // Mock withTimeout to directly return the promise
+    vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((promise) => promise);
+  });
+
+  afterEach(() => {
+    process.env = originalEnv;
+  });
+
+  it('should throw error when API key is not set', async () => {
+    delete process.env.SEARCH1API_API_KEY;
+    
+    await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
+      'SEARCH1API_API_KEY environment variable is not set',
+    );
+  });
+
+  it('should throw NetworkConnectionError when fetch fails', async () => {
+    mockFetch.mockRejectedValue(new Error('fetch failed'));
+    
+    await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
+      NetworkConnectionError,
+    );
+  });
+
+  it('should throw TimeoutError when request times out', async () => {
+    // Restore original withTimeout implementation for this test
+    vi.spyOn(withTimeoutModule, 'withTimeout').mockRestore();
+    
+    // Mock withTimeout to throw TimeoutError
+    vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation(() => {
+      throw new TimeoutError('Request timeout after 10000ms');
+    });
+    
+    await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
+      TimeoutError,
+    );
+  });
+
+  it('should throw PageNotFoundError when status is 404', async () => {
+    mockFetch.mockResolvedValue({
+      ok: false,
+      status: 404,
+      statusText: 'Not Found',
+    });
+    
+    await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
+      PageNotFoundError,
+    );
+  });
+
+  it('should throw error for other failed responses', async () => {
+    mockFetch.mockResolvedValue({
+      ok: false,
+      status: 500,
+      statusText: 'Internal Server Error',
+    });
+    
+    await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
+      'Search1API request failed with status 500: Internal Server Error',
+    );
+  });
+
+  it('should return undefined when content is too short', async () => {
+    mockFetch.mockResolvedValue({
+      ok: true,
+      json: () => Promise.resolve({
+        crawlParameters: { url: 'https://example.com' },
+        results: {
+          title: 'Test Title',
+          link: 'https://example.com',
+          content: 'Short', // Less than 100 characters
+        },
+      }),
+    });
+    
+    const result = await search1api('https://example.com', { filterOptions: {} });
+    expect(result).toBeUndefined();
+  });
+
+  it('should return crawl result on successful fetch', async () => {
+    const mockContent = 'This is a test content that is longer than 100 characters. '.repeat(3);
+    
+    mockFetch.mockResolvedValue({
+      ok: true,
+      json: () => Promise.resolve({
+        crawlParameters: { url: 'https://example.com' },
+        results: {
+          title: 'Test Title',
+          link: 'https://example.com',
+          content: mockContent,
+        },
+      }),
+    });
+    
+    const result = await search1api('https://example.com', { filterOptions: {} });
+    
+    expect(mockFetch).toHaveBeenCalledWith('https://api.search1api.com/crawl', {
+      method: 'POST',
+      headers: {
+        'Authorization': 'Bearer test-api-key',
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        url: 'https://example.com',
+      }),
+    });
+    
+    expect(result).toEqual({
+      content: mockContent,
+      contentType: 'text',
+      title: 'Test Title',
+      description: 'Test Title',
+      length: mockContent.length,
+      siteName: 'example.com',
+      url: 'https://example.com',
+    });
+  });
+
+  it('should handle JSON parse errors', async () => {
+    mockFetch.mockResolvedValue({
+      ok: true,
+      json: () => Promise.reject(new Error('Invalid JSON')),
+    });
+    
+    const result = await search1api('https://example.com', { filterOptions: {} });
+    expect(result).toBeUndefined();
+  });
+}); 
diff --git a/packages/web-crawler/src/crawImpl/index.ts b/packages/web-crawler/src/crawImpl/index.ts
@@ -1,11 +1,13 @@
 import { browserless } from './browserless';
 import { jina } from './jina';
 import { naive } from './naive';
+import { search1api } from './search1api';
 
 export const crawlImpls = {
   browserless,
   jina,
   naive,
+  search1api,
 };
 
 export type CrawlImplType = keyof typeof crawlImpls;
diff --git a/packages/web-crawler/src/crawImpl/naive.ts b/packages/web-crawler/src/crawImpl/naive.ts
@@ -1,6 +1,7 @@
 import { CrawlImpl, CrawlSuccessResult } from '../type';
 import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
 import { htmlToMarkdown } from '../utils/htmlToMarkdown';
+import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
 
 const mixinHeaders = {
   // 接受的内容类型
@@ -31,20 +32,6 @@ const mixinHeaders = {
   'sec-fetch-user': '?1',
 };
 
-const TIMEOUT_CONTROL = 10_000;
-
-const withTimeout = <T>(promise: Promise<T>, ms: number): Promise<T> => {
-  const controller = new AbortController();
-  const timeoutPromise = new Promise<T>((_, reject) => {
-    setTimeout(() => {
-      controller.abort();
-      reject(new TimeoutError(`Request timeout after ${ms}ms`));
-    }, ms);
-  });
-
-  return Promise.race([promise, timeoutPromise]);
-};
-
 export const naive: CrawlImpl = async (url, { filterOptions }) => {
   let res: Response;
 
@@ -54,7 +41,7 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
         headers: mixinHeaders,
         signal: new AbortController().signal,
       }),
-      TIMEOUT_CONTROL,
+      DEFAULT_TIMEOUT,
     );
   } catch (e) {
     const error = e as Error;
diff --git a/packages/web-crawler/src/crawImpl/search1api.ts b/packages/web-crawler/src/crawImpl/search1api.ts
@@ -0,0 +1,83 @@
+import { CrawlImpl, CrawlSuccessResult } from '../type';
+import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
+import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
+
+interface Search1ApiResponse {
+  crawlParameters: {
+    url: string;
+  };
+  results: {
+    title?: string;
+    link?: string;
+    content?: string;
+  };
+}
+
+export const search1api: CrawlImpl = async (url, { filterOptions }) => {
+  // Get API key from environment variable
+  const apiKey = process.env.SEARCH1API_API_KEY;
+  
+  if (!apiKey) {
+    throw new Error('SEARCH1API_API_KEY environment variable is not set. Visit https://www.search1api.com to get free quota.');
+  }
+
+  let res: Response;
+
+  try {
+    res = await withTimeout(
+      fetch('https://api.search1api.com/crawl', {
+        method: 'POST',
+        headers: {
+          'Authorization': `Bearer ${apiKey}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          url,
+        }),
+      }),
+      DEFAULT_TIMEOUT,
+    );
+  } catch (e) {
+    const error = e as Error;
+    if (error.message === 'fetch failed') {
+      throw new NetworkConnectionError();
+    }
+
+    if (error instanceof TimeoutError) {
+      throw error;
+    }
+
+    throw e;
+  }
+
+  if (!res.ok) {
+    if (res.status === 404) {
+      throw new PageNotFoundError(res.statusText);
+    }
+    
+    throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
+  }
+
+  try {
+    const data = await res.json() as Search1ApiResponse;
+    
+    // Check if content is empty or too short
+    if (!data.results.content || data.results.content.length < 100) {
+      return;
+    }
+    
+    return {
+      content: data.results.content,
+      contentType: 'text',
+      title: data.results.title,
+      description: data.results.title, // Using title as description since API doesn't provide a separate description
+      length: data.results.content.length,
+      siteName: new URL(url).hostname,
+      url: data.results.link || url,
+    } satisfies CrawlSuccessResult;
+  } catch (error) {
+    console.error(error);
+  }
+  
+  return;
+}; 
diff --git a/packages/web-crawler/src/crawler.ts b/packages/web-crawler/src/crawler.ts
@@ -13,7 +13,7 @@ export class Crawler {
   constructor(options: CrawlOptions = {}) {
     this.impls = !!options.impls?.length
       ? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
-      : (['naive', 'jina', 'browserless'] as const);
+      : (['naive', 'jina', 'search1api','browserless'] as const);
   }
 
   /**
diff --git a/packages/web-crawler/src/type.ts b/packages/web-crawler/src/type.ts
@@ -22,7 +22,7 @@ export interface FilterOptions {
   pureText?: boolean;
 }
 
-type CrawlImplType = 'naive' | 'jina' | 'browserless';
+type CrawlImplType = 'naive' | 'jina' | 'browserless' | 'search1api';
 
 type CrawlImplParams<T> = T & {
   filterOptions: FilterOptions;
diff --git a/packages/web-crawler/src/urlRules.ts b/packages/web-crawler/src/urlRules.ts
@@ -1,6 +1,16 @@
 import { CrawlUrlRule } from './type';
 
 export const crawUrlRules: CrawlUrlRule[] = [
+  // 搜狗微信链接，使用 search1api
+  {
+    impls: ['search1api'],
+    urlPattern: 'https://weixin.sogou.com/link(.*)',
+  },
+  // 搜狗链接，使用 search1api
+  {
+    impls: ['search1api'],
+    urlPattern: 'https://sogou.com/link(.*)',
+  },
   // github 源码解析
   {
     filterOptions: {
diff --git a/packages/web-crawler/src/utils/withTimeout.ts b/packages/web-crawler/src/utils/withTimeout.ts
@@ -0,0 +1,20 @@
+import { TimeoutError } from "./errorType";
+export const DEFAULT_TIMEOUT = 10_000;
+
+/**
+ * Wraps a promise with a timeout
+ * @param promise Promise to wrap
+ * @param ms Timeout in milliseconds
+ * @returns Promise that will be rejected if it takes longer than ms to resolve
+ */
+export const withTimeout = <T>(promise: Promise<T>, ms: number = DEFAULT_TIMEOUT): Promise<T> => {
+  const controller = new AbortController();
+  const timeoutPromise = new Promise<T>((_, reject) => {
+    setTimeout(() => {
+      controller.abort();
+      reject(new TimeoutError(`Request timeout after ${ms}ms`));
+    }, ms);
+  });
+
+  return Promise.race([promise, timeoutPromise]);
+};

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ export class Crawler {`
`13`	`13`	`constructor(options: CrawlOptions = {}) {`
`14`	`14`	`this.impls = !!options.impls?.length`
`15`	`15`	`? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])`
`16`		`- : (['naive', 'jina', 'browserless'] as const);`
	`16`	`+ : (['naive', 'jina', 'search1api','browserless'] as const);`
`17`	`17`	`}`
`18`	`18`
`19`	`19`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ export interface FilterOptions {`
`22`	`22`	`pureText?: boolean;`
`23`	`23`	`}`
`24`	`24`
`25`		`-type CrawlImplType = 'naive' \| 'jina' \| 'browserless';`
	`25`	`+type CrawlImplType = 'naive' \| 'jina' \| 'browserless' \| 'search1api';`
`26`	`26`
`27`	`27`	`type CrawlImplParams<T> = T & {`
`28`	`28`	`filterOptions: FilterOptions;`