feat: add search1api crawler implementation for Weixin Sogou links

fatwang2 · fatwang2 · commit 7ce9690d5f6c · 2025-03-19T10:42:51.000+08:00
diff --git a/packages/web-crawler/README.md b/packages/web-crawler/README.md
@@ -9,7 +9,7 @@ LobeChat's built-in web crawling module for intelligent extraction of web conten
 ## 🛠️ Core Features
 
 - **Intelligent Content Extraction**: Identifies main content based on Mozilla Readability algorithm
-- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, and Browserless rendering
+- **Multi-level Crawling Strategy**: Supports multiple crawling implementations including basic crawling, Jina, Search1API, and Browserless rendering
 - **Custom URL Rules**: Handles specific website crawling logic through a flexible rule system
 
 ## 🤝 Contribution
@@ -32,8 +32,8 @@ const url = [
     // Optional: URL transformation, redirects to an easier-to-crawl version
     urlTransform: 'https://example.com/print/$1',
 
-    // Optional: specify crawling implementation, supports 'naive', 'jina', and 'browserless'
-    impls: ['naive', 'jina', 'browserless'],
+    // Optional: specify crawling implementation, supports 'naive', 'jina', 'search1api', and 'browserless'
+    impls: ['naive', 'jina', 'search1api', 'browserless'],
 
     // Optional: content filtering configuration
     filterOptions: {
diff --git a/packages/web-crawler/README.zh-CN.md b/packages/web-crawler/README.zh-CN.md
@@ -9,7 +9,7 @@ LobeChat 内置的网页抓取模块，用于智能提取网页内容并转换
 ## 🛠️ 核心功能
 
 - **智能内容提取**：基于 Mozilla Readability 算法识别主要内容
-- **多级抓取策略**：支持多种抓取实现，包括基础抓取、Jina 和 Browserless 渲染抓取
+- **多级抓取策略**：支持多种抓取实现，包括基础抓取、Jina、Search1API 和 Browserless 渲染抓取
 - **自定义 URL 规则**：通过灵活的规则系统处理特定网站的抓取逻辑
 
 ## 🤝 参与共建
@@ -32,8 +32,8 @@ const url = [
     // 可选：URL 转换，用于重定向到更易抓取的版本
     urlTransform: 'https://example.com/print/$1',
 
-    // 可选：指定抓取实现方式，支持 'naive'、'jina' 和 'browserless' 三种
-    impls: ['naive', 'jina', 'browserless'],
+    // 可选：指定抓取实现方式，支持 'naive'、'jina'、'search1api' 和 'browserless' 四种
+    impls: ['naive', 'jina', 'search1api', 'browserless'],
 
     // 可选：内容过滤配置
     filterOptions: {
diff --git a/packages/web-crawler/src/crawImpl/index.ts b/packages/web-crawler/src/crawImpl/index.ts
@@ -1,11 +1,13 @@
 import { browserless } from './browserless';
 import { jina } from './jina';
 import { naive } from './naive';
+import { search1api } from './search1api';
 
 export const crawlImpls = {
   browserless,
   jina,
   naive,
+  search1api,
 };
 
 export type CrawlImplType = keyof typeof crawlImpls;
diff --git a/packages/web-crawler/src/crawImpl/search1api.ts b/packages/web-crawler/src/crawImpl/search1api.ts
@@ -0,0 +1,96 @@
+import { CrawlImpl, CrawlSuccessResult } from '../type';
+import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
+
+interface Search1ApiResponse {
+  crawlParameters: {
+    url: string;
+  };
+  results: {
+    title?: string;
+    link?: string;
+    content?: string;
+  };
+}
+
+const TIMEOUT_CONTROL = 10_000;
+
+const withTimeout = <T>(promise: Promise<T>, ms: number): Promise<T> => {
+  const controller = new AbortController();
+  const timeoutPromise = new Promise<T>((_, reject) => {
+    setTimeout(() => {
+      controller.abort();
+      reject(new TimeoutError(`Request timeout after ${ms}ms`));
+    }, ms);
+  });
+
+  return Promise.race([promise, timeoutPromise]);
+};
+
+export const search1api: CrawlImpl = async (url, { filterOptions }) => {
+  // Get API key from environment variable
+  const apiKey = process.env.SEARCH1API_API_KEY;
+  
+  if (!apiKey) {
+    throw new Error('SEARCH1API_API_KEY environment variable is not set');
+  }
+
+  let res: Response;
+
+  try {
+    res = await withTimeout(
+      fetch('https://api.search1api.com/crawl', {
+        method: 'POST',
+        headers: {
+          'Authorization': `Bearer ${apiKey}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          url,
+        }),
+      }),
+      TIMEOUT_CONTROL,
+    );
+  } catch (e) {
+    const error = e as Error;
+    if (error.message === 'fetch failed') {
+      throw new NetworkConnectionError();
+    }
+
+    if (error instanceof TimeoutError) {
+      throw error;
+    }
+
+    throw e;
+  }
+
+  if (!res.ok) {
+    if (res.status === 404) {
+      throw new PageNotFoundError(res.statusText);
+    }
+    
+    throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
+  }
+
+  try {
+    const data = await res.json() as Search1ApiResponse;
+    
+    // Check if content is empty or too short
+    if (!data.results.content || data.results.content.length < 100) {
+      return;
+    }
+    
+    return {
+      content: data.results.content,
+      contentType: 'text',
+      title: data.results.title,
+      description: data.results.title, // Using title as description since API doesn't provide a separate description
+      length: data.results.content.length,
+      siteName: new URL(url).hostname,
+      url: data.results.link || url,
+    } satisfies CrawlSuccessResult;
+  } catch (error) {
+    console.error(error);
+  }
+  
+  return;
+}; 
diff --git a/packages/web-crawler/src/crawler.ts b/packages/web-crawler/src/crawler.ts
@@ -13,7 +13,7 @@ export class Crawler {
   constructor(options: CrawlOptions = {}) {
     this.impls = !!options.impls?.length
       ? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])
-      : (['naive', 'jina', 'browserless'] as const);
+      : (['naive', 'jina', 'search1api','browserless'] as const);
   }
 
   /**
diff --git a/packages/web-crawler/src/type.ts b/packages/web-crawler/src/type.ts
@@ -22,7 +22,7 @@ export interface FilterOptions {
   pureText?: boolean;
 }
 
-type CrawlImplType = 'naive' | 'jina' | 'browserless';
+type CrawlImplType = 'naive' | 'jina' | 'browserless' | 'search1api';
 
 type CrawlImplParams<T> = T & {
   filterOptions: FilterOptions;
diff --git a/packages/web-crawler/src/urlRules.ts b/packages/web-crawler/src/urlRules.ts
@@ -1,6 +1,11 @@
 import { CrawlUrlRule } from './type';
 
 export const crawUrlRules: CrawlUrlRule[] = [
+  // 搜狗微信链接，使用 search1api
+  {
+    impls: ['search1api'],
+    urlPattern: 'https://weixin.sogou.com/link(.*)',
+  },
   // github 源码解析
   {
     filterOptions: {

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ export class Crawler {`
`13`	`13`	`constructor(options: CrawlOptions = {}) {`
`14`	`14`	`this.impls = !!options.impls?.length`
`15`	`15`	`? (options.impls.filter((impl) => Object.keys(crawlImpls).includes(impl)) as CrawlImplType[])`
`16`		`- : (['naive', 'jina', 'browserless'] as const);`
	`16`	`+ : (['naive', 'jina', 'search1api','browserless'] as const);`
`17`	`17`	`}`
`18`	`18`
`19`	`19`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ export interface FilterOptions {`
`22`	`22`	`pureText?: boolean;`
`23`	`23`	`}`
`24`	`24`
`25`		`-type CrawlImplType = 'naive' \| 'jina' \| 'browserless';`
	`25`	`+type CrawlImplType = 'naive' \| 'jina' \| 'browserless' \| 'search1api';`
`26`	`26`
`27`	`27`	`type CrawlImplParams<T> = T & {`
`28`	`28`	`filterOptions: FilterOptions;`