Skip to content

Commit e2a9059

Browse files
committed
feat: clear meta in chunk
1 parent 640c87c commit e2a9059

File tree

5 files changed

+43
-34
lines changed

5 files changed

+43
-34
lines changed

packages/backend/server/src/models/common/copilot.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,36 @@ export type IgnoredDoc = {
148148
createdByAvatar: string | undefined;
149149
updatedBy: string | undefined;
150150
};
151+
152+
export const EMBEDDING_DIMENSIONS = 1024;
153+
154+
const FILTER_PREFIX = [
155+
'Title: ',
156+
'Created at: ',
157+
'Updated at: ',
158+
'Created by: ',
159+
'Updated by: ',
160+
];
161+
162+
export function clearEmbeddingContent(content: string): string {
163+
const lines = content.split('\n');
164+
let maxLines = 5;
165+
while (maxLines > 0 && lines.length > 0) {
166+
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
167+
lines.shift();
168+
maxLines--;
169+
} else {
170+
// only process consecutive metadata rows
171+
break;
172+
}
173+
}
174+
return lines.join('\n');
175+
}
176+
177+
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
178+
if (chunk.content) {
179+
const content = clearEmbeddingContent(chunk.content);
180+
return { ...chunk, content };
181+
}
182+
return chunk;
183+
}

packages/backend/server/src/models/copilot-context.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,21 @@ import { Prisma } from '@prisma/client';
66
import { CopilotSessionNotFound } from '../base';
77
import { BaseModel } from './base';
88
import {
9+
clearEmbeddingContent,
910
ContextBlob,
1011
ContextConfigSchema,
1112
ContextDoc,
1213
ContextEmbedStatus,
1314
CopilotContext,
1415
DocChunkSimilarity,
1516
Embedding,
17+
EMBEDDING_DIMENSIONS,
1618
FileChunkSimilarity,
1719
MinimalContextConfigSchema,
1820
} from './common/copilot';
1921

2022
type UpdateCopilotContextInput = Pick<CopilotContext, 'config'>;
2123

22-
export const EMBEDDING_DIMENSIONS = 1024;
23-
2424
/**
2525
* Copilot Job Model
2626
*/
@@ -215,7 +215,7 @@ export class CopilotContextModel extends BaseModel {
215215
select: { content: true },
216216
orderBy: { chunk: 'asc' },
217217
});
218-
return file?.map(f => f.content).join('\n');
218+
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
219219
}
220220

221221
async insertFileEmbedding(
@@ -274,7 +274,7 @@ export class CopilotContextModel extends BaseModel {
274274
select: { content: true },
275275
orderBy: { chunk: 'asc' },
276276
});
277-
return file?.map(f => f.content).join('\n');
277+
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
278278
}
279279

280280
async insertWorkspaceEmbedding(

packages/backend/server/src/plugins/copilot/mcp/provider.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ import z from 'zod';
66

77
import { DocReader } from '../../../core/doc';
88
import { AccessController } from '../../../core/permission';
9+
import { clearEmbeddingChunk } from '../../../models';
910
import { IndexerService } from '../../indexer';
1011
import { CopilotContextService } from '../context';
11-
import { clearEmbeddingChunk } from '../utils';
1212

1313
@Injectable()
1414
export class WorkspaceMcpProvider {

packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
33
import { z } from 'zod';
44

55
import type { AccessController } from '../../../core/permission';
6-
import type { ChunkSimilarity, Models } from '../../../models';
6+
import {
7+
type ChunkSimilarity,
8+
clearEmbeddingChunk,
9+
type Models,
10+
} from '../../../models';
711
import type { CopilotContextService } from '../context';
812
import type { ContextSession } from '../context/session';
913
import type { CopilotChatOptions } from '../providers';
10-
import { clearEmbeddingChunk } from '../utils';
1114
import { toolError } from './error';
1215

1316
export const buildDocSearchGetter = (

packages/backend/server/src/plugins/copilot/utils.ts

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
33
import type { Request } from 'express';
44

55
import { OneMB, readBufferWithLimit } from '../../base';
6-
import type { ChunkSimilarity } from '../../models';
76
import type { PromptTools } from './providers';
87
import type { ToolsConfig } from './types';
98

@@ -83,29 +82,3 @@ export function getTools(
8382
});
8483
return result;
8584
}
86-
87-
const FILTER_PREFIX = [
88-
'Title: ',
89-
'Created at: ',
90-
'Updated at: ',
91-
'Created by: ',
92-
'Updated by: ',
93-
];
94-
95-
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
96-
if (chunk.content) {
97-
const lines = chunk.content.split('\n');
98-
let maxLines = 5;
99-
while (maxLines > 0 && lines.length > 0) {
100-
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
101-
lines.shift();
102-
maxLines--;
103-
} else {
104-
// only process consecutive metadata rows
105-
break;
106-
}
107-
}
108-
return { ...chunk, content: lines.join('\n') };
109-
}
110-
return chunk;
111-
}

0 commit comments

Comments
 (0)