Skip to content

Commit 640c87c

Browse files
committed
fix: frequent embedding
1 parent aa20e7b commit 640c87c

File tree

6 files changed

+132
-10
lines changed

6 files changed

+132
-10
lines changed

packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,28 @@ Generated by [AVA](https://avajs.dev).
101101
102102
0
103103

104+
## should check need to be embedded
105+
106+
> document with no embedding should need embedding
107+
108+
true
109+
110+
> document with recent embedding should not need embedding
111+
112+
false
113+
114+
> document updated after embedding and older-than-10m should need embedding
115+
116+
true
117+
118+
> should not need embedding when only 10-minute window passed without updates
119+
120+
false
121+
122+
> should need embedding when doc updated and last embedding older than 10 minutes
123+
124+
true
125+
104126
## should filter outdated doc id style in embedding status
105127

106128
> should include modern doc format

packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,10 @@ test('should check need to be embedded', async t => {
293293
workspace.id,
294294
docId
295295
);
296-
t.true(needsEmbedding, 'document with no embedding should need embedding');
296+
t.snapshot(
297+
needsEmbedding,
298+
'document with no embedding should need embedding'
299+
);
297300
}
298301

299302
{
@@ -313,7 +316,7 @@ test('should check need to be embedded', async t => {
313316
workspace.id,
314317
docId
315318
);
316-
t.false(
319+
t.snapshot(
317320
needsEmbedding,
318321
'document with recent embedding should not need embedding'
319322
);
@@ -328,15 +331,83 @@ test('should check need to be embedded', async t => {
328331
editorId: user.id,
329332
});
330333

334+
// simulate an old embedding
335+
const oldEmbeddingTime = new Date(Date.now() - 25 * 60 * 1000);
336+
await t.context.db.aiWorkspaceEmbedding.updateMany({
337+
where: { workspaceId: workspace.id, docId },
338+
data: { updatedAt: oldEmbeddingTime },
339+
});
340+
331341
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
332342
workspace.id,
333343
docId
334344
);
335-
t.true(
345+
t.snapshot(
346+
needsEmbedding,
347+
'document updated after embedding and older-than-10m should need embedding'
348+
);
349+
}
350+
351+
{
352+
// only time passed (>10m since last embedding) but no doc updates => should NOT re-embed
353+
const baseNow = Date.now();
354+
const docId2 = randomUUID();
355+
const t0 = baseNow - 30 * 60 * 1000; // snapshot updated 30 minutes ago
356+
const t1 = baseNow - 25 * 60 * 1000; // embedding updated 25 minutes ago
357+
358+
await t.context.doc.upsert({
359+
spaceId: workspace.id,
360+
docId: docId2,
361+
blob: Uint8Array.from([1, 2, 3]),
362+
timestamp: t0,
363+
editorId: user.id,
364+
});
365+
366+
await t.context.copilotContext.insertWorkspaceEmbedding(
367+
workspace.id,
368+
docId2,
369+
[
370+
{
371+
index: 0,
372+
content: 'content2',
373+
embedding: Array.from({ length: 1024 }, () => 1),
374+
},
375+
]
376+
);
377+
378+
await t.context.db.aiWorkspaceEmbedding.updateMany({
379+
where: { workspaceId: workspace.id, docId: docId2 },
380+
data: { updatedAt: new Date(t1) },
381+
});
382+
383+
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
384+
workspace.id,
385+
docId2
386+
);
387+
t.snapshot(
388+
needsEmbedding,
389+
'should not need embedding when only 10-minute window passed without updates'
390+
);
391+
392+
const t2 = baseNow - 5 * 60 * 1000; // doc updated 5 minutes ago
393+
await t.context.doc.upsert({
394+
spaceId: workspace.id,
395+
docId: docId2,
396+
blob: Uint8Array.from([7, 8, 9]),
397+
timestamp: t2,
398+
editorId: user.id,
399+
});
400+
401+
needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
402+
workspace.id,
403+
docId2
404+
);
405+
t.snapshot(
336406
needsEmbedding,
337-
'document updated after embedding should need embedding'
407+
'should need embedding when doc updated and last embedding older than 10 minutes'
338408
);
339409
}
410+
// --- new cases end ---
340411
});
341412

342413
test('should check embedding table', async t => {

packages/backend/server/src/models/copilot-context.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ export class CopilotContextModel extends BaseModel {
217217
});
218218
return file?.map(f => f.content).join('\n');
219219
}
220+
220221
async insertFileEmbedding(
221222
contextId: string,
222223
fileId: string,
@@ -263,6 +264,19 @@ export class CopilotContextModel extends BaseModel {
263264
return similarityChunks.filter(c => Number(c.distance) <= threshold);
264265
}
265266

267+
async getWorkspaceContent(
268+
workspaceId: string,
269+
docId: string,
270+
chunk?: number
271+
): Promise<string | undefined> {
272+
const file = await this.db.aiWorkspaceEmbedding.findMany({
273+
where: { workspaceId, docId, chunk },
274+
select: { content: true },
275+
orderBy: { chunk: 'asc' },
276+
});
277+
return file?.map(f => f.content).join('\n');
278+
}
279+
266280
async insertWorkspaceEmbedding(
267281
workspaceId: string,
268282
docId: string,
@@ -287,6 +301,7 @@ export class CopilotContextModel extends BaseModel {
287301
VALUES ${values}
288302
ON CONFLICT (workspace_id, doc_id, chunk)
289303
DO UPDATE SET
304+
content = EXCLUDED.content,
290305
embedding = EXCLUDED.embedding,
291306
updated_at = excluded.updated_at;
292307
`;

packages/backend/server/src/models/copilot-workspace.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,9 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
206206
@Transactional()
207207
async checkDocNeedEmbedded(workspaceId: string, docId: string) {
208208
// NOTE: check if the document needs re-embedding.
209-
// 1. check if there have been any recent updates to the document snapshot and update
210-
// 2. check if the embedding is older than the snapshot and update
211-
// 3. check if the embedding is older than 10 minutes (avoid frequent updates)
212-
// if all conditions are met, re-embedding is required.
209+
// 1. first-time embedding when no embedding exists
210+
// 2. re-embedding only when the doc has updates newer than the last embedding
211+
// AND the last embedding is older than 10 minutes (avoid frequent updates)
213212
const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>`
214213
SELECT
215214
EXISTS (
@@ -244,8 +243,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
244243
AND e.doc_id = docs.doc_id
245244
WHERE
246245
e.updated_at IS NULL
247-
OR docs.updated_at > e.updated_at
248-
OR e.updated_at < NOW() - INTERVAL '10 minutes'
246+
OR (docs.updated_at > e.updated_at AND e.updated_at < NOW() - INTERVAL '10 minutes')
249247
) AS needs_embedding;
250248
`;
251249

packages/backend/server/src/plugins/copilot/embedding/job.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,22 @@ export class CopilotEmbeddingJob {
429429
if (!hasNewDoc && fragment) {
430430
// fast fall for empty doc, journal is easily to create a empty doc
431431
if (fragment.summary.trim()) {
432+
const existsContent =
433+
await this.models.copilotContext.getWorkspaceContent(
434+
workspaceId,
435+
docId
436+
);
437+
if (
438+
existsContent &&
439+
existsContent.replaceAll('\n', '') ===
440+
fragment.summary.replaceAll('\n', '')
441+
) {
442+
this.logger.log(
443+
`Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
444+
);
445+
return;
446+
}
447+
432448
const embeddings = await this.embeddingClient.getFileEmbeddings(
433449
new File(
434450
[fragment.summary],

0 commit comments

Comments
 (0)