Skip to content

Commit d754acc

Browse files
authored
Merge pull request #1446 from mfts/marc/pm-72-migrate-to-trigger-v3-for-the-document-upload
feat: add trigger v3 convert pdf
2 parents ac45116 + 1e5587f commit d754acc

File tree

3 files changed

+249
-10
lines changed

3 files changed

+249
-10
lines changed

lib/trigger/pdf-to-image.ts

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
import { logger, task } from "@trigger.dev/sdk/v3";
2+
import { execSync } from "child_process";
3+
import { createReadStream, createWriteStream } from "fs";
4+
import fs from "fs/promises";
5+
import fetch from "node-fetch";
6+
import os from "os";
7+
import path from "path";
8+
import { pipeline } from "stream/promises";
9+
10+
import { getFile } from "@/lib/files/get-file";
11+
import { streamFileServer } from "@/lib/files/stream-file-server";
12+
import prisma from "@/lib/prisma";
13+
14+
export const convertPdfToImage = task({
15+
id: "convert-pdf-to-image",
16+
machine: {
17+
preset: "small-2x",
18+
},
19+
run: async (payload: {
20+
documentVersionId: string;
21+
teamId: string;
22+
docId: string;
23+
versionNumber?: number;
24+
}) => {
25+
const { documentVersionId, teamId, docId, versionNumber } = payload;
26+
27+
try {
28+
// Get document version
29+
const documentVersion = await prisma.documentVersion.findUnique({
30+
where: { id: documentVersionId },
31+
select: {
32+
file: true,
33+
storageType: true,
34+
numPages: true,
35+
},
36+
});
37+
38+
if (!documentVersion) {
39+
throw new Error("Document version not found");
40+
}
41+
42+
// Get signed URL for the PDF
43+
const pdfUrl = await getFile({
44+
type: documentVersion.storageType,
45+
data: documentVersion.file,
46+
});
47+
48+
if (!pdfUrl) {
49+
throw new Error("Failed to get signed URL");
50+
}
51+
52+
logger.info("Starting PDF conversion", { pdfUrl });
53+
54+
// Create temp directory
55+
const tempDirectory = path.join(os.tmpdir(), `pdf_${Date.now()}`);
56+
await fs.mkdir(tempDirectory, { recursive: true });
57+
const pdfPath = path.join(tempDirectory, "input.pdf");
58+
59+
// Stream PDF to temporary file
60+
const response = await fetch(pdfUrl);
61+
if (!response.body) {
62+
throw new Error("Failed to fetch PDF stream");
63+
}
64+
65+
logger.info("Streaming PDF to temporary file");
66+
await pipeline(response.body, createWriteStream(pdfPath));
67+
68+
// Get total pages and first page dimensions
69+
const getDimensions = execSync(
70+
`mutool show "${pdfPath}" "pages/1/MediaBox"`,
71+
{ encoding: "utf8" },
72+
);
73+
// Parse dimensions, removing brackets and splitting on whitespace
74+
const dimensions = getDimensions
75+
.replace(/[\[\]]/g, "")
76+
.trim()
77+
.split(/\s+/)
78+
.map(parseFloat);
79+
const [ulx, uly, lrx, lry] = dimensions;
80+
const widthInPoints = Math.abs(lrx - ulx);
81+
const heightInPoints = Math.abs(lry - uly);
82+
const resolution = widthInPoints >= 1600 ? 288 : 432; // 2x or 3x of 144 DPI
83+
const scaleFactor = resolution / 144;
84+
85+
const getTotalPages = execSync(
86+
`mutool show "${pdfPath}" trailer/Root/Pages/Count`,
87+
{ encoding: "utf8" },
88+
);
89+
const totalPages = parseInt(getTotalPages.trim());
90+
91+
logger.info("PDF metadata", {
92+
totalPages,
93+
widthInPoints,
94+
heightInPoints,
95+
resolution,
96+
scaleFactor,
97+
});
98+
99+
// Update document version with total pages
100+
await prisma.documentVersion.update({
101+
where: { id: documentVersionId },
102+
data: { numPages: totalPages },
103+
});
104+
105+
// Process each page
106+
for (let pageNumber = 1; pageNumber <= totalPages; pageNumber++) {
107+
const pngOutputPath = path.join(tempDirectory, `page-${pageNumber}`);
108+
const jpegOutputPath = path.join(tempDirectory, `page-${pageNumber}`);
109+
110+
// The actual files will have "1" appended by mutool
111+
const pngPath = `${pngOutputPath}1.png`;
112+
const jpegPath = `${jpegOutputPath}1.jpg`;
113+
114+
// Convert to PNG
115+
execSync(
116+
`mutool convert -o "${pngOutputPath}.png" -F png -O "resolution=${resolution}" "${pdfPath}" ${pageNumber}`,
117+
);
118+
// Convert to JPEG
119+
execSync(
120+
`mutool convert -o "${jpegOutputPath}.jpg" -F jpeg -O "resolution=${resolution},quality=80" "${pdfPath}" ${pageNumber}`,
121+
);
122+
123+
// Get file sizes
124+
const pngStats = await fs.stat(pngPath);
125+
const jpegStats = await fs.stat(jpegPath);
126+
127+
// Choose smaller file
128+
const useJpeg = jpegStats.size < pngStats.size;
129+
const finalPath = useJpeg ? jpegPath : pngPath;
130+
const mimeType = useJpeg ? "image/jpeg" : "image/png";
131+
const extension = useJpeg ? "jpeg" : "png";
132+
133+
logger.info(`Page ${pageNumber} format selection`, {
134+
pngSize: pngStats.size,
135+
jpegSize: jpegStats.size,
136+
chosen: useJpeg ? "jpeg" : "png",
137+
});
138+
139+
// Clean up unused file
140+
await fs.unlink(useJpeg ? pngPath : jpegPath);
141+
142+
// Stream to storage
143+
const fileStream = createReadStream(finalPath);
144+
const { type, data } = await streamFileServer({
145+
file: {
146+
name: `page-${pageNumber}.${extension}`,
147+
type: mimeType,
148+
stream: fileStream,
149+
},
150+
teamId,
151+
docId,
152+
});
153+
154+
if (!data) {
155+
throw new Error(`Failed to upload page ${pageNumber}`);
156+
}
157+
158+
// Create document page
159+
await prisma.documentPage.create({
160+
data: {
161+
versionId: documentVersionId,
162+
pageNumber,
163+
file: data,
164+
storageType: type,
165+
metadata: {
166+
originalWidth: widthInPoints,
167+
originalHeight: heightInPoints,
168+
width: widthInPoints * scaleFactor,
169+
height: heightInPoints * scaleFactor,
170+
scaleFactor,
171+
},
172+
},
173+
});
174+
175+
logger.info(`Uploaded page ${pageNumber}`, { type, data });
176+
}
177+
178+
// Update document version
179+
await prisma.documentVersion.update({
180+
where: { id: documentVersionId },
181+
data: {
182+
hasPages: true,
183+
isPrimary: true,
184+
},
185+
});
186+
187+
// If versionNumber is provided, update other versions to not be primary
188+
if (versionNumber) {
189+
await prisma.documentVersion.updateMany({
190+
where: {
191+
documentId: docId,
192+
versionNumber: {
193+
not: versionNumber,
194+
},
195+
},
196+
data: {
197+
isPrimary: false,
198+
},
199+
});
200+
}
201+
202+
// Clean up temporary directory
203+
await fs.rm(tempDirectory, { recursive: true });
204+
logger.info("Temporary directory cleaned up", { tempDirectory });
205+
206+
return {
207+
success: true,
208+
message: "Successfully converted PDF to images",
209+
totalPages,
210+
};
211+
} catch (error) {
212+
logger.error("Failed to convert PDF:", {
213+
error: error instanceof Error ? error.message : String(error),
214+
stack: error instanceof Error ? error.stack : undefined,
215+
});
216+
throw error;
217+
}
218+
},
219+
});

pages/api/teams/[teamId]/documents/index.ts

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import {
1616
convertFilesToPdfTask,
1717
} from "@/lib/trigger/convert-files";
1818
import { processVideo } from "@/lib/trigger/optimize-video-files";
19+
import { convertPdfToImage } from "@/lib/trigger/pdf-to-image";
1920
import { CustomUser } from "@/lib/types";
2021
import { getExtension, log } from "@/lib/utils";
2122

@@ -314,15 +315,33 @@ export default async function handle(
314315
// skip triggering convert-pdf-to-image job for "notion" / "excel" documents
315316
if (type === "pdf") {
316317
// trigger document uploaded event to trigger convert-pdf-to-image job
317-
await client.sendEvent({
318-
id: document.versions[0].id, // unique eventId for the run
319-
name: "document.uploaded",
320-
payload: {
321-
documentVersionId: document.versions[0].id,
322-
teamId: teamId,
323-
documentId: document.id,
324-
},
325-
});
318+
if (teamId === "cluqtfmcr0001zkza4xcgqatw") {
319+
await convertPdfToImage.trigger(
320+
{
321+
documentVersionId: document.versions[0].id,
322+
teamId,
323+
docId: fileUrl.split("/")[1],
324+
},
325+
{
326+
idempotencyKey: `${teamId}-${document.versions[0].id}`,
327+
tags: [
328+
`team_${teamId}`,
329+
`document_${document.id}`,
330+
`version_${document.versions[0].id}`,
331+
],
332+
},
333+
);
334+
} else {
335+
await client.sendEvent({
336+
id: document.versions[0].id, // unique eventId for the run
337+
name: "document.uploaded",
338+
payload: {
339+
documentVersionId: document.versions[0].id,
340+
teamId: teamId,
341+
documentId: document.id,
342+
},
343+
});
344+
}
326345
}
327346

328347
return res.status(201).json(document);

trigger.config.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { ffmpeg } from "@trigger.dev/build/extensions/core";
1+
import { aptGet, ffmpeg } from "@trigger.dev/build/extensions/core";
22
import { prismaExtension } from "@trigger.dev/build/extensions/prisma";
33
import { defineConfig } from "@trigger.dev/sdk/v3";
44

@@ -21,6 +21,7 @@ export default defineConfig({
2121
schema: "prisma/schema.prisma",
2222
}),
2323
ffmpeg(),
24+
aptGet({ packages: ["mupdf-tools", "curl"] }),
2425
],
2526
},
2627
});

0 commit comments

Comments
 (0)