Skip to content

Commit 367242a

Browse files
fix: fixing pdf parsing (#3349)
Our default pdf parser is Unstructured, for which we were using the 'fast' strategy, which fails in parsing some pdf. We instead use the 'auto' strategy which is more flexible and powerful. Closes TICK-86 # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: chloedia <[email protected]>
1 parent 90848eb commit 367242a

File tree

6 files changed

+63
-16
lines changed

6 files changed

+63
-16
lines changed

backend/api/quivr_api/modules/sync/utils/normalize.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import re
23
import unicodedata
34

@@ -15,3 +16,35 @@ def remove_special_characters(input):
1516
except Exception as e:
1617
logger.error(f"Error removing special characters: {e}")
1718
return input
19+
20+
21+
def sanitize_filename(filename: str) -> str:
22+
"""
23+
Sanitize the filename to make it usable.
24+
25+
Args:
26+
filename (str): The original filename.
27+
28+
Returns:
29+
str: The sanitized filename.
30+
31+
This function:
32+
1. Removes or replaces invalid characters
33+
2. Handles double extensions
34+
3. Ensures the filename is not empty
35+
4. Truncates long filenames
36+
"""
37+
valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
38+
39+
name, ext = os.path.splitext(valid_chars)
40+
41+
name = name.replace(".", "_")
42+
43+
if not name:
44+
name = "unnamed"
45+
max_length = 255 - len(ext)
46+
if len(name) > max_length:
47+
name = name[:max_length]
48+
sanitized_filename = f"{name}{ext}"
49+
50+
return sanitized_filename

backend/api/quivr_api/modules/sync/utils/syncutils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ISyncService,
3030
ISyncUserService,
3131
)
32+
from quivr_api.modules.sync.utils.normalize import sanitize_filename
3233
from quivr_api.modules.sync.utils.sync import BaseSync
3334
from quivr_api.modules.upload.service.upload_file import (
3435
check_file_exists,
@@ -168,6 +169,8 @@ async def process_sync_file(
168169
]:
169170
raise ValueError(f"Incompatible file extension for {downloaded_file}")
170171

172+
storage_path = sanitize_filename(storage_path)
173+
171174
response = await upload_file_storage(
172175
downloaded_file.file_data,
173176
storage_path,

backend/api/quivr_api/modules/upload/controller/upload_routes.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from quivr_api.modules.notification.service.notification_service import (
3131
NotificationService,
3232
)
33+
from quivr_api.modules.sync.utils.normalize import sanitize_filename
3334
from quivr_api.modules.upload.service.upload_file import (
3435
upload_file_storage,
3536
)
@@ -85,12 +86,14 @@ async def upload_file(
8586
brain_id=str(brain_id),
8687
)
8788
)
89+
file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
8890

8991
background_tasks.add_task(
90-
maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
92+
maybe_send_telemetry, "upload_file", {"file_name": file_name}
9193
)
9294

93-
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
95+
filename_with_brain_id = str(brain_id) + "/" + file_name
96+
filename_with_brain_id = sanitize_filename(filename_with_brain_id)
9497

9598
buff_reader = io.BufferedReader(uploadFile.file) # type: ignore
9699
try:
@@ -110,9 +113,9 @@ async def upload_file(
110113

111114
knowledge_to_add = CreateKnowledgeProperties(
112115
brain_id=brain_id,
113-
file_name=uploadFile.filename,
116+
file_name=file_name,
114117
extension=os.path.splitext(
115-
uploadFile.filename # pyright: ignore reportPrivateUsage=none
118+
file_name # pyright: ignore reportPrivateUsage=none
116119
)[-1].lower(),
117120
source=integration if integration else "local",
118121
source_link=integration_link, # FIXME: Should return the s3 link @chloedia
@@ -127,7 +130,7 @@ async def upload_file(
127130
"process_file_task",
128131
kwargs={
129132
"file_name": filename_with_brain_id,
130-
"file_original_name": uploadFile.filename,
133+
"file_original_name": file_name,
131134
"brain_id": brain_id,
132135
"notification_id": upload_notification.id,
133136
"knowledge_id": knowledge.id,

backend/core/MegaParse/megaparse/Converter.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -320,22 +320,28 @@ async def convert(
320320
else:
321321
raise ValueError(f"Method {self.method} not supported")
322322

323-
if not gpt4o_cleaner:
324-
return LangChainDocument(
325-
page_content=parsed_md,
326-
metadata={"filename": file_path.name, "type": "pdf"},
327-
)
328-
else:
323+
if gpt4o_cleaner:
329324
md_processor = MarkdownProcessor(
330325
parsed_md,
331326
strict=True,
332327
remove_pagination=True,
333328
)
334329
md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
335-
return LangChainDocument(
336-
page_content=md_cleaned,
337-
metadata={"filename": file_path.name, "type": "pdf"},
338-
)
330+
parsed_md = md_cleaned
331+
332+
if (
333+
len(parsed_md) < 5
334+
and file_path.stat().st_size > 100
335+
and self.strategy == "fast"
336+
):
337+
logger.info(f"Switching to auto strategy for {file_path.name}")
338+
self.strategy = "auto"
339+
return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
340+
341+
return LangChainDocument(
342+
page_content=parsed_md,
343+
metadata={"filename": file_path.name, "type": "pdf"},
344+
)
339345

340346
def save_md(self, md_content: str, file_path: Path | str) -> None:
341347
with open(file_path, "w") as f:

backend/core/quivr_core/processor/implementations/megaparse_processor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def processor_metadata(self):
5959
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
6060
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
6161
document: Document = await mega_parse.aload()
62-
print("\n\n document: ", document.page_content)
6362
if len(document.page_content) > self.splitter_config.chunk_size:
6463
docs = self.text_splitter.split_documents([document])
6564
for doc in docs:

backend/worker/quivr_worker/celery_monitor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
178178
running currently.
179179
"""
180180
active_tasks = celery.control.inspect().active()
181+
if not active_tasks:
182+
return False
183+
181184
for worker, running_tasks in active_tasks.items():
182185
for task in running_tasks:
183186
if task["name"] == task_name: # type: ignore

0 commit comments

Comments
 (0)