fix: fixing pdf parsing (#3349)

jacopo-chevallard · chloedia · web-flow · commit 367242a3d5ea · 2024-10-14T06:21:12.000-07:00
Our default pdf parser is Unstructured, for which we were using the
'fast' strategy, which fails in parsing some pdf. We instead use the
'auto' strategy which is more flexible and powerful.

Closes TICK-86

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):

---------

Co-authored-by: chloedia &lt;chloedaems0@gmail.com&gt;
diff --git a/backend/api/quivr_api/modules/sync/utils/normalize.py b/backend/api/quivr_api/modules/sync/utils/normalize.py
@@ -1,3 +1,4 @@
+import os
 import re
 import unicodedata
 
@@ -15,3 +16,35 @@ def remove_special_characters(input):
     except Exception as e:
         logger.error(f"Error removing special characters: {e}")
         return input
+
+
+def sanitize_filename(filename: str) -> str:
+    """
+    Sanitize the filename to make it usable.
+
+    Args:
+        filename (str): The original filename.
+
+    Returns:
+        str: The sanitized filename.
+
+    This function:
+    1. Removes or replaces invalid characters
+    2. Handles double extensions
+    3. Ensures the filename is not empty
+    4. Truncates long filenames
+    """
+    valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
+
+    name, ext = os.path.splitext(valid_chars)
+
+    name = name.replace(".", "_")
+
+    if not name:
+        name = "unnamed"
+    max_length = 255 - len(ext)
+    if len(name) > max_length:
+        name = name[:max_length]
+    sanitized_filename = f"{name}{ext}"
+
+    return sanitized_filename
diff --git a/backend/api/quivr_api/modules/sync/utils/syncutils.py b/backend/api/quivr_api/modules/sync/utils/syncutils.py
@@ -29,6 +29,7 @@
     ISyncService,
     ISyncUserService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.sync.utils.sync import BaseSync
 from quivr_api.modules.upload.service.upload_file import (
     check_file_exists,
@@ -168,6 +169,8 @@ async def process_sync_file(
         ]:
             raise ValueError(f"Incompatible file extension for {downloaded_file}")
 
+        storage_path = sanitize_filename(storage_path)
+
         response = await upload_file_storage(
             downloaded_file.file_data,
             storage_path,
diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py
@@ -30,6 +30,7 @@
 from quivr_api.modules.notification.service.notification_service import (
     NotificationService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.upload.service.upload_file import (
     upload_file_storage,
 )
@@ -85,12 +86,14 @@ async def upload_file(
             brain_id=str(brain_id),
         )
     )
+    file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
 
     background_tasks.add_task(
-        maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
+        maybe_send_telemetry, "upload_file", {"file_name": file_name}
     )
 
-    filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
+    filename_with_brain_id = str(brain_id) + "/" + file_name
+    filename_with_brain_id = sanitize_filename(filename_with_brain_id)
 
     buff_reader = io.BufferedReader(uploadFile.file)  # type: ignore
     try:
@@ -110,9 +113,9 @@ async def upload_file(
 
     knowledge_to_add = CreateKnowledgeProperties(
         brain_id=brain_id,
-        file_name=uploadFile.filename,
+        file_name=file_name,
         extension=os.path.splitext(
-            uploadFile.filename  # pyright: ignore reportPrivateUsage=none
+            file_name  # pyright: ignore reportPrivateUsage=none
         )[-1].lower(),
         source=integration if integration else "local",
         source_link=integration_link,  # FIXME: Should return the s3 link @chloedia
@@ -127,7 +130,7 @@ async def upload_file(
         "process_file_task",
         kwargs={
             "file_name": filename_with_brain_id,
-            "file_original_name": uploadFile.filename,
+            "file_original_name": file_name,
             "brain_id": brain_id,
             "notification_id": upload_notification.id,
             "knowledge_id": knowledge.id,
diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py
@@ -320,22 +320,28 @@ async def convert(
         else:
             raise ValueError(f"Method {self.method} not supported")
 
-        if not gpt4o_cleaner:
-            return LangChainDocument(
-                page_content=parsed_md,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
-        else:
+        if gpt4o_cleaner:
             md_processor = MarkdownProcessor(
                 parsed_md,
                 strict=True,
                 remove_pagination=True,
             )
             md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
-            return LangChainDocument(
-                page_content=md_cleaned,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
+            parsed_md = md_cleaned
+
+        if (
+            len(parsed_md) < 5
+            and file_path.stat().st_size > 100
+            and self.strategy == "fast"
+        ):
+            logger.info(f"Switching to auto strategy for {file_path.name}")
+            self.strategy = "auto"
+            return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
+
+        return LangChainDocument(
+            page_content=parsed_md,
+            metadata={"filename": file_path.name, "type": "pdf"},
+        )
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
         with open(file_path, "w") as f:
diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py
@@ -59,7 +59,6 @@ def processor_metadata(self):
     async def process_file_inner(self, file: QuivrFile) -> list[Document]:
         mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config)  # type: ignore
         document: Document = await mega_parse.aload()
-        print("\n\n document: ", document.page_content)
         if len(document.page_content) > self.splitter_config.chunk_size:
             docs = self.text_splitter.split_documents([document])
             for doc in docs:
diff --git a/backend/worker/quivr_worker/celery_monitor.py b/backend/worker/quivr_worker/celery_monitor.py
@@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
         running currently.
     """
     active_tasks = celery.control.inspect().active()
+    if not active_tasks:
+        return False
+    
     for worker, running_tasks in active_tasks.items():
         for task in running_tasks:
             if task["name"] == task_name:  # type: ignore