🔃 refactor: Improve Document Loaders, add langchain-ollama to Lite Build (#170)

gafda · web-flow · commit 15e31da56cf2 · 2025-08-17T13:59:49.000-04:00
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,40 @@
+# --------------------------------------------------------
+# Development Dockerfile for RAG API
+# --------------------------------------------------------
+ARG PYTHON_VERSION=3.10
+FROM python:${PYTHON_VERSION}-bookworm
+
+# --------------------------------------------------------
+# Install system dependencies
+# --------------------------------------------------------
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # Essential for development
+    git \
+    sudo \
+    # Document processing (for RAG API)
+    pandoc \
+    libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# --------------------------------------------------------
+# Create non-root development user
+# --------------------------------------------------------
+RUN useradd -m -s /bin/bash vscode \
+    && usermod -aG sudo vscode \
+    && echo "vscode ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# --------------------------------------------------------
+# Configure Python environment
+# --------------------------------------------------------
+WORKDIR /workspace
+RUN pip install --no-cache-dir --upgrade pip
+
+# --------------------------------------------------------
+# Configure environment
+# --------------------------------------------------------
+ENV PYTHONPATH=/workspace \
+    PYTHONUNBUFFERED=1 \
+    SCARF_NO_ANALYTICS=true
+
+# Switch to non-root user
+USER vscode
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,102 @@
+{
+  // --------------------------------------------------------
+  // [name] - Name of the development container
+  // --------------------------------------------------------
+  "name": "rag-api",
+
+  // --------------------------------------------------------
+  // [build] - Configure the build process
+  // --------------------------------------------------------
+  "build": {
+      "dockerfile": "Dockerfile",
+      "context": "..",
+      "args": {
+          "PYTHON_VERSION": "3.10"
+      }
+  },
+
+  // --------------------------------------------------------
+  // [features] - Install additional tools and runtimes
+  // --------------------------------------------------------
+  "features": {
+    "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {
+      "version": "latest",
+      "moby": "false",
+      "dockerDashComposeVersion": "v2",
+      "installDockerBuildx": "true",
+      "installDockerComposeSwitch": "true"
+    }
+
+    // "ghcr.io/devcontainers/features/nvidia-cuda:1": {
+    //     "installCudnn": "true"
+    // }
+  },
+
+  // --------------------------------------------------------
+  // [customizations] - Configure the development container
+  // --------------------------------------------------------
+  "customizations": {
+      "vscode": {
+          "extensions": [
+              "bierner.markdown-mermaid",
+              "dotenv.dotenv-vscode",
+              "EditorConfig.EditorConfig",
+              "gabdug.pdm",
+              "mechatroner.rainbow-csv",
+              "ms-azuretools.vscode-docker",
+              "ms-python.debugpy",
+              "ms-python.python",
+              "njpwerner.autodocstring",
+              "shardulm94.trailing-spaces",
+              "tamasfe.even-better-toml"
+          ],
+          "settings": {
+              "python.linting.enabled": true,
+              "python.linting.pylintEnabled": true
+          }
+      }
+  },
+
+  // --------------------------------------------------------
+  // [forwardPorts] - Expose container ports to local machine
+  // --------------------------------------------------------
+  "forwardPorts": [8000],
+
+  // --------------------------------------------------------
+  // [portsAttributes] - Configure port forwarding behavior
+  // --------------------------------------------------------
+  "portsAttributes": {
+    "8000": {
+      "label": "Backend",
+      "onAutoForward": "notify"
+    }
+  },
+
+  // --------------------------------------------------------
+  // [remoteEnv] - Set environment variables for the container
+  // --------------------------------------------------------
+  "containerEnv": {},
+
+  // --------------------------------------------------------------
+  // [postCreateCommand] - Run setup tasks after container creation
+  // --------------------------------------------------------------
+  "postCreateCommand": "apt-get moo",
+
+  // --------------------------------------------------------
+  // [runArgs] - Additional arguments for docker run command
+  // --------------------------------------------------------
+  "runArgs": [
+      "--name=devcontainer-rag-api",
+      "--privileged",
+      "--network=host"
+      // "--gpus=all"
+  ],
+
+  // --------------------------------------------------------
+  // [mounts] - Mount host Docker socket for docker-outside-of-docker
+  // --------------------------------------------------------
+  "mounts": [
+    // Add volume mount for NVIDIA runtime (if using nvidia-docker approach)
+    // "source=/dev/video0,target=/dev/video0,type=bind"
+  ]
+}
diff --git a/app/utils/document_loader.py b/app/utils/document_loader.py
@@ -2,6 +2,7 @@
 import os
 import codecs
 import tempfile
+
 from typing import List, Optional
 
 from langchain_core.documents import Document
@@ -61,12 +62,15 @@ def cleanup_temp_encoding_file(loader) -> None:
 
 
 def get_loader(filename: str, file_content_type: str, filepath: str):
+    """Get the appropriate document loader based on file type and\or content type."""
     file_ext = filename.split(".")[-1].lower()
     known_type = True
 
-    if file_ext == "pdf":
-        loader = PyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES)
-    elif file_ext == "csv":
+    # File Content Type reference:
+    # ref.: https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types
+    if file_ext == "pdf" or file_content_type == "application/pdf":
+        loader = SafePyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES)
+    elif file_ext == "csv" or file_content_type == "text/csv":
         # Detect encoding for CSV files
         encoding = detect_file_encoding(filepath)
 
@@ -99,30 +103,41 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
             loader = CSVLoader(filepath)
     elif file_ext == "rst":
         loader = UnstructuredRSTLoader(filepath, mode="elements")
-    elif file_ext == "xml":
+    elif file_ext == "xml" or file_content_type in [
+            "application/xml",
+            "text/xml",
+            "application/xhtml+xml",
+        ]:
         loader = UnstructuredXMLLoader(filepath)
-    elif file_ext == "pptx":
+    elif file_ext in ["ppt", "pptx"] or file_content_type in [
+            "application/vnd.ms-powerpoint",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        ]:
         loader = UnstructuredPowerPointLoader(filepath)
-    elif file_ext == "md":
+    elif file_ext == "md" or file_content_type in [
+            "text/markdown",
+            "text/x-markdown",
+            "application/markdown",
+            "application/x-markdown",
+        ]:
         loader = UnstructuredMarkdownLoader(filepath)
-    elif file_content_type == "application/epub+zip":
+    elif file_ext == "epub" or file_content_type == "application/epub+zip":
         loader = UnstructuredEPubLoader(filepath)
-    elif (
-        file_content_type
-        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        or file_ext in ["doc", "docx"]
-    ):
+    elif file_ext in ["doc", "docx"] or file_content_type in [
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        ]:
         loader = Docx2txtLoader(filepath)
-    elif file_content_type in [
-        "application/vnd.ms-excel",
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-    ] or file_ext in ["xls", "xlsx"]:
+    elif file_ext in ["xls", "xlsx"] or file_content_type in [
+            "application/vnd.ms-excel",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        ]:
         loader = UnstructuredExcelLoader(filepath)
-    elif file_content_type == "application/json" or file_ext == "json":
+    elif file_ext == "json" or file_content_type == "application/json":
         loader = TextLoader(filepath, autodetect_encoding=True)
     elif file_ext in known_source_ext or (
-        file_content_type and file_content_type.find("text/") >= 0
-    ):
+            file_content_type and file_content_type.find("text/") >= 0
+        ):
         loader = TextLoader(filepath, autodetect_encoding=True)
     else:
         loader = TextLoader(filepath, autodetect_encoding=True)
@@ -166,3 +181,39 @@ def process_documents(documents: List[Document]) -> str:
             processed_text += new_content
 
     return processed_text.strip()
+
+
+class SafePyPDFLoader:
+    """
+    A wrapper around PyPDFLoader that handles image extraction failures gracefully.
+    Falls back to text-only extraction when image extraction fails.
+
+    This is a workaround for issues with PyPDFLoader that can occur when extracting images
+    from PDFs, which can lead to KeyError exceptions if the PDF is malformed or has unsupported
+    image formats. This class attempts to load the PDF with image extraction enabled, and if it
+    fails due to a KeyError related to image filters, it falls back to loading the PDF
+    without image extraction.
+    ref.: https://github.com/langchain-ai/langchain/issues/26652
+    """
+
+    def __init__(self, filepath: str, extract_images: bool = False):
+        self.filepath = filepath
+        self.extract_images = extract_images
+        self._temp_filepath = None  # For compatibility with cleanup function
+
+    def load(self) -> List[Document]:
+        """Load PDF documents with automatic fallback on image extraction errors."""
+        loader = PyPDFLoader(self.filepath, extract_images=self.extract_images)
+
+        try:
+            return loader.load()
+        except KeyError as e:
+            if "/Filter" in str(e) and self.extract_images:
+                logger.warning(
+                    f"PDF image extraction failed for {self.filepath}, falling back to text-only: {e}"
+                )
+                fallback_loader = PyPDFLoader(self.filepath, extract_images=False)
+                return fallback_loader.load()
+            else:
+                # Re-raise if it's a different error
+                raise
diff --git a/requirements.lite.txt b/requirements.lite.txt
@@ -1,33 +1,34 @@
-langchain==0.3.12
-langchain-community==0.3.12
-langchain-openai==0.2.11
-langchain-core==0.3.27
-langchain-google-vertexai==2.0.11
-sqlalchemy==2.0.28
-python-dotenv==1.0.1
+langchain==0.3.26
+langchain-community==0.3.27
+langchain-openai==0.3.27
+langchain-core==0.3.68
+langchain-google-vertexai==2.0.27
+sqlalchemy==2.0.41
+python-dotenv==1.1.1
 fastapi==0.115.12
 psycopg2-binary==2.9.9
 pgvector==0.2.5
 uvicorn==0.28.0
-pypdf==4.1.0
+pypdf==5.7.0
 unstructured==0.16.11
-markdown==3.6
+markdown==3.8.2
 networkx==3.2.1
 pandas==2.2.1
-openpyxl==3.1.2
-docx2txt==0.8
-pypandoc==1.13
+openpyxl==3.1.5
+docx2txt==0.9
+pypandoc==1.15
 PyJWT==2.8.0
 asyncpg==0.29.0
 python-multipart==0.0.19
-aiofiles==23.2.1
-rapidocr-onnxruntime==1.3.24
+aiofiles==24.1.0
+rapidocr-onnxruntime==1.4.4
 opencv-python-headless==4.9.0.80
 pymongo==4.6.3
 langchain-mongodb==0.2.0
-cryptography==44.0.1
+cryptography==45.0.5
 python-magic==0.4.27
-python-pptx==0.6.23
-xlrd==2.0.1
+python-pptx==1.0.2
+xlrd==2.0.2
 langchain-aws==0.2.1
 boto3==1.34.144
+langchain-ollama==0.3.3
diff --git a/requirements.txt b/requirements.txt
@@ -1,38 +1,38 @@
-langchain==0.3.12
-langchain-community==0.3.12
-langchain-openai==0.2.11
-langchain-core==0.3.27
+langchain==0.3.26
+langchain-community==0.3.27
+langchain-openai==0.3.27
+langchain-core==0.3.68
 langchain-aws==0.2.1
-langchain-google-vertexai==2.0.0
-langchain_text_splitters==0.3.3
+langchain-google-vertexai==2.0.27
+langchain_text_splitters==0.3.8 # 0.3.3
 boto3==1.34.144
-sqlalchemy==2.0.28
-python-dotenv==1.0.1
+sqlalchemy==2.0.41
+python-dotenv==1.1.1
 fastapi==0.115.12
 psycopg2-binary==2.9.9
 pgvector==0.2.5
 uvicorn==0.28.0
-pypdf==4.1.0
+pypdf==5.7.0
 unstructured==0.16.11
-markdown==3.6
+markdown==3.8.2
 networkx==3.2.1
 pandas==2.2.1
-openpyxl==3.1.2
-docx2txt==0.8
-pypandoc==1.13
+openpyxl==3.1.5
+docx2txt==0.9
+pypandoc==1.15
 PyJWT==2.8.0
 asyncpg==0.29.0
 python-multipart==0.0.19
 sentence_transformers==3.1.1
-aiofiles==23.2.1
-rapidocr-onnxruntime==1.3.24
+aiofiles==24.1.0
+rapidocr-onnxruntime==1.4.4
 opencv-python-headless==4.9.0.80
 pymongo==4.6.3
 langchain-mongodb==0.2.0
-langchain-ollama==0.2.0
+langchain-ollama==0.3.3
 langchain-huggingface==0.1.0
-cryptography==44.0.1
+cryptography==45.0.5
 python-magic==0.4.27
-python-pptx==0.6.23
-xlrd==2.0.1
+python-pptx==1.0.2
+xlrd==2.0.2
 pydantic==2.9.2
diff --git a/tests/utils/test_document_loader.py b/tests/utils/test_document_loader.py