Skip to content

Commit 15e31da

Browse files
authored
🔃 refactor: Improve Document Loaders, add langchain-ollama to Lite Build (#170)
1 parent 8acca63 commit 15e31da

File tree

6 files changed

+274
-56
lines changed

6 files changed

+274
-56
lines changed

‎.devcontainer/Dockerfile‎

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# --------------------------------------------------------
2+
# Development Dockerfile for RAG API
3+
# --------------------------------------------------------
4+
ARG PYTHON_VERSION=3.10
5+
FROM python:${PYTHON_VERSION}-bookworm
6+
7+
# --------------------------------------------------------
8+
# Install system dependencies
9+
# --------------------------------------------------------
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
# Essential for development
12+
git \
13+
sudo \
14+
# Document processing (for RAG API)
15+
pandoc \
16+
libmagic1 \
17+
&& rm -rf /var/lib/apt/lists/*
18+
19+
# --------------------------------------------------------
20+
# Create non-root development user
21+
# --------------------------------------------------------
22+
RUN useradd -m -s /bin/bash vscode \
23+
&& usermod -aG sudo vscode \
24+
&& echo "vscode ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
25+
26+
# --------------------------------------------------------
27+
# Configure Python environment
28+
# --------------------------------------------------------
29+
WORKDIR /workspace
30+
RUN pip install --no-cache-dir --upgrade pip
31+
32+
# --------------------------------------------------------
33+
# Configure environment
34+
# --------------------------------------------------------
35+
ENV PYTHONPATH=/workspace \
36+
PYTHONUNBUFFERED=1 \
37+
SCARF_NO_ANALYTICS=true
38+
39+
# Switch to non-root user
40+
USER vscode
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
// --------------------------------------------------------
3+
// [name] - Name of the development container
4+
// --------------------------------------------------------
5+
"name": "rag-api",
6+
7+
// --------------------------------------------------------
8+
// [build] - Configure the build process
9+
// --------------------------------------------------------
10+
"build": {
11+
"dockerfile": "Dockerfile",
12+
"context": "..",
13+
"args": {
14+
"PYTHON_VERSION": "3.10"
15+
}
16+
},
17+
18+
// --------------------------------------------------------
19+
// [features] - Install additional tools and runtimes
20+
// --------------------------------------------------------
21+
"features": {
22+
"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {
23+
"version": "latest",
24+
"moby": "false",
25+
"dockerDashComposeVersion": "v2",
26+
"installDockerBuildx": "true",
27+
"installDockerComposeSwitch": "true"
28+
}
29+
30+
// "ghcr.io/devcontainers/features/nvidia-cuda:1": {
31+
// "installCudnn": "true"
32+
// }
33+
},
34+
35+
// --------------------------------------------------------
36+
// [customizations] - Configure the development container
37+
// --------------------------------------------------------
38+
"customizations": {
39+
"vscode": {
40+
"extensions": [
41+
"bierner.markdown-mermaid",
42+
"dotenv.dotenv-vscode",
43+
"EditorConfig.EditorConfig",
44+
"gabdug.pdm",
45+
"mechatroner.rainbow-csv",
46+
"ms-azuretools.vscode-docker",
47+
"ms-python.debugpy",
48+
"ms-python.python",
49+
"njpwerner.autodocstring",
50+
"shardulm94.trailing-spaces",
51+
"tamasfe.even-better-toml"
52+
],
53+
"settings": {
54+
"python.linting.enabled": true,
55+
"python.linting.pylintEnabled": true
56+
}
57+
}
58+
},
59+
60+
// --------------------------------------------------------
61+
// [forwardPorts] - Expose container ports to local machine
62+
// --------------------------------------------------------
63+
"forwardPorts": [8000],
64+
65+
// --------------------------------------------------------
66+
// [portsAttributes] - Configure port forwarding behavior
67+
// --------------------------------------------------------
68+
"portsAttributes": {
69+
"8000": {
70+
"label": "Backend",
71+
"onAutoForward": "notify"
72+
}
73+
},
74+
75+
// --------------------------------------------------------
76+
// [remoteEnv] - Set environment variables for the container
77+
// --------------------------------------------------------
78+
"containerEnv": {},
79+
80+
// --------------------------------------------------------------
81+
// [postCreateCommand] - Run setup tasks after container creation
82+
// --------------------------------------------------------------
83+
"postCreateCommand": "apt-get moo",
84+
85+
// --------------------------------------------------------
86+
// [runArgs] - Additional arguments for docker run command
87+
// --------------------------------------------------------
88+
"runArgs": [
89+
"--name=devcontainer-rag-api",
90+
"--privileged",
91+
"--network=host"
92+
// "--gpus=all"
93+
],
94+
95+
// --------------------------------------------------------
96+
// [mounts] - Mount host Docker socket for docker-outside-of-docker
97+
// --------------------------------------------------------
98+
"mounts": [
99+
// Add volume mount for NVIDIA runtime (if using nvidia-docker approach)
100+
// "source=/dev/video0,target=/dev/video0,type=bind"
101+
]
102+
}

‎app/utils/document_loader.py‎

Lines changed: 70 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import codecs
44
import tempfile
5+
56
from typing import List, Optional
67

78
from langchain_core.documents import Document
@@ -61,12 +62,15 @@ def cleanup_temp_encoding_file(loader) -> None:
6162

6263

6364
def get_loader(filename: str, file_content_type: str, filepath: str):
65+
"""Get the appropriate document loader based on file type and\or content type."""
6466
file_ext = filename.split(".")[-1].lower()
6567
known_type = True
6668

67-
if file_ext == "pdf":
68-
loader = PyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES)
69-
elif file_ext == "csv":
69+
# File Content Type reference:
70+
# ref.: https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types
71+
if file_ext == "pdf" or file_content_type == "application/pdf":
72+
loader = SafePyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES)
73+
elif file_ext == "csv" or file_content_type == "text/csv":
7074
# Detect encoding for CSV files
7175
encoding = detect_file_encoding(filepath)
7276

@@ -99,30 +103,41 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
99103
loader = CSVLoader(filepath)
100104
elif file_ext == "rst":
101105
loader = UnstructuredRSTLoader(filepath, mode="elements")
102-
elif file_ext == "xml":
106+
elif file_ext == "xml" or file_content_type in [
107+
"application/xml",
108+
"text/xml",
109+
"application/xhtml+xml",
110+
]:
103111
loader = UnstructuredXMLLoader(filepath)
104-
elif file_ext == "pptx":
112+
elif file_ext in ["ppt", "pptx"] or file_content_type in [
113+
"application/vnd.ms-powerpoint",
114+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
115+
]:
105116
loader = UnstructuredPowerPointLoader(filepath)
106-
elif file_ext == "md":
117+
elif file_ext == "md" or file_content_type in [
118+
"text/markdown",
119+
"text/x-markdown",
120+
"application/markdown",
121+
"application/x-markdown",
122+
]:
107123
loader = UnstructuredMarkdownLoader(filepath)
108-
elif file_content_type == "application/epub+zip":
124+
elif file_ext == "epub" or file_content_type == "application/epub+zip":
109125
loader = UnstructuredEPubLoader(filepath)
110-
elif (
111-
file_content_type
112-
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
113-
or file_ext in ["doc", "docx"]
114-
):
126+
elif file_ext in ["doc", "docx"] or file_content_type in [
127+
"application/msword",
128+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
129+
]:
115130
loader = Docx2txtLoader(filepath)
116-
elif file_content_type in [
117-
"application/vnd.ms-excel",
118-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
119-
] or file_ext in ["xls", "xlsx"]:
131+
elif file_ext in ["xls", "xlsx"] or file_content_type in [
132+
"application/vnd.ms-excel",
133+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
134+
]:
120135
loader = UnstructuredExcelLoader(filepath)
121-
elif file_content_type == "application/json" or file_ext == "json":
136+
elif file_ext == "json" or file_content_type == "application/json":
122137
loader = TextLoader(filepath, autodetect_encoding=True)
123138
elif file_ext in known_source_ext or (
124-
file_content_type and file_content_type.find("text/") >= 0
125-
):
139+
file_content_type and file_content_type.find("text/") >= 0
140+
):
126141
loader = TextLoader(filepath, autodetect_encoding=True)
127142
else:
128143
loader = TextLoader(filepath, autodetect_encoding=True)
@@ -166,3 +181,39 @@ def process_documents(documents: List[Document]) -> str:
166181
processed_text += new_content
167182

168183
return processed_text.strip()
184+
185+
186+
class SafePyPDFLoader:
187+
"""
188+
A wrapper around PyPDFLoader that handles image extraction failures gracefully.
189+
Falls back to text-only extraction when image extraction fails.
190+
191+
This is a workaround for issues with PyPDFLoader that can occur when extracting images
192+
from PDFs, which can lead to KeyError exceptions if the PDF is malformed or has unsupported
193+
image formats. This class attempts to load the PDF with image extraction enabled, and if it
194+
fails due to a KeyError related to image filters, it falls back to loading the PDF
195+
without image extraction.
196+
ref.: https://github.com/langchain-ai/langchain/issues/26652
197+
"""
198+
199+
def __init__(self, filepath: str, extract_images: bool = False):
200+
self.filepath = filepath
201+
self.extract_images = extract_images
202+
self._temp_filepath = None # For compatibility with cleanup function
203+
204+
def load(self) -> List[Document]:
205+
"""Load PDF documents with automatic fallback on image extraction errors."""
206+
loader = PyPDFLoader(self.filepath, extract_images=self.extract_images)
207+
208+
try:
209+
return loader.load()
210+
except KeyError as e:
211+
if "/Filter" in str(e) and self.extract_images:
212+
logger.warning(
213+
f"PDF image extraction failed for {self.filepath}, falling back to text-only: {e}"
214+
)
215+
fallback_loader = PyPDFLoader(self.filepath, extract_images=False)
216+
return fallback_loader.load()
217+
else:
218+
# Re-raise if it's a different error
219+
raise

‎requirements.lite.txt‎

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,34 @@
1-
langchain==0.3.12
2-
langchain-community==0.3.12
3-
langchain-openai==0.2.11
4-
langchain-core==0.3.27
5-
langchain-google-vertexai==2.0.11
6-
sqlalchemy==2.0.28
7-
python-dotenv==1.0.1
1+
langchain==0.3.26
2+
langchain-community==0.3.27
3+
langchain-openai==0.3.27
4+
langchain-core==0.3.68
5+
langchain-google-vertexai==2.0.27
6+
sqlalchemy==2.0.41
7+
python-dotenv==1.1.1
88
fastapi==0.115.12
99
psycopg2-binary==2.9.9
1010
pgvector==0.2.5
1111
uvicorn==0.28.0
12-
pypdf==4.1.0
12+
pypdf==5.7.0
1313
unstructured==0.16.11
14-
markdown==3.6
14+
markdown==3.8.2
1515
networkx==3.2.1
1616
pandas==2.2.1
17-
openpyxl==3.1.2
18-
docx2txt==0.8
19-
pypandoc==1.13
17+
openpyxl==3.1.5
18+
docx2txt==0.9
19+
pypandoc==1.15
2020
PyJWT==2.8.0
2121
asyncpg==0.29.0
2222
python-multipart==0.0.19
23-
aiofiles==23.2.1
24-
rapidocr-onnxruntime==1.3.24
23+
aiofiles==24.1.0
24+
rapidocr-onnxruntime==1.4.4
2525
opencv-python-headless==4.9.0.80
2626
pymongo==4.6.3
2727
langchain-mongodb==0.2.0
28-
cryptography==44.0.1
28+
cryptography==45.0.5
2929
python-magic==0.4.27
30-
python-pptx==0.6.23
31-
xlrd==2.0.1
30+
python-pptx==1.0.2
31+
xlrd==2.0.2
3232
langchain-aws==0.2.1
3333
boto3==1.34.144
34+
langchain-ollama==0.3.3

‎requirements.txt‎

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,38 @@
1-
langchain==0.3.12
2-
langchain-community==0.3.12
3-
langchain-openai==0.2.11
4-
langchain-core==0.3.27
1+
langchain==0.3.26
2+
langchain-community==0.3.27
3+
langchain-openai==0.3.27
4+
langchain-core==0.3.68
55
langchain-aws==0.2.1
6-
langchain-google-vertexai==2.0.0
7-
langchain_text_splitters==0.3.3
6+
langchain-google-vertexai==2.0.27
7+
langchain_text_splitters==0.3.8 # 0.3.3
88
boto3==1.34.144
9-
sqlalchemy==2.0.28
10-
python-dotenv==1.0.1
9+
sqlalchemy==2.0.41
10+
python-dotenv==1.1.1
1111
fastapi==0.115.12
1212
psycopg2-binary==2.9.9
1313
pgvector==0.2.5
1414
uvicorn==0.28.0
15-
pypdf==4.1.0
15+
pypdf==5.7.0
1616
unstructured==0.16.11
17-
markdown==3.6
17+
markdown==3.8.2
1818
networkx==3.2.1
1919
pandas==2.2.1
20-
openpyxl==3.1.2
21-
docx2txt==0.8
22-
pypandoc==1.13
20+
openpyxl==3.1.5
21+
docx2txt==0.9
22+
pypandoc==1.15
2323
PyJWT==2.8.0
2424
asyncpg==0.29.0
2525
python-multipart==0.0.19
2626
sentence_transformers==3.1.1
27-
aiofiles==23.2.1
28-
rapidocr-onnxruntime==1.3.24
27+
aiofiles==24.1.0
28+
rapidocr-onnxruntime==1.4.4
2929
opencv-python-headless==4.9.0.80
3030
pymongo==4.6.3
3131
langchain-mongodb==0.2.0
32-
langchain-ollama==0.2.0
32+
langchain-ollama==0.3.3
3333
langchain-huggingface==0.1.0
34-
cryptography==44.0.1
34+
cryptography==45.0.5
3535
python-magic==0.4.27
36-
python-pptx==0.6.23
37-
xlrd==2.0.1
36+
python-pptx==1.0.2
37+
xlrd==2.0.2
3838
pydantic==2.9.2

0 commit comments

Comments
 (0)