|
2 | 2 | import os |
3 | 3 | import codecs |
4 | 4 | import tempfile |
| 5 | + |
5 | 6 | from typing import List, Optional |
6 | 7 |
|
7 | 8 | from langchain_core.documents import Document |
@@ -61,12 +62,15 @@ def cleanup_temp_encoding_file(loader) -> None: |
61 | 62 |
|
62 | 63 |
|
63 | 64 | def get_loader(filename: str, file_content_type: str, filepath: str): |
| 65 | + """Get the appropriate document loader based on file type and\or content type.""" |
64 | 66 | file_ext = filename.split(".")[-1].lower() |
65 | 67 | known_type = True |
66 | 68 |
|
67 | | - if file_ext == "pdf": |
68 | | - loader = PyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES) |
69 | | - elif file_ext == "csv": |
| 69 | + # File Content Type reference: |
| 70 | + # ref.: https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types |
| 71 | + if file_ext == "pdf" or file_content_type == "application/pdf": |
| 72 | + loader = SafePyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES) |
| 73 | + elif file_ext == "csv" or file_content_type == "text/csv": |
70 | 74 | # Detect encoding for CSV files |
71 | 75 | encoding = detect_file_encoding(filepath) |
72 | 76 |
|
@@ -99,30 +103,41 @@ def get_loader(filename: str, file_content_type: str, filepath: str): |
99 | 103 | loader = CSVLoader(filepath) |
100 | 104 | elif file_ext == "rst": |
101 | 105 | loader = UnstructuredRSTLoader(filepath, mode="elements") |
102 | | - elif file_ext == "xml": |
| 106 | + elif file_ext == "xml" or file_content_type in [ |
| 107 | + "application/xml", |
| 108 | + "text/xml", |
| 109 | + "application/xhtml+xml", |
| 110 | + ]: |
103 | 111 | loader = UnstructuredXMLLoader(filepath) |
104 | | - elif file_ext == "pptx": |
| 112 | + elif file_ext in ["ppt", "pptx"] or file_content_type in [ |
| 113 | + "application/vnd.ms-powerpoint", |
| 114 | + "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| 115 | + ]: |
105 | 116 | loader = UnstructuredPowerPointLoader(filepath) |
106 | | - elif file_ext == "md": |
| 117 | + elif file_ext == "md" or file_content_type in [ |
| 118 | + "text/markdown", |
| 119 | + "text/x-markdown", |
| 120 | + "application/markdown", |
| 121 | + "application/x-markdown", |
| 122 | + ]: |
107 | 123 | loader = UnstructuredMarkdownLoader(filepath) |
108 | | - elif file_content_type == "application/epub+zip": |
| 124 | + elif file_ext == "epub" or file_content_type == "application/epub+zip": |
109 | 125 | loader = UnstructuredEPubLoader(filepath) |
110 | | - elif ( |
111 | | - file_content_type |
112 | | - == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
113 | | - or file_ext in ["doc", "docx"] |
114 | | - ): |
| 126 | + elif file_ext in ["doc", "docx"] or file_content_type in [ |
| 127 | + "application/msword", |
| 128 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
| 129 | + ]: |
115 | 130 | loader = Docx2txtLoader(filepath) |
116 | | - elif file_content_type in [ |
117 | | - "application/vnd.ms-excel", |
118 | | - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
119 | | - ] or file_ext in ["xls", "xlsx"]: |
| 131 | + elif file_ext in ["xls", "xlsx"] or file_content_type in [ |
| 132 | + "application/vnd.ms-excel", |
| 133 | + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
| 134 | + ]: |
120 | 135 | loader = UnstructuredExcelLoader(filepath) |
121 | | - elif file_content_type == "application/json" or file_ext == "json": |
| 136 | + elif file_ext == "json" or file_content_type == "application/json": |
122 | 137 | loader = TextLoader(filepath, autodetect_encoding=True) |
123 | 138 | elif file_ext in known_source_ext or ( |
124 | | - file_content_type and file_content_type.find("text/") >= 0 |
125 | | - ): |
| 139 | + file_content_type and file_content_type.find("text/") >= 0 |
| 140 | + ): |
126 | 141 | loader = TextLoader(filepath, autodetect_encoding=True) |
127 | 142 | else: |
128 | 143 | loader = TextLoader(filepath, autodetect_encoding=True) |
@@ -166,3 +181,39 @@ def process_documents(documents: List[Document]) -> str: |
166 | 181 | processed_text += new_content |
167 | 182 |
|
168 | 183 | return processed_text.strip() |
| 184 | + |
| 185 | + |
| 186 | +class SafePyPDFLoader: |
| 187 | + """ |
| 188 | + A wrapper around PyPDFLoader that handles image extraction failures gracefully. |
| 189 | + Falls back to text-only extraction when image extraction fails. |
| 190 | +
|
| 191 | + This is a workaround for issues with PyPDFLoader that can occur when extracting images |
| 192 | + from PDFs, which can lead to KeyError exceptions if the PDF is malformed or has unsupported |
| 193 | + image formats. This class attempts to load the PDF with image extraction enabled, and if it |
| 194 | + fails due to a KeyError related to image filters, it falls back to loading the PDF |
| 195 | + without image extraction. |
| 196 | + ref.: https://github.com/langchain-ai/langchain/issues/26652 |
| 197 | + """ |
| 198 | + |
| 199 | + def __init__(self, filepath: str, extract_images: bool = False): |
| 200 | + self.filepath = filepath |
| 201 | + self.extract_images = extract_images |
| 202 | + self._temp_filepath = None # For compatibility with cleanup function |
| 203 | + |
| 204 | + def load(self) -> List[Document]: |
| 205 | + """Load PDF documents with automatic fallback on image extraction errors.""" |
| 206 | + loader = PyPDFLoader(self.filepath, extract_images=self.extract_images) |
| 207 | + |
| 208 | + try: |
| 209 | + return loader.load() |
| 210 | + except KeyError as e: |
| 211 | + if "/Filter" in str(e) and self.extract_images: |
| 212 | + logger.warning( |
| 213 | + f"PDF image extraction failed for {self.filepath}, falling back to text-only: {e}" |
| 214 | + ) |
| 215 | + fallback_loader = PyPDFLoader(self.filepath, extract_images=False) |
| 216 | + return fallback_loader.load() |
| 217 | + else: |
| 218 | + # Re-raise if it's a different error |
| 219 | + raise |
0 commit comments