fix: use defusexml instead of xml.etree (#18362)

masci · web-flow · commit 4f6ee062b192 · 2025-04-03T13:59:13.000-06:00
diff --git a/llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py b/llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py
@@ -2,12 +2,14 @@
 
 from typing import List, Optional
 
+from defusedxml import ElementTree as safe_xml
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 
 
 class PubmedReader(BaseReader):
-    """Pubmed Reader.
+    """
+    Pubmed Reader.
 
     Gets a search query, return a list of Documents of the top corresponding scientific papers on Pubmed.
     """
@@ -17,7 +19,8 @@ def load_data_bioc(
         search_query: str,
         max_results: Optional[int] = 10,
     ) -> List[Document]:
-        """Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
+        """
+        Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
         Uses the BoiC API, which has been down a lot.
 
         Args:
@@ -27,10 +30,10 @@ def load_data_bioc(
         Returns:
             List[Document]: A list of Document objects.
         """
-        import xml.etree.ElementTree as xml
         from datetime import datetime
 
         import requests
+        from defusedxml import ElementTree as safe_xml
 
         pubmed_search = []
         parameters = {"tool": "tool", "email": "email", "db": "pmc"}
@@ -40,7 +43,7 @@ def load_data_bioc(
             "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
             params=parameters,
         )
-        root = xml.fromstring(resp.content)
+        root = safe_xml.fromstring(resp.content)
 
         for elem in root.iter():
             if elem.tag == "Id":
@@ -99,7 +102,8 @@ def load_data(
         search_query: str,
         max_results: Optional[int] = 10,
     ) -> List[Document]:
-        """Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
+        """
+        Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
 
         Args:
             search_query (str): A topic to search for (e.g. "Alzheimers").
@@ -110,7 +114,6 @@ def load_data(
             List[Document]: A list of Document objects.
         """
         import time
-        import xml.etree.ElementTree as xml
 
         import requests
 
@@ -122,7 +125,7 @@ def load_data(
             "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
             params=parameters,
         )
-        root = xml.fromstring(resp.content)
+        root = safe_xml.fromstring(resp.content)
 
         for elem in root.iter():
             if elem.tag == "Id":
@@ -131,7 +134,7 @@ def load_data(
                 print(url)
                 try:
                     resp = requests.get(url)
-                    info = xml.fromstring(resp.content)
+                    info = safe_xml.fromstring(resp.content)
 
                     raw_text = ""
                     title = ""
diff --git a/llama-index-integrations/readers/llama-index-readers-papers/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-papers/pyproject.toml
@@ -29,12 +29,13 @@ license = "MIT"
 maintainers = ["thejessezhang"]
 name = "llama-index-readers-papers"
 readme = "README.md"
-version = "0.3.1"
+version = "0.3.2"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 arxiv = "^2.1.0"
 llama-index-core = "^0.12.0"
+defusedxml = "^0.7.1"
 
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"
diff --git a/llama-index-integrations/readers/llama-index-readers-stripe-docs/llama_index/readers/stripe_docs/base.py b/llama-index-integrations/readers/llama-index-readers-stripe-docs/llama_index/readers/stripe_docs/base.py
@@ -1,7 +1,7 @@
 import urllib.request
-import xml.etree.ElementTree as ET
 from typing import List
 
+from defusedxml.ElementTree import fromstring
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 from llama_index.readers.web import AsyncWebPageReader
@@ -13,7 +13,8 @@
 
 
 class StripeDocsReader(BaseReader):
-    """Asynchronous Stripe documentation reader.
+    """
+    Asynchronous Stripe documentation reader.
 
     Reads pages from the Stripe documentation based on the sitemap.xml.
 
@@ -36,7 +37,7 @@ def _load_sitemap(self) -> str:
     def _parse_sitemap(
         self, raw_sitemap: str, filters: List[str] = DEFAULT_FILTERS
     ) -> List:
-        root_sitemap = ET.fromstring(raw_sitemap)
+        root_sitemap = fromstring(raw_sitemap)
         sitemap_partition_urls = []
         sitemap_urls = []
 
@@ -45,7 +46,7 @@ def _parse_sitemap(
             sitemap_partition_urls.append(loc)
 
         for sitemap_partition_url in sitemap_partition_urls:
-            sitemap_partition = ET.fromstring(self._load_url(sitemap_partition_url))
+            sitemap_partition = fromstring(self._load_url(sitemap_partition_url))
 
             # Find all <url /> and iterate through them
             for url in sitemap_partition.findall(f"{{{XML_SITEMAP_SCHEMA}}}url"):
diff --git a/llama-index-integrations/readers/llama-index-readers-stripe-docs/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-stripe-docs/pyproject.toml
@@ -29,14 +29,15 @@ license = "GPL-3.0-or-later"
 maintainers = ["amorriscode"]
 name = "llama-index-readers-stripe-docs"
 readme = "README.md"
-version = "0.3.0"
+version = "0.3.1"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 html2text = "^2024.2.26"
 urllib3 = "^2.1.0"
 llama-index-readers-web = "^0.3.0"
 llama-index-core = "^0.12.0"
+defusedxml = "^0.7.1"
 
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/sitemap/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/sitemap/base.py
@@ -1,14 +1,15 @@
 import urllib.request
-import xml.etree.ElementTree as ET
 from typing import List
 
+from defusedxml.ElementTree import fromstring
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
 from llama_index.readers.web.async_web.base import AsyncWebPageReader
 
 
 class SitemapReader(BaseReader):
-    """Asynchronous sitemap reader for web.
+    """
+    Asynchronous sitemap reader for web.
 
     Reads pages from the web based on their sitemap.xml.
 
@@ -34,7 +35,7 @@ def _load_sitemap(self, sitemap_url: str) -> str:
         return sitemap_url_request.read()
 
     def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
-        sitemap = ET.fromstring(raw_sitemap)
+        sitemap = fromstring(raw_sitemap)
         sitemap_urls = []
 
         for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -47,7 +47,7 @@ license = "GPL-3.0-or-later"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.3.8"
+version = "0.3.9"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
@@ -62,6 +62,7 @@ playwright = ">=1.30,<2.0"
 newspaper3k = "^0.2.8"
 spider-client = "^0.0.27"
 llama-index-core = "^0.12.0"
+defusedxml = "^0.7.1"
 
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"