Skip to content

Commit 4f6ee06

Browse files
authored
fix: use defusexml instead of xml.etree (#18362)
1 parent 0819982 commit 4f6ee06

File tree

6 files changed

+26
-18
lines changed

6 files changed

+26
-18
lines changed

llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
from typing import List, Optional
44

5+
from defusedxml import ElementTree as safe_xml
56
from llama_index.core.readers.base import BaseReader
67
from llama_index.core.schema import Document
78

89

910
class PubmedReader(BaseReader):
10-
"""Pubmed Reader.
11+
"""
12+
Pubmed Reader.
1113
1214
Gets a search query, return a list of Documents of the top corresponding scientific papers on Pubmed.
1315
"""
@@ -17,7 +19,8 @@ def load_data_bioc(
1719
search_query: str,
1820
max_results: Optional[int] = 10,
1921
) -> List[Document]:
20-
"""Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
22+
"""
23+
Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
2124
Uses the BoiC API, which has been down a lot.
2225
2326
Args:
@@ -27,10 +30,10 @@ def load_data_bioc(
2730
Returns:
2831
List[Document]: A list of Document objects.
2932
"""
30-
import xml.etree.ElementTree as xml
3133
from datetime import datetime
3234

3335
import requests
36+
from defusedxml import ElementTree as safe_xml
3437

3538
pubmed_search = []
3639
parameters = {"tool": "tool", "email": "email", "db": "pmc"}
@@ -40,7 +43,7 @@ def load_data_bioc(
4043
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
4144
params=parameters,
4245
)
43-
root = xml.fromstring(resp.content)
46+
root = safe_xml.fromstring(resp.content)
4447

4548
for elem in root.iter():
4649
if elem.tag == "Id":
@@ -99,7 +102,8 @@ def load_data(
99102
search_query: str,
100103
max_results: Optional[int] = 10,
101104
) -> List[Document]:
102-
"""Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
105+
"""
106+
Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
103107
104108
Args:
105109
search_query (str): A topic to search for (e.g. "Alzheimers").
@@ -110,7 +114,6 @@ def load_data(
110114
List[Document]: A list of Document objects.
111115
"""
112116
import time
113-
import xml.etree.ElementTree as xml
114117

115118
import requests
116119

@@ -122,7 +125,7 @@ def load_data(
122125
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
123126
params=parameters,
124127
)
125-
root = xml.fromstring(resp.content)
128+
root = safe_xml.fromstring(resp.content)
126129

127130
for elem in root.iter():
128131
if elem.tag == "Id":
@@ -131,7 +134,7 @@ def load_data(
131134
print(url)
132135
try:
133136
resp = requests.get(url)
134-
info = xml.fromstring(resp.content)
137+
info = safe_xml.fromstring(resp.content)
135138

136139
raw_text = ""
137140
title = ""

llama-index-integrations/readers/llama-index-readers-papers/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ license = "MIT"
2929
maintainers = ["thejessezhang"]
3030
name = "llama-index-readers-papers"
3131
readme = "README.md"
32-
version = "0.3.1"
32+
version = "0.3.2"
3333

3434
[tool.poetry.dependencies]
3535
python = ">=3.9,<4.0"
3636
arxiv = "^2.1.0"
3737
llama-index-core = "^0.12.0"
38+
defusedxml = "^0.7.1"
3839

3940
[tool.poetry.group.dev.dependencies]
4041
ipython = "8.10.0"

llama-index-integrations/readers/llama-index-readers-stripe-docs/llama_index/readers/stripe_docs/base.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import urllib.request
2-
import xml.etree.ElementTree as ET
32
from typing import List
43

4+
from defusedxml.ElementTree import fromstring
55
from llama_index.core.readers.base import BaseReader
66
from llama_index.core.schema import Document
77
from llama_index.readers.web import AsyncWebPageReader
@@ -13,7 +13,8 @@
1313

1414

1515
class StripeDocsReader(BaseReader):
16-
"""Asynchronous Stripe documentation reader.
16+
"""
17+
Asynchronous Stripe documentation reader.
1718
1819
Reads pages from the Stripe documentation based on the sitemap.xml.
1920
@@ -36,7 +37,7 @@ def _load_sitemap(self) -> str:
3637
def _parse_sitemap(
3738
self, raw_sitemap: str, filters: List[str] = DEFAULT_FILTERS
3839
) -> List:
39-
root_sitemap = ET.fromstring(raw_sitemap)
40+
root_sitemap = fromstring(raw_sitemap)
4041
sitemap_partition_urls = []
4142
sitemap_urls = []
4243

@@ -45,7 +46,7 @@ def _parse_sitemap(
4546
sitemap_partition_urls.append(loc)
4647

4748
for sitemap_partition_url in sitemap_partition_urls:
48-
sitemap_partition = ET.fromstring(self._load_url(sitemap_partition_url))
49+
sitemap_partition = fromstring(self._load_url(sitemap_partition_url))
4950

5051
# Find all <url /> and iterate through them
5152
for url in sitemap_partition.findall(f"{{{XML_SITEMAP_SCHEMA}}}url"):

llama-index-integrations/readers/llama-index-readers-stripe-docs/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,15 @@ license = "GPL-3.0-or-later"
2929
maintainers = ["amorriscode"]
3030
name = "llama-index-readers-stripe-docs"
3131
readme = "README.md"
32-
version = "0.3.0"
32+
version = "0.3.1"
3333

3434
[tool.poetry.dependencies]
3535
python = ">=3.9,<4.0"
3636
html2text = "^2024.2.26"
3737
urllib3 = "^2.1.0"
3838
llama-index-readers-web = "^0.3.0"
3939
llama-index-core = "^0.12.0"
40+
defusedxml = "^0.7.1"
4041

4142
[tool.poetry.group.dev.dependencies]
4243
ipython = "8.10.0"

llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/sitemap/base.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import urllib.request
2-
import xml.etree.ElementTree as ET
32
from typing import List
43

4+
from defusedxml.ElementTree import fromstring
55
from llama_index.core.readers.base import BaseReader
66
from llama_index.core.schema import Document
77
from llama_index.readers.web.async_web.base import AsyncWebPageReader
88

99

1010
class SitemapReader(BaseReader):
11-
"""Asynchronous sitemap reader for web.
11+
"""
12+
Asynchronous sitemap reader for web.
1213
1314
Reads pages from the web based on their sitemap.xml.
1415
@@ -34,7 +35,7 @@ def _load_sitemap(self, sitemap_url: str) -> str:
3435
return sitemap_url_request.read()
3536

3637
def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
37-
sitemap = ET.fromstring(raw_sitemap)
38+
sitemap = fromstring(raw_sitemap)
3839
sitemap_urls = []
3940

4041
for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):

llama-index-integrations/readers/llama-index-readers-web/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ license = "GPL-3.0-or-later"
4747
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
4848
name = "llama-index-readers-web"
4949
readme = "README.md"
50-
version = "0.3.8"
50+
version = "0.3.9"
5151

5252
[tool.poetry.dependencies]
5353
python = ">=3.9,<4.0"
@@ -62,6 +62,7 @@ playwright = ">=1.30,<2.0"
6262
newspaper3k = "^0.2.8"
6363
spider-client = "^0.0.27"
6464
llama-index-core = "^0.12.0"
65+
defusedxml = "^0.7.1"
6566

6667
[tool.poetry.group.dev.dependencies]
6768
ipython = "8.10.0"

0 commit comments

Comments
 (0)