Skip to content

Commit fdaf96c

Browse files
eyurtsevhinthornw
authored andcommitted
core[patch]: Patch XML vulnerability in XMLOutputParser (CVE-2024-1455) (#19653)
Patch potential XML vulnerability CVE-2024-1455 This patches a potential XML vulnerability in the XMLOutputParser in langchain-core. The vulnerability in some situations could lead to a denial of service attack. At risk are users that: 1) Running older distributions of python that have older version of libexpat 2) Are using XMLOutputParser with an agent 3) Accept inputs from untrusted sources with this agent (e.g., endpoint on the web that allows an untrusted user to interact wiith the parser)
1 parent d6b8be6 commit fdaf96c

File tree

2 files changed

+212
-140
lines changed

2 files changed

+212
-140
lines changed

libs/core/langchain_core/output_parsers/xml.py

Lines changed: 144 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import re
22
import xml
33
import xml.etree.ElementTree as ET
4-
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union
4+
from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional, Union
5+
from xml.etree.ElementTree import TreeBuilder
56

67
from langchain_core.exceptions import OutputParserException
78
from langchain_core.messages import BaseMessage
@@ -24,19 +25,154 @@
2425
```""" # noqa: E501
2526

2627

28+
class _StreamingParser:
29+
"""Streaming parser for XML.
30+
31+
This implementation is pulled into a class to avoid implementation
32+
drift between transform and atransform of the XMLOutputParser.
33+
"""
34+
35+
def __init__(self, parser: Literal["defusedxml", "xml"]) -> None:
36+
"""Initialize the streaming parser.
37+
38+
Args:
39+
parser: Parser to use for XML parsing. Can be either 'defusedxml' or 'xml'.
40+
See documentation in XMLOutputParser for more information.
41+
"""
42+
if parser == "defusedxml":
43+
try:
44+
from defusedxml import ElementTree as DET # type: ignore
45+
except ImportError:
46+
raise ImportError(
47+
"defusedxml is not installed. "
48+
"Please install it to use the defusedxml parser."
49+
"You can install it with `pip install defusedxml` "
50+
)
51+
_parser = DET.DefusedXMLParser(target=TreeBuilder())
52+
else:
53+
_parser = None
54+
self.pull_parser = ET.XMLPullParser(["start", "end"], _parser=_parser)
55+
self.xml_start_re = re.compile(r"<[a-zA-Z:_]")
56+
self.current_path: List[str] = []
57+
self.current_path_has_children = False
58+
self.buffer = ""
59+
self.xml_started = False
60+
61+
def parse(self, chunk: Union[str, BaseMessage]) -> Iterator[AddableDict]:
62+
"""Parse a chunk of text.
63+
64+
Args:
65+
chunk: A chunk of text to parse. This can be a string or a BaseMessage.
66+
67+
Yields:
68+
AddableDict: A dictionary representing the parsed XML element.
69+
"""
70+
if isinstance(chunk, BaseMessage):
71+
# extract text
72+
chunk_content = chunk.content
73+
if not isinstance(chunk_content, str):
74+
# ignore non-string messages (e.g., function calls)
75+
return
76+
chunk = chunk_content
77+
# add chunk to buffer of unprocessed text
78+
self.buffer += chunk
79+
# if xml string hasn't started yet, continue to next chunk
80+
if not self.xml_started:
81+
if match := self.xml_start_re.search(self.buffer):
82+
# if xml string has started, remove all text before it
83+
self.buffer = self.buffer[match.start() :]
84+
self.xml_started = True
85+
else:
86+
return
87+
# feed buffer to parser
88+
self.pull_parser.feed(self.buffer)
89+
self.buffer = ""
90+
# yield all events
91+
try:
92+
for event, elem in self.pull_parser.read_events():
93+
if event == "start":
94+
# update current path
95+
self.current_path.append(elem.tag)
96+
self.current_path_has_children = False
97+
elif event == "end":
98+
# remove last element from current path
99+
#
100+
self.current_path.pop()
101+
# yield element
102+
if not self.current_path_has_children:
103+
yield nested_element(self.current_path, elem)
104+
# prevent yielding of parent element
105+
if self.current_path:
106+
self.current_path_has_children = True
107+
else:
108+
self.xml_started = False
109+
except xml.etree.ElementTree.ParseError:
110+
# This might be junk at the end of the XML input.
111+
# Let's check whether the current path is empty.
112+
if not self.current_path:
113+
# If it is empty, we can ignore this error.
114+
return
115+
else:
116+
raise
117+
118+
def close(self) -> None:
119+
"""Close the parser."""
120+
try:
121+
self.pull_parser.close()
122+
except xml.etree.ElementTree.ParseError:
123+
# Ignore. This will ignore any incomplete XML at the end of the input
124+
pass
125+
126+
27127
class XMLOutputParser(BaseTransformOutputParser):
28128
"""Parse an output using xml format."""
29129

30130
tags: Optional[List[str]] = None
31131
encoding_matcher: re.Pattern = re.compile(
32132
r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL
33133
)
134+
parser: Literal["defusedxml", "xml"] = "defusedxml"
135+
"""Parser to use for XML parsing. Can be either 'defusedxml' or 'xml'.
136+
137+
* 'defusedxml' is the default parser and is used to prevent XML vulnerabilities
138+
present in some distributions of Python's standard library xml.
139+
`defusedxml` is a wrapper around the standard library parser that
140+
sets up the parser with secure defaults.
141+
* 'xml' is the standard library parser.
142+
143+
Use `xml` only if you are sure that your distribution of the standard library
144+
is not vulnerable to XML vulnerabilities.
145+
146+
Please review the following resources for more information:
147+
148+
* https://docs.python.org/3/library/xml.html#xml-vulnerabilities
149+
* https://github.com/tiran/defusedxml
150+
151+
The standard library relies on libexpat for parsing XML:
152+
https://github.com/libexpat/libexpat
153+
"""
34154

35155
def get_format_instructions(self) -> str:
36156
return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags)
37157

38158
def parse(self, text: str) -> Dict[str, List[Any]]:
39159
# Try to find XML string within triple backticks
160+
# Imports are temporarily placed here to avoid issue with caching on CI
161+
# likely if you're reading this you can move them to the top of the file
162+
if self.parser == "defusedxml":
163+
try:
164+
from defusedxml import ElementTree as DET # type: ignore
165+
except ImportError:
166+
raise ImportError(
167+
"defusedxml is not installed. "
168+
"Please install it to use the defusedxml parser."
169+
"You can install it with `pip install defusedxml`"
170+
"See https://github.com/tiran/defusedxml for more details"
171+
)
172+
_ET = DET # Use the defusedxml parser
173+
else:
174+
_ET = ET # Use the standard library parser
175+
40176
match = re.search(r"```(xml)?(.*)```", text, re.DOTALL)
41177
if match is not None:
42178
# If match found, use the content within the backticks
@@ -57,132 +193,19 @@ def parse(self, text: str) -> Dict[str, List[Any]]:
57193
def _transform(
58194
self, input: Iterator[Union[str, BaseMessage]]
59195
) -> Iterator[AddableDict]:
60-
xml_start_re = re.compile(r"<[a-zA-Z:_]")
61-
parser = ET.XMLPullParser(["start", "end"])
62-
xml_started = False
63-
current_path: List[str] = []
64-
current_path_has_children = False
65-
buffer = ""
196+
streaming_parser = _StreamingParser(self.parser)
66197
for chunk in input:
67-
if isinstance(chunk, BaseMessage):
68-
# extract text
69-
chunk_content = chunk.content
70-
if not isinstance(chunk_content, str):
71-
continue
72-
chunk = chunk_content
73-
# add chunk to buffer of unprocessed text
74-
buffer += chunk
75-
# if xml string hasn't started yet, continue to next chunk
76-
if not xml_started:
77-
if match := xml_start_re.search(buffer):
78-
# if xml string has started, remove all text before it
79-
buffer = buffer[match.start() :]
80-
xml_started = True
81-
else:
82-
continue
83-
# feed buffer to parser
84-
parser.feed(buffer)
85-
86-
buffer = ""
87-
# yield all events
88-
try:
89-
for event, elem in parser.read_events():
90-
if event == "start":
91-
# update current path
92-
current_path.append(elem.tag)
93-
current_path_has_children = False
94-
elif event == "end":
95-
# remove last element from current path
96-
#
97-
current_path.pop()
98-
# yield element
99-
if not current_path_has_children:
100-
yield nested_element(current_path, elem)
101-
# prevent yielding of parent element
102-
if current_path:
103-
current_path_has_children = True
104-
else:
105-
xml_started = False
106-
except xml.etree.ElementTree.ParseError:
107-
# This might be junk at the end of the XML input.
108-
# Let's check whether the current path is empty.
109-
if not current_path:
110-
# If it is empty, we can ignore this error.
111-
break
112-
else:
113-
raise
114-
115-
# close parser
116-
try:
117-
parser.close()
118-
except xml.etree.ElementTree.ParseError:
119-
# Ignore. This will ignore any incomplete XML at the end of the input
120-
pass
198+
yield from streaming_parser.parse(chunk)
199+
streaming_parser.close()
121200

122201
async def _atransform(
123202
self, input: AsyncIterator[Union[str, BaseMessage]]
124203
) -> AsyncIterator[AddableDict]:
125-
xml_start_re = re.compile(r"<[a-zA-Z:_]")
126-
parser = ET.XMLPullParser(["start", "end"])
127-
xml_started = False
128-
current_path: List[str] = []
129-
current_path_has_children = False
130-
buffer = ""
204+
streaming_parser = _StreamingParser(self.parser)
131205
async for chunk in input:
132-
if isinstance(chunk, BaseMessage):
133-
# extract text
134-
chunk_content = chunk.content
135-
if not isinstance(chunk_content, str):
136-
continue
137-
chunk = chunk_content
138-
# add chunk to buffer of unprocessed text
139-
buffer += chunk
140-
# if xml string hasn't started yet, continue to next chunk
141-
if not xml_started:
142-
if match := xml_start_re.search(buffer):
143-
# if xml string has started, remove all text before it
144-
buffer = buffer[match.start() :]
145-
xml_started = True
146-
else:
147-
continue
148-
# feed buffer to parser
149-
parser.feed(buffer)
150-
151-
buffer = ""
152-
# yield all events
153-
try:
154-
for event, elem in parser.read_events():
155-
if event == "start":
156-
# update current path
157-
current_path.append(elem.tag)
158-
current_path_has_children = False
159-
elif event == "end":
160-
# remove last element from current path
161-
#
162-
current_path.pop()
163-
# yield element
164-
if not current_path_has_children:
165-
yield nested_element(current_path, elem)
166-
# prevent yielding of parent element
167-
if current_path:
168-
current_path_has_children = True
169-
else:
170-
xml_started = False
171-
except xml.etree.ElementTree.ParseError:
172-
# This might be junk at the end of the XML input.
173-
# Let's check whether the current path is empty.
174-
if not current_path:
175-
# If it is empty, we can ignore this error.
176-
break
177-
else:
178-
raise
179-
180-
# close parser
181-
try:
182-
parser.close()
183-
except xml.etree.ElementTree.ParseError:
184-
# Ignore. This will ignore any incomplete XML at the end of the input
185-
pass
206+
for output in streaming_parser.parse(chunk):
207+
yield output
208+
streaming_parser.close()
186209

187210
def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
188211
"""Converts xml tree to python dictionary."""

0 commit comments

Comments
 (0)