11import re
22import xml
33import xml .etree .ElementTree as ET
4- from typing import Any , AsyncIterator , Dict , Iterator , List , Optional , Union
4+ from typing import Any , AsyncIterator , Dict , Iterator , List , Literal , Optional , Union
5+ from xml .etree .ElementTree import TreeBuilder
56
67from langchain_core .exceptions import OutputParserException
78from langchain_core .messages import BaseMessage
2425```""" # noqa: E501
2526
2627
28+ class _StreamingParser :
29+ """Streaming parser for XML.
30+
31+ This implementation is pulled into a class to avoid implementation
32+ drift between transform and atransform of the XMLOutputParser.
33+ """
34+
35+ def __init__ (self , parser : Literal ["defusedxml" , "xml" ]) -> None :
36+ """Initialize the streaming parser.
37+
38+ Args:
39+ parser: Parser to use for XML parsing. Can be either 'defusedxml' or 'xml'.
40+ See documentation in XMLOutputParser for more information.
41+ """
42+ if parser == "defusedxml" :
43+ try :
44+ from defusedxml import ElementTree as DET # type: ignore
45+ except ImportError :
46+ raise ImportError (
47+ "defusedxml is not installed. "
48+ "Please install it to use the defusedxml parser."
49+ "You can install it with `pip install defusedxml` "
50+ )
51+ _parser = DET .DefusedXMLParser (target = TreeBuilder ())
52+ else :
53+ _parser = None
54+ self .pull_parser = ET .XMLPullParser (["start" , "end" ], _parser = _parser )
55+ self .xml_start_re = re .compile (r"<[a-zA-Z:_]" )
56+ self .current_path : List [str ] = []
57+ self .current_path_has_children = False
58+ self .buffer = ""
59+ self .xml_started = False
60+
61+ def parse (self , chunk : Union [str , BaseMessage ]) -> Iterator [AddableDict ]:
62+ """Parse a chunk of text.
63+
64+ Args:
65+ chunk: A chunk of text to parse. This can be a string or a BaseMessage.
66+
67+ Yields:
68+ AddableDict: A dictionary representing the parsed XML element.
69+ """
70+ if isinstance (chunk , BaseMessage ):
71+ # extract text
72+ chunk_content = chunk .content
73+ if not isinstance (chunk_content , str ):
74+ # ignore non-string messages (e.g., function calls)
75+ return
76+ chunk = chunk_content
77+ # add chunk to buffer of unprocessed text
78+ self .buffer += chunk
79+ # if xml string hasn't started yet, continue to next chunk
80+ if not self .xml_started :
81+ if match := self .xml_start_re .search (self .buffer ):
82+ # if xml string has started, remove all text before it
83+ self .buffer = self .buffer [match .start () :]
84+ self .xml_started = True
85+ else :
86+ return
87+ # feed buffer to parser
88+ self .pull_parser .feed (self .buffer )
89+ self .buffer = ""
90+ # yield all events
91+ try :
92+ for event , elem in self .pull_parser .read_events ():
93+ if event == "start" :
94+ # update current path
95+ self .current_path .append (elem .tag )
96+ self .current_path_has_children = False
97+ elif event == "end" :
98+ # remove last element from current path
99+ #
100+ self .current_path .pop ()
101+ # yield element
102+ if not self .current_path_has_children :
103+ yield nested_element (self .current_path , elem )
104+ # prevent yielding of parent element
105+ if self .current_path :
106+ self .current_path_has_children = True
107+ else :
108+ self .xml_started = False
109+ except xml .etree .ElementTree .ParseError :
110+ # This might be junk at the end of the XML input.
111+ # Let's check whether the current path is empty.
112+ if not self .current_path :
113+ # If it is empty, we can ignore this error.
114+ return
115+ else :
116+ raise
117+
118+ def close (self ) -> None :
119+ """Close the parser."""
120+ try :
121+ self .pull_parser .close ()
122+ except xml .etree .ElementTree .ParseError :
123+ # Ignore. This will ignore any incomplete XML at the end of the input
124+ pass
125+
126+
27127class XMLOutputParser (BaseTransformOutputParser ):
28128 """Parse an output using xml format."""
29129
30130 tags : Optional [List [str ]] = None
31131 encoding_matcher : re .Pattern = re .compile (
32132 r"<([^>]*encoding[^>]*)>\n(.*)" , re .MULTILINE | re .DOTALL
33133 )
134+ parser : Literal ["defusedxml" , "xml" ] = "defusedxml"
135+ """Parser to use for XML parsing. Can be either 'defusedxml' or 'xml'.
136+
137+ * 'defusedxml' is the default parser and is used to prevent XML vulnerabilities
138+ present in some distributions of Python's standard library xml.
139+ `defusedxml` is a wrapper around the standard library parser that
140+ sets up the parser with secure defaults.
141+ * 'xml' is the standard library parser.
142+
143+ Use `xml` only if you are sure that your distribution of the standard library
144+ is not vulnerable to XML vulnerabilities.
145+
146+ Please review the following resources for more information:
147+
148+ * https://docs.python.org/3/library/xml.html#xml-vulnerabilities
149+ * https://github.com/tiran/defusedxml
150+
151+ The standard library relies on libexpat for parsing XML:
152+ https://github.com/libexpat/libexpat
153+ """
34154
35155 def get_format_instructions (self ) -> str :
36156 return XML_FORMAT_INSTRUCTIONS .format (tags = self .tags )
37157
38158 def parse (self , text : str ) -> Dict [str , List [Any ]]:
39159 # Try to find XML string within triple backticks
160+ # Imports are temporarily placed here to avoid issue with caching on CI
161+ # likely if you're reading this you can move them to the top of the file
162+ if self .parser == "defusedxml" :
163+ try :
164+ from defusedxml import ElementTree as DET # type: ignore
165+ except ImportError :
166+ raise ImportError (
167+ "defusedxml is not installed. "
168+ "Please install it to use the defusedxml parser."
169+ "You can install it with `pip install defusedxml`"
170+ "See https://github.com/tiran/defusedxml for more details"
171+ )
172+ _ET = DET # Use the defusedxml parser
173+ else :
174+ _ET = ET # Use the standard library parser
175+
40176 match = re .search (r"```(xml)?(.*)```" , text , re .DOTALL )
41177 if match is not None :
42178 # If match found, use the content within the backticks
@@ -57,132 +193,19 @@ def parse(self, text: str) -> Dict[str, List[Any]]:
57193 def _transform (
58194 self , input : Iterator [Union [str , BaseMessage ]]
59195 ) -> Iterator [AddableDict ]:
60- xml_start_re = re .compile (r"<[a-zA-Z:_]" )
61- parser = ET .XMLPullParser (["start" , "end" ])
62- xml_started = False
63- current_path : List [str ] = []
64- current_path_has_children = False
65- buffer = ""
196+ streaming_parser = _StreamingParser (self .parser )
66197 for chunk in input :
67- if isinstance (chunk , BaseMessage ):
68- # extract text
69- chunk_content = chunk .content
70- if not isinstance (chunk_content , str ):
71- continue
72- chunk = chunk_content
73- # add chunk to buffer of unprocessed text
74- buffer += chunk
75- # if xml string hasn't started yet, continue to next chunk
76- if not xml_started :
77- if match := xml_start_re .search (buffer ):
78- # if xml string has started, remove all text before it
79- buffer = buffer [match .start () :]
80- xml_started = True
81- else :
82- continue
83- # feed buffer to parser
84- parser .feed (buffer )
85-
86- buffer = ""
87- # yield all events
88- try :
89- for event , elem in parser .read_events ():
90- if event == "start" :
91- # update current path
92- current_path .append (elem .tag )
93- current_path_has_children = False
94- elif event == "end" :
95- # remove last element from current path
96- #
97- current_path .pop ()
98- # yield element
99- if not current_path_has_children :
100- yield nested_element (current_path , elem )
101- # prevent yielding of parent element
102- if current_path :
103- current_path_has_children = True
104- else :
105- xml_started = False
106- except xml .etree .ElementTree .ParseError :
107- # This might be junk at the end of the XML input.
108- # Let's check whether the current path is empty.
109- if not current_path :
110- # If it is empty, we can ignore this error.
111- break
112- else :
113- raise
114-
115- # close parser
116- try :
117- parser .close ()
118- except xml .etree .ElementTree .ParseError :
119- # Ignore. This will ignore any incomplete XML at the end of the input
120- pass
198+ yield from streaming_parser .parse (chunk )
199+ streaming_parser .close ()
121200
122201 async def _atransform (
123202 self , input : AsyncIterator [Union [str , BaseMessage ]]
124203 ) -> AsyncIterator [AddableDict ]:
125- xml_start_re = re .compile (r"<[a-zA-Z:_]" )
126- parser = ET .XMLPullParser (["start" , "end" ])
127- xml_started = False
128- current_path : List [str ] = []
129- current_path_has_children = False
130- buffer = ""
204+ streaming_parser = _StreamingParser (self .parser )
131205 async for chunk in input :
132- if isinstance (chunk , BaseMessage ):
133- # extract text
134- chunk_content = chunk .content
135- if not isinstance (chunk_content , str ):
136- continue
137- chunk = chunk_content
138- # add chunk to buffer of unprocessed text
139- buffer += chunk
140- # if xml string hasn't started yet, continue to next chunk
141- if not xml_started :
142- if match := xml_start_re .search (buffer ):
143- # if xml string has started, remove all text before it
144- buffer = buffer [match .start () :]
145- xml_started = True
146- else :
147- continue
148- # feed buffer to parser
149- parser .feed (buffer )
150-
151- buffer = ""
152- # yield all events
153- try :
154- for event , elem in parser .read_events ():
155- if event == "start" :
156- # update current path
157- current_path .append (elem .tag )
158- current_path_has_children = False
159- elif event == "end" :
160- # remove last element from current path
161- #
162- current_path .pop ()
163- # yield element
164- if not current_path_has_children :
165- yield nested_element (current_path , elem )
166- # prevent yielding of parent element
167- if current_path :
168- current_path_has_children = True
169- else :
170- xml_started = False
171- except xml .etree .ElementTree .ParseError :
172- # This might be junk at the end of the XML input.
173- # Let's check whether the current path is empty.
174- if not current_path :
175- # If it is empty, we can ignore this error.
176- break
177- else :
178- raise
179-
180- # close parser
181- try :
182- parser .close ()
183- except xml .etree .ElementTree .ParseError :
184- # Ignore. This will ignore any incomplete XML at the end of the input
185- pass
206+ for output in streaming_parser .parse (chunk ):
207+ yield output
208+ streaming_parser .close ()
186209
187210 def _root_to_dict (self , root : ET .Element ) -> Dict [str , List [Any ]]:
188211 """Converts xml tree to python dictionary."""
0 commit comments