33import tokenize
44import types , builtins
55from io import StringIO
6+ import ast
7+ from dataclasses import dataclass
8+ from collections import defaultdict
69
10+ @dataclass (frozen = True )
11+ class TokenSeq :
12+ kind : int # e.g. OP, NAME, STRING
13+ text : str # e.g. 'foo', '(', ')'
14+
15+ @dataclass (frozen = True )
16+ class KeyPattern :
17+ string : str # key's string representation
18+ tokens : tuple [TokenSeq , ...] # normalized token sequence
719
820def check50_assert (src , msg_or_exc = None , cond_type = "unknown" , left = None , right = None , context = None ):
921 """
@@ -20,7 +32,7 @@ def check50_assert(src, msg_or_exc=None, cond_type="unknown", left=None, right=N
2032 Used for rewriting assertion statements in check files.
2133
2234 Note:
23- Exceptions from the check50 library are preferred, since they will be
35+ Exceptions from the ` check50` library are preferred, since they will be
2436 handled gracefully and integrated into the check output. Native Python
2537 exceptions are technically supported, but check50 will immediately
2638 terminate on the user's end if the assertion fails.
@@ -63,121 +75,211 @@ def check50_assert(src, msg_or_exc=None, cond_type="unknown", left=None, right=N
6375 caller_globals = caller_frame .f_globals
6476 caller_locals = caller_frame .f_locals
6577
66- # Evaluate all variables and functions within the context dict and generate
67- # a string of these values
68- context_str = None
69- if context or (left and right ):
70- for expr_str in context :
71- try :
72- context [expr_str ] = eval (expr_str , caller_globals , caller_locals )
73- except Exception as e :
74- context [expr_str ] = f"[error evaluating: { e } ]"
75-
76- # filter out modules, functions, and built-ins, which is needed to avoid
77- # overwriting function definitions in evaluaton and avoid useless string
78- # output
79- def is_irrelevant_value (v ):
80- return isinstance (v , (types .ModuleType , types .FunctionType , types .BuiltinFunctionType ))
81-
82- def is_builtin_name (name ):
83- return name in dir (builtins )
84-
85- filtered_context = {
86- k : v for k , v in context .items ()
87- if not is_irrelevant_value (v ) and not is_builtin_name (k .split ("(" )[0 ])
88- }
89-
90- # produces a string like "var1 = ..., var2 = ..., foo() = ..."
91- context_str = ", " .join (f"{ k } = { repr (v )} " for k , v in filtered_context .items ())
92- else :
93- filtered_context = {}
78+ # Build the list of candidate keys
79+ candidate_keys = list (context .keys ()) if context else []
9480
95- # Since we've memoized the functions and variables once, now try and
96- # evaluate the conditional by substituting the function calls/vars with
97- # their results
98- eval_src , eval_context = substitute_expressions (src , filtered_context )
81+ # Plan substitutions and learn which keys are actually used
82+ eval_src , key_to_placeholder = substitute_expressions (src , candidate_keys )
9983
100- # Merge globals with expression context for evaluation
101- eval_globals = caller_globals .copy ()
102- eval_globals .update (eval_context )
84+ # Only evaluate the keys that were actually matched
85+ evaluated = {}
86+ for expr_str in key_to_placeholder .keys ():
87+ try :
88+ evaluated [expr_str ] = eval (expr_str , caller_globals , caller_locals )
89+ except Exception as e :
90+ evaluated [expr_str ] = f"[error evaluating: { e } ]"
10391
104- # Merge locals with expression context for evaluation
105- eval_locals = caller_locals .copy ()
106- eval_locals .update (eval_context )
92+ # Build the eval_context for placeholders
93+ eval_context = {
94+ placeholder : evaluated [key ]
95+ for key , placeholder in key_to_placeholder .items ()
96+ }
10797
98+ # Merge locals and globals with expression context for evaluation
99+ eval_globals = caller_globals .copy (); eval_globals .update (eval_context )
100+ eval_locals = caller_locals .copy (); eval_locals .update (eval_context )
108101 cond = eval (eval_src , eval_globals , eval_locals )
109102
110103 # Finally, quit if the condition evaluated to True.
111104 if cond :
112105 return
113106
114- # If `right` or `left` were evaluatable objects, their actual value will be stored in `context`.
115- # Otherwise, they're still just literals.
116- right = context .get (right ) or right
117- left = context .get (left ) or left
107+ # Filter out modules, functions, and built-ins, which is needed to avoid
108+ # overwriting function definitions in evaluaton and avoid useless string
109+ # output
110+ def is_irrelevant_value (v ):
111+ return isinstance (v , (
112+ types .ModuleType ,
113+ types .FunctionType ,
114+ types .BuiltinFunctionType
115+ ))
116+ def is_builtin_name (name ):
117+ name = name .split ("(" )[0 ] # grab `len` from `len(...)`
118+ return name in dir (builtins )
118119
119- # Since the condition didn't evaluate to True, now, we can raise special
120- # exceptions.
120+ filtered_context = {
121+ k : v for k , v in evaluated .items ()
122+ if not is_irrelevant_value (v ) and not is_builtin_name (k )
123+ }
124+
125+ # Produces a string like "var1 = ..., var2 = ..., foo() = ..."
126+ context_str = ", " .join (f"{ k } = { repr (v )} " for k , v in filtered_context .items ()) or None
127+
128+ # If `right` or `left` were evaluatable objects, their actual
129+ # value will be stored in `evaluated`.
130+ if right in evaluated :
131+ right = evaluated [right ]
132+ if left in evaluated :
133+ left = evaluated [left ]
134+
135+ # Raise check50-specific/user-passed exceptions.
121136 if isinstance (msg_or_exc , str ):
122137 raise Failure (msg_or_exc )
123- elif isinstance (msg_or_exc , BaseException ):
138+ elif isinstance (msg_or_exc , type ) and issubclass ( msg_or_exc , BaseException ):
124139 raise msg_or_exc
125- elif cond_type == 'eq' and left and right :
140+ elif cond_type == 'eq' and left is not None and right is not None :
126141 help_msg = f"checked: { src } "
127142 help_msg += f"\n where { context_str } " if context_str else ""
128143 raise Mismatch (right , left , help = help_msg )
129- elif cond_type == 'in' and left and right :
144+ elif cond_type == 'in' and left is not None and right is not None :
130145 help_msg = f"checked: { src } "
131146 help_msg += f"\n where { context_str } " if context_str else ""
132147 raise Missing (left , right , help = help_msg )
133148 else :
134149 help_msg = f"\n where { context_str } " if context_str else ""
135150 raise Failure (f"check did not pass: { src } " + help_msg )
136151
137- def substitute_expressions ( src : str , context : dict ) -> tuple [ str , dict ] :
152+ def _tokenize_normalized ( code : str ) :
138153 """
139- Rewrites `src` by replacing each key in `context` with a placeholder variable name,
140- and builds a new context dict where those names map to pre-evaluated values.
154+ Tokenize and normalize:
155+ - drop ENCODING, NL, NEWLINE, INDENT, DEDENT, ENDMARKER
156+ - for STRING tokens, normalize to their Python value (so "'pwd'" == "\" pwd\" ")
157+ - return both normalized tokens and the original raw tokens (1:1 positions)
158+
159+ Outputs a normalized and raw tokenization (raw, excluding dropped) of the
160+ code.
141161
142- For instance, given a `src` :
162+ For instance, the code input "foo.bar()" might output a `norm` of `TokenSeq`s :
143163 ```
144- check50.run('pwd').stdout() == actual
164+ [
165+ TokenSeq(NAME, "foo"), TokenSeq(OP, "."), TokenSeq(NAME, "bar"),
166+ TokenSeq(OP, "("), TokenSeq(OP, ")")
167+ ]
145168 ```
146- it will create a new `eval_src` as
169+ In this case, there were no strings to normalize, so `raw` would
170+ output the same thing.
171+ """
172+ drop = {
173+ tokenize .ENCODING , tokenize .NL , tokenize .NEWLINE ,
174+ tokenize .INDENT , tokenize .DEDENT , tokenize .ENDMARKER
175+ }
176+
177+ norm , raw = [], []
178+ for tok in tokenize .generate_tokens (StringIO (code ).readline ):
179+ # Extract type and string representation from token
180+ tok_type , tok_string , * _ = tok
181+
182+ # Ignore certain encoding types
183+ if tok_type in drop :
184+ continue
185+
186+ raw .append (TokenSeq (tok_type , tok_string ))
187+
188+ # Normalize STRING tokens to their Python value
189+ if tok_type == tokenize .STRING :
190+ try :
191+ val = ast .literal_eval (tok_string )
192+ norm .append (TokenSeq (tok_type , repr (val )))
193+ except Exception :
194+ norm .append (TokenSeq (tok_type , tok_string ))
195+ else :
196+ norm .append (TokenSeq (tok_type , tok_string ))
197+
198+ return norm , raw
199+
200+
201+ def substitute_expressions (src : str , keys : list [str ]) -> tuple [str , dict ]:
202+ """
203+ Rewrites `src` by replacing known `keys` (from `context`) with a placeholder
204+ variable name, and builds a new context dict where those names map to
205+ pre-evaluated values.
206+
207+ For instance, let `src` be the string representation of
147208 ```
148- __expr0 == __expr1
209+ assert check50.run("./foo.c").stdout() == "OK"
149210 ```
150- and use the given context to define these variables:
211+ The `keys` might look like
151212 ```
152- eval_context = {
153- '__expr0': context['check50.run('pwd').stdout()'],
154- '__expr1': context['actual']
155- }
213+ ["check50.run("./foo.c")", "check50.run("./foo.c").stdout()"]
214+ ```
215+ We would want to find the longest match from these keys and output:
216+ ```
217+ expr_str: assert __expr0 == "OK"
218+ key_to_placeholder: { "check50.run("./foo.c").stdout()": "__expr0" }
156219 ```
157220 """
158- # Parse the src into a stream of tokens
159- tokens = tokenize .generate_tokens (StringIO (src ).readline )
160-
161- new_tokens = []
162- new_context = {}
163- placeholder_map = {} # used for duplicates in src (i.e. x == x => __expr0 == __expr0)
164- counter = 0
165-
166- for tok_type , tok_string , start , end , line in tokens :
167- if tok_string in context :
168- if tok_string not in placeholder_map :
169- placeholder = f"__expr{ counter } "
170- placeholder_map [tok_string ] = placeholder
171- new_context [placeholder ] = context [tok_string ]
172- counter += 1
173- else :
174- # Avoid creating a new __expr{i} variable if it has already been seen
175- placeholder = placeholder_map [tok_string ]
176- new_tokens .append ((tok_type , placeholder ))
221+ # Tokenize/normalize the source once
222+ src_norm , src_raw = _tokenize_normalized (src )
223+
224+ # Store a list of KeyPatterns
225+ patterns = []
226+ for key in keys :
227+ key_norm , _ = _tokenize_normalized (key )
228+ if key_norm :
229+ patterns .append (KeyPattern (key , tuple (key_norm )))
230+
231+ # Stores a TokenSeq and every KeyPattern that starts with that TokenSeq
232+ patterns_by_start_token = defaultdict (list )
233+ for pattern in patterns :
234+ start_token = pattern .tokens [0 ]
235+ patterns_by_start_token [start_token ].append (pattern )
236+
237+ # Prefer longest matches first (e.g. foo.bar.baz() is preferred over foo.bar)
238+ for candidates in patterns_by_start_token .values ():
239+ candidates .sort (key = lambda p : len (p .tokens ), reverse = True )
240+
241+ key_to_placeholder = {}
242+ def get_placeholder (key_str ):
243+ """Return a placeholder `__expr{i}` for a given key."""
244+ if key_str not in key_to_placeholder :
245+ key_to_placeholder [key_str ] = f"__expr{ len (key_to_placeholder )} "
246+ return key_to_placeholder [key_str ]
247+
248+ def longest_match_at (i ):
249+ """Return the longest KeyPattern that matches `src_norm` starting at `i`"""
250+ if i >= len (src_norm ):
251+ return None
252+
253+ candidates = patterns_by_start_token .get (src_norm [i ], [])
254+
255+ # Iterate through the possible candidates for the longest match
256+ for pattern in candidates :
257+ L = len (pattern .tokens )
258+
259+ # Skip if i + L would run past the end and then check for match
260+ if i + L <= len (src_norm ) and tuple (src_norm [i :i + L ]) == pattern .tokens :
261+ return pattern
262+
263+ # No match
264+ return None
265+
266+ output = []
267+ i = 0
268+ while i < len (src_norm ):
269+ # Find a longest pattern, if exists
270+ pattern = longest_match_at (i )
271+ if pattern is not None :
272+ # Create a placeholder var for this specific match
273+ placeholder = get_placeholder (pattern .string )
274+ output .append ((tokenize .NAME , placeholder ))
275+ # Move forward by the number of tokens in this pattern
276+ i += len (pattern .tokens )
177277 else :
178- # Anything not found in the context dictionary is placed here,
179- # including keywords, whitespace, operators, etc.
180- new_tokens .append ((tok_type , tok_string ))
278+ # Preserve original lex for unmatched regions
279+ token = src_raw [i ]
280+ output .append ((token .kind , token .text ))
281+ # Move forward by 1 token
282+ i += 1
181283
182- eval_src = tokenize .untokenize (new_tokens )
183- return eval_src , new_context
284+ eval_src = tokenize .untokenize (output )
285+ return eval_src , key_to_placeholder
0 commit comments