@@ -101,7 +101,7 @@ def best_match(i):
101101
102102def replace_person_token (t ):
103103 "Used for CC12M"
104- t = re .sub ("<person>([,\s]*(and)*[,\s]*<person>)+" , " people " , t )
104+ t = re .sub (r "<person>([,\s]*(and)*[,\s]*<person>)+" , " people " , t )
105105 while "<person>" in t :
106106 t = t .replace ("<person>" ,
107107 f" { random .choices (* tuple (zip (* person_token )))[0 ]} " , 1 )
@@ -114,27 +114,27 @@ def fix_html(t):
114114
115115
116116def replace_punctuation_with_commas (t ):
117- return re .sub ("[()[\].,|:;?!=+~\-\/{}]" , "," , t )
117+ return re .sub (r "[()[\].,|:;?!=+~\-\/{}]" , "," , t )
118118
119119
120120def simplify_quotes (t ):
121121 return re .sub ("""['"`]""" , ' " ' , t )
122122
123123
124124def merge_quotes (t ):
125- return re .sub ('(\s*"+\s*)+' , ' " ' , t )
125+ return re .sub (r '(\s*"+\s*)+' , ' " ' , t )
126126
127127
128128def remove_comma_numbers (t ):
129129
130130 def _f (t ):
131- return re .sub ("(\d),(\d{3})" , r"\1\2" , t )
131+ return re .sub (r "(\d),(\d{3})" , r"\1\2" , t )
132132
133133 return _f (_f (t ))
134134
135135
136136def pre_process_dot_numbers (t ):
137- return re .sub ("(\w)\.(\w)" , rf"\1{ temp_token } dot{ temp_token } \2" , t )
137+ return re .sub (r "(\w)\.(\w)" , rf"\1{ temp_token } dot{ temp_token } \2" , t )
138138
139139
140140def post_process_dot_numbers (t ):
@@ -152,15 +152,15 @@ def post_process_quotes(t):
152152
153153
154154def pre_process_dates (t ):
155- return re .sub ("(\d)/(\d)" , rf"\1{ temp_token } slash{ temp_token } \2" , t )
155+ return re .sub (r "(\d)/(\d)" , rf"\1{ temp_token } slash{ temp_token } \2" , t )
156156
157157
158158def post_process_dates (t ):
159159 return re .sub (f"{ temp_token } slash{ temp_token } " , "/" , t )
160160
161161
162162def merge_commas (t ):
163- return re .sub ("(\s*,+\s*)+" , ", " , t )
163+ return re .sub (r "(\s*,+\s*)+" , ", " , t )
164164
165165
166166def add_space_after_commas (t ):
@@ -170,14 +170,14 @@ def add_space_after_commas(t):
170170def handle_special_chars (t ):
171171 "Handle special characters"
172172 # replace "-" with a space when between words without space
173- t = re .sub ("(\w)-(\w)" , r"\1 \2" , t )
173+ t = re .sub (r "(\w)-(\w)" , r"\1 \2" , t )
174174 # always add space around some characters
175- return re .sub ("([%&\/$*])" , r" \1 " , t )
175+ return re .sub (r "([%&\/$*])" , r" \1 " , t )
176176
177177
178178def expand_hashtags (t , hashtag_processor ):
179179 "Remove # and try to split words"
180- return re .sub ("#(\w+)" , lambda m : hashtag_processor (m .group (1 )), t )
180+ return re .sub (r "#(\w+)" , lambda m : hashtag_processor (m .group (1 )), t )
181181
182182
183183_re_ignore_chars = r"[_#\\]"
@@ -190,7 +190,7 @@ def ignore_chars(t):
190190
191191def remove_extra_spaces (t ):
192192 "Remove extra spaces (including \t and \n )"
193- return re .sub ("\s+" , " " , t )
193+ return re .sub (r "\s+" , " " , t )
194194
195195
196196def remove_repeating_chars (t ):
0 commit comments