4242 *
4343 * <code>isTemplateMatchWithinText(String text)</code> will return true if the text text matches the template
4444 *
45+ * <code>getQuickMatchRegex()</code> will return a regular expression with limited backtracking which can be used for a quick search
4546 * <code>getCompleteRegex()</code> will return a regular expression for the entire license where
4647 * <code>getStartRegex(int wordLimit)</code> will return a regular expression to match the beginning of a license
4748 * and <code>getEndRegex(int wordLimit)</code> will return a regular expression to match the end of a license
@@ -53,7 +54,7 @@ public class TemplateRegexMatcher implements ILicenseTemplateOutputHandler {
5354
5455 static final Logger logger = LoggerFactory .getLogger (TemplateRegexMatcher .class );
5556
56- static final int WORD_LIMIT = 25 ; // number of words to search for at the beginning and end of the template
57+ static final int WORD_LIMIT = 25 ; // number of words to search for in the quick match, beginning and end of the template
5758
5859 static final String REGEX_GLOBAL_MODIFIERS = "(?im)" ; // ignore case and muti-line
5960
@@ -177,6 +178,45 @@ public String getCompleteRegex() {
177178 return REGEX_GLOBAL_MODIFIERS + regexPatternList .toString ();
178179 }
179180
181+ /**
182+ * @param wordLimit maximum number of contiguous words to match
183+ * @return a regular expression to match the template with minimum backtracking - avoiding optional and var tags
184+ */
185+ public String getQuickMatchRegex (int wordLimit ) {
186+ RegexList result = new RegexList ();
187+ int index = 0 ;
188+ int numWords = 0 ;
189+ List <RegexElement > elementList = regexPatternList .getElements ();
190+ int largestContiguousText = 0 ; // number of contiguous tokens in a regular text
191+ while (index < elementList .size () && numWords <= wordLimit ) {
192+ RegexElement element = elementList .get (index ++);
193+ result .addElement (element );
194+ if (element instanceof RegexToken ) {
195+ numWords ++;
196+ } else {
197+ if (numWords > largestContiguousText ) {
198+ largestContiguousText = numWords ;
199+ }
200+ result .getElements ().clear ();
201+ numWords = 0 ;
202+ }
203+ }
204+ if (numWords < largestContiguousText ) {
205+ // Need to retry to get as much as we can
206+ while (index < elementList .size () && numWords <= largestContiguousText ) {
207+ RegexElement element = elementList .get (index ++);
208+ result .addElement (element );
209+ if (element instanceof RegexToken ) {
210+ numWords ++;
211+ } else {
212+ result .getElements ().clear ();
213+ numWords = 0 ;
214+ }
215+ }
216+ }
217+ return REGEX_GLOBAL_MODIFIERS + result .toString ();
218+ }
219+
180220 /**
181221 * @param wordLimit number of non optional words to include in the pattern
182222 * @return a regex to match the start of the license per the template
@@ -269,15 +309,18 @@ public boolean isTemplateMatchWithinText(String text) throws SpdxCompareExceptio
269309
270310 String compareText = normalizedText .toString ();
271311
272- Pattern startPattern = Pattern .compile (getStartRegex (WORD_LIMIT ));
273- Matcher startMatcher = startPattern .matcher (compareText );
274- if (startMatcher .find ()) {
275- startIndex = startMatcher .start ();
276- Pattern endPattern = Pattern .compile (getEndRegex (WORD_LIMIT ));
277- Matcher endMatcher = endPattern .matcher (compareText );
278- if (endMatcher .find ()) {
279- endIndex = endMatcher .end ();
280- result = compareText .substring (startIndex , endIndex );
312+ Pattern quickPattern = Pattern .compile (getQuickMatchRegex (WORD_LIMIT ));
313+ if (quickPattern .matcher (compareText ).find ()) {
314+ Pattern startPattern = Pattern .compile (getStartRegex (WORD_LIMIT ));
315+ Matcher startMatcher = startPattern .matcher (compareText );
316+ if (startMatcher .find ()) {
317+ startIndex = startMatcher .start ();
318+ Pattern endPattern = Pattern .compile (getEndRegex (WORD_LIMIT ));
319+ Matcher endMatcher = endPattern .matcher (compareText );
320+ if (endMatcher .find ()) {
321+ endIndex = endMatcher .end ();
322+ result = compareText .substring (startIndex , endIndex );
323+ }
281324 }
282325 }
283326 return result ;
0 commit comments