Skip to content

Commit e7212a6

Browse files
committed
Implement a quick find regex for license matching
Potential fix for #165 Signed-off-by: Gary O'Neall <[email protected]>
1 parent b58739c commit e7212a6

File tree

1 file changed

+53
-10
lines changed

1 file changed

+53
-10
lines changed

src/main/java/org/spdx/utility/compare/TemplateRegexMatcher.java

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
*
4343
* <code>isTemplateMatchWithinText(String text)</code> will return true if the text text matches the template
4444
*
45+
* <code>getQuickMatchRegex()</code> will return a regular expression with limited backtracking which can be used for a quick search
4546
* <code>getCompleteRegex()</code> will return a regular expression for the entire license where
4647
* <code>getStartRegex(int wordLimit)</code> will return a regular expression to match the beginning of a license
4748
* and <code>getEndRegex(int wordLimit)</code> will return a regular expression to match the end of a license
@@ -53,7 +54,7 @@ public class TemplateRegexMatcher implements ILicenseTemplateOutputHandler {
5354

5455
static final Logger logger = LoggerFactory.getLogger(TemplateRegexMatcher.class);
5556

56-
static final int WORD_LIMIT = 25; // number of words to search for at the beginning and end of the template
57+
static final int WORD_LIMIT = 25; // number of words to search for in the quick match, beginning and end of the template
5758

5859
static final String REGEX_GLOBAL_MODIFIERS = "(?im)"; // ignore case and muti-line
5960

@@ -177,6 +178,45 @@ public String getCompleteRegex() {
177178
return REGEX_GLOBAL_MODIFIERS + regexPatternList.toString();
178179
}
179180

181+
/**
182+
* @param wordLimit maximum number of contiguous words to match
183+
* @return a regular expression to match the template with minimum backtracking - avoiding optional and var tags
184+
*/
185+
public String getQuickMatchRegex(int wordLimit) {
186+
RegexList result = new RegexList();
187+
int index = 0;
188+
int numWords = 0;
189+
List<RegexElement> elementList = regexPatternList.getElements();
190+
int largestContiguousText = 0; // number of contiguous tokens in a regular text
191+
while (index < elementList.size() && numWords <= wordLimit) {
192+
RegexElement element = elementList.get(index++);
193+
result.addElement(element);
194+
if (element instanceof RegexToken) {
195+
numWords++;
196+
} else {
197+
if (numWords > largestContiguousText) {
198+
largestContiguousText = numWords;
199+
}
200+
result.getElements().clear();
201+
numWords = 0;
202+
}
203+
}
204+
if (numWords < largestContiguousText) {
205+
// Need to retry to get as much as we can
206+
while (index < elementList.size() && numWords <= largestContiguousText) {
207+
RegexElement element = elementList.get(index++);
208+
result.addElement(element);
209+
if (element instanceof RegexToken) {
210+
numWords++;
211+
} else {
212+
result.getElements().clear();
213+
numWords = 0;
214+
}
215+
}
216+
}
217+
return REGEX_GLOBAL_MODIFIERS + result.toString();
218+
}
219+
180220
/**
181221
* @param wordLimit number of non optional words to include in the pattern
182222
* @return a regex to match the start of the license per the template
@@ -269,15 +309,18 @@ public boolean isTemplateMatchWithinText(String text) throws SpdxCompareExceptio
269309

270310
String compareText = normalizedText.toString();
271311

272-
Pattern startPattern = Pattern.compile(getStartRegex(WORD_LIMIT));
273-
Matcher startMatcher = startPattern.matcher(compareText);
274-
if(startMatcher.find()) {
275-
startIndex = startMatcher.start();
276-
Pattern endPattern = Pattern.compile(getEndRegex(WORD_LIMIT));
277-
Matcher endMatcher = endPattern.matcher(compareText);
278-
if (endMatcher.find()) {
279-
endIndex = endMatcher.end();
280-
result = compareText.substring(startIndex, endIndex);
312+
Pattern quickPattern = Pattern.compile(getQuickMatchRegex(WORD_LIMIT));
313+
if (quickPattern.matcher(compareText).find()) {
314+
Pattern startPattern = Pattern.compile(getStartRegex(WORD_LIMIT));
315+
Matcher startMatcher = startPattern.matcher(compareText);
316+
if(startMatcher.find()) {
317+
startIndex = startMatcher.start();
318+
Pattern endPattern = Pattern.compile(getEndRegex(WORD_LIMIT));
319+
Matcher endMatcher = endPattern.matcher(compareText);
320+
if (endMatcher.find()) {
321+
endIndex = endMatcher.end();
322+
result = compareText.substring(startIndex, endIndex);
323+
}
281324
}
282325
}
283326
return result;

0 commit comments

Comments
 (0)