Skip to content

Commit 4e936d2

Browse files
[Fix] Fix regex converter when the property's pattern is empty. (#409)
Currently, when we enconter such schema: ``` { "type": "object", "properties": { "_links": { "type": "object", "patternProperties": { "": { "type": "object", "properties": { "href": { "type": "string" } } } } } } } ``` This will lead to a segmentation fault. It is caused by the lack of handling with the empty regex when `RegexConverter` is constructed. This PR fixed it. Signed-off-by: Yuchuan <[email protected]> --------- Signed-off-by: Yuchuan <[email protected]>
1 parent 9aa156b commit 4e936d2

File tree

3 files changed

+47
-4
lines changed

3 files changed

+47
-4
lines changed

cpp/regex_converter.cc

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ namespace xgrammar {
2323
class RegexConverter {
2424
public:
2525
explicit RegexConverter(const std::string& regex) : regex_(regex) {
26-
regex_codepoints_ = ParseUTF8(regex_.c_str(), false);
27-
if (regex_codepoints_[0] == kInvalidUTF8) {
28-
XGRAMMAR_LOG(FATAL) << "The regex is not a valid UTF-8 string.";
29-
XGRAMMAR_UNREACHABLE();
26+
if (!regex.empty()) {
27+
regex_codepoints_ = ParseUTF8(regex_.c_str(), false);
28+
if (regex_codepoints_[0] == kInvalidUTF8) {
29+
XGRAMMAR_LOG(FATAL) << "The regex is not a valid UTF-8 string.";
30+
XGRAMMAR_UNREACHABLE();
31+
}
3032
}
3133
regex_codepoints_.push_back(0); // Add a null terminator
3234
}
@@ -291,6 +293,7 @@ std::string RegexConverter::Convert() {
291293
start_ = regex_codepoints_.data();
292294
current_ = start_;
293295
end_ = start_ + regex_codepoints_.size() - 1;
296+
bool is_empty = true;
294297
while (current_ != end_) {
295298
if (*current_ == '^') {
296299
if (current_ != start_) {
@@ -307,8 +310,10 @@ std::string RegexConverter::Convert() {
307310
}
308311
++current_;
309312
} else if (*current_ == '[') {
313+
is_empty = false;
310314
AddEBNFSegment(HandleCharacterClass());
311315
} else if (*current_ == '(') {
316+
is_empty = false;
312317
++current_;
313318
++parenthesis_level_;
314319
AddEBNFSegment("(");
@@ -317,6 +322,7 @@ std::string RegexConverter::Convert() {
317322
HandleGroupModifier();
318323
}
319324
} else if (*current_ == ')') {
325+
is_empty = false;
320326
if (parenthesis_level_ == 0) {
321327
RaiseError("Unmatched ')'");
322328
}
@@ -328,6 +334,7 @@ std::string RegexConverter::Convert() {
328334
AddEBNFSegment(")");
329335
++current_;
330336
} else if (*current_ == '*' || *current_ == '+' || *current_ == '?') {
337+
is_empty = false;
331338
result_ebnf_ += static_cast<char>(*current_);
332339
++current_;
333340
if (current_ != end_ && *current_ == '?') {
@@ -340,6 +347,7 @@ std::string RegexConverter::Convert() {
340347
RaiseError("Two consecutive repetition modifiers are not allowed.");
341348
}
342349
} else if (*current_ == '{') {
350+
is_empty = false;
343351
result_ebnf_ += HandleRepetitionRange();
344352
if (current_ != end_ && *current_ == '?') {
345353
// Still ignore the non-greedy modifier.
@@ -350,14 +358,18 @@ std::string RegexConverter::Convert() {
350358
RaiseError("Two consecutive repetition modifiers are not allowed.");
351359
}
352360
} else if (*current_ == '|') {
361+
is_empty = false;
353362
AddEBNFSegment("|");
354363
++current_;
355364
} else if (*current_ == '\\') {
365+
is_empty = false;
356366
AddEBNFSegment(HandleEscape());
357367
} else if (*current_ == '.') {
368+
is_empty = false;
358369
AddEBNFSegment(R"([\u0000-\U0010FFFF])");
359370
++current_;
360371
} else {
372+
is_empty = false;
361373
// Non-special characters are matched literally.
362374
AddEBNFSegment("\"" + EscapeString(*current_) + "\"");
363375
++current_;
@@ -366,6 +378,9 @@ std::string RegexConverter::Convert() {
366378
if (parenthesis_level_ != 0) {
367379
RaiseError("The parenthesis is not closed.");
368380
}
381+
if (is_empty) {
382+
AddEBNFSegment("\"\"");
383+
}
369384
return result_ebnf_;
370385
}
371386

tests/python/test_grammar_matcher_json_schema.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,5 +538,21 @@ def test_regression_accept_invalid_token():
538538
matcher.fill_next_token_bitmask(token_bitmask, i)
539539

540540

541+
def test_regression_empty_property_key_regex():
542+
schema = {
543+
"type": "object",
544+
"properties": {
545+
"_links": {
546+
"type": "object",
547+
"patternProperties": {
548+
"": {"type": "object", "properties": {"href": {"type": "string"}}}
549+
},
550+
}
551+
},
552+
}
553+
_ = xgr.Grammar.from_json_schema(schema)
554+
assert _ is not None
555+
556+
541557
if __name__ == "__main__":
542558
pytest.main(sys.argv)

tests/python/test_regex_converter.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,5 +469,17 @@ def test_mask_generation(tokenizer_path: str, regex: str, instance: str):
469469
assert matcher.is_terminated()
470470

471471

472+
empty_regex = ["", "^$", "(())", "()", "^", "$", "()|()"]
473+
474+
475+
@pytest.mark.parametrize("regex", empty_regex)
476+
def test_empty(regex: str):
477+
grammar = xgr.Grammar.from_regex(regex)
478+
expected_grammar = 'root ::= ("")\n'
479+
assert str(grammar) == expected_grammar
480+
assert _is_grammar_accept_string(grammar, "")
481+
assert not _is_grammar_accept_string(grammar, "a")
482+
483+
472484
if __name__ == "__main__":
473485
pytest.main(sys.argv)

0 commit comments

Comments
 (0)