[Fix] Fix regex converter when the property's pattern is empty. (#409)

Seven-Streams · web-flow · commit 4e936d2326aa · 2025-09-04T04:11:40.000-04:00
Currently, when we enconter such schema:
```
{
  "type": "object",
  "properties": {
    "_links": {
      "type": "object",
      "patternProperties": {
        "": {
          "type": "object",
          "properties": {
            "href": {
              "type": "string"
            }
          }
        }
      }
    }
  }
}
```
This will lead to a segmentation fault. It is caused by the lack of
handling with the empty regex when `RegexConverter` is constructed. This
PR fixed it.


Signed-off-by: Yuchuan &lt;blemiade_qinchuan@sjtu.edu.cn&gt;

---------

Signed-off-by: Yuchuan &lt;blemiade_qinchuan@sjtu.edu.cn&gt;
diff --git a/cpp/regex_converter.cc b/cpp/regex_converter.cc
@@ -23,10 +23,12 @@ namespace xgrammar {
 class RegexConverter {
  public:
   explicit RegexConverter(const std::string& regex) : regex_(regex) {
-    regex_codepoints_ = ParseUTF8(regex_.c_str(), false);
-    if (regex_codepoints_[0] == kInvalidUTF8) {
-      XGRAMMAR_LOG(FATAL) << "The regex is not a valid UTF-8 string.";
-      XGRAMMAR_UNREACHABLE();
+    if (!regex.empty()) {
+      regex_codepoints_ = ParseUTF8(regex_.c_str(), false);
+      if (regex_codepoints_[0] == kInvalidUTF8) {
+        XGRAMMAR_LOG(FATAL) << "The regex is not a valid UTF-8 string.";
+        XGRAMMAR_UNREACHABLE();
+      }
     }
     regex_codepoints_.push_back(0);  // Add a null terminator
   }
@@ -291,6 +293,7 @@ std::string RegexConverter::Convert() {
   start_ = regex_codepoints_.data();
   current_ = start_;
   end_ = start_ + regex_codepoints_.size() - 1;
+  bool is_empty = true;
   while (current_ != end_) {
     if (*current_ == '^') {
       if (current_ != start_) {
@@ -307,8 +310,10 @@ std::string RegexConverter::Convert() {
       }
       ++current_;
     } else if (*current_ == '[') {
+      is_empty = false;
       AddEBNFSegment(HandleCharacterClass());
     } else if (*current_ == '(') {
+      is_empty = false;
       ++current_;
       ++parenthesis_level_;
       AddEBNFSegment("(");
@@ -317,6 +322,7 @@ std::string RegexConverter::Convert() {
         HandleGroupModifier();
       }
     } else if (*current_ == ')') {
+      is_empty = false;
       if (parenthesis_level_ == 0) {
         RaiseError("Unmatched ')'");
       }
@@ -328,6 +334,7 @@ std::string RegexConverter::Convert() {
       AddEBNFSegment(")");
       ++current_;
     } else if (*current_ == '*' || *current_ == '+' || *current_ == '?') {
+      is_empty = false;
       result_ebnf_ += static_cast<char>(*current_);
       ++current_;
       if (current_ != end_ && *current_ == '?') {
@@ -340,6 +347,7 @@ std::string RegexConverter::Convert() {
         RaiseError("Two consecutive repetition modifiers are not allowed.");
       }
     } else if (*current_ == '{') {
+      is_empty = false;
       result_ebnf_ += HandleRepetitionRange();
       if (current_ != end_ && *current_ == '?') {
         // Still ignore the non-greedy modifier.
@@ -350,14 +358,18 @@ std::string RegexConverter::Convert() {
         RaiseError("Two consecutive repetition modifiers are not allowed.");
       }
     } else if (*current_ == '|') {
+      is_empty = false;
       AddEBNFSegment("|");
       ++current_;
     } else if (*current_ == '\\') {
+      is_empty = false;
       AddEBNFSegment(HandleEscape());
     } else if (*current_ == '.') {
+      is_empty = false;
       AddEBNFSegment(R"([\u0000-\U0010FFFF])");
       ++current_;
     } else {
+      is_empty = false;
       // Non-special characters are matched literally.
       AddEBNFSegment("\"" + EscapeString(*current_) + "\"");
       ++current_;
@@ -366,6 +378,9 @@ std::string RegexConverter::Convert() {
   if (parenthesis_level_ != 0) {
     RaiseError("The parenthesis is not closed.");
   }
+  if (is_empty) {
+    AddEBNFSegment("\"\"");
+  }
   return result_ebnf_;
 }
 
diff --git a/tests/python/test_grammar_matcher_json_schema.py b/tests/python/test_grammar_matcher_json_schema.py
@@ -538,5 +538,21 @@ def test_regression_accept_invalid_token():
         matcher.fill_next_token_bitmask(token_bitmask, i)
 
 
+def test_regression_empty_property_key_regex():
+    schema = {
+        "type": "object",
+        "properties": {
+            "_links": {
+                "type": "object",
+                "patternProperties": {
+                    "": {"type": "object", "properties": {"href": {"type": "string"}}}
+                },
+            }
+        },
+    }
+    _ = xgr.Grammar.from_json_schema(schema)
+    assert _ is not None
+
+
 if __name__ == "__main__":
     pytest.main(sys.argv)
diff --git a/tests/python/test_regex_converter.py b/tests/python/test_regex_converter.py
@@ -469,5 +469,17 @@ def test_mask_generation(tokenizer_path: str, regex: str, instance: str):
     assert matcher.is_terminated()
 
 
+empty_regex = ["", "^$", "(())", "()", "^", "$", "()|()"]
+
+
+@pytest.mark.parametrize("regex", empty_regex)
+def test_empty(regex: str):
+    grammar = xgr.Grammar.from_regex(regex)
+    expected_grammar = 'root ::= ("")\n'
+    assert str(grammar) == expected_grammar
+    assert _is_grammar_accept_string(grammar, "")
+    assert not _is_grammar_accept_string(grammar, "a")
+
+
 if __name__ == "__main__":
     pytest.main(sys.argv)