Skip to content

Commit 7659dc9

Browse files
committed
fix: convert bare groups to non capturing groups to ensure predictable numbering
1 parent 716dcc0 commit 7659dc9

File tree

1 file changed

+107
-2
lines changed

1 file changed

+107
-2
lines changed

src/regex/extract.rs

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,35 @@ use crate::regex::types::{
33
};
44
use fancy_regex::Regex;
55

6+
/// Converts bare capturing groups to non-capturing groups
7+
/// This preserves existing special groups like (?:...), (?=...), (?!...), (?<=...), etc.
8+
fn convert_bare_groups_to_non_capturing(pattern: &str) -> String {
9+
let mut result = String::new();
10+
let mut chars = pattern.chars().peekable();
11+
12+
while let Some(ch) = chars.next() {
13+
if ch == '(' {
14+
// Check if this is already a special group
15+
if let Some(&next) = chars.peek() {
16+
if next == '?' {
17+
// This is already a special group like (?:...), keep it as-is
18+
result.push(ch);
19+
} else {
20+
// This is a bare capturing group, convert it
21+
result.push_str("(?:");
22+
}
23+
} else {
24+
// Parenthesis at end of string (shouldn't happen in valid regex)
25+
result.push(ch);
26+
}
27+
} else {
28+
result.push(ch);
29+
}
30+
}
31+
32+
result
33+
}
34+
635
/// Extracts substring indices matching a decomposed regex pattern
736
///
837
/// This function supports two modes:
@@ -17,6 +46,19 @@ use fancy_regex::Regex;
1746
/// - Works standalone without compiler dependency
1847
/// - Suitable for simple extraction use cases
1948
///
49+
/// # Capture Group Handling
50+
///
51+
/// To ensure stable and predictable capture group numbering, this function automatically
52+
/// converts bare capturing groups `(...)` to non-capturing groups `(?:...)` within
53+
/// PublicPattern parts before wrapping them in an outer capturing group. This prevents
54+
/// nested capture groups from interfering with the expected group indices.
55+
///
56+
/// For example:
57+
/// - Input pattern: `"(\w+)@(\d+)"`
58+
/// - Converted to: `"((?:\w+)@(?:\d+))"` (only 1 capture group)
59+
///
60+
/// Existing special groups like `(?:...)`, `(?=...)`, `(?!...)`, etc. are preserved as-is.
61+
///
2062
/// # Arguments
2163
/// * `input` - The input string to search
2264
/// * `config` - The decomposed regex configuration
@@ -182,8 +224,11 @@ fn compose_pattern_for_nfa(
182224
}
183225
RegexPart::PublicPattern((p, _max_bytes)) => {
184226
public_count += 1;
227+
// Convert any bare capturing groups to non-capturing groups to prevent
228+
// nested capture groups from interfering with the expected group numbering
229+
let adjusted_pattern = convert_bare_groups_to_non_capturing(p);
185230
// Wrap in capturing group
186-
pattern.push_str(&format!("({})", p).as_str());
231+
pattern.push_str(&format!("({})", adjusted_pattern).as_str());
187232
}
188233
};
189234
}
@@ -220,8 +265,11 @@ fn compose_pattern_standalone(
220265
RegexPart::PublicPattern((p, _max_bytes)) => {
221266
public_group_indices.push(current_group);
222267
current_group += 1;
268+
// Convert any bare capturing groups to non-capturing groups to prevent
269+
// nested capture groups from interfering with the expected group numbering
270+
let adjusted_pattern = convert_bare_groups_to_non_capturing(p);
223271
// Wrap in capturing group
224-
pattern.push_str(&format!("({})", p).as_str());
272+
pattern.push_str(&format!("({})", adjusted_pattern).as_str());
225273
}
226274
};
227275
}
@@ -325,4 +373,61 @@ mod tests {
325373
let full_match = extract_substr_idxes(input, &config, None, true).unwrap();
326374
assert_eq!(&input[full_match[0].0..full_match[0].1], "prefix:hello");
327375
}
376+
377+
#[test]
378+
fn test_nested_capturing_groups_converted() {
379+
// Test that bare capturing groups inside PublicPattern are converted to non-capturing
380+
// This ensures stable capture group numbering
381+
let config = DecomposedRegexConfig {
382+
parts: vec![
383+
RegexPart::Pattern("data:".to_string()),
384+
RegexPart::PublicPattern(("(\\w+)-(\\d+)".to_string(), 50)),
385+
],
386+
};
387+
388+
let input = "data:hello-123";
389+
let result = extract_substr_idxes(input, &config, None, false).unwrap();
390+
391+
// Should extract the entire public pattern match, not individual nested groups
392+
assert_eq!(result.len(), 1, "Should have exactly one capture group");
393+
assert_eq!(&input[result[0].0..result[0].1], "hello-123");
394+
}
395+
396+
#[test]
397+
fn test_multiple_public_patterns_with_nested_groups() {
398+
// Test multiple PublicPattern parts each with nested groups
399+
let config = DecomposedRegexConfig {
400+
parts: vec![
401+
RegexPart::Pattern("from:".to_string()),
402+
RegexPart::PublicPattern(("(\\w+)@(\\w+)".to_string(), 50)),
403+
RegexPart::Pattern(" to:".to_string()),
404+
RegexPart::PublicPattern(("(\\w+)@(\\w+)".to_string(), 50)),
405+
],
406+
};
407+
408+
let input = "from:alice@example to:bob@test";
409+
let result = extract_substr_idxes(input, &config, None, false).unwrap();
410+
411+
// Should extract exactly two capture groups (one per PublicPattern)
412+
assert_eq!(result.len(), 2, "Should have exactly two capture groups");
413+
assert_eq!(&input[result[0].0..result[0].1], "alice@example");
414+
assert_eq!(&input[result[1].0..result[1].1], "bob@test");
415+
}
416+
417+
#[test]
418+
fn test_already_non_capturing_groups_preserved() {
419+
// Test that already non-capturing groups (?:...) work correctly
420+
let config = DecomposedRegexConfig {
421+
parts: vec![
422+
RegexPart::Pattern("prefix:".to_string()),
423+
RegexPart::PublicPattern(("(?:\\w+)-(?:\\d+)".to_string(), 50)),
424+
],
425+
};
426+
427+
let input = "prefix:hello-123";
428+
let result = extract_substr_idxes(input, &config, None, false).unwrap();
429+
430+
assert_eq!(result.len(), 1);
431+
assert_eq!(&input[result[0].0..result[0].1], "hello-123");
432+
}
328433
}

0 commit comments

Comments
 (0)