@@ -3,6 +3,35 @@ use crate::regex::types::{
33} ;
44use fancy_regex:: Regex ;
55
6+ /// Converts bare capturing groups to non-capturing groups
7+ /// This preserves existing special groups like (?:...), (?=...), (?!...), (?<=...), etc.
8+ fn convert_bare_groups_to_non_capturing ( pattern : & str ) -> String {
9+ let mut result = String :: new ( ) ;
10+ let mut chars = pattern. chars ( ) . peekable ( ) ;
11+
12+ while let Some ( ch) = chars. next ( ) {
13+ if ch == '(' {
14+ // Check if this is already a special group
15+ if let Some ( & next) = chars. peek ( ) {
16+ if next == '?' {
17+ // This is already a special group like (?:...), keep it as-is
18+ result. push ( ch) ;
19+ } else {
20+ // This is a bare capturing group, convert it
21+ result. push_str ( "(?:" ) ;
22+ }
23+ } else {
24+ // Parenthesis at end of string (shouldn't happen in valid regex)
25+ result. push ( ch) ;
26+ }
27+ } else {
28+ result. push ( ch) ;
29+ }
30+ }
31+
32+ result
33+ }
34+
635/// Extracts substring indices matching a decomposed regex pattern
736///
837/// This function supports two modes:
@@ -17,6 +46,19 @@ use fancy_regex::Regex;
1746/// - Works standalone without compiler dependency
1847/// - Suitable for simple extraction use cases
1948///
49+ /// # Capture Group Handling
50+ ///
51+ /// To ensure stable and predictable capture group numbering, this function automatically
52+ /// converts bare capturing groups `(...)` to non-capturing groups `(?:...)` within
53+ /// PublicPattern parts before wrapping them in an outer capturing group. This prevents
54+ /// nested capture groups from interfering with the expected group indices.
55+ ///
56+ /// For example:
57+ /// - Input pattern: `"(\w+)@(\d+)"`
58+ /// - Converted to: `"((?:\w+)@(?:\d+))"` (only 1 capture group)
59+ ///
60+ /// Existing special groups like `(?:...)`, `(?=...)`, `(?!...)`, etc. are preserved as-is.
61+ ///
2062/// # Arguments
2163/// * `input` - The input string to search
2264/// * `config` - The decomposed regex configuration
@@ -182,8 +224,11 @@ fn compose_pattern_for_nfa(
182224 }
183225 RegexPart :: PublicPattern ( ( p, _max_bytes) ) => {
184226 public_count += 1 ;
227+ // Convert any bare capturing groups to non-capturing groups to prevent
228+ // nested capture groups from interfering with the expected group numbering
229+ let adjusted_pattern = convert_bare_groups_to_non_capturing ( p) ;
185230 // Wrap in capturing group
186- pattern. push_str ( & format ! ( "({})" , p ) . as_str ( ) ) ;
231+ pattern. push_str ( & format ! ( "({})" , adjusted_pattern ) . as_str ( ) ) ;
187232 }
188233 } ;
189234 }
@@ -220,8 +265,11 @@ fn compose_pattern_standalone(
220265 RegexPart :: PublicPattern ( ( p, _max_bytes) ) => {
221266 public_group_indices. push ( current_group) ;
222267 current_group += 1 ;
268+ // Convert any bare capturing groups to non-capturing groups to prevent
269+ // nested capture groups from interfering with the expected group numbering
270+ let adjusted_pattern = convert_bare_groups_to_non_capturing ( p) ;
223271 // Wrap in capturing group
224- pattern. push_str ( & format ! ( "({})" , p ) . as_str ( ) ) ;
272+ pattern. push_str ( & format ! ( "({})" , adjusted_pattern ) . as_str ( ) ) ;
225273 }
226274 } ;
227275 }
@@ -325,4 +373,61 @@ mod tests {
325373 let full_match = extract_substr_idxes ( input, & config, None , true ) . unwrap ( ) ;
326374 assert_eq ! ( & input[ full_match[ 0 ] . 0 ..full_match[ 0 ] . 1 ] , "prefix:hello" ) ;
327375 }
376+
377+ #[ test]
378+ fn test_nested_capturing_groups_converted ( ) {
379+ // Test that bare capturing groups inside PublicPattern are converted to non-capturing
380+ // This ensures stable capture group numbering
381+ let config = DecomposedRegexConfig {
382+ parts : vec ! [
383+ RegexPart :: Pattern ( "data:" . to_string( ) ) ,
384+ RegexPart :: PublicPattern ( ( "(\\ w+)-(\\ d+)" . to_string( ) , 50 ) ) ,
385+ ] ,
386+ } ;
387+
388+ let input = "data:hello-123" ;
389+ let result = extract_substr_idxes ( input, & config, None , false ) . unwrap ( ) ;
390+
391+ // Should extract the entire public pattern match, not individual nested groups
392+ assert_eq ! ( result. len( ) , 1 , "Should have exactly one capture group" ) ;
393+ assert_eq ! ( & input[ result[ 0 ] . 0 ..result[ 0 ] . 1 ] , "hello-123" ) ;
394+ }
395+
396+ #[ test]
397+ fn test_multiple_public_patterns_with_nested_groups ( ) {
398+ // Test multiple PublicPattern parts each with nested groups
399+ let config = DecomposedRegexConfig {
400+ parts : vec ! [
401+ RegexPart :: Pattern ( "from:" . to_string( ) ) ,
402+ RegexPart :: PublicPattern ( ( "(\\ w+)@(\\ w+)" . to_string( ) , 50 ) ) ,
403+ RegexPart :: Pattern ( " to:" . to_string( ) ) ,
404+ RegexPart :: PublicPattern ( ( "(\\ w+)@(\\ w+)" . to_string( ) , 50 ) ) ,
405+ ] ,
406+ } ;
407+
408+ let input = "from:alice@example to:bob@test" ;
409+ let result = extract_substr_idxes ( input, & config, None , false ) . unwrap ( ) ;
410+
411+ // Should extract exactly two capture groups (one per PublicPattern)
412+ assert_eq ! ( result. len( ) , 2 , "Should have exactly two capture groups" ) ;
413+ assert_eq ! ( & input[ result[ 0 ] . 0 ..result[ 0 ] . 1 ] , "alice@example" ) ;
414+ assert_eq ! ( & input[ result[ 1 ] . 0 ..result[ 1 ] . 1 ] , "bob@test" ) ;
415+ }
416+
417+ #[ test]
418+ fn test_already_non_capturing_groups_preserved ( ) {
419+ // Test that already non-capturing groups (?:...) work correctly
420+ let config = DecomposedRegexConfig {
421+ parts : vec ! [
422+ RegexPart :: Pattern ( "prefix:" . to_string( ) ) ,
423+ RegexPart :: PublicPattern ( ( "(?:\\ w+)-(?:\\ d+)" . to_string( ) , 50 ) ) ,
424+ ] ,
425+ } ;
426+
427+ let input = "prefix:hello-123" ;
428+ let result = extract_substr_idxes ( input, & config, None , false ) . unwrap ( ) ;
429+
430+ assert_eq ! ( result. len( ) , 1 ) ;
431+ assert_eq ! ( & input[ result[ 0 ] . 0 ..result[ 0 ] . 1 ] , "hello-123" ) ;
432+ }
328433}
0 commit comments