Skip to content

Commit 2385800

Browse files
Merge pull request #301 from alexandernorth/fix/patterns-including-metacharacters
escape non-metacharacters (XML spec)
2 parents c1998d4 + 8f609b6 commit 2385800

File tree

2 files changed

+102
-1
lines changed

2 files changed

+102
-1
lines changed

pkg/utils/leaf_convert.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,37 @@ func ConvertInt64(value string, lst *sdcpb.SchemaLeafType) (*sdcpb.TypedValue, e
273273
return convertInt(value, lst.Range, ranges)
274274
}
275275

276+
func XMLRegexConvert(s string) string {
277+
278+
cTest := func(r rune, prev rune) bool {
279+
// if ^ is not following a [ or if $ we want to return true
280+
return (r == '^' && prev != '[') || r == '$'
281+
}
282+
283+
b := strings.Builder{}
284+
b.Grow(len(s) + len(s)/4)
285+
slashes := 0
286+
prevR := rune(0)
287+
288+
for _, r := range s {
289+
if r == '\\' {
290+
slashes++
291+
prevR = r
292+
b.WriteRune(r)
293+
continue
294+
}
295+
296+
if cTest(r, prevR) && slashes%2 == 0 {
297+
b.WriteRune('\\')
298+
}
299+
300+
slashes = 0
301+
prevR = r
302+
b.WriteRune(r)
303+
}
304+
return b.String()
305+
}
306+
276307
func ConvertString(value string, lst *sdcpb.SchemaLeafType) (*sdcpb.TypedValue, error) {
277308
// check length of the string if the length property is set
278309
// length will contain a range like string definition "5..60" or "7..10|40..45"
@@ -289,7 +320,14 @@ func ConvertString(value string, lst *sdcpb.SchemaLeafType) (*sdcpb.TypedValue,
289320
// If the type has multiple "pattern" statements, the expressions are
290321
// ANDed together, i.e., all such expressions have to match.
291322
for _, sp := range lst.Patterns {
292-
re, err := regexp.Compile(sp.Pattern)
323+
// The set of metacharacters is not the same between XML schema and perl/python/go REs
324+
// the set of metacharacters for XML is: .\?*+{}()[] (https://www.w3.org/TR/xmlschema-2/#dt-metac)
325+
// the set of metacharacters defined in go is: \.+*?()|[]{}^$ (go/libexec/src/regexp/regexp.go:714)
326+
// we need therefore to escape some values
327+
// TODO check about '^'
328+
329+
escaped := XMLRegexConvert(sp.Pattern)
330+
re, err := regexp.Compile(escaped)
293331
if err != nil {
294332
log.Errorf("unable to compile regex %q", sp.Pattern)
295333
}

pkg/utils/leaf_convert_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package utils
2+
3+
import (
4+
"reflect"
5+
"testing"
6+
)
7+
8+
func TestXMLRegexConvert(t *testing.T) {
9+
10+
tests := []struct {
11+
name string
12+
in string
13+
want string
14+
}{
15+
{
16+
name: "anchors become literals",
17+
in: `^\d+$`,
18+
want: `\^\d+\$`,
19+
},
20+
{
21+
name: "already-escaped anchors stay escaped",
22+
in: `foo\$bar`,
23+
want: `foo\$bar`,
24+
},
25+
{
26+
name: "caret in char class is left alone, dollar is escaped",
27+
in: `[^\w]+$`,
28+
want: `[^\w]+\$`,
29+
},
30+
{
31+
name: "caret later inside char class is escaped",
32+
in: `[a^b]`,
33+
want: `[a\^b]`,
34+
},
35+
{
36+
name: "caret escaped inside char class is escaped",
37+
in: `[\^]`,
38+
want: `[\^]`,
39+
},
40+
{
41+
name: "caret in char class multiple times, dollar is escaped",
42+
in: `[^a^b]`,
43+
want: `[^a\^b]`,
44+
},
45+
{
46+
name: "anchors preceded by a single back-slash stay escaped",
47+
in: `\^test\$`,
48+
want: `\^test\$`,
49+
},
50+
{
51+
name: "empty string",
52+
in: ``,
53+
want: ``,
54+
},
55+
}
56+
for _, tt := range tests {
57+
t.Run(tt.name, func(t *testing.T) {
58+
if got := XMLRegexConvert(tt.in); !reflect.DeepEqual(got, tt.want) {
59+
t.Errorf("XMLRegexConvert() = %v, want %v", got, tt.want)
60+
}
61+
})
62+
}
63+
}

0 commit comments

Comments
 (0)