Do not optimize regexps with begin/end text anchors inside (#433)

* Do not optimize regexps with being/end text anchors inside

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Explicit case for begin/end text in stringMatcherFromRegexpInternal()

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Added more test cases

Signed-off-by: Marco Pracucci <marco@pracucci.com>

---------

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2023-03-01 14:50:26 +01:00 committed by GitHub
parent 2e0ecc013f
commit eeecfee885
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 73 additions and 16 deletions

View file

@ -49,7 +49,7 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
if parsed.Op == syntax.OpConcat {
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
}
if matches, caseSensitive := findSetMatches(parsed, ""); caseSensitive {
if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
m.setMatches = matches
}
m.stringMatcher = stringMatcherFromRegexp(parsed)
@ -60,10 +60,22 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
// findSetMatches extract equality matches from a regexp.
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
// a mix of case sensitive and case insensitive matchers.
func findSetMatches(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) {
clearBeginEndText(re)
return findSetMatchesInternal(re, "")
}
func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
switch re.Op {
case syntax.OpBeginText:
// Correctly handling the begin text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil, false
case syntax.OpEndText:
// Correctly handling the end text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil, false
case syntax.OpLiteral:
return []string{base + string(re.Rune)}, isCaseSensitive(re)
case syntax.OpEmptyMatch:
@ -74,7 +86,7 @@ func findSetMatches(re *syntax.Regexp, base string) (matches []string, caseSensi
return findSetMatchesFromAlternate(re, base)
case syntax.OpCapture:
clearCapture(re)
return findSetMatches(re, base)
return findSetMatchesInternal(re, base)
case syntax.OpConcat:
return findSetMatchesFromConcat(re, base)
case syntax.OpCharClass:
@ -116,7 +128,7 @@ func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string,
for i := 0; i < len(re.Sub); i++ {
var newMatches []string
for j, b := range matches {
m, caseSensitive := findSetMatches(re.Sub[i], b)
m, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
if m == nil {
return nil, false
}
@ -144,7 +156,7 @@ func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string,
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
for i, sub := range re.Sub {
found, caseSensitive := findSetMatches(sub, base)
found, caseSensitive := findSetMatchesInternal(sub, base)
if found == nil {
return nil, false
}
@ -179,6 +191,12 @@ func clearCapture(regs ...*syntax.Regexp) {
// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
func clearBeginEndText(re *syntax.Regexp) {
// Do not clear begin/end text from an alternate operator because it could
// change the actual regexp properties.
if re.Op == syntax.OpAlternate {
return
}
if len(re.Sub) == 0 {
return
}
@ -298,10 +316,23 @@ type StringMatcher interface {
// It returns nil if the regexp is not supported.
// For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`.
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
clearCapture(re)
clearBeginEndText(re)
return stringMatcherFromRegexpInternal(re)
}
func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
clearCapture(re)
switch re.Op {
case syntax.OpBeginText:
// Correctly handling the begin text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil
case syntax.OpEndText:
// Correctly handling the end text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil
case syntax.OpPlus, syntax.OpStar:
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
@ -321,7 +352,7 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
case syntax.OpAlternate:
or := make([]StringMatcher, 0, len(re.Sub))
for _, sub := range re.Sub {
m := stringMatcherFromRegexp(sub)
m := stringMatcherFromRegexpInternal(sub)
if m == nil {
return nil
}
@ -334,26 +365,26 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
return emptyStringMatcher{}
}
if len(re.Sub) == 1 {
return stringMatcherFromRegexp(re.Sub[0])
return stringMatcherFromRegexpInternal(re.Sub[0])
}
var left, right StringMatcher
// Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
left = stringMatcherFromRegexp(re.Sub[0])
left = stringMatcherFromRegexpInternal(re.Sub[0])
if left == nil {
return nil
}
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1])
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
if right == nil {
return nil
}
re.Sub = re.Sub[:len(re.Sub)-1]
}
matches, matchesCaseSensitive := findSetMatches(re, "")
matches, matchesCaseSensitive := findSetMatchesInternal(re, "")
if len(matches) == 0 {
return nil
}

View file

@ -129,6 +129,9 @@ func TestOptimizeConcatRegex(t *testing.T) {
{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"},
{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"},
{regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"},
{regex: "^5..$", prefix: "5", suffix: "", contains: ""},
{regex: "^release.*", prefix: "release", suffix: "", contains: ""},
{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"},
}
for _, c := range cases {
@ -162,8 +165,13 @@ func TestFindSetMatches(t *testing.T) {
// Simple sets alternate and concat and alternates with empty matches
// parsed as b(ar|(?:)|uzz) where b(?:) means literal b.
{"bar|b|buzz", []string{"bar", "b", "buzz"}, true},
// Skip anchors it's enforced anyway at the root.
{"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}, true},
// Skip outer anchors (it's enforced anyway at the root).
{"^(bar|b|buzz)$", []string{"bar", "b", "buzz"}, true},
{"^(?:prod|production)$", []string{"prod", "production"}, true},
// Do not optimize regexp with inner anchors.
{"(bar|b|b^uz$z)", nil, false},
// Do not optimize regexp with empty string matcher.
{"^$|Running", nil, false},
// Simple sets containing escaped characters.
{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}, true},
// using charclass
@ -208,7 +216,7 @@ func TestFindSetMatches(t *testing.T) {
t.Parallel()
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)
matches, actualCaseSensitive := findSetMatches(parsed, "")
matches, actualCaseSensitive := findSetMatches(parsed)
require.Equal(t, c.expMatches, matches)
require.Equal(t, c.expCaseSensitive, actualCaseSensitive)
})
@ -274,8 +282,8 @@ func Test_OptimizeRegex(t *testing.T) {
{"^$", emptyStringMatcher{}},
{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
{"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}}), &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
@ -348,3 +356,21 @@ func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F)
}
})
}
func FuzzFastRegexMatcher_WithFuzzyRegularExpressions(f *testing.F) {
for _, re := range regexes {
for _, text := range values {
f.Add(re, text)
}
}
f.Fuzz(func(t *testing.T, re, text string) {
m, err := NewFastRegexMatcher(re)
if err != nil {
// Ignore invalid regexes.
return
}
require.Equalf(t, m.re.MatchString(text), m.MatchString(text), "regexp: %s text: %s", m.re.String(), text)
})
}