From eeecfee8858d169f4ef7676b0df0b8f0b0a5d747 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 1 Mar 2023 14:50:26 +0100 Subject: [PATCH] Do not optimize regexps with begin/end text anchors inside (#433) * Do not optimize regexps with being/end text anchors inside Signed-off-by: Marco Pracucci * Explicit case for begin/end text in stringMatcherFromRegexpInternal() Signed-off-by: Marco Pracucci * Added more test cases Signed-off-by: Marco Pracucci --------- Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 53 +++++++++++++++++++++++++++++-------- model/labels/regexp_test.go | 36 +++++++++++++++++++++---- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 721cb247c5..7ac1c8a5ae 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -49,7 +49,7 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { if parsed.Op == syntax.OpConcat { m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed) } - if matches, caseSensitive := findSetMatches(parsed, ""); caseSensitive { + if matches, caseSensitive := findSetMatches(parsed); caseSensitive { m.setMatches = matches } m.stringMatcher = stringMatcherFromRegexp(parsed) @@ -60,10 +60,22 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { // findSetMatches extract equality matches from a regexp. // Returns nil if we can't replace the regexp by only equality matchers or the regexp contains // a mix of case sensitive and case insensitive matchers. -func findSetMatches(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) { +func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) { clearBeginEndText(re) + return findSetMatchesInternal(re, "") +} + +func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) { switch re.Op { + case syntax.OpBeginText: + // Correctly handling the begin text operator inside a regex is tricky, + // so in this case we fallback to the regex engine. + return nil, false + case syntax.OpEndText: + // Correctly handling the end text operator inside a regex is tricky, + // so in this case we fallback to the regex engine. + return nil, false case syntax.OpLiteral: return []string{base + string(re.Rune)}, isCaseSensitive(re) case syntax.OpEmptyMatch: @@ -74,7 +86,7 @@ func findSetMatches(re *syntax.Regexp, base string) (matches []string, caseSensi return findSetMatchesFromAlternate(re, base) case syntax.OpCapture: clearCapture(re) - return findSetMatches(re, base) + return findSetMatchesInternal(re, base) case syntax.OpConcat: return findSetMatchesFromConcat(re, base) case syntax.OpCharClass: @@ -116,7 +128,7 @@ func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, for i := 0; i < len(re.Sub); i++ { var newMatches []string for j, b := range matches { - m, caseSensitive := findSetMatches(re.Sub[i], b) + m, caseSensitive := findSetMatchesInternal(re.Sub[i], b) if m == nil { return nil, false } @@ -144,7 +156,7 @@ func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) { for i, sub := range re.Sub { - found, caseSensitive := findSetMatches(sub, base) + found, caseSensitive := findSetMatchesInternal(sub, base) if found == nil { return nil, false } @@ -179,6 +191,12 @@ func clearCapture(regs ...*syntax.Regexp) { // clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string. func clearBeginEndText(re *syntax.Regexp) { + // Do not clear begin/end text from an alternate operator because it could + // change the actual regexp properties. + if re.Op == syntax.OpAlternate { + return + } + if len(re.Sub) == 0 { return } @@ -298,10 +316,23 @@ type StringMatcher interface { // It returns nil if the regexp is not supported. // For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`. func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { - clearCapture(re) clearBeginEndText(re) + return stringMatcherFromRegexpInternal(re) +} + +func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { + clearCapture(re) + switch re.Op { + case syntax.OpBeginText: + // Correctly handling the begin text operator inside a regex is tricky, + // so in this case we fallback to the regex engine. + return nil + case syntax.OpEndText: + // Correctly handling the end text operator inside a regex is tricky, + // so in this case we fallback to the regex engine. + return nil case syntax.OpPlus, syntax.OpStar: if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { return nil @@ -321,7 +352,7 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { case syntax.OpAlternate: or := make([]StringMatcher, 0, len(re.Sub)) for _, sub := range re.Sub { - m := stringMatcherFromRegexp(sub) + m := stringMatcherFromRegexpInternal(sub) if m == nil { return nil } @@ -334,26 +365,26 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { return emptyStringMatcher{} } if len(re.Sub) == 1 { - return stringMatcherFromRegexp(re.Sub[0]) + return stringMatcherFromRegexpInternal(re.Sub[0]) } var left, right StringMatcher // Let's try to find if there's a first and last any matchers. if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar { - left = stringMatcherFromRegexp(re.Sub[0]) + left = stringMatcherFromRegexpInternal(re.Sub[0]) if left == nil { return nil } re.Sub = re.Sub[1:] } if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar { - right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1]) + right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1]) if right == nil { return nil } re.Sub = re.Sub[:len(re.Sub)-1] } - matches, matchesCaseSensitive := findSetMatches(re, "") + matches, matchesCaseSensitive := findSetMatchesInternal(re, "") if len(matches) == 0 { return nil } diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index 5f8d9bea90..cc9fb938a7 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -129,6 +129,9 @@ func TestOptimizeConcatRegex(t *testing.T) { {regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"}, {regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"}, {regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"}, + {regex: "^5..$", prefix: "5", suffix: "", contains: ""}, + {regex: "^release.*", prefix: "release", suffix: "", contains: ""}, + {regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"}, } for _, c := range cases { @@ -162,8 +165,13 @@ func TestFindSetMatches(t *testing.T) { // Simple sets alternate and concat and alternates with empty matches // parsed as b(ar|(?:)|uzz) where b(?:) means literal b. {"bar|b|buzz", []string{"bar", "b", "buzz"}, true}, - // Skip anchors it's enforced anyway at the root. - {"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}, true}, + // Skip outer anchors (it's enforced anyway at the root). + {"^(bar|b|buzz)$", []string{"bar", "b", "buzz"}, true}, + {"^(?:prod|production)$", []string{"prod", "production"}, true}, + // Do not optimize regexp with inner anchors. + {"(bar|b|b^uz$z)", nil, false}, + // Do not optimize regexp with empty string matcher. + {"^$|Running", nil, false}, // Simple sets containing escaped characters. {"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}, true}, // using charclass @@ -208,7 +216,7 @@ func TestFindSetMatches(t *testing.T) { t.Parallel() parsed, err := syntax.Parse(c.pattern, syntax.Perl) require.NoError(t, err) - matches, actualCaseSensitive := findSetMatches(parsed, "") + matches, actualCaseSensitive := findSetMatches(parsed) require.Equal(t, c.expMatches, matches) require.Equal(t, c.expCaseSensitive, actualCaseSensitive) }) @@ -274,8 +282,8 @@ func Test_OptimizeRegex(t *testing.T) { {"^$", emptyStringMatcher{}}, {"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}}, {"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}}, - {"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})}, - {"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}}), &equalStringMatcher{s: "bar", caseSensitive: true}})}, + {"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})}, + {"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})}, {"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})}, {".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, {"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, @@ -348,3 +356,21 @@ func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) } }) } + +func FuzzFastRegexMatcher_WithFuzzyRegularExpressions(f *testing.F) { + for _, re := range regexes { + for _, text := range values { + f.Add(re, text) + } + } + + f.Fuzz(func(t *testing.T, re, text string) { + m, err := NewFastRegexMatcher(re) + if err != nil { + // Ignore invalid regexes. + return + } + + require.Equalf(t, m.re.MatchString(text), m.MatchString(text), "regexp: %s text: %s", m.re.String(), text) + }) +}