From 74c5667b632dfa46336a4f1269081a76eb5decef Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Thu, 7 Oct 2021 13:56:31 +0200 Subject: [PATCH] implement optimization. Signed-off-by: Cyril Tovena --- pkg/labels/regexp.go | 177 ++++++++++++++++++++++++++++++++------ pkg/labels/regexp_test.go | 57 ++++++++++++ 2 files changed, 207 insertions(+), 27 deletions(-) diff --git a/pkg/labels/regexp.go b/pkg/labels/regexp.go index 559d2a4ae0..19fc9724d8 100644 --- a/pkg/labels/regexp.go +++ b/pkg/labels/regexp.go @@ -250,41 +250,164 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) { return } -// todo remove anchors ^foo$ -// remove captures. -// .+ | .* | .*POD.* | test-.* | .*-test | test-.+ | .+-test - -// regexp prefix fn ???? can we use this ? - -// benchmark them - type StringMatcher interface { Matches(s string) bool } -func contains(s, substr string) bool { - pos := strings.Index(s, substr) - if pos < 0 { - return false +func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { + clearCapture(re) + clearBeginEndText(re) + switch re.Op { + case syntax.OpStar: + return anyStringMatcher{ + allowEmpty: true, + matchNL: re.Flags&syntax.DotNL != 0, + } + case syntax.OpEmptyMatch: + return emptyStringMatcher{} + case syntax.OpPlus: + return anyStringMatcher{ + allowEmpty: false, + matchNL: re.Flags&syntax.DotNL != 0, + } + case syntax.OpLiteral: + return equalStringMatcher{ + s: string(re.Rune), + caseSensitive: !isCaseInsensitive(re), + } + case syntax.OpAlternate: + or := make([]StringMatcher, 0, len(re.Sub)) + for _, sub := range re.Sub { + m := stringMatcherFromRegexp(sub) + if m == nil { + return nil + } + or = append(or, m) + } + return orStringMatcher(or) + case syntax.OpConcat: + clearCapture(re.Sub...) + if len(re.Sub) == 0 { + return emptyStringMatcher{} + } + if len(re.Sub) == 1 { + return stringMatcherFromRegexp(re.Sub[0]) + } + var left, right StringMatcher + + if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar { + left = stringMatcherFromRegexp(re.Sub[0]) + if left == nil { + return nil + } + re.Sub = re.Sub[1:] + } + if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar { + right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1]) + if right == nil { + return nil + } + re.Sub = re.Sub[:len(re.Sub)-1] + } + matches := findSetMatches(re, "") + if left == nil && right == nil { + if len(matches) > 0 { + var or []StringMatcher + for _, match := range matches { + or = append(or, equalStringMatcher{ + s: match, + caseSensitive: true, + }) + } + return orStringMatcher(or) + } + } + if len(matches) > 0 { + return containsStringMatcher{ + substr: matches, + left: left, + right: right, + } + } } - return matchesAnyZeroOrMoreNotNL(s[:pos]) && matchesAnyZeroOrMoreNotNL(s[pos+len(substr):]) + return nil } -func matchesAnyZeroOrMoreNotNL(s string) bool { - // strings.Contains(s string, substr string) faster ?? - for _, r := range s { - if r == '\n' { - return false +type containsStringMatcher struct { + substr []string + left StringMatcher + right StringMatcher +} + +func (m containsStringMatcher) Matches(s string) bool { + var pos int + for _, substr := range m.substr { + pos = strings.Index(s, substr) + if pos < 0 { + continue } + if m.right != nil && m.left != nil { + if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(m.substr):]) { + return true + } + continue + } + if m.left != nil { + if m.left.Matches(s[:pos]) { + return true + } + continue + } + if m.right != nil { + if m.right.Matches(s[pos+len(m.substr):]) { + return true + } + continue + } + } + return false +} + +type emptyStringMatcher struct{} + +func (m emptyStringMatcher) Matches(s string) bool { + return len(s) == 0 +} + +type orStringMatcher []StringMatcher + +func (m orStringMatcher) Matches(s string) bool { + for _, matcher := range m { + if matcher.Matches(s) { + return true + } + } + return false +} + +type equalStringMatcher struct { + s string + caseSensitive bool +} + +func (m equalStringMatcher) Matches(s string) bool { + if !m.caseSensitive { + return m.s == s + } + return strings.EqualFold(m.s, s) +} + +type anyStringMatcher struct { + allowEmpty bool + matchNL bool +} + +func (m anyStringMatcher) Matches(s string) bool { + if !m.matchNL && strings.ContainsRune(s, '\n') { + return false + } + if !m.allowEmpty && len(s) == 0 { + return false } return true } - -func matchesOneZeroOrMoreNotNL(s string) bool { - for _, r := range s { - if r == '\n' { - return false - } - } - return len(s) > 0 -} diff --git a/pkg/labels/regexp_test.go b/pkg/labels/regexp_test.go index 4e29bf21e2..3a25679627 100644 --- a/pkg/labels/regexp_test.go +++ b/pkg/labels/regexp_test.go @@ -175,3 +175,60 @@ func TestFindSetMatches(t *testing.T) { } } + +func Test_OptimizeRegex(t *testing.T) { + for _, c := range []struct { + pattern string + exp StringMatcher + }{ + {".*", anyStringMatcher{allowEmpty: true, matchNL: false}}, + {"(?s:.*)", anyStringMatcher{allowEmpty: true, matchNL: true}}, + {"(.*)", anyStringMatcher{allowEmpty: true, matchNL: false}}, + {"^.*$", anyStringMatcher{allowEmpty: true, matchNL: false}}, + {".+", anyStringMatcher{allowEmpty: false, matchNL: false}}, + {"(?s:.+)", anyStringMatcher{allowEmpty: false, matchNL: true}}, + {"^.+$", anyStringMatcher{allowEmpty: false, matchNL: false}}, + {"(.+)", anyStringMatcher{allowEmpty: false, matchNL: false}}, + {"", emptyStringMatcher{}}, + {"^$", emptyStringMatcher{}}, + {"^foo$", equalStringMatcher{s: "foo", caseSensitive: true}}, + {"^(?i:foo)$", equalStringMatcher{s: "FOO", caseSensitive: false}}, + {"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{equalStringMatcher{s: "FOO", caseSensitive: false}, equalStringMatcher{s: "bar", caseSensitive: true}})}, + {"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{equalStringMatcher{s: "FOO", caseSensitive: false}, equalStringMatcher{s: "OO", caseSensitive: false}}), equalStringMatcher{s: "bar", caseSensitive: true}})}, + {".*foo.*", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"(.*)foo.*", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"(.*)foo(.*)", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"(.+)foo(.*)", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: false, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"^.+foo.+", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: false, matchNL: false}, right: anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {"^(.*)(foo)(.*)$", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"^(.*)(foo|foobar)(.*)$", containsStringMatcher{substr: []string{"foo", "foobar"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"^(.*)(foo|foobar)(.+)$", containsStringMatcher{substr: []string{"foo", "foobar"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {"^(.*)(bar|b|buzz)(.+)$", containsStringMatcher{substr: []string{"bar", "b", "buzz"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {"10\\.0\\.(1|2)\\.+", containsStringMatcher{substr: []string{"10.0.1", "10.0.2"}, left: nil, right: anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {"^.+foo", containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}}, + {"foo-.*$", containsStringMatcher{substr: []string{"foo-"}, left: nil, right: anyStringMatcher{allowEmpty: true, matchNL: false}}}, + {"(prometheus|api_prom)_api_v1_.+", containsStringMatcher{substr: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{containsStringMatcher{substr: []string{"bar", "b", "buzz"}, left: anyStringMatcher{allowEmpty: true, matchNL: false}, right: anyStringMatcher{allowEmpty: false, matchNL: false}}, equalStringMatcher{s: "foo", caseSensitive: true}})}, + {"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{equalStringMatcher{s: "fobar", caseSensitive: true}}), containsStringMatcher{substr: []string{"foo"}, left: anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})}, + // we don't support case insensitive matching for contains. + // This is because there's no strings.IndexOfFold function. + // We can revisit later if this is really popular by using strings.ToUpper. + {"^(.*)((?i)foo|foobar)(.*)$", nil}, + {"(api|rpc)_(v1|prom)_((?i)push|query)", nil}, + {"[a-z][a-z]", nil}, + {"[1^3]", nil}, + // This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat. + // It would make the code too complex to handle it. + {"/|/bar.*", nil}, + {"(.+)/(foo.*|bar$)", nil}, + } { + c := c + t.Run(c.pattern, func(t *testing.T) { + t.Parallel() + parsed, err := syntax.Parse(c.pattern, syntax.Perl) + require.NoError(t, err) + matches := stringMatcherFromRegexp(parsed) + require.Equal(t, c.exp, matches) + }) + } +}