diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 3fba36430..8173f7e42 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -37,6 +37,9 @@ type FastRegexMatcher struct { prefix string suffix string contains string + + // matchString is the "compiled" function to run by MatchString(). + matchString func(string) bool } func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { @@ -61,9 +64,42 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { } m.stringMatcher = stringMatcherFromRegexp(parsed) + m.matchString = m.compileMatchStringFunction() return m, nil } +// compileMatchStringFunction returns the function to run by MatchString(). +func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { + // If the only optimization available is the string matcher, then we can just run it. + if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil { + return m.stringMatcher.Matches + } + + return func(s string) bool { + if len(m.setMatches) != 0 { + for _, match := range m.setMatches { + if match == s { + return true + } + } + return false + } + if m.prefix != "" && !strings.HasPrefix(s, m.prefix) { + return false + } + if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { + return false + } + if m.contains != "" && !strings.Contains(s, m.contains) { + return false + } + if m.stringMatcher != nil { + return m.stringMatcher.Matches(s) + } + return m.re.MatchString(s) + } +} + // isOptimized returns true if any fast-path optimization is applied to the // regex matcher. // @@ -250,27 +286,7 @@ func tooManyMatches(matches []string, new ...string) bool { } func (m *FastRegexMatcher) MatchString(s string) bool { - if len(m.setMatches) != 0 { - for _, match := range m.setMatches { - if match == s { - return true - } - } - return false - } - if m.prefix != "" && !strings.HasPrefix(s, m.prefix) { - return false - } - if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { - return false - } - if m.contains != "" && !strings.Contains(s, m.contains) { - return false - } - if m.stringMatcher != nil { - return m.stringMatcher.Matches(s) - } - return m.re.MatchString(s) + return m.matchString(s) } func (m *FastRegexMatcher) SetMatches() []string { @@ -351,14 +367,25 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { // Correctly handling the end text operator inside a regex is tricky, // so in this case we fallback to the regex engine. return nil - case syntax.OpPlus, syntax.OpStar: + case syntax.OpPlus: if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { return nil } - return &anyStringMatcher{ - allowEmpty: re.Op == syntax.OpStar, - matchNL: re.Sub[0].Op == syntax.OpAnyChar, + return &anyNonEmptyStringMatcher{ + matchNL: re.Sub[0].Op == syntax.OpAnyChar, } + case syntax.OpStar: + if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { + return nil + } + + // If the newline is valid, than this matcher literally match any string (even empty). + if re.Sub[0].Op == syntax.OpAnyChar { + return trueMatcher{} + } + + // Any string is fine (including an empty one), as far as it doesn't contain any newline. + return anyStringWithoutNewlineMatcher{} case syntax.OpEmptyMatch: return emptyStringMatcher{} @@ -531,20 +558,37 @@ func (m *equalMultiStringMatcher) Matches(s string) bool { return ok } -// anyStringMatcher is a matcher that matches any string. -// It is used for the + and * operator. matchNL tells if it should matches newlines or not. -type anyStringMatcher struct { - allowEmpty bool - matchNL bool +// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string +// (including an empty one) as far as it doesn't contain any newline character. +type anyStringWithoutNewlineMatcher struct{} + +func (m anyStringWithoutNewlineMatcher) Matches(s string) bool { + // We need to make sure it doesn't contain a newline. Since the newline is + // an ASCII character, we can use strings.IndexByte(). + return strings.IndexByte(s, '\n') == -1 } -func (m *anyStringMatcher) Matches(s string) bool { - if !m.allowEmpty && len(s) == 0 { - return false - } - if !m.matchNL && strings.ContainsRune(s, '\n') { - return false +// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string. +type anyNonEmptyStringMatcher struct { + matchNL bool +} + +func (m *anyNonEmptyStringMatcher) Matches(s string) bool { + if m.matchNL { + // It's OK if the string contains a newline so we just need to make + // sure it's non-empty. + return len(s) > 0 } + + // We need to make sure it non-empty and doesn't contain a newline. + // Since the newline is an ASCII character, we can use strings.IndexByte(). + return len(s) > 0 && strings.IndexByte(s, '\n') == -1 +} + +// trueMatcher is a stringMatcher which matches any string (always returns true). +type trueMatcher struct{} + +func (m trueMatcher) Matches(_ string) bool { return true } diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index ad0576a42..ead22a741 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -27,13 +27,9 @@ import ( "github.com/stretchr/testify/require" ) -func init() { - rand.Seed(time.Now().UnixNano()) -} - var ( - letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") - regexes = []string{ + asciiRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_") + regexes = []string{ "foo", "^foo", "(foo|bar)", @@ -225,20 +221,25 @@ func TestFindSetMatches(t *testing.T) { } func BenchmarkFastRegexMatcher(b *testing.B) { - var ( - x = strings.Repeat("x", 50) - y = "foo" + x - z = x + "foo" - ) + // Init the random seed with a constant, so that it doesn't change between runs. + randGenerator := rand.New(rand.NewSource(1)) + + // Generate variable lengths random texts to match against. + texts := append([]string{}, randStrings(randGenerator, 10, 10)...) + texts = append(texts, randStrings(randGenerator, 5, 30)...) + texts = append(texts, randStrings(randGenerator, 1, 100)...) + texts = append(texts, "foo"+randString(randGenerator, 50)) + texts = append(texts, randString(randGenerator, 50)+"foo") + for _, r := range regexes { b.Run(getTestNameFromRegexp(r), func(b *testing.B) { m, err := NewFastRegexMatcher(r) require.NoError(b, err) b.ResetTimer() for i := 0; i < b.N; i++ { - _ = m.MatchString(x) - _ = m.MatchString(y) - _ = m.MatchString(z) + for _, text := range texts { + _ = m.MatchString(text) + } } }) } @@ -249,15 +250,15 @@ func Test_OptimizeRegex(t *testing.T) { pattern string exp StringMatcher }{ - {".*", &anyStringMatcher{allowEmpty: true, matchNL: false}}, - {".*?", &anyStringMatcher{allowEmpty: true, matchNL: false}}, - {"(?s:.*)", &anyStringMatcher{allowEmpty: true, matchNL: true}}, - {"(.*)", &anyStringMatcher{allowEmpty: true, matchNL: false}}, - {"^.*$", &anyStringMatcher{allowEmpty: true, matchNL: false}}, - {".+", &anyStringMatcher{allowEmpty: false, matchNL: false}}, - {"(?s:.+)", &anyStringMatcher{allowEmpty: false, matchNL: true}}, - {"^.+$", &anyStringMatcher{allowEmpty: false, matchNL: false}}, - {"(.+)", &anyStringMatcher{allowEmpty: false, matchNL: false}}, + {".*", anyStringWithoutNewlineMatcher{}}, + {".*?", anyStringWithoutNewlineMatcher{}}, + {"(?s:.*)", trueMatcher{}}, + {"(.*)", anyStringWithoutNewlineMatcher{}}, + {"^.*$", anyStringWithoutNewlineMatcher{}}, + {".+", &anyNonEmptyStringMatcher{matchNL: false}}, + {"(?s:.+)", &anyNonEmptyStringMatcher{matchNL: true}}, + {"^.+$", &anyNonEmptyStringMatcher{matchNL: false}}, + {"(.+)", &anyNonEmptyStringMatcher{matchNL: false}}, {"", emptyStringMatcher{}}, {"^$", emptyStringMatcher{}}, {"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}}, @@ -265,23 +266,23 @@ func Test_OptimizeRegex(t *testing.T) { {"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})}, {"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})}, {"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})}, - {".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}}, - {"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}}, - {"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}}, + {".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}}, + {"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}}, + {"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}}, + {"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: anyStringWithoutNewlineMatcher{}}}, + {"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: &anyNonEmptyStringMatcher{matchNL: false}}}, + {"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}}, + {"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}}, + {"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}}, + {"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}}, {"10\\.0\\.(1|2)\\.+", nil}, - {"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}}, - {"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}}, - {"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}}, - {"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}}, - {"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})}, - {"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})}, - {"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}}, + {"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}}, + {"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}}, + {"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: anyStringWithoutNewlineMatcher{}}}, + {"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}}, + {"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})}, + {"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}})}, + {"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}}, // we don't support case insensitive matching for contains. // This is because there's no strings.IndexOfFold function. // We can revisit later if this is really popular by using strings.ToUpper. @@ -308,18 +309,18 @@ func Test_OptimizeRegex(t *testing.T) { } } -func randString(length int) string { +func randString(randGenerator *rand.Rand, length int) string { b := make([]rune, length) for i := range b { - b[i] = letterRunes[rand.Intn(len(letterRunes))] + b[i] = asciiRunes[randGenerator.Intn(len(asciiRunes))] } return string(b) } -func randStrings(many, length int) []string { +func randStrings(randGenerator *rand.Rand, many, length int) []string { out := make([]string, 0, many) for i := 0; i < many; i++ { - out = append(out, randString(length)) + out = append(out, randString(randGenerator, length)) } return out } @@ -524,16 +525,18 @@ func TestOptimizeEqualStringMatchers(t *testing.T) { // This benchmark is used to find a good threshold to use to apply the optimization // done by optimizeEqualStringMatchers() func BenchmarkOptimizeEqualStringMatchers(b *testing.B) { + randGenerator := rand.New(rand.NewSource(time.Now().UnixNano())) + // Generate variable lengths random texts to match against. - texts := append([]string{}, randStrings(10, 10)...) - texts = append(texts, randStrings(5, 30)...) - texts = append(texts, randStrings(1, 100)...) + texts := append([]string{}, randStrings(randGenerator, 10, 10)...) + texts = append(texts, randStrings(randGenerator, 5, 30)...) + texts = append(texts, randStrings(randGenerator, 1, 100)...) for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 { for _, caseSensitive := range []bool{true, false} { b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) { // Generate a regex with the expected number of alternations. - re := strings.Join(randStrings(numAlternations, 10), "|") + re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|") if !caseSensitive { re = "(?i:(" + re + "))" } diff --git a/promql/parser/parse_test.go b/promql/parser/parse_test.go index 511d2d26e..1230b39b7 100644 --- a/promql/parser/parse_test.go +++ b/promql/parser/parse_test.go @@ -3565,7 +3565,32 @@ func TestParseExpressions(t *testing.T) { if !test.fail { require.NoError(t, err) - require.Equal(t, test.expected, expr, "error on input '%s'", test.input) + expected := test.expected + + // The FastRegexMatcher introduced in mimir-prometheus is not comparable with + // a deep equal, so only compare its String() version. + if actualVector, ok := expr.(*VectorSelector); ok { + require.IsType(t, &VectorSelector{}, test.expected, "error on input '%s'", test.input) + expectedVector := test.expected.(*VectorSelector) + + require.Len(t, actualVector.LabelMatchers, len(expectedVector.LabelMatchers), "error on input '%s'", test.input) + + for i := 0; i < len(actualVector.LabelMatchers); i++ { + expectedMatcher := expectedVector.LabelMatchers[i].String() + actualMatcher := actualVector.LabelMatchers[i].String() + + require.Equal(t, expectedMatcher, actualMatcher, "unexpected label matcher '%s' on input '%s'", actualMatcher, test.input) + } + + // Make a shallow copy of the expected expr (because the test cases are defined in a global variable) + // and then reset the LabelMatcher to not compared them with the following deep equal. + expectedCopy := *expectedVector + expectedCopy.LabelMatchers = nil + expected = &expectedCopy + actualVector.LabelMatchers = nil + } + + require.Equal(t, expected, expr, "error on input '%s'", test.input) } else { require.Error(t, err) require.Contains(t, err.Error(), test.errMsg, "unexpected error on input '%s', expected '%s', got '%s'", test.input, test.errMsg, err.Error())