Optimize regex star operation (#448)

* Optimize .* regex matcher Signed-off-by: Marco Pracucci <marco@pracucci.com> * Consistent benchmark runs for BenchmarkFastRegexMatcher Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed TestParseExpressions Signed-off-by: Marco Pracucci <marco@pracucci.com> --------- Signed-off-by: Marco Pracucci <marco@pracucci.com>
2025-03-05 20:59:13 -08:00 · 2023-03-09 09:38:41 +01:00 · 2023-03-09 09:38:41 +01:00 · 242e82b8e6
parent cfdf2a0594
commit 242e82b8e6
3 changed files with 156 additions and 84 deletions
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -37,6 +37,9 @@ type FastRegexMatcher struct {
 	prefix        string
 	suffix        string
 	contains      string
+
+	// matchString is the "compiled" function to run by MatchString().
+	matchString func(string) bool
 }

 func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
@ -61,9 +64,42 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
 	}
 	m.stringMatcher = stringMatcherFromRegexp(parsed)

+	m.matchString = m.compileMatchStringFunction()
 	return m, nil
 }

+// compileMatchStringFunction returns the function to run by MatchString().
+func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
+	// If the only optimization available is the string matcher, then we can just run it.
+	if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil {
+		return m.stringMatcher.Matches
+	}
+
+	return func(s string) bool {
+		if len(m.setMatches) != 0 {
+			for _, match := range m.setMatches {
+				if match == s {
+					return true
+				}
+			}
+			return false
+		}
+		if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
+			return false
+		}
+		if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
+			return false
+		}
+		if m.contains != "" && !strings.Contains(s, m.contains) {
+			return false
+		}
+		if m.stringMatcher != nil {
+			return m.stringMatcher.Matches(s)
+		}
+		return m.re.MatchString(s)
+	}
+}
+
 // isOptimized returns true if any fast-path optimization is applied to the
 // regex matcher.
 //
@ -250,27 +286,7 @@ func tooManyMatches(matches []string, new ...string) bool {
 }

 func (m *FastRegexMatcher) MatchString(s string) bool {
-	if len(m.setMatches) != 0 {
-		for _, match := range m.setMatches {
-			if match == s {
-				return true
-			}
-		}
-		return false
-	}
-	if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
-		return false
-	}
-	if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
-		return false
-	}
-	if m.contains != "" && !strings.Contains(s, m.contains) {
-		return false
-	}
-	if m.stringMatcher != nil {
-		return m.stringMatcher.Matches(s)
-	}
-	return m.re.MatchString(s)
+	return m.matchString(s)
 }

 func (m *FastRegexMatcher) SetMatches() []string {
@ -351,14 +367,25 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
 		// Correctly handling the end text operator inside a regex is tricky,
 		// so in this case we fallback to the regex engine.
 		return nil
-	case syntax.OpPlus, syntax.OpStar:
+	case syntax.OpPlus:
 		if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
 			return nil
 		}
-		return &anyStringMatcher{
-			allowEmpty: re.Op == syntax.OpStar,
+		return &anyNonEmptyStringMatcher{
 			matchNL: re.Sub[0].Op == syntax.OpAnyChar,
 		}
+	case syntax.OpStar:
+		if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
+			return nil
+		}
+
+		// If the newline is valid, than this matcher literally match any string (even empty).
+		if re.Sub[0].Op == syntax.OpAnyChar {
+			return trueMatcher{}
+		}
+
+		// Any string is fine (including an empty one), as far as it doesn't contain any newline.
+		return anyStringWithoutNewlineMatcher{}
 	case syntax.OpEmptyMatch:
 		return emptyStringMatcher{}

@ -531,20 +558,37 @@ func (m *equalMultiStringMatcher) Matches(s string) bool {
 	return ok
 }

-// anyStringMatcher is a matcher that matches any string.
-// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
-type anyStringMatcher struct {
-	allowEmpty bool
+// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
+// (including an empty one) as far as it doesn't contain any newline character.
+type anyStringWithoutNewlineMatcher struct{}
+
+func (m anyStringWithoutNewlineMatcher) Matches(s string) bool {
+	// We need to make sure it doesn't contain a newline. Since the newline is
+	// an ASCII character, we can use strings.IndexByte().
+	return strings.IndexByte(s, '\n') == -1
+}
+
+// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string.
+type anyNonEmptyStringMatcher struct {
 	matchNL bool
 }

-func (m *anyStringMatcher) Matches(s string) bool {
-	if !m.allowEmpty && len(s) == 0 {
-		return false
+func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
+	if m.matchNL {
+		// It's OK if the string contains a newline so we just need to make
+		// sure it's non-empty.
+		return len(s) > 0
 	}
-	if !m.matchNL && strings.ContainsRune(s, '\n') {
-		return false
+
+	// We need to make sure it non-empty and doesn't contain a newline.
+	// Since the newline is an ASCII character, we can use strings.IndexByte().
+	return len(s) > 0 && strings.IndexByte(s, '\n') == -1
 }
+
+// trueMatcher is a stringMatcher which matches any string (always returns true).
+type trueMatcher struct{}
+
+func (m trueMatcher) Matches(_ string) bool {
 	return true
 }

--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@ -27,12 +27,8 @@ import (
 	"github.com/stretchr/testify/require"
 )

-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
 var (
-	letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
+	asciiRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_")
 	regexes    = []string{
 		"foo",
 		"^foo",
@ -225,20 +221,25 @@ func TestFindSetMatches(t *testing.T) {
 }

 func BenchmarkFastRegexMatcher(b *testing.B) {
-	var (
-		x = strings.Repeat("x", 50)
-		y = "foo" + x
-		z = x + "foo"
-	)
+	// Init the random seed with a constant, so that it doesn't change between runs.
+	randGenerator := rand.New(rand.NewSource(1))
+
+	// Generate variable lengths random texts to match against.
+	texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
+	texts = append(texts, randStrings(randGenerator, 5, 30)...)
+	texts = append(texts, randStrings(randGenerator, 1, 100)...)
+	texts = append(texts, "foo"+randString(randGenerator, 50))
+	texts = append(texts, randString(randGenerator, 50)+"foo")
+
 	for _, r := range regexes {
 		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
 			m, err := NewFastRegexMatcher(r)
 			require.NoError(b, err)
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
-				_ = m.MatchString(x)
-				_ = m.MatchString(y)
-				_ = m.MatchString(z)
+				for _, text := range texts {
+					_ = m.MatchString(text)
+				}
 			}
 		})
 	}
@ -249,15 +250,15 @@ func Test_OptimizeRegex(t *testing.T) {
 		pattern string
 		exp     StringMatcher
 	}{
-		{".*", &anyStringMatcher{allowEmpty: true, matchNL: false}},
-		{".*?", &anyStringMatcher{allowEmpty: true, matchNL: false}},
-		{"(?s:.*)", &anyStringMatcher{allowEmpty: true, matchNL: true}},
-		{"(.*)", &anyStringMatcher{allowEmpty: true, matchNL: false}},
-		{"^.*$", &anyStringMatcher{allowEmpty: true, matchNL: false}},
-		{".+", &anyStringMatcher{allowEmpty: false, matchNL: false}},
-		{"(?s:.+)", &anyStringMatcher{allowEmpty: false, matchNL: true}},
-		{"^.+$", &anyStringMatcher{allowEmpty: false, matchNL: false}},
-		{"(.+)", &anyStringMatcher{allowEmpty: false, matchNL: false}},
+		{".*", anyStringWithoutNewlineMatcher{}},
+		{".*?", anyStringWithoutNewlineMatcher{}},
+		{"(?s:.*)", trueMatcher{}},
+		{"(.*)", anyStringWithoutNewlineMatcher{}},
+		{"^.*$", anyStringWithoutNewlineMatcher{}},
+		{".+", &anyNonEmptyStringMatcher{matchNL: false}},
+		{"(?s:.+)", &anyNonEmptyStringMatcher{matchNL: true}},
+		{"^.+$", &anyNonEmptyStringMatcher{matchNL: false}},
+		{"(.+)", &anyNonEmptyStringMatcher{matchNL: false}},
 		{"", emptyStringMatcher{}},
 		{"^$", emptyStringMatcher{}},
 		{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
@ -265,23 +266,23 @@ func Test_OptimizeRegex(t *testing.T) {
 		{"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
 		{"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
 		{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
-		{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
-		{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
-		{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
+		{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
+		{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
+		{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
+		{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: anyStringWithoutNewlineMatcher{}}},
+		{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
+		{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
+		{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
+		{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
+		{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
 		{"10\\.0\\.(1|2)\\.+", nil},
-		{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
-		{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
-		{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
-		{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
-		{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
-		{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})},
-		{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
+		{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
+		{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
+		{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: anyStringWithoutNewlineMatcher{}}},
+		{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
+		{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
+		{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}})},
+		{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
 		// we don't support case insensitive matching for contains.
 		// This is because there's no strings.IndexOfFold function.
 		// We can revisit later if this is really popular by using strings.ToUpper.
@ -308,18 +309,18 @@ func Test_OptimizeRegex(t *testing.T) {
 	}
 }

-func randString(length int) string {
+func randString(randGenerator *rand.Rand, length int) string {
 	b := make([]rune, length)
 	for i := range b {
-		b[i] = letterRunes[rand.Intn(len(letterRunes))]
+		b[i] = asciiRunes[randGenerator.Intn(len(asciiRunes))]
 	}
 	return string(b)
 }

-func randStrings(many, length int) []string {
+func randStrings(randGenerator *rand.Rand, many, length int) []string {
 	out := make([]string, 0, many)
 	for i := 0; i < many; i++ {
-		out = append(out, randString(length))
+		out = append(out, randString(randGenerator, length))
 	}
 	return out
 }
@ -524,16 +525,18 @@ func TestOptimizeEqualStringMatchers(t *testing.T) {
 // This benchmark is used to find a good threshold to use to apply the optimization
 // done by optimizeEqualStringMatchers()
 func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
+	randGenerator := rand.New(rand.NewSource(time.Now().UnixNano()))
+
 	// Generate variable lengths random texts to match against.
-	texts := append([]string{}, randStrings(10, 10)...)
-	texts = append(texts, randStrings(5, 30)...)
-	texts = append(texts, randStrings(1, 100)...)
+	texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
+	texts = append(texts, randStrings(randGenerator, 5, 30)...)
+	texts = append(texts, randStrings(randGenerator, 1, 100)...)

 	for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
 		for _, caseSensitive := range []bool{true, false} {
 			b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
 				// Generate a regex with the expected number of alternations.
-				re := strings.Join(randStrings(numAlternations, 10), "|")
+				re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|")
 				if !caseSensitive {
 					re = "(?i:(" + re + "))"
 				}
--- a/promql/parser/parse_test.go
+++ b/promql/parser/parse_test.go
@ -3565,7 +3565,32 @@ func TestParseExpressions(t *testing.T) {

 			if !test.fail {
 				require.NoError(t, err)
-				require.Equal(t, test.expected, expr, "error on input '%s'", test.input)
+				expected := test.expected
+
+				// The FastRegexMatcher introduced in mimir-prometheus is not comparable with
+				// a deep equal, so only compare its String() version.
+				if actualVector, ok := expr.(*VectorSelector); ok {
+					require.IsType(t, &VectorSelector{}, test.expected, "error on input '%s'", test.input)
+					expectedVector := test.expected.(*VectorSelector)
+
+					require.Len(t, actualVector.LabelMatchers, len(expectedVector.LabelMatchers), "error on input '%s'", test.input)
+
+					for i := 0; i < len(actualVector.LabelMatchers); i++ {
+						expectedMatcher := expectedVector.LabelMatchers[i].String()
+						actualMatcher := actualVector.LabelMatchers[i].String()
+
+						require.Equal(t, expectedMatcher, actualMatcher, "unexpected label matcher '%s' on input '%s'", actualMatcher, test.input)
+					}
+
+					// Make a shallow copy of the expected expr (because the test cases are defined in a global variable)
+					// and then reset the LabelMatcher to not compared them with the following deep equal.
+					expectedCopy := *expectedVector
+					expectedCopy.LabelMatchers = nil
+					expected = &expectedCopy
+					actualVector.LabelMatchers = nil
+				}
+
+				require.Equal(t, expected, expr, "error on input '%s'", test.input)
 			} else {
 				require.Error(t, err)
 				require.Contains(t, err.Error(), test.errMsg, "unexpected error on input '%s', expected '%s', got '%s'", test.input, test.errMsg, err.Error())