Optimized very long case insensitive alternations (#444)

* Optimized very long case insensitive alternations Signed-off-by: Marco Pracucci <marco@pracucci.com> * Run common regexps in BenchmarkFastRegexMatcher Signed-off-by: Marco Pracucci <marco@pracucci.com> * Modify BenchmarkNewFastRegexMatcher to benchmark the NewFastRegexMatcher() function Signed-off-by: Marco Pracucci <marco@pracucci.com> * Reduced allocations by optimizeEqualStringMatchers() Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in comments Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in test case name Signed-off-by: Marco Pracucci <marco@pracucci.com> --------- Signed-off-by: Marco Pracucci <marco@pracucci.com>
2025-03-05 20:59:13 -08:00 · 2023-03-02 17:20:52 +01:00 · 2023-03-02 17:20:52 +01:00 · 1e7ad0ec11
parent 383ea59ce1
commit 1e7ad0ec11
2 changed files with 306 additions and 44 deletions
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -20,7 +20,14 @@ import (
 	"github.com/grafana/regexp/syntax"
 )

-const maxSetMatches = 256
+const (
+	maxSetMatches = 256
+
+	// The minimum number of alternate values a regex should have to trigger
+	// the optimization done by optimizeEqualStringMatchers(). This value has
+	// been computed running BenchmarkOptimizeEqualStringMatchers.
+	optimizeEqualStringMatchersThreshold = 16
+)

 type FastRegexMatcher struct {
 	re *regexp.Regexp
@ -326,7 +333,10 @@ type StringMatcher interface {
 func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
 	clearBeginEndText(re)

-	return stringMatcherFromRegexpInternal(re)
+	m := stringMatcherFromRegexpInternal(re)
+	m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold)
+
+	return m
 }

 func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
@ -503,6 +513,24 @@ func (m *equalStringMatcher) Matches(s string) bool {
 	return strings.EqualFold(m.s, s)
 }

+// equalMultiStringMatcher matches a string exactly against a set of valid values.
+type equalMultiStringMatcher struct {
+	// values to match a string against. If the matching is case insensitive,
+	// the values here must be lowercase.
+	values map[string]struct{}
+
+	caseSensitive bool
+}
+
+func (m *equalMultiStringMatcher) Matches(s string) bool {
+	if !m.caseSensitive {
+		s = strings.ToLower(s)
+	}
+
+	_, ok := m.values[s]
+	return ok
+}
+
 // anyStringMatcher is a matcher that matches any string.
 // It is used for the + and * operator. matchNL tells if it should matches newlines or not.
 type anyStringMatcher struct {
@ -519,3 +547,92 @@ func (m *anyStringMatcher) Matches(s string) bool {
 	}
 	return true
 }
+
+// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
+// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
+// this specific case, when we have many strings to match against we can use a map instead
+// of iterating over the list of strings.
+func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
+	var (
+		caseSensitive    bool
+		caseSensitiveSet bool
+		numValues        int
+	)
+
+	// Analyse the input StringMatcher to count the number of occurrences
+	// and ensure all of them have the same case sensitivity.
+	analyseCallback := func(matcher *equalStringMatcher) bool {
+		// Ensure we don't have mixed case sensitivity.
+		if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
+			return false
+		} else if !caseSensitiveSet {
+			caseSensitive = matcher.caseSensitive
+			caseSensitiveSet = true
+		}
+
+		numValues++
+		return true
+	}
+
+	if !findEqualStringMatchers(input, analyseCallback) {
+		return input
+	}
+
+	// If the number of values found is less than the threshold, then we should skip the optimization.
+	if numValues < threshold {
+		return input
+	}
+
+	// Parse again the input StringMatcher to extract all values and storing them.
+	// We can skip the case sensitivity check because we've already checked it and
+	// if the code reach this point then it means all matchers have the same case sensitivity.
+	values := make(map[string]struct{}, numValues)
+
+	// Ignore the return value because we already iterated over the input StringMatcher
+	// and it was all good.
+	findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
+		if caseSensitive {
+			values[matcher.s] = struct{}{}
+		} else {
+			values[strings.ToLower(matcher.s)] = struct{}{}
+		}
+
+		return true
+	})
+
+	return &equalMultiStringMatcher{
+		values:        values,
+		caseSensitive: caseSensitive,
+	}
+}
+
+// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
+// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
+// composed by an alternation of equalStringMatcher.
+func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
+	orInput, ok := input.(orStringMatcher)
+	if !ok {
+		return false
+	}
+
+	for _, m := range orInput {
+		switch casted := m.(type) {
+		case orStringMatcher:
+			if !findEqualStringMatchers(m, callback) {
+				return false
+			}
+
+		case *equalStringMatcher:
+			if !callback(casted) {
+				return false
+			}
+
+		default:
+			// It's not an equal string matcher, so we have to stop searching
+			// cause this optimization can't be applied.
+			return false
+		}
+	}
+
+	return true
+}
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@ -15,6 +15,7 @@ package labels

 import (
 	"bufio"
+	"fmt"
 	"math/rand"
 	"os"
 	"strings"
@ -33,6 +34,8 @@ func init() {
 var (
 	letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 	regexes     = []string{
+		"foo",
+		"^foo",
 		"(foo|bar)",
 		"foo.*",
 		".*foo",
@ -46,17 +49,24 @@ var (
 		"foo\n.*",
 		".*foo.*",
 		".+foo.+",
-		"",
 		"(?s:.*)",
 		"(?s:.+)",
 		"(?s:^.*foo$)",
+		"(?i:foo)",
+		"(?i:(foo|bar))",
+		"(?i:(foo1|foo2|bar))",
 		"^(?i:foo|oo)|(bar)$",
+		"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
 		"((.*)(bar|b|buzz)(.+)|foo)$",
 		"^$",
 		"(prometheus|api_prom)_api_v1_.+",
 		"10\\.0\\.(1|2)\\.+",
 		"10\\.0\\.(1|2).+",
 		"((fo(bar))|.+foo)",
+		// A long case sensitive alternation.
+		"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
+		// A long case insensitive alternation.
+		"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
 	}
 	values = []string{
 		"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@ -83,27 +93,15 @@ func TestNewFastRegexMatcher(t *testing.T) {
 }

 func BenchmarkNewFastRegexMatcher(b *testing.B) {
-	benchValues := values
-	for _, v := range values {
-		for i := 5; i < 50; i = i + 5 {
-			benchValues = append(benchValues, v+RandStringRunes(i))
-			benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
-			benchValues = append(benchValues, RandStringRunes(i)+v)
-		}
-	}
 	for _, r := range regexes {
-		r := r
-		b.Run(r, func(b *testing.B) {
-			m, err := NewFastRegexMatcher(r)
-			require.NoError(b, err)
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				for _, v := range benchValues {
-					_ = m.MatchString(v)
+		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				_, err := NewFastRegexMatcher(r)
+				if err != nil {
+					b.Fatal(err)
 				}
 			}
 		})
-
 	}
 }

@ -232,29 +230,8 @@ func BenchmarkFastRegexMatcher(b *testing.B) {
 		y = "foo" + x
 		z = x + "foo"
 	)
-	regexes := []string{
-		"foo",
-		"^foo",
-		"(foo|bar)",
-		"foo.*",
-		".*foo",
-		"^.*foo$",
-		"^.+foo$",
-		".*",
-		".+",
-		"foo.+",
-		".+foo",
-		".*foo.*",
-		"(?i:foo)",
-		"(?i:(foo|bar))",
-		"(?i:(foo1|foo2|bar))",
-		"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
-		"(prometheus|api_prom)_api_v1_.+",
-		"((fo(bar))|.+foo)",
-	}
 	for _, r := range regexes {
-		r := r
-		b.Run(r, func(b *testing.B) {
+		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
 			m, err := NewFastRegexMatcher(r)
 			require.NoError(b, err)
 			b.ResetTimer()
@ -331,14 +308,22 @@ func Test_OptimizeRegex(t *testing.T) {
 	}
 }

-func RandStringRunes(n int) string {
-	b := make([]rune, n)
+func randString(length int) string {
+	b := make([]rune, length)
 	for i := range b {
 		b[i] = letterRunes[rand.Intn(len(letterRunes))]
 	}
 	return string(b)
 }

+func randStrings(many, length int) []string {
+	out := make([]string, 0, many)
+	for i := 0; i < many; i++ {
+		out = append(out, randString(length))
+	}
+	return out
+}
+
 func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
 	// Create all matchers.
 	matchers := make([]*FastRegexMatcher, 0, len(regexes))
@ -428,3 +413,163 @@ func TestAnalyzeRealQueries(t *testing.T) {

 	t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked)
 }
+
+func TestOptimizeEqualStringMatchers(t *testing.T) {
+	tests := map[string]struct {
+		input                 StringMatcher
+		expectedValues        map[string]struct{}
+		expectedCaseSensitive bool
+	}{
+		"should skip optimization on orStringMatcher with containsStringMatcher": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				&containsStringMatcher{substrings: []string{"a", "b", "c"}},
+			},
+			expectedValues: nil,
+		},
+		"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				&equalStringMatcher{s: "bar", caseSensitive: true},
+				&equalStringMatcher{s: "baz", caseSensitive: true},
+			},
+			expectedValues: map[string]struct{}{
+				"FOO": {},
+				"bar": {},
+				"baz": {},
+			},
+			expectedCaseSensitive: true,
+		},
+		"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				&equalStringMatcher{s: "bar", caseSensitive: false},
+				&equalStringMatcher{s: "baz", caseSensitive: true},
+			},
+			expectedValues: nil,
+		},
+		"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				orStringMatcher{
+					&equalStringMatcher{s: "bar", caseSensitive: true},
+					&equalStringMatcher{s: "xxx", caseSensitive: true},
+				},
+				&equalStringMatcher{s: "baz", caseSensitive: true},
+			},
+			expectedValues: map[string]struct{}{
+				"FOO": {},
+				"bar": {},
+				"xxx": {},
+				"baz": {},
+			},
+			expectedCaseSensitive: true,
+		},
+		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				orStringMatcher{
+					// Case sensitivity is different within items at the same level.
+					&equalStringMatcher{s: "bar", caseSensitive: true},
+					&equalStringMatcher{s: "xxx", caseSensitive: false},
+				},
+				&equalStringMatcher{s: "baz", caseSensitive: true},
+			},
+			expectedValues: nil,
+		},
+		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: true},
+				// Case sensitivity is different between the parent and child.
+				orStringMatcher{
+					&equalStringMatcher{s: "bar", caseSensitive: false},
+					&equalStringMatcher{s: "xxx", caseSensitive: false},
+				},
+				&equalStringMatcher{s: "baz", caseSensitive: true},
+			},
+			expectedValues: nil,
+		},
+		"should return lowercase values on case insensitive matchers": {
+			input: orStringMatcher{
+				&equalStringMatcher{s: "FOO", caseSensitive: false},
+				orStringMatcher{
+					&equalStringMatcher{s: "bAr", caseSensitive: false},
+				},
+				&equalStringMatcher{s: "baZ", caseSensitive: false},
+			},
+			expectedValues: map[string]struct{}{
+				"foo": {},
+				"bar": {},
+				"baz": {},
+			},
+			expectedCaseSensitive: false,
+		},
+	}
+
+	for testName, testData := range tests {
+		t.Run(testName, func(t *testing.T) {
+			actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
+
+			if testData.expectedValues == nil {
+				require.IsType(t, testData.input, actualMatcher)
+			} else {
+				require.IsType(t, &equalMultiStringMatcher{}, actualMatcher)
+				require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values)
+				require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive)
+			}
+		})
+	}
+}
+
+// This benchmark is used to find a good threshold to use to apply the optimization
+// done by optimizeEqualStringMatchers()
+func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
+	// Generate variable lengths random texts to match against.
+	texts := append([]string{}, randStrings(10, 10)...)
+	texts = append(texts, randStrings(5, 30)...)
+	texts = append(texts, randStrings(1, 100)...)
+
+	for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
+		for _, caseSensitive := range []bool{true, false} {
+			b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
+				// Generate a regex with the expected number of alternations.
+				re := strings.Join(randStrings(numAlternations, 10), "|")
+				if !caseSensitive {
+					re = "(?i:(" + re + "))"
+				}
+
+				parsed, err := syntax.Parse(re, syntax.Perl)
+				require.NoError(b, err)
+
+				unoptimized := stringMatcherFromRegexpInternal(parsed)
+				require.IsType(b, orStringMatcher{}, unoptimized)
+
+				optimized := optimizeEqualStringMatchers(unoptimized, 0)
+				require.IsType(b, &equalMultiStringMatcher{}, optimized)
+
+				b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
+					for n := 0; n < b.N; n++ {
+						for _, t := range texts {
+							unoptimized.Matches(t)
+						}
+					}
+				})
+
+				b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
+					for n := 0; n < b.N; n++ {
+						for _, t := range texts {
+							optimized.Matches(t)
+						}
+					}
+				})
+			})
+		}
+	}
+}
+
+func getTestNameFromRegexp(re string) string {
+	if len(re) > 32 {
+		return re[:32]
+	}
+	return re
+}