Optimized very long case insensitive alternations (#444)

* Optimized very long case insensitive alternations Signed-off-by: Marco Pracucci <marco@pracucci.com> * Run common regexps in BenchmarkFastRegexMatcher Signed-off-by: Marco Pracucci <marco@pracucci.com> * Modify BenchmarkNewFastRegexMatcher to benchmark the NewFastRegexMatcher() function Signed-off-by: Marco Pracucci <marco@pracucci.com> * Reduced allocations by optimizeEqualStringMatchers() Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in comments Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in test case name Signed-off-by: Marco Pracucci <marco@pracucci.com> --------- Signed-off-by: Marco Pracucci <marco@pracucci.com>
2025-03-05 20:59:13 -08:00 · 2023-03-02 17:20:52 +01:00 · 2023-03-02 17:20:52 +01:00 · 1e7ad0ec11
parent 383ea59ce1
commit 1e7ad0ec11
2 changed files with 306 additions and 44 deletions
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -20,7 +20,14 @@ import (
 	"github.com/grafana/regexp/syntax"
 )
-const maxSetMatches = 256
+const (
 	maxSetMatches = 256
 	// The minimum number of alternate values a regex should have to trigger
 	// the optimization done by optimizeEqualStringMatchers(). This value has
 	// been computed running BenchmarkOptimizeEqualStringMatchers.
 	optimizeEqualStringMatchersThreshold = 16
 )
 type FastRegexMatcher struct {
 	re *regexp.Regexp
@ -326,7 +333,10 @@ type StringMatcher interface {
 func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
 	clearBeginEndText(re)
-	return stringMatcherFromRegexpInternal(re)
+	m := stringMatcherFromRegexpInternal(re)
 	m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold)
 	return m
 }
 func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
@ -503,6 +513,24 @@ func (m *equalStringMatcher) Matches(s string) bool {
 	return strings.EqualFold(m.s, s)
 }
 // equalMultiStringMatcher matches a string exactly against a set of valid values.
 type equalMultiStringMatcher struct {
 	// values to match a string against. If the matching is case insensitive,
 	// the values here must be lowercase.
 	values map[string]struct{}
 	caseSensitive bool
 }
 func (m *equalMultiStringMatcher) Matches(s string) bool {
 	if !m.caseSensitive {
 		s = strings.ToLower(s)
 	}
 	_, ok := m.values[s]
 	return ok
 }
 // anyStringMatcher is a matcher that matches any string.
 // It is used for the + and * operator. matchNL tells if it should matches newlines or not.
 type anyStringMatcher struct {
@ -519,3 +547,92 @@ func (m *anyStringMatcher) Matches(s string) bool {
 	}
 	return true
 }
 // optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
 // alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
 // this specific case, when we have many strings to match against we can use a map instead
 // of iterating over the list of strings.
 func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
 	var (
 		caseSensitive    bool
 		caseSensitiveSet bool
 		numValues        int
 	)
 	// Analyse the input StringMatcher to count the number of occurrences
 	// and ensure all of them have the same case sensitivity.
 	analyseCallback := func(matcher *equalStringMatcher) bool {
 		// Ensure we don't have mixed case sensitivity.
 		if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
 			return false
 		} else if !caseSensitiveSet {
 			caseSensitive = matcher.caseSensitive
 			caseSensitiveSet = true
 		}
 		numValues++
 		return true
 	}
 	if !findEqualStringMatchers(input, analyseCallback) {
 		return input
 	}
 	// If the number of values found is less than the threshold, then we should skip the optimization.
 	if numValues < threshold {
 		return input
 	}
 	// Parse again the input StringMatcher to extract all values and storing them.
 	// We can skip the case sensitivity check because we've already checked it and
 	// if the code reach this point then it means all matchers have the same case sensitivity.
 	values := make(map[string]struct{}, numValues)
 	// Ignore the return value because we already iterated over the input StringMatcher
 	// and it was all good.
 	findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
 		if caseSensitive {
 			values[matcher.s] = struct{}{}
 		} else {
 			values[strings.ToLower(matcher.s)] = struct{}{}
 		}
 		return true
 	})
 	return &equalMultiStringMatcher{
 		values:        values,
 		caseSensitive: caseSensitive,
 	}
 }
 // findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
 // equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
 // composed by an alternation of equalStringMatcher.
 func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
 	orInput, ok := input.(orStringMatcher)
 	if !ok {
 		return false
 	}
 	for _, m := range orInput {
 		switch casted := m.(type) {
 		case orStringMatcher:
 			if !findEqualStringMatchers(m, callback) {
 				return false
 			}
 		case *equalStringMatcher:
 			if !callback(casted) {
 				return false
 			}
 		default:
 			// It's not an equal string matcher, so we have to stop searching
 			// cause this optimization can't be applied.
 			return false
 		}
 	}
 	return true
 }
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@ -15,6 +15,7 @@ package labels
 import (
 	"bufio"
 	"fmt"
 	"math/rand"
 	"os"
 	"strings"
@ -33,6 +34,8 @@ func init() {
 var (
 	letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 	regexes     = []string{
 		"foo",
 		"^foo",
 		"(foo|bar)",
 		"foo.*",
 		".*foo",
@ -46,17 +49,24 @@ var (
 		"foo\n.*",
 		".*foo.*",
 		".+foo.+",
 		"",
 		"(?s:.*)",
 		"(?s:.+)",
 		"(?s:^.*foo$)",
 		"(?i:foo)",
 		"(?i:(foo|bar))",
 		"(?i:(foo1|foo2|bar))",
 		"^(?i:foo|oo)|(bar)$",
 		"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
 		"((.*)(bar|b|buzz)(.+)|foo)$",
 		"^$",
 		"(prometheus|api_prom)_api_v1_.+",
 		"10\\.0\\.(1|2)\\.+",
 		"10\\.0\\.(1|2).+",
 		"((fo(bar))|.+foo)",
 		// A long case sensitive alternation.
 		"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
 		// A long case insensitive alternation.
 		"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
 	}
 	values = []string{
 		"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@ -83,27 +93,15 @@ func TestNewFastRegexMatcher(t *testing.T) {
 }
 func BenchmarkNewFastRegexMatcher(b *testing.B) {
 	benchValues := values
 	for _, v := range values {
 		for i := 5; i < 50; i = i + 5 {
 			benchValues = append(benchValues, v+RandStringRunes(i))
 			benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
 			benchValues = append(benchValues, RandStringRunes(i)+v)
 		}
 	}
 	for _, r := range regexes {
-		r := r
+		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
-		b.Run(r, func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
-			m, err := NewFastRegexMatcher(r)
+				_, err := NewFastRegexMatcher(r)
-			require.NoError(b, err)
+				if err != nil {
-			b.ResetTimer()
+					b.Fatal(err)
 			for i := 0; i < b.N; i++ {
 				for _, v := range benchValues {
 					_ = m.MatchString(v)
 				}
 			}
 		})
 	}
 }
@ -232,29 +230,8 @@ func BenchmarkFastRegexMatcher(b *testing.B) {
 		y = "foo" + x
 		z = x + "foo"
 	)
 	regexes := []string{
 		"foo",
 		"^foo",
 		"(foo|bar)",
 		"foo.*",
 		".*foo",
 		"^.*foo$",
 		"^.+foo$",
 		".*",
 		".+",
 		"foo.+",
 		".+foo",
 		".*foo.*",
 		"(?i:foo)",
 		"(?i:(foo|bar))",
 		"(?i:(foo1|foo2|bar))",
 		"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
 		"(prometheus|api_prom)_api_v1_.+",
 		"((fo(bar))|.+foo)",
 	}
 	for _, r := range regexes {
-		r := r
+		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
 		b.Run(r, func(b *testing.B) {
 			m, err := NewFastRegexMatcher(r)
 			require.NoError(b, err)
 			b.ResetTimer()
@ -331,14 +308,22 @@ func Test_OptimizeRegex(t *testing.T) {
 	}
 }
-func RandStringRunes(n int) string {
+func randString(length int) string {
-	b := make([]rune, n)
+	b := make([]rune, length)
 	for i := range b {
 		b[i] = letterRunes[rand.Intn(len(letterRunes))]
 	}
 	return string(b)
 }
 func randStrings(many, length int) []string {
 	out := make([]string, 0, many)
 	for i := 0; i < many; i++ {
 		out = append(out, randString(length))
 	}
 	return out
 }
 func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
 	// Create all matchers.
 	matchers := make([]*FastRegexMatcher, 0, len(regexes))
@ -428,3 +413,163 @@ func TestAnalyzeRealQueries(t *testing.T) {
 	t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked)
 }
 func TestOptimizeEqualStringMatchers(t *testing.T) {
 	tests := map[string]struct {
 		input                 StringMatcher
 		expectedValues        map[string]struct{}
 		expectedCaseSensitive bool
 	}{
 		"should skip optimization on orStringMatcher with containsStringMatcher": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				&containsStringMatcher{substrings: []string{"a", "b", "c"}},
 			},
 			expectedValues: nil,
 		},
 		"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				&equalStringMatcher{s: "bar", caseSensitive: true},
 				&equalStringMatcher{s: "baz", caseSensitive: true},
 			},
 			expectedValues: map[string]struct{}{
 				"FOO": {},
 				"bar": {},
 				"baz": {},
 			},
 			expectedCaseSensitive: true,
 		},
 		"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				&equalStringMatcher{s: "bar", caseSensitive: false},
 				&equalStringMatcher{s: "baz", caseSensitive: true},
 			},
 			expectedValues: nil,
 		},
 		"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				orStringMatcher{
 					&equalStringMatcher{s: "bar", caseSensitive: true},
 					&equalStringMatcher{s: "xxx", caseSensitive: true},
 				},
 				&equalStringMatcher{s: "baz", caseSensitive: true},
 			},
 			expectedValues: map[string]struct{}{
 				"FOO": {},
 				"bar": {},
 				"xxx": {},
 				"baz": {},
 			},
 			expectedCaseSensitive: true,
 		},
 		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				orStringMatcher{
 					// Case sensitivity is different within items at the same level.
 					&equalStringMatcher{s: "bar", caseSensitive: true},
 					&equalStringMatcher{s: "xxx", caseSensitive: false},
 				},
 				&equalStringMatcher{s: "baz", caseSensitive: true},
 			},
 			expectedValues: nil,
 		},
 		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: true},
 				// Case sensitivity is different between the parent and child.
 				orStringMatcher{
 					&equalStringMatcher{s: "bar", caseSensitive: false},
 					&equalStringMatcher{s: "xxx", caseSensitive: false},
 				},
 				&equalStringMatcher{s: "baz", caseSensitive: true},
 			},
 			expectedValues: nil,
 		},
 		"should return lowercase values on case insensitive matchers": {
 			input: orStringMatcher{
 				&equalStringMatcher{s: "FOO", caseSensitive: false},
 				orStringMatcher{
 					&equalStringMatcher{s: "bAr", caseSensitive: false},
 				},
 				&equalStringMatcher{s: "baZ", caseSensitive: false},
 			},
 			expectedValues: map[string]struct{}{
 				"foo": {},
 				"bar": {},
 				"baz": {},
 			},
 			expectedCaseSensitive: false,
 		},
 	}
 	for testName, testData := range tests {
 		t.Run(testName, func(t *testing.T) {
 			actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
 			if testData.expectedValues == nil {
 				require.IsType(t, testData.input, actualMatcher)
 			} else {
 				require.IsType(t, &equalMultiStringMatcher{}, actualMatcher)
 				require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values)
 				require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive)
 			}
 		})
 	}
 }
 // This benchmark is used to find a good threshold to use to apply the optimization
 // done by optimizeEqualStringMatchers()
 func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
 	// Generate variable lengths random texts to match against.
 	texts := append([]string{}, randStrings(10, 10)...)
 	texts = append(texts, randStrings(5, 30)...)
 	texts = append(texts, randStrings(1, 100)...)
 	for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
 		for _, caseSensitive := range []bool{true, false} {
 			b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
 				// Generate a regex with the expected number of alternations.
 				re := strings.Join(randStrings(numAlternations, 10), "|")
 				if !caseSensitive {
 					re = "(?i:(" + re + "))"
 				}
 				parsed, err := syntax.Parse(re, syntax.Perl)
 				require.NoError(b, err)
 				unoptimized := stringMatcherFromRegexpInternal(parsed)
 				require.IsType(b, orStringMatcher{}, unoptimized)
 				optimized := optimizeEqualStringMatchers(unoptimized, 0)
 				require.IsType(b, &equalMultiStringMatcher{}, optimized)
 				b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
 					for n := 0; n < b.N; n++ {
 						for _, t := range texts {
 							unoptimized.Matches(t)
 						}
 					}
 				})
 				b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
 					for n := 0; n < b.N; n++ {
 						for _, t := range texts {
 							optimized.Matches(t)
 						}
 					}
 				})
 			})
 		}
 	}
 }
 func getTestNameFromRegexp(re string) string {
 	if len(re) > 32 {
 		return re[:32]
 	}
 	return re
 }