diff --git a/model/labels/regexp.go b/model/labels/regexp.go index d2c4906fc..3fba36430 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -20,7 +20,14 @@ import ( "github.com/grafana/regexp/syntax" ) -const maxSetMatches = 256 +const ( + maxSetMatches = 256 + + // The minimum number of alternate values a regex should have to trigger + // the optimization done by optimizeEqualStringMatchers(). This value has + // been computed running BenchmarkOptimizeEqualStringMatchers. + optimizeEqualStringMatchersThreshold = 16 +) type FastRegexMatcher struct { re *regexp.Regexp @@ -326,7 +333,10 @@ type StringMatcher interface { func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { clearBeginEndText(re) - return stringMatcherFromRegexpInternal(re) + m := stringMatcherFromRegexpInternal(re) + m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold) + + return m } func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { @@ -503,6 +513,24 @@ func (m *equalStringMatcher) Matches(s string) bool { return strings.EqualFold(m.s, s) } +// equalMultiStringMatcher matches a string exactly against a set of valid values. +type equalMultiStringMatcher struct { + // values to match a string against. If the matching is case insensitive, + // the values here must be lowercase. + values map[string]struct{} + + caseSensitive bool +} + +func (m *equalMultiStringMatcher) Matches(s string) bool { + if !m.caseSensitive { + s = strings.ToLower(s) + } + + _, ok := m.values[s] + return ok +} + // anyStringMatcher is a matcher that matches any string. // It is used for the + and * operator. matchNL tells if it should matches newlines or not. type anyStringMatcher struct { @@ -519,3 +547,92 @@ func (m *anyStringMatcher) Matches(s string) bool { } return true } + +// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an +// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In +// this specific case, when we have many strings to match against we can use a map instead +// of iterating over the list of strings. +func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher { + var ( + caseSensitive bool + caseSensitiveSet bool + numValues int + ) + + // Analyse the input StringMatcher to count the number of occurrences + // and ensure all of them have the same case sensitivity. + analyseCallback := func(matcher *equalStringMatcher) bool { + // Ensure we don't have mixed case sensitivity. + if caseSensitiveSet && caseSensitive != matcher.caseSensitive { + return false + } else if !caseSensitiveSet { + caseSensitive = matcher.caseSensitive + caseSensitiveSet = true + } + + numValues++ + return true + } + + if !findEqualStringMatchers(input, analyseCallback) { + return input + } + + // If the number of values found is less than the threshold, then we should skip the optimization. + if numValues < threshold { + return input + } + + // Parse again the input StringMatcher to extract all values and storing them. + // We can skip the case sensitivity check because we've already checked it and + // if the code reach this point then it means all matchers have the same case sensitivity. + values := make(map[string]struct{}, numValues) + + // Ignore the return value because we already iterated over the input StringMatcher + // and it was all good. + findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool { + if caseSensitive { + values[matcher.s] = struct{}{} + } else { + values[strings.ToLower(matcher.s)] = struct{}{} + } + + return true + }) + + return &equalMultiStringMatcher{ + values: values, + caseSensitive: caseSensitive, + } +} + +// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each +// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only* +// composed by an alternation of equalStringMatcher. +func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool { + orInput, ok := input.(orStringMatcher) + if !ok { + return false + } + + for _, m := range orInput { + switch casted := m.(type) { + case orStringMatcher: + if !findEqualStringMatchers(m, callback) { + return false + } + + case *equalStringMatcher: + if !callback(casted) { + return false + } + + default: + // It's not an equal string matcher, so we have to stop searching + // cause this optimization can't be applied. + return false + } + } + + return true +} diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index b0fefc512..ad0576a42 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -15,6 +15,7 @@ package labels import ( "bufio" + "fmt" "math/rand" "os" "strings" @@ -33,6 +34,8 @@ func init() { var ( letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") regexes = []string{ + "foo", + "^foo", "(foo|bar)", "foo.*", ".*foo", @@ -46,17 +49,24 @@ var ( "foo\n.*", ".*foo.*", ".+foo.+", - "", "(?s:.*)", "(?s:.+)", "(?s:^.*foo$)", + "(?i:foo)", + "(?i:(foo|bar))", + "(?i:(foo1|foo2|bar))", "^(?i:foo|oo)|(bar)$", + "(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))", "((.*)(bar|b|buzz)(.+)|foo)$", "^$", "(prometheus|api_prom)_api_v1_.+", "10\\.0\\.(1|2)\\.+", "10\\.0\\.(1|2).+", "((fo(bar))|.+foo)", + // A long case sensitive alternation. + "zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb", + // A long case insensitive alternation. + "(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))", } values = []string{ "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", @@ -83,27 +93,15 @@ func TestNewFastRegexMatcher(t *testing.T) { } func BenchmarkNewFastRegexMatcher(b *testing.B) { - benchValues := values - for _, v := range values { - for i := 5; i < 50; i = i + 5 { - benchValues = append(benchValues, v+RandStringRunes(i)) - benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i)) - benchValues = append(benchValues, RandStringRunes(i)+v) - } - } for _, r := range regexes { - r := r - b.Run(r, func(b *testing.B) { - m, err := NewFastRegexMatcher(r) - require.NoError(b, err) - b.ResetTimer() - for i := 0; i < b.N; i++ { - for _, v := range benchValues { - _ = m.MatchString(v) + b.Run(getTestNameFromRegexp(r), func(b *testing.B) { + for n := 0; n < b.N; n++ { + _, err := NewFastRegexMatcher(r) + if err != nil { + b.Fatal(err) } } }) - } } @@ -232,29 +230,8 @@ func BenchmarkFastRegexMatcher(b *testing.B) { y = "foo" + x z = x + "foo" ) - regexes := []string{ - "foo", - "^foo", - "(foo|bar)", - "foo.*", - ".*foo", - "^.*foo$", - "^.+foo$", - ".*", - ".+", - "foo.+", - ".+foo", - ".*foo.*", - "(?i:foo)", - "(?i:(foo|bar))", - "(?i:(foo1|foo2|bar))", - "(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))", - "(prometheus|api_prom)_api_v1_.+", - "((fo(bar))|.+foo)", - } for _, r := range regexes { - r := r - b.Run(r, func(b *testing.B) { + b.Run(getTestNameFromRegexp(r), func(b *testing.B) { m, err := NewFastRegexMatcher(r) require.NoError(b, err) b.ResetTimer() @@ -331,14 +308,22 @@ func Test_OptimizeRegex(t *testing.T) { } } -func RandStringRunes(n int) string { - b := make([]rune, n) +func randString(length int) string { + b := make([]rune, length) for i := range b { b[i] = letterRunes[rand.Intn(len(letterRunes))] } return string(b) } +func randStrings(many, length int) []string { + out := make([]string, 0, many) + for i := 0; i < many; i++ { + out = append(out, randString(length)) + } + return out +} + func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) { // Create all matchers. matchers := make([]*FastRegexMatcher, 0, len(regexes)) @@ -428,3 +413,163 @@ func TestAnalyzeRealQueries(t *testing.T) { t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked) } + +func TestOptimizeEqualStringMatchers(t *testing.T) { + tests := map[string]struct { + input StringMatcher + expectedValues map[string]struct{} + expectedCaseSensitive bool + }{ + "should skip optimization on orStringMatcher with containsStringMatcher": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + &containsStringMatcher{substrings: []string{"a", "b", "c"}}, + }, + expectedValues: nil, + }, + "should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + &equalStringMatcher{s: "bar", caseSensitive: true}, + &equalStringMatcher{s: "baz", caseSensitive: true}, + }, + expectedValues: map[string]struct{}{ + "FOO": {}, + "bar": {}, + "baz": {}, + }, + expectedCaseSensitive: true, + }, + "should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + &equalStringMatcher{s: "bar", caseSensitive: false}, + &equalStringMatcher{s: "baz", caseSensitive: true}, + }, + expectedValues: nil, + }, + "should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + orStringMatcher{ + &equalStringMatcher{s: "bar", caseSensitive: true}, + &equalStringMatcher{s: "xxx", caseSensitive: true}, + }, + &equalStringMatcher{s: "baz", caseSensitive: true}, + }, + expectedValues: map[string]struct{}{ + "FOO": {}, + "bar": {}, + "xxx": {}, + "baz": {}, + }, + expectedCaseSensitive: true, + }, + "should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + orStringMatcher{ + // Case sensitivity is different within items at the same level. + &equalStringMatcher{s: "bar", caseSensitive: true}, + &equalStringMatcher{s: "xxx", caseSensitive: false}, + }, + &equalStringMatcher{s: "baz", caseSensitive: true}, + }, + expectedValues: nil, + }, + "should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: true}, + // Case sensitivity is different between the parent and child. + orStringMatcher{ + &equalStringMatcher{s: "bar", caseSensitive: false}, + &equalStringMatcher{s: "xxx", caseSensitive: false}, + }, + &equalStringMatcher{s: "baz", caseSensitive: true}, + }, + expectedValues: nil, + }, + "should return lowercase values on case insensitive matchers": { + input: orStringMatcher{ + &equalStringMatcher{s: "FOO", caseSensitive: false}, + orStringMatcher{ + &equalStringMatcher{s: "bAr", caseSensitive: false}, + }, + &equalStringMatcher{s: "baZ", caseSensitive: false}, + }, + expectedValues: map[string]struct{}{ + "foo": {}, + "bar": {}, + "baz": {}, + }, + expectedCaseSensitive: false, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + actualMatcher := optimizeEqualStringMatchers(testData.input, 0) + + if testData.expectedValues == nil { + require.IsType(t, testData.input, actualMatcher) + } else { + require.IsType(t, &equalMultiStringMatcher{}, actualMatcher) + require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values) + require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive) + } + }) + } +} + +// This benchmark is used to find a good threshold to use to apply the optimization +// done by optimizeEqualStringMatchers() +func BenchmarkOptimizeEqualStringMatchers(b *testing.B) { + // Generate variable lengths random texts to match against. + texts := append([]string{}, randStrings(10, 10)...) + texts = append(texts, randStrings(5, 30)...) + texts = append(texts, randStrings(1, 100)...) + + for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 { + for _, caseSensitive := range []bool{true, false} { + b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) { + // Generate a regex with the expected number of alternations. + re := strings.Join(randStrings(numAlternations, 10), "|") + if !caseSensitive { + re = "(?i:(" + re + "))" + } + + parsed, err := syntax.Parse(re, syntax.Perl) + require.NoError(b, err) + + unoptimized := stringMatcherFromRegexpInternal(parsed) + require.IsType(b, orStringMatcher{}, unoptimized) + + optimized := optimizeEqualStringMatchers(unoptimized, 0) + require.IsType(b, &equalMultiStringMatcher{}, optimized) + + b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) { + for n := 0; n < b.N; n++ { + for _, t := range texts { + unoptimized.Matches(t) + } + } + }) + + b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) { + for n := 0; n < b.N; n++ { + for _, t := range texts { + optimized.Matches(t) + } + } + }) + }) + } + } +} + +func getTestNameFromRegexp(re string) string { + if len(re) > 32 { + return re[:32] + } + return re +}