From 82a8c6abe23a2afa3d7cbf3ccebc35e59d646dbb Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 3 Jul 2024 18:45:36 +0100 Subject: [PATCH] [ENHANCEMENT] Optimize regexps with multiple prefixes (#13843) For example `foo.*|bar.*|baz.*`. Instead of checking each one in turn, we build a map of prefixes, then check the smaller set that could match the string supplied. Signed-off-by: Bryan Boreham * Improve testing and readability Address review comments on #13843 Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 129 +++++++++++++---- model/labels/regexp_test.go | 279 ++++++++++++++++++++++++++++-------- 2 files changed, 323 insertions(+), 85 deletions(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 767bd6942..d2151d83d 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -28,7 +28,7 @@ const ( maxSetMatches = 256 // The minimum number of alternate values a regex should have to trigger - // the optimization done by optimizeEqualStringMatchers() and so use a map + // the optimization done by optimizeEqualOrPrefixStringMatchers() and so use a map // to match values instead of iterating over a list. This value has // been computed running BenchmarkOptimizeEqualStringMatchers. minEqualMultiStringMatcherMapThreshold = 16 @@ -337,7 +337,7 @@ func optimizeAlternatingLiterals(s string) (StringMatcher, []string) { return nil, nil } - multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates) + multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates, 0, 0) for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') { // Split the string into the next literal and the remainder @@ -412,7 +412,7 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { clearBeginEndText(re) m := stringMatcherFromRegexpInternal(re) - m = optimizeEqualStringMatchers(m, minEqualMultiStringMatcherMapThreshold) + m = optimizeEqualOrPrefixStringMatchers(m, minEqualMultiStringMatcherMapThreshold) return m } @@ -732,17 +732,20 @@ func (m *equalStringMatcher) Matches(s string) bool { type multiStringMatcherBuilder interface { StringMatcher add(s string) + addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) setMatches() []string } -func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize int) multiStringMatcherBuilder { +func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize, estimatedPrefixes, minPrefixLength int) multiStringMatcherBuilder { // If the estimated size is low enough, it's faster to use a slice instead of a map. - if estimatedSize < minEqualMultiStringMatcherMapThreshold { + if estimatedSize < minEqualMultiStringMatcherMapThreshold && estimatedPrefixes == 0 { return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)} } return &equalMultiStringMapMatcher{ values: make(map[string]struct{}, estimatedSize), + prefixes: make(map[string][]StringMatcher, estimatedPrefixes), + minPrefixLen: minPrefixLength, caseSensitive: caseSensitive, } } @@ -758,6 +761,10 @@ func (m *equalMultiStringSliceMatcher) add(s string) { m.values = append(m.values, s) } +func (m *equalMultiStringSliceMatcher) addPrefix(_ string, _ bool, _ StringMatcher) { + panic("not implemented") +} + func (m *equalMultiStringSliceMatcher) setMatches() []string { return m.values } @@ -779,12 +786,17 @@ func (m *equalMultiStringSliceMatcher) Matches(s string) bool { return false } -// equalMultiStringMapMatcher matches a string exactly against a map of valid values. +// equalMultiStringMapMatcher matches a string exactly against a map of valid values +// or against a set of prefix matchers. type equalMultiStringMapMatcher struct { // values contains values to match a string against. If the matching is case insensitive, // the values here must be lowercase. values map[string]struct{} - + // prefixes maps strings, all of length minPrefixLen, to sets of matchers to check the rest of the string. + // If the matching is case insensitive, prefixes are all lowercase. + prefixes map[string][]StringMatcher + // minPrefixLen can be zero, meaning there are no prefix matchers. + minPrefixLen int caseSensitive bool } @@ -796,8 +808,27 @@ func (m *equalMultiStringMapMatcher) add(s string) { m.values[s] = struct{}{} } +func (m *equalMultiStringMapMatcher) addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) { + if m.minPrefixLen == 0 { + panic("addPrefix called when no prefix length defined") + } + if len(prefix) < m.minPrefixLen { + panic("addPrefix called with a too short prefix") + } + if m.caseSensitive != prefixCaseSensitive { + panic("addPrefix called with a prefix whose case sensitivity is different than the expected one") + } + + s := prefix[:m.minPrefixLen] + if !m.caseSensitive { + s = strings.ToLower(s) + } + + m.prefixes[s] = append(m.prefixes[s], matcher) +} + func (m *equalMultiStringMapMatcher) setMatches() []string { - if len(m.values) >= maxSetMatches { + if len(m.values) >= maxSetMatches || len(m.prefixes) > 0 { return nil } @@ -813,8 +844,17 @@ func (m *equalMultiStringMapMatcher) Matches(s string) bool { s = toNormalisedLower(s) } - _, ok := m.values[s] - return ok + if _, ok := m.values[s]; ok { + return true + } + if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen { + for _, matcher := range m.prefixes[s[:m.minPrefixLen]] { + if matcher.Matches(s) { + return true + } + } + } + return false } // toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert @@ -897,20 +937,24 @@ func (m trueMatcher) Matches(_ string) bool { return true } -// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an -// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In -// this specific case, when we have many strings to match against we can use a map instead +// optimizeEqualOrPrefixStringMatchers optimize a specific case where all matchers are made by an +// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher) or +// with a literal prefix (literalPrefixSensitiveStringMatcher or literalPrefixInsensitiveStringMatcher). +// +// In this specific case, when we have many strings to match against we can use a map instead // of iterating over the list of strings. -func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher { +func optimizeEqualOrPrefixStringMatchers(input StringMatcher, threshold int) StringMatcher { var ( caseSensitive bool caseSensitiveSet bool numValues int + numPrefixes int + minPrefixLength int ) // Analyse the input StringMatcher to count the number of occurrences // and ensure all of them have the same case sensitivity. - analyseCallback := func(matcher *equalStringMatcher) bool { + analyseEqualMatcherCallback := func(matcher *equalStringMatcher) bool { // Ensure we don't have mixed case sensitivity. if caseSensitiveSet && caseSensitive != matcher.caseSensitive { return false @@ -923,34 +967,55 @@ func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatch return true } - if !findEqualStringMatchers(input, analyseCallback) { + analysePrefixMatcherCallback := func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { + // Ensure we don't have mixed case sensitivity. + if caseSensitiveSet && caseSensitive != prefixCaseSensitive { + return false + } else if !caseSensitiveSet { + caseSensitive = prefixCaseSensitive + caseSensitiveSet = true + } + if numPrefixes == 0 || len(prefix) < minPrefixLength { + minPrefixLength = len(prefix) + } + + numPrefixes++ + return true + } + + if !findEqualOrPrefixStringMatchers(input, analyseEqualMatcherCallback, analysePrefixMatcherCallback) { return input } - // If the number of values found is less than the threshold, then we should skip the optimization. - if numValues < threshold { + // If the number of values and prefixes found is less than the threshold, then we should skip the optimization. + if (numValues + numPrefixes) < threshold { return input } // Parse again the input StringMatcher to extract all values and storing them. // We can skip the case sensitivity check because we've already checked it and // if the code reach this point then it means all matchers have the same case sensitivity. - multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues) + multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues, numPrefixes, minPrefixLength) // Ignore the return value because we already iterated over the input StringMatcher // and it was all good. - findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool { + findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool { multiMatcher.add(matcher.s) return true + }, func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { + multiMatcher.addPrefix(prefix, caseSensitive, matcher) + return true }) return multiMatcher } -// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each -// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only* -// composed by an alternation of equalStringMatcher. -func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool { +// findEqualOrPrefixStringMatchers analyze the input StringMatcher and calls the equalMatcherCallback for each +// equalStringMatcher found, and prefixMatcherCallback for each literalPrefixSensitiveStringMatcher and literalPrefixInsensitiveStringMatcher found. +// +// Returns true if and only if the input StringMatcher is *only* composed by an alternation of equalStringMatcher and/or +// literal prefix matcher. Returns false if prefixMatcherCallback is nil and a literal prefix matcher is encountered. +func findEqualOrPrefixStringMatchers(input StringMatcher, equalMatcherCallback func(matcher *equalStringMatcher) bool, prefixMatcherCallback func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool) bool { orInput, ok := input.(orStringMatcher) if !ok { return false @@ -959,17 +1024,27 @@ func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalSt for _, m := range orInput { switch casted := m.(type) { case orStringMatcher: - if !findEqualStringMatchers(m, callback) { + if !findEqualOrPrefixStringMatchers(m, equalMatcherCallback, prefixMatcherCallback) { return false } case *equalStringMatcher: - if !callback(casted) { + if !equalMatcherCallback(casted) { + return false + } + + case *literalPrefixSensitiveStringMatcher: + if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, true, casted) { + return false + } + + case *literalPrefixInsensitiveStringMatcher: + if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, false, casted) { return false } default: - // It's not an equal string matcher, so we have to stop searching + // It's not an equal or prefix string matcher, so we have to stop searching // cause this optimization can't be applied. return false } diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index fa5c96f42..24875e64e 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -71,6 +71,8 @@ var ( // A long case insensitive alternation. "(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))", "(?i:(AAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBB|cccccccccccccccccccccccC|ſſſſſſſſſſſſſſſſſſſſſſſſS|SSSSSSSSSSSSSSSSSSSSSSSSſ))", + // A short case insensitive alternation where each entry ends with ".*". + "(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*))", // A long case insensitive alternation where each entry ends with ".*". "(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))", // A long case insensitive alternation where each entry starts with ".*". @@ -686,7 +688,15 @@ func randStrings(randGenerator *rand.Rand, many, length int) []string { return out } -func TestOptimizeEqualStringMatchers(t *testing.T) { +func randStringsWithSuffix(randGenerator *rand.Rand, many, length int, suffix string) []string { + out := randStrings(randGenerator, many, length) + for i := range out { + out[i] += suffix + } + return out +} + +func TestOptimizeEqualOrPrefixStringMatchers(t *testing.T) { tests := map[string]struct { input StringMatcher expectedValues []string @@ -767,7 +777,7 @@ func TestOptimizeEqualStringMatchers(t *testing.T) { for testName, testData := range tests { t.Run(testName, func(t *testing.T) { - actualMatcher := optimizeEqualStringMatchers(testData.input, 0) + actualMatcher := optimizeEqualOrPrefixStringMatchers(testData.input, 0) if testData.expectedValues == nil { require.IsType(t, testData.input, actualMatcher) @@ -782,10 +792,12 @@ func TestOptimizeEqualStringMatchers(t *testing.T) { func TestNewEqualMultiStringMatcher(t *testing.T) { tests := map[string]struct { - values []string - caseSensitive bool - expectedValuesMap map[string]struct{} - expectedValuesList []string + values []string + caseSensitivePrefixes []*literalPrefixSensitiveStringMatcher + caseSensitive bool + expectedValuesMap map[string]struct{} + expectedPrefixesMap map[string][]StringMatcher + expectedValuesList []string }{ "few case sensitive values": { values: []string{"a", "B"}, @@ -797,27 +809,47 @@ func TestNewEqualMultiStringMatcher(t *testing.T) { caseSensitive: false, expectedValuesList: []string{"a", "B"}, }, + "few case sensitive values and prefixes": { + values: []string{"a"}, + caseSensitivePrefixes: []*literalPrefixSensitiveStringMatcher{{prefix: "B", right: anyStringWithoutNewlineMatcher{}}}, + caseSensitive: true, + expectedValuesMap: map[string]struct{}{"a": {}}, + expectedPrefixesMap: map[string][]StringMatcher{"B": {&literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}}}, + }, "many case sensitive values": { - values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"}, - caseSensitive: true, - expectedValuesMap: map[string]struct{}{"a": {}, "B": {}, "c": {}, "D": {}, "e": {}, "F": {}, "g": {}, "H": {}, "i": {}, "L": {}, "m": {}, "N": {}, "o": {}, "P": {}, "q": {}, "r": {}}, + values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"}, + caseSensitive: true, + expectedValuesMap: map[string]struct{}{"a": {}, "B": {}, "c": {}, "D": {}, "e": {}, "F": {}, "g": {}, "H": {}, "i": {}, "L": {}, "m": {}, "N": {}, "o": {}, "P": {}, "q": {}, "r": {}}, + expectedPrefixesMap: map[string][]StringMatcher{}, }, "many case insensitive values": { - values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"}, - caseSensitive: false, - expectedValuesMap: map[string]struct{}{"a": {}, "b": {}, "c": {}, "d": {}, "e": {}, "f": {}, "g": {}, "h": {}, "i": {}, "l": {}, "m": {}, "n": {}, "o": {}, "p": {}, "q": {}, "r": {}}, + values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"}, + caseSensitive: false, + expectedValuesMap: map[string]struct{}{"a": {}, "b": {}, "c": {}, "d": {}, "e": {}, "f": {}, "g": {}, "h": {}, "i": {}, "l": {}, "m": {}, "n": {}, "o": {}, "p": {}, "q": {}, "r": {}}, + expectedPrefixesMap: map[string][]StringMatcher{}, }, } for testName, testData := range tests { t.Run(testName, func(t *testing.T) { - matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values)) + // To keep this test simple, we always assume a min prefix length of 1. + minPrefixLength := 0 + if len(testData.caseSensitivePrefixes) > 0 { + minPrefixLength = 1 + } + + matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values), len(testData.caseSensitivePrefixes), minPrefixLength) for _, v := range testData.values { matcher.add(v) } - if testData.expectedValuesMap != nil { + for _, p := range testData.caseSensitivePrefixes { + matcher.addPrefix(p.prefix, true, p) + } + + if testData.expectedValuesMap != nil || testData.expectedPrefixesMap != nil { require.IsType(t, &equalMultiStringMapMatcher{}, matcher) require.Equal(t, testData.expectedValuesMap, matcher.(*equalMultiStringMapMatcher).values) + require.Equal(t, testData.expectedPrefixesMap, matcher.(*equalMultiStringMapMatcher).prefixes) require.Equal(t, testData.caseSensitive, matcher.(*equalMultiStringMapMatcher).caseSensitive) } if testData.expectedValuesList != nil { @@ -829,9 +861,32 @@ func TestNewEqualMultiStringMatcher(t *testing.T) { } } +func TestEqualMultiStringMapMatcher_addPrefix(t *testing.T) { + t.Run("should panic if the matcher is case sensitive but the prefix is not case sensitive", func(t *testing.T) { + matcher := newEqualMultiStringMatcher(true, 0, 1, 1) + + require.Panics(t, func() { + matcher.addPrefix("a", false, &literalPrefixInsensitiveStringMatcher{ + prefix: "a", + }) + }) + }) + + t.Run("should panic if the matcher is not case sensitive but the prefix is case sensitive", func(t *testing.T) { + matcher := newEqualMultiStringMatcher(false, 0, 1, 1) + + require.Panics(t, func() { + matcher.addPrefix("a", true, &literalPrefixSensitiveStringMatcher{ + prefix: "a", + }) + }) + }) +} + func TestEqualMultiStringMatcher_Matches(t *testing.T) { tests := map[string]struct { values []string + prefixes []StringMatcher caseSensitive bool expectedMatches []string expectedNotMatches []string @@ -848,6 +903,24 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) { expectedMatches: []string{"a", "A", "b", "B"}, expectedNotMatches: []string{"c", "C"}, }, + "few case sensitive prefixes": { + prefixes: []StringMatcher{ + &literalPrefixSensitiveStringMatcher{prefix: "a", right: anyStringWithoutNewlineMatcher{}}, + &literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}, + }, + caseSensitive: true, + expectedMatches: []string{"a", "aX", "B", "BX"}, + expectedNotMatches: []string{"A", "b"}, + }, + "few case insensitive prefixes": { + prefixes: []StringMatcher{ + &literalPrefixInsensitiveStringMatcher{prefix: "a", right: anyStringWithoutNewlineMatcher{}}, + &literalPrefixInsensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}, + }, + caseSensitive: false, + expectedMatches: []string{"a", "aX", "A", "AX", "b", "bX", "B", "BX"}, + expectedNotMatches: []string{"c", "cX", "C", "CX"}, + }, "many case sensitive values": { values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"}, caseSensitive: true, @@ -860,14 +933,37 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) { expectedMatches: []string{"a", "A", "b", "B"}, expectedNotMatches: []string{"x", "X"}, }, + "mixed values and prefixes": { + values: []string{"a"}, + prefixes: []StringMatcher{&literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}}, + caseSensitive: true, + expectedMatches: []string{"a", "B", "BX"}, + expectedNotMatches: []string{"aX", "A", "b", "bX"}, + }, } for testName, testData := range tests { t.Run(testName, func(t *testing.T) { - matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values)) + // To keep this test simple, we always assume a min prefix length of 1. + minPrefixLength := 0 + if len(testData.prefixes) > 0 { + minPrefixLength = 1 + } + + matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values), len(testData.prefixes), minPrefixLength) for _, v := range testData.values { matcher.add(v) } + for _, p := range testData.prefixes { + switch m := p.(type) { + case *literalPrefixSensitiveStringMatcher: + matcher.addPrefix(m.prefix, true, p) + case *literalPrefixInsensitiveStringMatcher: + matcher.addPrefix(m.prefix, false, p) + default: + panic("Unexpected type in test case") + } + } for _, v := range testData.expectedMatches { require.True(t, matcher.Matches(v), "value: %s", v) @@ -879,29 +975,33 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) { } } -func TestFindEqualStringMatchers(t *testing.T) { +func TestFindEqualOrPrefixStringMatchers(t *testing.T) { type match struct { s string caseSensitive bool } - // Utility to call findEqualStringMatchers() and collect all callback invocations. - findEqualStringMatchersAndCollectMatches := func(input StringMatcher) (matches []match, ok bool) { - ok = findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool { + // Utility to call findEqualOrPrefixStringMatchers() and collect all callback invocations. + findEqualOrPrefixStringMatchersAndCollectMatches := func(input StringMatcher) (matches []match, ok bool) { + ok = findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool { matches = append(matches, match{matcher.s, matcher.caseSensitive}) return true + }, func(prefix string, prefixCaseSensitive bool, right StringMatcher) bool { + matches = append(matches, match{prefix, prefixCaseSensitive}) + return true }) + return } t.Run("empty matcher", func(t *testing.T) { - actualMatches, actualOk := findEqualStringMatchersAndCollectMatches(emptyStringMatcher{}) + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(emptyStringMatcher{}) require.False(t, actualOk) require.Empty(t, actualMatches) }) t.Run("concat of literal matchers (case sensitive)", func(t *testing.T) { - actualMatches, actualOk := findEqualStringMatchersAndCollectMatches( + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( orStringMatcher{ &equalStringMatcher{s: "test-1", caseSensitive: true}, &equalStringMatcher{s: "test-2", caseSensitive: true}, @@ -913,7 +1013,7 @@ func TestFindEqualStringMatchers(t *testing.T) { }) t.Run("concat of literal matchers (case insensitive)", func(t *testing.T) { - actualMatches, actualOk := findEqualStringMatchersAndCollectMatches( + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( orStringMatcher{ &equalStringMatcher{s: "test-1", caseSensitive: false}, &equalStringMatcher{s: "test-2", caseSensitive: false}, @@ -925,7 +1025,7 @@ func TestFindEqualStringMatchers(t *testing.T) { }) t.Run("concat of literal matchers (mixed case)", func(t *testing.T) { - actualMatches, actualOk := findEqualStringMatchersAndCollectMatches( + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( orStringMatcher{ &equalStringMatcher{s: "test-1", caseSensitive: false}, &equalStringMatcher{s: "test-2", caseSensitive: true}, @@ -935,11 +1035,59 @@ func TestFindEqualStringMatchers(t *testing.T) { require.True(t, actualOk) require.Equal(t, []match{{"test-1", false}, {"test-2", true}}, actualMatches) }) + + t.Run("concat of literal prefix matchers (case sensitive)", func(t *testing.T) { + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( + orStringMatcher{ + &literalPrefixSensitiveStringMatcher{prefix: "test-1"}, + &literalPrefixSensitiveStringMatcher{prefix: "test-2"}, + }, + ) + + require.True(t, actualOk) + require.Equal(t, []match{{"test-1", true}, {"test-2", true}}, actualMatches) + }) + + t.Run("concat of literal prefix matchers (case insensitive)", func(t *testing.T) { + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( + orStringMatcher{ + &literalPrefixInsensitiveStringMatcher{prefix: "test-1"}, + &literalPrefixInsensitiveStringMatcher{prefix: "test-2"}, + }, + ) + + require.True(t, actualOk) + require.Equal(t, []match{{"test-1", false}, {"test-2", false}}, actualMatches) + }) + + t.Run("concat of literal prefix matchers (mixed case)", func(t *testing.T) { + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( + orStringMatcher{ + &literalPrefixInsensitiveStringMatcher{prefix: "test-1"}, + &literalPrefixSensitiveStringMatcher{prefix: "test-2"}, + }, + ) + + require.True(t, actualOk) + require.Equal(t, []match{{"test-1", false}, {"test-2", true}}, actualMatches) + }) + + t.Run("concat of literal string and prefix matchers (case sensitive)", func(t *testing.T) { + actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches( + orStringMatcher{ + &equalStringMatcher{s: "test-1", caseSensitive: true}, + &literalPrefixSensitiveStringMatcher{prefix: "test-2"}, + }, + ) + + require.True(t, actualOk) + require.Equal(t, []match{{"test-1", true}, {"test-2", true}}, actualMatches) + }) } // This benchmark is used to find a good threshold to use to apply the optimization -// done by optimizeEqualStringMatchers(). -func BenchmarkOptimizeEqualStringMatchers(b *testing.B) { +// done by optimizeEqualOrPrefixStringMatchers(). +func BenchmarkOptimizeEqualOrPrefixStringMatchers(b *testing.B) { randGenerator := rand.New(rand.NewSource(time.Now().UnixNano())) // Generate variable lengths random texts to match against. @@ -949,42 +1097,51 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) { for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 { for _, caseSensitive := range []bool{true, false} { - b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) { - // Generate a regex with the expected number of alternations. - re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|") - if !caseSensitive { - re = "(?i:(" + re + "))" - } - - parsed, err := syntax.Parse(re, syntax.Perl) - require.NoError(b, err) - - unoptimized := stringMatcherFromRegexpInternal(parsed) - require.IsType(b, orStringMatcher{}, unoptimized) - - optimized := optimizeEqualStringMatchers(unoptimized, 0) - if numAlternations < minEqualMultiStringMatcherMapThreshold { - require.IsType(b, &equalMultiStringSliceMatcher{}, optimized) - } else { - require.IsType(b, &equalMultiStringMapMatcher{}, optimized) - } - - b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) { - for n := 0; n < b.N; n++ { - for _, t := range texts { - unoptimized.Matches(t) - } + for _, prefixMatcher := range []bool{true, false} { + b.Run(fmt.Sprintf("alternations: %d case sensitive: %t prefix matcher: %t", numAlternations, caseSensitive, prefixMatcher), func(b *testing.B) { + // If the test should run on prefix matchers, we add a wildcard matcher as suffix (prefix will be a literal). + suffix := "" + if prefixMatcher { + suffix = ".*" } - }) - b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) { - for n := 0; n < b.N; n++ { - for _, t := range texts { - optimized.Matches(t) - } + // Generate a regex with the expected number of alternations. + re := strings.Join(randStringsWithSuffix(randGenerator, numAlternations, 10, suffix), "|") + if !caseSensitive { + re = "(?i:(" + re + "))" } + b.Logf("regexp: %s", re) + + parsed, err := syntax.Parse(re, syntax.Perl) + require.NoError(b, err) + + unoptimized := stringMatcherFromRegexpInternal(parsed) + require.IsType(b, orStringMatcher{}, unoptimized) + + optimized := optimizeEqualOrPrefixStringMatchers(unoptimized, 0) + if numAlternations < minEqualMultiStringMatcherMapThreshold && !prefixMatcher { + require.IsType(b, &equalMultiStringSliceMatcher{}, optimized) + } else { + require.IsType(b, &equalMultiStringMapMatcher{}, optimized) + } + + b.Run("without optimizeEqualOrPrefixStringMatchers()", func(b *testing.B) { + for n := 0; n < b.N; n++ { + for _, t := range texts { + unoptimized.Matches(t) + } + } + }) + + b.Run("with optimizeEqualOrPrefixStringMatchers()", func(b *testing.B) { + for n := 0; n < b.N; n++ { + for _, t := range texts { + optimized.Matches(t) + } + } + }) }) - }) + } } } } @@ -1204,10 +1361,16 @@ func visitStringMatcher(matcher StringMatcher, callback func(matcher StringMatch } // No nested matchers for the following ones. + case *equalMultiStringMapMatcher: + for _, prefixes := range casted.prefixes { + for _, matcher := range prefixes { + visitStringMatcher(matcher, callback) + } + } + case emptyStringMatcher: case *equalStringMatcher: case *equalMultiStringSliceMatcher: - case *equalMultiStringMapMatcher: case anyStringWithoutNewlineMatcher: case *anyNonEmptyStringMatcher: case trueMatcher: