Optimized very long case insensitive alternations (#444)

* Optimized very long case insensitive alternations

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Run common regexps in BenchmarkFastRegexMatcher

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Modify BenchmarkNewFastRegexMatcher to benchmark the NewFastRegexMatcher() function

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Reduced allocations by optimizeEqualStringMatchers()

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Fixed typo in comments

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Fixed typo in test case name

Signed-off-by: Marco Pracucci <marco@pracucci.com>

---------

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2023-03-02 17:20:52 +01:00 committed by GitHub
parent 383ea59ce1
commit 1e7ad0ec11
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 306 additions and 44 deletions

View file

@ -20,7 +20,14 @@ import (
"github.com/grafana/regexp/syntax"
)
const maxSetMatches = 256
const (
maxSetMatches = 256
// The minimum number of alternate values a regex should have to trigger
// the optimization done by optimizeEqualStringMatchers(). This value has
// been computed running BenchmarkOptimizeEqualStringMatchers.
optimizeEqualStringMatchersThreshold = 16
)
type FastRegexMatcher struct {
re *regexp.Regexp
@ -326,7 +333,10 @@ type StringMatcher interface {
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
clearBeginEndText(re)
return stringMatcherFromRegexpInternal(re)
m := stringMatcherFromRegexpInternal(re)
m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold)
return m
}
func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
@ -503,6 +513,24 @@ func (m *equalStringMatcher) Matches(s string) bool {
return strings.EqualFold(m.s, s)
}
// equalMultiStringMatcher matches a string exactly against a set of valid values.
type equalMultiStringMatcher struct {
// values to match a string against. If the matching is case insensitive,
// the values here must be lowercase.
values map[string]struct{}
caseSensitive bool
}
func (m *equalMultiStringMatcher) Matches(s string) bool {
if !m.caseSensitive {
s = strings.ToLower(s)
}
_, ok := m.values[s]
return ok
}
// anyStringMatcher is a matcher that matches any string.
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
type anyStringMatcher struct {
@ -519,3 +547,92 @@ func (m *anyStringMatcher) Matches(s string) bool {
}
return true
}
// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
// this specific case, when we have many strings to match against we can use a map instead
// of iterating over the list of strings.
func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
var (
caseSensitive bool
caseSensitiveSet bool
numValues int
)
// Analyse the input StringMatcher to count the number of occurrences
// and ensure all of them have the same case sensitivity.
analyseCallback := func(matcher *equalStringMatcher) bool {
// Ensure we don't have mixed case sensitivity.
if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
return false
} else if !caseSensitiveSet {
caseSensitive = matcher.caseSensitive
caseSensitiveSet = true
}
numValues++
return true
}
if !findEqualStringMatchers(input, analyseCallback) {
return input
}
// If the number of values found is less than the threshold, then we should skip the optimization.
if numValues < threshold {
return input
}
// Parse again the input StringMatcher to extract all values and storing them.
// We can skip the case sensitivity check because we've already checked it and
// if the code reach this point then it means all matchers have the same case sensitivity.
values := make(map[string]struct{}, numValues)
// Ignore the return value because we already iterated over the input StringMatcher
// and it was all good.
findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
if caseSensitive {
values[matcher.s] = struct{}{}
} else {
values[strings.ToLower(matcher.s)] = struct{}{}
}
return true
})
return &equalMultiStringMatcher{
values: values,
caseSensitive: caseSensitive,
}
}
// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
// composed by an alternation of equalStringMatcher.
func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
orInput, ok := input.(orStringMatcher)
if !ok {
return false
}
for _, m := range orInput {
switch casted := m.(type) {
case orStringMatcher:
if !findEqualStringMatchers(m, callback) {
return false
}
case *equalStringMatcher:
if !callback(casted) {
return false
}
default:
// It's not an equal string matcher, so we have to stop searching
// cause this optimization can't be applied.
return false
}
}
return true
}

View file

@ -15,6 +15,7 @@ package labels
import (
"bufio"
"fmt"
"math/rand"
"os"
"strings"
@ -33,6 +34,8 @@ func init() {
var (
letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
regexes = []string{
"foo",
"^foo",
"(foo|bar)",
"foo.*",
".*foo",
@ -46,17 +49,24 @@ var (
"foo\n.*",
".*foo.*",
".+foo.+",
"",
"(?s:.*)",
"(?s:.+)",
"(?s:^.*foo$)",
"(?i:foo)",
"(?i:(foo|bar))",
"(?i:(foo1|foo2|bar))",
"^(?i:foo|oo)|(bar)$",
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
"((.*)(bar|b|buzz)(.+)|foo)$",
"^$",
"(prometheus|api_prom)_api_v1_.+",
"10\\.0\\.(1|2)\\.+",
"10\\.0\\.(1|2).+",
"((fo(bar))|.+foo)",
// A long case sensitive alternation.
"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
// A long case insensitive alternation.
"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
}
values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@ -83,27 +93,15 @@ func TestNewFastRegexMatcher(t *testing.T) {
}
func BenchmarkNewFastRegexMatcher(b *testing.B) {
benchValues := values
for _, v := range values {
for i := 5; i < 50; i = i + 5 {
benchValues = append(benchValues, v+RandStringRunes(i))
benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
benchValues = append(benchValues, RandStringRunes(i)+v)
}
}
for _, r := range regexes {
r := r
b.Run(r, func(b *testing.B) {
m, err := NewFastRegexMatcher(r)
require.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, v := range benchValues {
_ = m.MatchString(v)
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
for n := 0; n < b.N; n++ {
_, err := NewFastRegexMatcher(r)
if err != nil {
b.Fatal(err)
}
}
})
}
}
@ -232,29 +230,8 @@ func BenchmarkFastRegexMatcher(b *testing.B) {
y = "foo" + x
z = x + "foo"
)
regexes := []string{
"foo",
"^foo",
"(foo|bar)",
"foo.*",
".*foo",
"^.*foo$",
"^.+foo$",
".*",
".+",
"foo.+",
".+foo",
".*foo.*",
"(?i:foo)",
"(?i:(foo|bar))",
"(?i:(foo1|foo2|bar))",
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
"(prometheus|api_prom)_api_v1_.+",
"((fo(bar))|.+foo)",
}
for _, r := range regexes {
r := r
b.Run(r, func(b *testing.B) {
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
m, err := NewFastRegexMatcher(r)
require.NoError(b, err)
b.ResetTimer()
@ -331,14 +308,22 @@ func Test_OptimizeRegex(t *testing.T) {
}
}
func RandStringRunes(n int) string {
b := make([]rune, n)
func randString(length int) string {
b := make([]rune, length)
for i := range b {
b[i] = letterRunes[rand.Intn(len(letterRunes))]
}
return string(b)
}
func randStrings(many, length int) []string {
out := make([]string, 0, many)
for i := 0; i < many; i++ {
out = append(out, randString(length))
}
return out
}
func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
// Create all matchers.
matchers := make([]*FastRegexMatcher, 0, len(regexes))
@ -428,3 +413,163 @@ func TestAnalyzeRealQueries(t *testing.T) {
t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked)
}
func TestOptimizeEqualStringMatchers(t *testing.T) {
tests := map[string]struct {
input StringMatcher
expectedValues map[string]struct{}
expectedCaseSensitive bool
}{
"should skip optimization on orStringMatcher with containsStringMatcher": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&containsStringMatcher{substrings: []string{"a", "b", "c"}},
},
expectedValues: nil,
},
"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: map[string]struct{}{
"FOO": {},
"bar": {},
"baz": {},
},
expectedCaseSensitive: true,
},
"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&equalStringMatcher{s: "bar", caseSensitive: false},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
orStringMatcher{
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "xxx", caseSensitive: true},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: map[string]struct{}{
"FOO": {},
"bar": {},
"xxx": {},
"baz": {},
},
expectedCaseSensitive: true,
},
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
orStringMatcher{
// Case sensitivity is different within items at the same level.
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "xxx", caseSensitive: false},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
// Case sensitivity is different between the parent and child.
orStringMatcher{
&equalStringMatcher{s: "bar", caseSensitive: false},
&equalStringMatcher{s: "xxx", caseSensitive: false},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should return lowercase values on case insensitive matchers": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: false},
orStringMatcher{
&equalStringMatcher{s: "bAr", caseSensitive: false},
},
&equalStringMatcher{s: "baZ", caseSensitive: false},
},
expectedValues: map[string]struct{}{
"foo": {},
"bar": {},
"baz": {},
},
expectedCaseSensitive: false,
},
}
for testName, testData := range tests {
t.Run(testName, func(t *testing.T) {
actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
if testData.expectedValues == nil {
require.IsType(t, testData.input, actualMatcher)
} else {
require.IsType(t, &equalMultiStringMatcher{}, actualMatcher)
require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values)
require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive)
}
})
}
}
// This benchmark is used to find a good threshold to use to apply the optimization
// done by optimizeEqualStringMatchers()
func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
// Generate variable lengths random texts to match against.
texts := append([]string{}, randStrings(10, 10)...)
texts = append(texts, randStrings(5, 30)...)
texts = append(texts, randStrings(1, 100)...)
for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
for _, caseSensitive := range []bool{true, false} {
b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
// Generate a regex with the expected number of alternations.
re := strings.Join(randStrings(numAlternations, 10), "|")
if !caseSensitive {
re = "(?i:(" + re + "))"
}
parsed, err := syntax.Parse(re, syntax.Perl)
require.NoError(b, err)
unoptimized := stringMatcherFromRegexpInternal(parsed)
require.IsType(b, orStringMatcher{}, unoptimized)
optimized := optimizeEqualStringMatchers(unoptimized, 0)
require.IsType(b, &equalMultiStringMatcher{}, optimized)
b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, t := range texts {
unoptimized.Matches(t)
}
}
})
b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, t := range texts {
optimized.Matches(t)
}
}
})
})
}
}
}
func getTestNameFromRegexp(re string) string {
if len(re) > 32 {
return re[:32]
}
return re
}