mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-14 17:44:06 -08:00
Optimized very long case insensitive alternations (#444)
* Optimized very long case insensitive alternations Signed-off-by: Marco Pracucci <marco@pracucci.com> * Run common regexps in BenchmarkFastRegexMatcher Signed-off-by: Marco Pracucci <marco@pracucci.com> * Modify BenchmarkNewFastRegexMatcher to benchmark the NewFastRegexMatcher() function Signed-off-by: Marco Pracucci <marco@pracucci.com> * Reduced allocations by optimizeEqualStringMatchers() Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in comments Signed-off-by: Marco Pracucci <marco@pracucci.com> * Fixed typo in test case name Signed-off-by: Marco Pracucci <marco@pracucci.com> --------- Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
parent
383ea59ce1
commit
1e7ad0ec11
|
@ -20,7 +20,14 @@ import (
|
|||
"github.com/grafana/regexp/syntax"
|
||||
)
|
||||
|
||||
const maxSetMatches = 256
|
||||
const (
|
||||
maxSetMatches = 256
|
||||
|
||||
// The minimum number of alternate values a regex should have to trigger
|
||||
// the optimization done by optimizeEqualStringMatchers(). This value has
|
||||
// been computed running BenchmarkOptimizeEqualStringMatchers.
|
||||
optimizeEqualStringMatchersThreshold = 16
|
||||
)
|
||||
|
||||
type FastRegexMatcher struct {
|
||||
re *regexp.Regexp
|
||||
|
@ -326,7 +333,10 @@ type StringMatcher interface {
|
|||
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
|
||||
clearBeginEndText(re)
|
||||
|
||||
return stringMatcherFromRegexpInternal(re)
|
||||
m := stringMatcherFromRegexpInternal(re)
|
||||
m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold)
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
|
||||
|
@ -503,6 +513,24 @@ func (m *equalStringMatcher) Matches(s string) bool {
|
|||
return strings.EqualFold(m.s, s)
|
||||
}
|
||||
|
||||
// equalMultiStringMatcher matches a string exactly against a set of valid values.
|
||||
type equalMultiStringMatcher struct {
|
||||
// values to match a string against. If the matching is case insensitive,
|
||||
// the values here must be lowercase.
|
||||
values map[string]struct{}
|
||||
|
||||
caseSensitive bool
|
||||
}
|
||||
|
||||
func (m *equalMultiStringMatcher) Matches(s string) bool {
|
||||
if !m.caseSensitive {
|
||||
s = strings.ToLower(s)
|
||||
}
|
||||
|
||||
_, ok := m.values[s]
|
||||
return ok
|
||||
}
|
||||
|
||||
// anyStringMatcher is a matcher that matches any string.
|
||||
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
|
||||
type anyStringMatcher struct {
|
||||
|
@ -519,3 +547,92 @@ func (m *anyStringMatcher) Matches(s string) bool {
|
|||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
|
||||
// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
|
||||
// this specific case, when we have many strings to match against we can use a map instead
|
||||
// of iterating over the list of strings.
|
||||
func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
|
||||
var (
|
||||
caseSensitive bool
|
||||
caseSensitiveSet bool
|
||||
numValues int
|
||||
)
|
||||
|
||||
// Analyse the input StringMatcher to count the number of occurrences
|
||||
// and ensure all of them have the same case sensitivity.
|
||||
analyseCallback := func(matcher *equalStringMatcher) bool {
|
||||
// Ensure we don't have mixed case sensitivity.
|
||||
if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
|
||||
return false
|
||||
} else if !caseSensitiveSet {
|
||||
caseSensitive = matcher.caseSensitive
|
||||
caseSensitiveSet = true
|
||||
}
|
||||
|
||||
numValues++
|
||||
return true
|
||||
}
|
||||
|
||||
if !findEqualStringMatchers(input, analyseCallback) {
|
||||
return input
|
||||
}
|
||||
|
||||
// If the number of values found is less than the threshold, then we should skip the optimization.
|
||||
if numValues < threshold {
|
||||
return input
|
||||
}
|
||||
|
||||
// Parse again the input StringMatcher to extract all values and storing them.
|
||||
// We can skip the case sensitivity check because we've already checked it and
|
||||
// if the code reach this point then it means all matchers have the same case sensitivity.
|
||||
values := make(map[string]struct{}, numValues)
|
||||
|
||||
// Ignore the return value because we already iterated over the input StringMatcher
|
||||
// and it was all good.
|
||||
findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
|
||||
if caseSensitive {
|
||||
values[matcher.s] = struct{}{}
|
||||
} else {
|
||||
values[strings.ToLower(matcher.s)] = struct{}{}
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
|
||||
return &equalMultiStringMatcher{
|
||||
values: values,
|
||||
caseSensitive: caseSensitive,
|
||||
}
|
||||
}
|
||||
|
||||
// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
|
||||
// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
|
||||
// composed by an alternation of equalStringMatcher.
|
||||
func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
|
||||
orInput, ok := input.(orStringMatcher)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, m := range orInput {
|
||||
switch casted := m.(type) {
|
||||
case orStringMatcher:
|
||||
if !findEqualStringMatchers(m, callback) {
|
||||
return false
|
||||
}
|
||||
|
||||
case *equalStringMatcher:
|
||||
if !callback(casted) {
|
||||
return false
|
||||
}
|
||||
|
||||
default:
|
||||
// It's not an equal string matcher, so we have to stop searching
|
||||
// cause this optimization can't be applied.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ package labels
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os"
|
||||
"strings"
|
||||
|
@ -33,6 +34,8 @@ func init() {
|
|||
var (
|
||||
letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||
regexes = []string{
|
||||
"foo",
|
||||
"^foo",
|
||||
"(foo|bar)",
|
||||
"foo.*",
|
||||
".*foo",
|
||||
|
@ -46,17 +49,24 @@ var (
|
|||
"foo\n.*",
|
||||
".*foo.*",
|
||||
".+foo.+",
|
||||
"",
|
||||
"(?s:.*)",
|
||||
"(?s:.+)",
|
||||
"(?s:^.*foo$)",
|
||||
"(?i:foo)",
|
||||
"(?i:(foo|bar))",
|
||||
"(?i:(foo1|foo2|bar))",
|
||||
"^(?i:foo|oo)|(bar)$",
|
||||
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
|
||||
"((.*)(bar|b|buzz)(.+)|foo)$",
|
||||
"^$",
|
||||
"(prometheus|api_prom)_api_v1_.+",
|
||||
"10\\.0\\.(1|2)\\.+",
|
||||
"10\\.0\\.(1|2).+",
|
||||
"((fo(bar))|.+foo)",
|
||||
// A long case sensitive alternation.
|
||||
"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
|
||||
// A long case insensitive alternation.
|
||||
"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
|
||||
}
|
||||
values = []string{
|
||||
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
|
||||
|
@ -83,27 +93,15 @@ func TestNewFastRegexMatcher(t *testing.T) {
|
|||
}
|
||||
|
||||
func BenchmarkNewFastRegexMatcher(b *testing.B) {
|
||||
benchValues := values
|
||||
for _, v := range values {
|
||||
for i := 5; i < 50; i = i + 5 {
|
||||
benchValues = append(benchValues, v+RandStringRunes(i))
|
||||
benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
|
||||
benchValues = append(benchValues, RandStringRunes(i)+v)
|
||||
}
|
||||
}
|
||||
for _, r := range regexes {
|
||||
r := r
|
||||
b.Run(r, func(b *testing.B) {
|
||||
m, err := NewFastRegexMatcher(r)
|
||||
require.NoError(b, err)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, v := range benchValues {
|
||||
_ = m.MatchString(v)
|
||||
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
_, err := NewFastRegexMatcher(r)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -232,29 +230,8 @@ func BenchmarkFastRegexMatcher(b *testing.B) {
|
|||
y = "foo" + x
|
||||
z = x + "foo"
|
||||
)
|
||||
regexes := []string{
|
||||
"foo",
|
||||
"^foo",
|
||||
"(foo|bar)",
|
||||
"foo.*",
|
||||
".*foo",
|
||||
"^.*foo$",
|
||||
"^.+foo$",
|
||||
".*",
|
||||
".+",
|
||||
"foo.+",
|
||||
".+foo",
|
||||
".*foo.*",
|
||||
"(?i:foo)",
|
||||
"(?i:(foo|bar))",
|
||||
"(?i:(foo1|foo2|bar))",
|
||||
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
|
||||
"(prometheus|api_prom)_api_v1_.+",
|
||||
"((fo(bar))|.+foo)",
|
||||
}
|
||||
for _, r := range regexes {
|
||||
r := r
|
||||
b.Run(r, func(b *testing.B) {
|
||||
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
|
||||
m, err := NewFastRegexMatcher(r)
|
||||
require.NoError(b, err)
|
||||
b.ResetTimer()
|
||||
|
@ -331,14 +308,22 @@ func Test_OptimizeRegex(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func RandStringRunes(n int) string {
|
||||
b := make([]rune, n)
|
||||
func randString(length int) string {
|
||||
b := make([]rune, length)
|
||||
for i := range b {
|
||||
b[i] = letterRunes[rand.Intn(len(letterRunes))]
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func randStrings(many, length int) []string {
|
||||
out := make([]string, 0, many)
|
||||
for i := 0; i < many; i++ {
|
||||
out = append(out, randString(length))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
|
||||
// Create all matchers.
|
||||
matchers := make([]*FastRegexMatcher, 0, len(regexes))
|
||||
|
@ -428,3 +413,163 @@ func TestAnalyzeRealQueries(t *testing.T) {
|
|||
|
||||
t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked)
|
||||
}
|
||||
|
||||
func TestOptimizeEqualStringMatchers(t *testing.T) {
|
||||
tests := map[string]struct {
|
||||
input StringMatcher
|
||||
expectedValues map[string]struct{}
|
||||
expectedCaseSensitive bool
|
||||
}{
|
||||
"should skip optimization on orStringMatcher with containsStringMatcher": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
&containsStringMatcher{substrings: []string{"a", "b", "c"}},
|
||||
},
|
||||
expectedValues: nil,
|
||||
},
|
||||
"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
&equalStringMatcher{s: "bar", caseSensitive: true},
|
||||
&equalStringMatcher{s: "baz", caseSensitive: true},
|
||||
},
|
||||
expectedValues: map[string]struct{}{
|
||||
"FOO": {},
|
||||
"bar": {},
|
||||
"baz": {},
|
||||
},
|
||||
expectedCaseSensitive: true,
|
||||
},
|
||||
"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
&equalStringMatcher{s: "bar", caseSensitive: false},
|
||||
&equalStringMatcher{s: "baz", caseSensitive: true},
|
||||
},
|
||||
expectedValues: nil,
|
||||
},
|
||||
"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
orStringMatcher{
|
||||
&equalStringMatcher{s: "bar", caseSensitive: true},
|
||||
&equalStringMatcher{s: "xxx", caseSensitive: true},
|
||||
},
|
||||
&equalStringMatcher{s: "baz", caseSensitive: true},
|
||||
},
|
||||
expectedValues: map[string]struct{}{
|
||||
"FOO": {},
|
||||
"bar": {},
|
||||
"xxx": {},
|
||||
"baz": {},
|
||||
},
|
||||
expectedCaseSensitive: true,
|
||||
},
|
||||
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
orStringMatcher{
|
||||
// Case sensitivity is different within items at the same level.
|
||||
&equalStringMatcher{s: "bar", caseSensitive: true},
|
||||
&equalStringMatcher{s: "xxx", caseSensitive: false},
|
||||
},
|
||||
&equalStringMatcher{s: "baz", caseSensitive: true},
|
||||
},
|
||||
expectedValues: nil,
|
||||
},
|
||||
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: true},
|
||||
// Case sensitivity is different between the parent and child.
|
||||
orStringMatcher{
|
||||
&equalStringMatcher{s: "bar", caseSensitive: false},
|
||||
&equalStringMatcher{s: "xxx", caseSensitive: false},
|
||||
},
|
||||
&equalStringMatcher{s: "baz", caseSensitive: true},
|
||||
},
|
||||
expectedValues: nil,
|
||||
},
|
||||
"should return lowercase values on case insensitive matchers": {
|
||||
input: orStringMatcher{
|
||||
&equalStringMatcher{s: "FOO", caseSensitive: false},
|
||||
orStringMatcher{
|
||||
&equalStringMatcher{s: "bAr", caseSensitive: false},
|
||||
},
|
||||
&equalStringMatcher{s: "baZ", caseSensitive: false},
|
||||
},
|
||||
expectedValues: map[string]struct{}{
|
||||
"foo": {},
|
||||
"bar": {},
|
||||
"baz": {},
|
||||
},
|
||||
expectedCaseSensitive: false,
|
||||
},
|
||||
}
|
||||
|
||||
for testName, testData := range tests {
|
||||
t.Run(testName, func(t *testing.T) {
|
||||
actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
|
||||
|
||||
if testData.expectedValues == nil {
|
||||
require.IsType(t, testData.input, actualMatcher)
|
||||
} else {
|
||||
require.IsType(t, &equalMultiStringMatcher{}, actualMatcher)
|
||||
require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values)
|
||||
require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// This benchmark is used to find a good threshold to use to apply the optimization
|
||||
// done by optimizeEqualStringMatchers()
|
||||
func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
|
||||
// Generate variable lengths random texts to match against.
|
||||
texts := append([]string{}, randStrings(10, 10)...)
|
||||
texts = append(texts, randStrings(5, 30)...)
|
||||
texts = append(texts, randStrings(1, 100)...)
|
||||
|
||||
for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
|
||||
for _, caseSensitive := range []bool{true, false} {
|
||||
b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
|
||||
// Generate a regex with the expected number of alternations.
|
||||
re := strings.Join(randStrings(numAlternations, 10), "|")
|
||||
if !caseSensitive {
|
||||
re = "(?i:(" + re + "))"
|
||||
}
|
||||
|
||||
parsed, err := syntax.Parse(re, syntax.Perl)
|
||||
require.NoError(b, err)
|
||||
|
||||
unoptimized := stringMatcherFromRegexpInternal(parsed)
|
||||
require.IsType(b, orStringMatcher{}, unoptimized)
|
||||
|
||||
optimized := optimizeEqualStringMatchers(unoptimized, 0)
|
||||
require.IsType(b, &equalMultiStringMatcher{}, optimized)
|
||||
|
||||
b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
for _, t := range texts {
|
||||
unoptimized.Matches(t)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
for _, t := range texts {
|
||||
optimized.Matches(t)
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getTestNameFromRegexp(re string) string {
|
||||
if len(re) > 32 {
|
||||
return re[:32]
|
||||
}
|
||||
return re
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue