prometheus/model/labels/regexp_test.go

885 lines
69 KiB
Go
Raw Normal View History

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
"bufio"
"fmt"
"math/rand"
"os"
"sort"
"strconv"
"strings"
"testing"
"time"
"github.com/DmitriyVTitov/size"
"github.com/grafana/regexp"
"github.com/grafana/regexp/syntax"
"github.com/stretchr/testify/require"
)
var (
asciiRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_")
regexes = []string{
"",
"foo",
"^foo",
"(foo|bar)",
"foo.*",
".*foo",
"^.*foo$",
"^.+foo$",
".*",
".+",
"foo.+",
".+foo",
"foo\n.+",
"foo\n.*",
".*foo.*",
".+foo.+",
"(?s:.*)",
"(?s:.+)",
"(?s:^.*foo$)",
"(?i:foo)",
"(?i:(foo|bar))",
"(?i:(foo1|foo2|bar))",
"^(?i:foo|oo)|(bar)$",
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
"((.*)(bar|b|buzz)(.+)|foo)$",
"^$",
"(prometheus|api_prom)_api_v1_.+",
"10\\.0\\.(1|2)\\.+",
"10\\.0\\.(1|2).+",
"((fo(bar))|.+foo)",
// A long case sensitive alternation.
"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
// An extremely long case sensitive alternation. This is a special
// case because the values share common prefixes rather than being
// entirely random. This is common in the real world. For example, the
// values of a label like kubernetes pod will often include the
// deployment name as a prefix.
"jyyfj00j0061|jyyfj00j0062|jyyfj94j0093|jyyfj99j0093|jyyfm01j0021|jyyfm02j0021|jyefj00j0192|jyefj00j0193|jyefj00j0194|jyefj00j0195|jyefj00j0196|jyefj00j0197|jyefj00j0290|jyefj00j0291|jyefj00j0292|jyefj00j0293|jyefj00j0294|jyefj00j0295|jyefj00j0296|jyefj00j0297|jyefj89j0394|jyefj90j0394|jyefj91j0394|jyefj95j0347|jyefj96j0322|jyefj96j0347|jyefj97j0322|jyefj97j0347|jyefj98j0322|jyefj98j0347|jyefj99j0320|jyefj99j0322|jyefj99j0323|jyefj99j0335|jyefj99j0336|jyefj99j0344|jyefj99j0347|jyefj99j0349|jyefj99j0351|jyeff00j0117|lyyfm01j0025|lyyfm01j0028|lyyfm01j0041|lyyfm01j0133|lyyfm01j0701|lyyfm02j0025|lyyfm02j0028|lyyfm02j0041|lyyfm02j0133|lyyfm02j0701|lyyfm03j0701|lyefj00j0775|lyefj00j0776|lyefj00j0777|lyefj00j0778|lyefj00j0779|lyefj00j0780|lyefj00j0781|lyefj00j0782|lyefj50j3807|lyefj50j3852|lyefj51j3807|lyefj51j3852|lyefj52j3807|lyefj52j3852|lyefj53j3807|lyefj53j3852|lyefj54j3807|lyefj54j3852|lyefj54j3886|lyefj55j3807|lyefj55j3852|lyefj55j3886|lyefj56j3807|lyefj56j3852|lyefj56j3886|lyefj57j3807|lyefj57j3852|lyefj57j3886|lyefj58j3807|lyefj58j3852|lyefj58j3886|lyefj59j3807|lyefj59j3852|lyefj59j3886|lyefj60j3807|lyefj60j3852|lyefj60j3886|lyefj61j3807|lyefj61j3852|lyefj61j3886|lyefj62j3807|lyefj62j3852|lyefj62j3886|lyefj63j3807|lyefj63j3852|lyefj63j3886|lyefj64j3807|lyefj64j3852|lyefj64j3886|lyefj65j3807|lyefj65j3852|lyefj65j3886|lyefj66j3807|lyefj66j3852|lyefj66j3886|lyefj67j3807|lyefj67j3852|lyefj67j3886|lyefj68j3807|lyefj68j3852|lyefj68j3886|lyefj69j3807|lyefj69j3846|lyefj69j3852|lyefj69j3886|lyefj70j3807|lyefj70j3846|lyefj70j3852|lyefj70j3886|lyefj71j3807|lyefj71j3846|lyefj71j3852|lyefj71j3886|lyefj72j3807|lyefj72j3846|lyefj72j3852|lyefj72j3886|lyefj73j3807|lyefj73j3846|lyefj73j3852|lyefj73j3886|lyefj74j3807|lyefj74j3846|lyefj74j3852|lyefj74j3886|lyefj75j3807|lyefj75j3808|lyefj75j3846|lyefj75j3852|lyefj75j3886|lyefj76j3732|lyefj76j3807|lyefj76j3808|lyefj76j3846|lyefj76j3852|lyefj76j3886|lyefj77j3732|lyefj77j3807|lyefj77j3808|lyefj77j3846|lyefj77j3852|lyefj77j3886|lyefj78j3278|lyefj78j3732|lyefj78j3807|lyefj78j3808|lyefj78j3846|lyefj78j3852|lyefj78j3886|lyefj79j3732|lyefj79j3807|lyefj79j3808|lyefj79j3846|lyefj79j3852|lyefj79j3886|lyefj80j3732|lyefj80j3807|lyefj80j3808|lyefj80j3846|lyefj80j3852|lyefj80j3886|lyefj81j3732|lyefj81j3807|lyefj81j3808|lyefj81j3846|lyefj81j3852|lyefj81j3886|lyefj82j3732|lyefj82j3807|lyefj82j3808|lyefj82j3846|lyefj82j3852|lyefj82j3886|lyefj83j3732|lyefj83j3807|lyefj83j3808|lyefj83j3846|lyefj83j3852|lyefj83j3886|lyefj84j3732|lyefj84j3807|lyefj84j3808|lyefj84j3846|lyefj84j3852|lyefj84j3886|lyefj85j3732|lyefj85j3807|lyefj85j3808|lyefj85j3846|lyefj85j3852|lyefj85j3886|lyefj86j3278|lyefj86j3732|lyefj86j3807|lyefj86j3808|lyefj86j3846|lyefj86j3852|lyefj86j3886|lyefj87j3278|lyefj87j3732|lyefj87j3807|lyefj87j3808|lyefj87j3846|lyefj87j3852|lyefj87j3886|lyefj88j3732|lyefj88j3807|lyefj88j3808|lyefj88j3846|lyefj88j3852|lyefj88j3886|lyefj89j3732|lyefj89j3807|lyefj89j3808|lyefj89j3846|lyefj89j3852|lyefj89j3886|lyefj90j3732|lyefj90j3807|lyefj90j3808|lyefj90j3846|lyefj90j3852|lyefj90j3886|lyefj91j3732|lyefj91j3807|lyefj91j3808|lyefj91j3846|lyefj91j3852|lyefj91j3886|lyefj92j3732|lyefj92j3807|lyefj92j3808|lyefj92j3846|lyefj92j3852|lyefj92j3886|lyefj93j3732|lyefj93j3807|lyefj93j3808|lyefj93j3846|lyefj93j3852|lyefj93j3885|lyefj93j3886|lyefj94j3525|lyefj94j3732|lyefj94j3807|lyefj94j3808|lyefj94j3846|lyefj94j3852|lyefj94j3885|lyefj94j3886|lyefj95j3525|lyefj95j3732|lyefj95j3807|lyefj95j3808|lyefj95j3846|lyefj95j3852|lyefj95j3886|lyefj96j3732|lyefj96j3803|lyefj96j3807|lyefj96j3808|lyefj96j3846|lyefj96j3852|lyefj96j3886|lyefj97j3333|lyefj97j3732|lyefj97j3792|lyefj97j3803|lyefj97j3807|lyefj97j3808|lyefj97j3838|lyefj97j3843|lyefj97j3846|lyefj97j3852|lyefj97j3886|lyefj98j3083|lyefj98j3333|lyefj98j3732|lyefj98j3807|lyefj98j3808|lyefj98j3838|lyefj98j3843|lyefj98j3846|lyefj98j3852|lyefj98j3873|lyefj98j3877|lyefj98j3882|lyefj98j3886|lyefj99j2984|lyefj99j3083|lyefj99j3333|lyefj99j3732|lyefj99j3807|lyefj99j3808|lyefj99j3846|lyefj99j3849|lyefj99j3852|lyefj99j3873|lyefj99j3877|lyefj99j3882|lyefj99j3884|lyefj99j3886|lyeff00j0106|lyeff00j01
// A long case insensitive alternation.
"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
}
values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
"FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
"10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40",
"foofoo0", "foofoo",
}
)
func TestNewFastRegexMatcher(t *testing.T) {
for _, r := range regexes {
r := r
for _, v := range values {
v := v
t.Run(r+` on "`+v+`"`, func(t *testing.T) {
t.Parallel()
m, err := NewFastRegexMatcher(r)
require.NoError(t, err)
re := regexp.MustCompile("^(?:" + r + ")$")
require.Equal(t, re.MatchString(v), m.MatchString(v))
})
}
}
}
func TestNewFastRegexMatcher_CacheSizeLimit(t *testing.T) {
// Start with an empty cache.
fastRegexMatcherCache.Clear()
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand.New(rand.NewSource(1))
// Generate a very expensive regex.
alternates := make([]string, 1000)
for i := 0; i < len(alternates); i++ {
alternates[i] = randString(randGenerator, 100) + fmt.Sprintf(".%d", i)
}
expensiveRegexp := strings.Join(alternates, "|")
// Utility function to get a unique expensive regexp.
getExpensiveRegexp := func(id int) string {
return expensiveRegexp + fmt.Sprintf("%d", id)
}
// Estimate the size of the matcher with the expensive regexp.
m, err := newFastRegexMatcherWithoutCache(expensiveRegexp)
require.NoError(t, err)
expensiveRegexpSizeBytes := size.Of(m)
t.Logf("expensive regexp estimated size (bytes): %d", expensiveRegexpSizeBytes)
// Estimate the max number of items in the cache.
estimatedMaxItemsInCache := fastRegexMatcherCacheMaxSizeBytes / expensiveRegexpSizeBytes
// Fill the cache.
for i := 0; i < estimatedMaxItemsInCache; i++ {
_, err := NewFastRegexMatcher(getExpensiveRegexp(i))
require.NoError(t, err)
}
// Ensure all regexp matchers are still in the cache.
fastRegexMatcherCache.Wait()
for i := 0; i < estimatedMaxItemsInCache; i++ {
_, ok := fastRegexMatcherCache.Get(getExpensiveRegexp(i))
require.True(t, ok, "checking if regexp matcher #%d is still in the cache", i)
}
// Add one more regexp matcher to the cache.
_, err = NewFastRegexMatcher(getExpensiveRegexp(estimatedMaxItemsInCache + 1))
require.NoError(t, err)
// Ensure one item has been evicted from the cache to make room for the new entry.
fastRegexMatcherCache.Wait()
numEvicted := 0
for i := 0; i < estimatedMaxItemsInCache; i++ {
if _, ok := fastRegexMatcherCache.Get(getExpensiveRegexp(i)); !ok {
t.Logf("the regexp matcher #%d has been evicted from the cache", i)
numEvicted++
}
}
require.Equal(t, 1, numEvicted)
}
func BenchmarkNewFastRegexMatcher(b *testing.B) {
runBenchmark := func(newFunc func(v string) (*FastRegexMatcher, error)) func(b *testing.B) {
return func(b *testing.B) {
for _, r := range regexes {
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
for n := 0; n < b.N; n++ {
_, err := newFunc(r)
if err != nil {
b.Fatal(err)
}
}
})
}
}
}
b.Run("with cache", runBenchmark(NewFastRegexMatcher))
b.Run("without cache", runBenchmark(newFastRegexMatcherWithoutCache))
}
func BenchmarkNewFastRegexMatcher_CacheMisses(b *testing.B) {
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand.New(rand.NewSource(1))
tests := map[string]string{
"simple regexp": randString(randGenerator, 10),
"complex regexp": strings.Join(randStrings(randGenerator, 100, 10), "|"),
}
for testName, regexpPrefix := range tests {
b.Run(testName, func(b *testing.B) {
// Ensure the cache is empty.
fastRegexMatcherCache.Clear()
b.ResetTimer()
for n := 0; n < b.N; n++ {
// Unique regexp to emulate 100% cache misses.
regexp := regexpPrefix + strconv.Itoa(n)
_, err := NewFastRegexMatcher(regexp)
if err != nil {
b.Fatal(err)
}
}
})
}
}
func TestOptimizeConcatRegex(t *testing.T) {
cases := []struct {
regex string
prefix string
suffix string
contains string
}{
{regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: ""},
{regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: ""},
{regex: "foo.*", prefix: "foo", suffix: "", contains: ""},
{regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: "hello"},
{regex: ".*foo", prefix: "", suffix: "foo", contains: ""},
{regex: "^.*foo$", prefix: "", suffix: "foo", contains: ""},
{regex: ".*foo.*", prefix: "", suffix: "", contains: "foo"},
{regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: "foo"},
{regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: ""},
{regex: ".*[abc].*", prefix: "", suffix: "", contains: ""},
{regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: ""},
{regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: ""},
{regex: "(?i:abc).*", prefix: "", suffix: "", contains: ""},
{regex: ".*(?i:abc)", prefix: "", suffix: "", contains: ""},
{regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: "def"},
{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"},
{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"},
{regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"},
{regex: "^5..$", prefix: "5", suffix: "", contains: ""},
{regex: "^release.*", prefix: "release", suffix: "", contains: ""},
{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"},
}
for _, c := range cases {
parsed, err := syntax.Parse(c.regex, syntax.Perl)
require.NoError(t, err)
prefix, suffix, contains := optimizeConcatRegex(parsed)
require.Equal(t, c.prefix, prefix)
require.Equal(t, c.suffix, suffix)
require.Equal(t, c.contains, contains)
}
}
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func TestFindSetMatches(t *testing.T) {
for _, c := range []struct {
pattern string
expMatches []string
expCaseSensitive bool
}{
// Single value, coming from a `bar=~"foo"` selector.
{"foo", []string{"foo"}, true},
{"^foo", []string{"foo"}, true},
{"^foo$", []string{"foo"}, true},
// Simple sets alternates.
{"foo|bar|zz", []string{"foo", "bar", "zz"}, true},
// Simple sets alternate and concat (bar|baz is parsed as "ba[rz]").
{"foo|bar|baz", []string{"foo", "bar", "baz"}, true},
// Simple sets alternate and concat and capture
{"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}, true},
// Simple sets alternate and concat and alternates with empty matches
// parsed as b(ar|(?:)|uzz) where b(?:) means literal b.
{"bar|b|buzz", []string{"bar", "b", "buzz"}, true},
// Skip outer anchors (it's enforced anyway at the root).
{"^(bar|b|buzz)$", []string{"bar", "b", "buzz"}, true},
{"^(?:prod|production)$", []string{"prod", "production"}, true},
// Do not optimize regexp with inner anchors.
{"(bar|b|b^uz$z)", nil, false},
// Do not optimize regexp with empty string matcher.
{"^$|Running", nil, false},
// Simple sets containing escaped characters.
{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}, true},
// using charclass
{"[abc]d", []string{"ad", "bd", "cd"}, true},
// high low charset different => A(B[CD]|EF)|BC[XY]
{"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}, true},
// triple concat
{"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}, true},
// triple concat with multiple alternates
{"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}, true},
{"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}, true},
// class starting with "-"
{"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}, true},
{"[1^3]", []string{"1", "3", "^"}, true},
// OpPlus with concat
{"(.+)/(foo|bar)", nil, false},
// Simple sets containing special characters without escaping.
{"fo.o|bar?|^baz", nil, false},
// case sensitive wrapper.
{"(?i)foo", []string{"FOO"}, false},
// case sensitive wrapper on alternate.
{"(?i)foo|bar|baz", []string{"FOO", "BAR", "BAZ", "BAr", "BAz"}, false},
// mixed case sensitivity.
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil, false},
// mixed case sensitivity concatenation only without capture group.
{"api_v1_(?i)push", nil, false},
// mixed case sensitivity alternation only without capture group.
{"api|(?i)rpc", nil, false},
// case sensitive after unsetting insensitivity.
{"rpc|(?i)(?-i)api", []string{"rpc", "api"}, true},
// case sensitive after unsetting insensitivity in all alternation options.
{"(?i)((?-i)api|(?-i)rpc)", []string{"api", "rpc"}, true},
// mixed case sensitivity after unsetting insensitivity.
{"(?i)rpc|(?-i)api", nil, false},
// too high charset combination
{"(api|rpc)_[^0-9]", nil, false},
// too many combinations
{"[a-z][a-z]", nil, false},
} {
c := c
t.Run(c.pattern, func(t *testing.T) {
t.Parallel()
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)
matches, actualCaseSensitive := findSetMatches(parsed)
require.Equal(t, c.expMatches, matches)
require.Equal(t, c.expCaseSensitive, actualCaseSensitive)
})
}
}
func BenchmarkFastRegexMatcher(b *testing.B) {
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand.New(rand.NewSource(1))
// Generate variable lengths random texts to match against.
texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
texts = append(texts, randStrings(randGenerator, 5, 30)...)
texts = append(texts, randStrings(randGenerator, 1, 100)...)
texts = append(texts, "foo"+randString(randGenerator, 50))
texts = append(texts, randString(randGenerator, 50)+"foo")
for _, r := range regexes {
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
m, err := NewFastRegexMatcher(r)
require.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, text := range texts {
_ = m.MatchString(text)
}
}
})
}
}
func Test_OptimizeRegex(t *testing.T) {
for _, c := range []struct {
pattern string
exp StringMatcher
}{
{".*", anyStringWithoutNewlineMatcher{}},
{".*?", anyStringWithoutNewlineMatcher{}},
{"(?s:.*)", trueMatcher{}},
{"(.*)", anyStringWithoutNewlineMatcher{}},
{"^.*$", anyStringWithoutNewlineMatcher{}},
{".+", &anyNonEmptyStringMatcher{matchNL: false}},
{"(?s:.+)", &anyNonEmptyStringMatcher{matchNL: true}},
{"^.+$", &anyNonEmptyStringMatcher{matchNL: false}},
{"(.+)", &anyNonEmptyStringMatcher{matchNL: false}},
{"", emptyStringMatcher{}},
{"^$", emptyStringMatcher{}},
{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
{"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: anyStringWithoutNewlineMatcher{}}},
{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"10\\.0\\.(1|2)\\.+", nil},
{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: anyStringWithoutNewlineMatcher{}}},
{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}})},
{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
// we don't support case insensitive matching for contains.
// This is because there's no strings.IndexOfFold function.
// We can revisit later if this is really popular by using strings.ToUpper.
{"^(.*)((?i)foo|foobar)(.*)$", nil},
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
{"[a-z][a-z]", nil},
{"[1^3]", nil},
{".*foo.*bar.*", nil},
{`\d*`, nil},
{".", nil},
// This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
// It would make the code too complex to handle it.
{"/|/bar.*", nil},
{"(.+)/(foo.*|bar$)", nil},
} {
c := c
t.Run(c.pattern, func(t *testing.T) {
t.Parallel()
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)
matches := stringMatcherFromRegexp(parsed)
require.Equal(t, c.exp, matches)
})
}
}
func randString(randGenerator *rand.Rand, length int) string {
b := make([]rune, length)
for i := range b {
b[i] = asciiRunes[randGenerator.Intn(len(asciiRunes))]
}
return string(b)
}
func randStrings(randGenerator *rand.Rand, many, length int) []string {
out := make([]string, 0, many)
for i := 0; i < many; i++ {
out = append(out, randString(randGenerator, length))
}
return out
}
func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
// Create all matchers.
matchers := make([]*FastRegexMatcher, 0, len(regexes))
res := make([]*regexp.Regexp, 0, len(regexes))
for _, re := range regexes {
m, err := NewFastRegexMatcher(re)
require.NoError(f, err)
r := regexp.MustCompile("^(?:" + re + ")$")
matchers = append(matchers, m)
res = append(res, r)
}
// Add known values to seed corpus.
for _, v := range values {
f.Add(v)
}
f.Fuzz(func(t *testing.T, text string) {
for i, m := range matchers {
require.Equalf(t, res[i].MatchString(text), m.MatchString(text), "regexp: %s text: %s", res[i].String(), text)
}
})
}
func FuzzFastRegexMatcher_WithFuzzyRegularExpressions(f *testing.F) {
for _, re := range regexes {
for _, text := range values {
f.Add(re, text)
}
}
f.Fuzz(func(t *testing.T, re, text string) {
m, err := NewFastRegexMatcher(re)
if err != nil {
// Ignore invalid regexes.
return
}
reg, err := regexp.Compile("^(?:" + re + ")$")
if err != nil {
// Ignore invalid regexes.
return
}
require.Equalf(t, reg.MatchString(text), m.MatchString(text), "regexp: %s text: %s", reg.String(), text)
})
}
// This test can be used to analyze real queries from Mimir logs. You can extract real queries with a regexp matcher
// running the following command:
//
// logcli --addr=XXX --username=YYY --password=ZZZ query '{namespace=~"(cortex|mimir).*",name="query-frontend"} |= "query stats" |= "=~" --limit=100000 > logs.txt
func TestAnalyzeRealQueries(t *testing.T) {
t.Skip("Decomment this test only to manually analyze real queries")
type labelValueInfo struct {
numMatchingQueries int //nolint:unused
numShardedQueries int //nolint:unused
numSplitQueries int //nolint:unused
optimized bool //nolint:unused
averageParsingTimeMillis float64 //nolint:unused
// Sorted list of timestamps when the queries have been received.
queryStartTimes []time.Time
}
labelValueRE := regexp.MustCompile(`=~\\"([^"]+)\\"`)
tsRE := regexp.MustCompile(`ts=([^ ]+)`)
shardedQueriesRE := regexp.MustCompile(`sharded_queries=(\d+)`)
splitQueriesRE := regexp.MustCompile(`split_queries=(\d+)`)
labelValues := make(map[string]*labelValueInfo)
// Read the logs file line-by-line, and find all values for regex label matchers.
readFile, err := os.Open("logs.txt")
require.NoError(t, err)
fileScanner := bufio.NewScanner(readFile)
fileScanner.Split(bufio.ScanLines)
numQueries := 0
for fileScanner.Scan() {
line := fileScanner.Text()
matches := labelValueRE.FindAllStringSubmatch(line, -1)
if len(matches) == 0 {
continue
}
// Look up query stats.
tsRaw := tsRE.FindStringSubmatch(line)
shardedQueriesRaw := shardedQueriesRE.FindStringSubmatch(line)
splitQueriesRaw := splitQueriesRE.FindStringSubmatch(line)
shardedQueries := 0
splitQueries := 0
var ts time.Time
if len(tsRaw) > 0 {
ts, _ = time.Parse(time.RFC3339Nano, tsRaw[1])
}
if len(shardedQueriesRaw) > 0 {
shardedQueries, _ = strconv.Atoi(shardedQueriesRaw[1])
}
if len(splitQueriesRaw) > 0 {
splitQueries, _ = strconv.Atoi(splitQueriesRaw[1])
}
numQueries++
for _, match := range matches {
info := labelValues[match[1]]
if info == nil {
info = &labelValueInfo{}
labelValues[match[1]] = info
}
info.numMatchingQueries++
info.numShardedQueries += shardedQueries
info.numSplitQueries += splitQueries
if !ts.IsZero() {
info.queryStartTimes = append(info.queryStartTimes, ts)
}
}
}
// Sort query start times.
for _, info := range labelValues {
sort.Slice(info.queryStartTimes, func(i, j int) bool {
return info.queryStartTimes[i].Before(info.queryStartTimes[j])
})
}
require.NoError(t, readFile.Close())
t.Logf("Found %d unique regexp matchers out of %d queries", len(labelValues), numQueries)
// Analyze each regexp matcher found.
numChecked := 0
numOptimized := 0
for re, info := range labelValues {
m, err := NewFastRegexMatcher(re)
if err != nil {
// Ignore it, because we may have failed to extract the label matcher.
continue
}
numChecked++
// Check if each regexp matcher is supported by our optimization.
if m.IsOptimized() {
numOptimized++
info.optimized = true
}
// Estimate the parsing complexity.
startTime := time.Now()
const numParsingRuns = 1000
for i := 0; i < numParsingRuns; i++ {
NewFastRegexMatcher(re)
}
info.averageParsingTimeMillis = float64(time.Since(startTime).Milliseconds()) / float64(numParsingRuns)
}
t.Logf("Found %d out of %d (%.2f%%) regexp matchers optimized by FastRegexMatcher", numOptimized, numChecked, (float64(numOptimized)/float64(numChecked))*100)
// Print some statistics.
for labelValue, info := range labelValues {
// Find the min/avg/max difference between query start times.
var (
minQueryStartTimeDiff time.Duration
maxQueryStartTimeDiff time.Duration
avgQueryStartTimeDiff time.Duration
sumQueryStartTime time.Duration
countQueryStartTime int
)
for i := 1; i < len(info.queryStartTimes); i++ {
diff := info.queryStartTimes[i].Sub(info.queryStartTimes[i-1])
sumQueryStartTime += diff
countQueryStartTime++
if minQueryStartTimeDiff == 0 || diff < minQueryStartTimeDiff {
minQueryStartTimeDiff = diff
}
if diff > maxQueryStartTimeDiff {
maxQueryStartTimeDiff = diff
}
}
if countQueryStartTime > 0 {
avgQueryStartTimeDiff = sumQueryStartTime / time.Duration(countQueryStartTime)
}
t.Logf("num queries: %d\t num split queries: %d\t num sharded queries: %d\t optimized: %t\t parsing time: %.0fms\t min/avg/max query start time diff (sec): %.2f/%.2f/%.2f regexp: %s",
info.numMatchingQueries, info.numSplitQueries, info.numShardedQueries, info.optimized, info.averageParsingTimeMillis,
minQueryStartTimeDiff.Seconds(), avgQueryStartTimeDiff.Seconds(), maxQueryStartTimeDiff.Seconds(), labelValue)
}
}
func TestOptimizeEqualStringMatchers(t *testing.T) {
tests := map[string]struct {
input StringMatcher
expectedValues []string
expectedCaseSensitive bool
}{
"should skip optimization on orStringMatcher with containsStringMatcher": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&containsStringMatcher{substrings: []string{"a", "b", "c"}},
},
expectedValues: nil,
},
"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: []string{"FOO", "bar", "baz"},
expectedCaseSensitive: true,
},
"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
&equalStringMatcher{s: "bar", caseSensitive: false},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
orStringMatcher{
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "xxx", caseSensitive: true},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: []string{"FOO", "bar", "xxx", "baz"},
expectedCaseSensitive: true,
},
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
orStringMatcher{
// Case sensitivity is different within items at the same level.
&equalStringMatcher{s: "bar", caseSensitive: true},
&equalStringMatcher{s: "xxx", caseSensitive: false},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: true},
// Case sensitivity is different between the parent and child.
orStringMatcher{
&equalStringMatcher{s: "bar", caseSensitive: false},
&equalStringMatcher{s: "xxx", caseSensitive: false},
},
&equalStringMatcher{s: "baz", caseSensitive: true},
},
expectedValues: nil,
},
"should return unchanged values on few case insensitive matchers": {
input: orStringMatcher{
&equalStringMatcher{s: "FOO", caseSensitive: false},
orStringMatcher{
&equalStringMatcher{s: "bAr", caseSensitive: false},
},
&equalStringMatcher{s: "baZ", caseSensitive: false},
},
expectedValues: []string{"FOO", "bAr", "baZ"},
expectedCaseSensitive: false,
},
}
for testName, testData := range tests {
t.Run(testName, func(t *testing.T) {
actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
if testData.expectedValues == nil {
require.IsType(t, testData.input, actualMatcher)
} else {
require.IsType(t, &equalMultiStringSliceMatcher{}, actualMatcher)
require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringSliceMatcher).values)
require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringSliceMatcher).caseSensitive)
}
})
}
}
func TestNewEqualMultiStringMatcher(t *testing.T) {
tests := map[string]struct {
values []string
caseSensitive bool
expectedValuesMap map[string]struct{}
expectedValuesList []string
}{
"few case sensitive values": {
values: []string{"a", "B"},
caseSensitive: true,
expectedValuesList: []string{"a", "B"},
},
"few case insensitive values": {
values: []string{"a", "B"},
caseSensitive: false,
expectedValuesList: []string{"a", "B"},
},
"many case sensitive values": {
values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
caseSensitive: true,
expectedValuesMap: map[string]struct{}{"a": {}, "B": {}, "c": {}, "D": {}, "e": {}, "F": {}, "g": {}, "H": {}, "i": {}, "L": {}, "m": {}, "N": {}, "o": {}, "P": {}, "q": {}, "r": {}},
},
"many case insensitive values": {
values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
caseSensitive: false,
expectedValuesMap: map[string]struct{}{"a": {}, "b": {}, "c": {}, "d": {}, "e": {}, "f": {}, "g": {}, "h": {}, "i": {}, "l": {}, "m": {}, "n": {}, "o": {}, "p": {}, "q": {}, "r": {}},
},
}
for testName, testData := range tests {
t.Run(testName, func(t *testing.T) {
matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values))
for _, v := range testData.values {
matcher.add(v)
}
if testData.expectedValuesMap != nil {
require.IsType(t, &equalMultiStringMapMatcher{}, matcher)
require.Equal(t, testData.expectedValuesMap, matcher.(*equalMultiStringMapMatcher).values)
require.Equal(t, testData.caseSensitive, matcher.(*equalMultiStringMapMatcher).caseSensitive)
}
if testData.expectedValuesList != nil {
require.IsType(t, &equalMultiStringSliceMatcher{}, matcher)
require.Equal(t, testData.expectedValuesList, matcher.(*equalMultiStringSliceMatcher).values)
require.Equal(t, testData.caseSensitive, matcher.(*equalMultiStringSliceMatcher).caseSensitive)
}
})
}
}
func TestEqualMultiStringMatcher_Matches(t *testing.T) {
tests := map[string]struct {
values []string
caseSensitive bool
expectedMatches []string
expectedNotMatches []string
}{
"few case sensitive values": {
values: []string{"a", "B"},
caseSensitive: true,
expectedMatches: []string{"a", "B"},
expectedNotMatches: []string{"A", "b"},
},
"few case insensitive values": {
values: []string{"a", "B"},
caseSensitive: false,
expectedMatches: []string{"a", "A", "b", "B"},
expectedNotMatches: []string{"c", "C"},
},
"many case sensitive values": {
values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
caseSensitive: true,
expectedMatches: []string{"a", "B"},
expectedNotMatches: []string{"A", "b"},
},
"many case insensitive values": {
values: []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
caseSensitive: false,
expectedMatches: []string{"a", "A", "b", "B"},
expectedNotMatches: []string{"x", "X"},
},
}
for testName, testData := range tests {
t.Run(testName, func(t *testing.T) {
matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values))
for _, v := range testData.values {
matcher.add(v)
}
for _, v := range testData.expectedMatches {
require.True(t, matcher.Matches(v), "value: %s", v)
}
for _, v := range testData.expectedNotMatches {
require.False(t, matcher.Matches(v), "value: %s", v)
}
})
}
}
// This benchmark is used to find a good threshold to use to apply the optimization
// done by optimizeEqualStringMatchers()
func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
randGenerator := rand.New(rand.NewSource(time.Now().UnixNano()))
// Generate variable lengths random texts to match against.
texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
texts = append(texts, randStrings(randGenerator, 5, 30)...)
texts = append(texts, randStrings(randGenerator, 1, 100)...)
for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
for _, caseSensitive := range []bool{true, false} {
b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
// Generate a regex with the expected number of alternations.
re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|")
if !caseSensitive {
re = "(?i:(" + re + "))"
}
parsed, err := syntax.Parse(re, syntax.Perl)
require.NoError(b, err)
unoptimized := stringMatcherFromRegexpInternal(parsed)
require.IsType(b, orStringMatcher{}, unoptimized)
optimized := optimizeEqualStringMatchers(unoptimized, 0)
require.IsType(b, &equalMultiStringMapMatcher{}, optimized)
b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, t := range texts {
unoptimized.Matches(t)
}
}
})
b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, t := range texts {
optimized.Matches(t)
}
}
})
})
}
}
}
func getTestNameFromRegexp(re string) string {
if len(re) > 32 {
return re[:32]
}
return re
}