Optimize regex star operation (#448)

* Optimize .* regex matcher

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Consistent benchmark runs for BenchmarkFastRegexMatcher

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Fixed TestParseExpressions

Signed-off-by: Marco Pracucci <marco@pracucci.com>

---------

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2023-03-09 09:38:41 +01:00 committed by GitHub
parent cfdf2a0594
commit 242e82b8e6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 156 additions and 84 deletions

View file

@ -37,6 +37,9 @@ type FastRegexMatcher struct {
prefix string
suffix string
contains string
// matchString is the "compiled" function to run by MatchString().
matchString func(string) bool
}
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
@ -61,9 +64,42 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
}
m.stringMatcher = stringMatcherFromRegexp(parsed)
m.matchString = m.compileMatchStringFunction()
return m, nil
}
// compileMatchStringFunction returns the function to run by MatchString().
func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
// If the only optimization available is the string matcher, then we can just run it.
if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil {
return m.stringMatcher.Matches
}
return func(s string) bool {
if len(m.setMatches) != 0 {
for _, match := range m.setMatches {
if match == s {
return true
}
}
return false
}
if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
return false
}
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
return false
}
if m.contains != "" && !strings.Contains(s, m.contains) {
return false
}
if m.stringMatcher != nil {
return m.stringMatcher.Matches(s)
}
return m.re.MatchString(s)
}
}
// isOptimized returns true if any fast-path optimization is applied to the
// regex matcher.
//
@ -250,27 +286,7 @@ func tooManyMatches(matches []string, new ...string) bool {
}
func (m *FastRegexMatcher) MatchString(s string) bool {
if len(m.setMatches) != 0 {
for _, match := range m.setMatches {
if match == s {
return true
}
}
return false
}
if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
return false
}
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
return false
}
if m.contains != "" && !strings.Contains(s, m.contains) {
return false
}
if m.stringMatcher != nil {
return m.stringMatcher.Matches(s)
}
return m.re.MatchString(s)
return m.matchString(s)
}
func (m *FastRegexMatcher) SetMatches() []string {
@ -351,14 +367,25 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
// Correctly handling the end text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil
case syntax.OpPlus, syntax.OpStar:
case syntax.OpPlus:
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
}
return &anyStringMatcher{
allowEmpty: re.Op == syntax.OpStar,
return &anyNonEmptyStringMatcher{
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpStar:
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
}
// If the newline is valid, than this matcher literally match any string (even empty).
if re.Sub[0].Op == syntax.OpAnyChar {
return trueMatcher{}
}
// Any string is fine (including an empty one), as far as it doesn't contain any newline.
return anyStringWithoutNewlineMatcher{}
case syntax.OpEmptyMatch:
return emptyStringMatcher{}
@ -531,20 +558,37 @@ func (m *equalMultiStringMatcher) Matches(s string) bool {
return ok
}
// anyStringMatcher is a matcher that matches any string.
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
type anyStringMatcher struct {
allowEmpty bool
// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
// (including an empty one) as far as it doesn't contain any newline character.
type anyStringWithoutNewlineMatcher struct{}
func (m anyStringWithoutNewlineMatcher) Matches(s string) bool {
// We need to make sure it doesn't contain a newline. Since the newline is
// an ASCII character, we can use strings.IndexByte().
return strings.IndexByte(s, '\n') == -1
}
// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string.
type anyNonEmptyStringMatcher struct {
matchNL bool
}
func (m *anyStringMatcher) Matches(s string) bool {
if !m.allowEmpty && len(s) == 0 {
return false
func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
if m.matchNL {
// It's OK if the string contains a newline so we just need to make
// sure it's non-empty.
return len(s) > 0
}
if !m.matchNL && strings.ContainsRune(s, '\n') {
return false
// We need to make sure it non-empty and doesn't contain a newline.
// Since the newline is an ASCII character, we can use strings.IndexByte().
return len(s) > 0 && strings.IndexByte(s, '\n') == -1
}
// trueMatcher is a stringMatcher which matches any string (always returns true).
type trueMatcher struct{}
func (m trueMatcher) Matches(_ string) bool {
return true
}

View file

@ -27,12 +27,8 @@ import (
"github.com/stretchr/testify/require"
)
func init() {
rand.Seed(time.Now().UnixNano())
}
var (
letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
asciiRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_")
regexes = []string{
"foo",
"^foo",
@ -225,20 +221,25 @@ func TestFindSetMatches(t *testing.T) {
}
func BenchmarkFastRegexMatcher(b *testing.B) {
var (
x = strings.Repeat("x", 50)
y = "foo" + x
z = x + "foo"
)
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand.New(rand.NewSource(1))
// Generate variable lengths random texts to match against.
texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
texts = append(texts, randStrings(randGenerator, 5, 30)...)
texts = append(texts, randStrings(randGenerator, 1, 100)...)
texts = append(texts, "foo"+randString(randGenerator, 50))
texts = append(texts, randString(randGenerator, 50)+"foo")
for _, r := range regexes {
b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
m, err := NewFastRegexMatcher(r)
require.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = m.MatchString(x)
_ = m.MatchString(y)
_ = m.MatchString(z)
for _, text := range texts {
_ = m.MatchString(text)
}
}
})
}
@ -249,15 +250,15 @@ func Test_OptimizeRegex(t *testing.T) {
pattern string
exp StringMatcher
}{
{".*", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{".*?", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{"(?s:.*)", &anyStringMatcher{allowEmpty: true, matchNL: true}},
{"(.*)", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{"^.*$", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{".+", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{"(?s:.+)", &anyStringMatcher{allowEmpty: false, matchNL: true}},
{"^.+$", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{"(.+)", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{".*", anyStringWithoutNewlineMatcher{}},
{".*?", anyStringWithoutNewlineMatcher{}},
{"(?s:.*)", trueMatcher{}},
{"(.*)", anyStringWithoutNewlineMatcher{}},
{"^.*$", anyStringWithoutNewlineMatcher{}},
{".+", &anyNonEmptyStringMatcher{matchNL: false}},
{"(?s:.+)", &anyNonEmptyStringMatcher{matchNL: true}},
{"^.+$", &anyNonEmptyStringMatcher{matchNL: false}},
{"(.+)", &anyNonEmptyStringMatcher{matchNL: false}},
{"", emptyStringMatcher{}},
{"^$", emptyStringMatcher{}},
{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
@ -265,23 +266,23 @@ func Test_OptimizeRegex(t *testing.T) {
{"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: anyStringWithoutNewlineMatcher{}}},
{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"10\\.0\\.(1|2)\\.+", nil},
{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})},
{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: anyStringWithoutNewlineMatcher{}}},
{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}})},
{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
// we don't support case insensitive matching for contains.
// This is because there's no strings.IndexOfFold function.
// We can revisit later if this is really popular by using strings.ToUpper.
@ -308,18 +309,18 @@ func Test_OptimizeRegex(t *testing.T) {
}
}
func randString(length int) string {
func randString(randGenerator *rand.Rand, length int) string {
b := make([]rune, length)
for i := range b {
b[i] = letterRunes[rand.Intn(len(letterRunes))]
b[i] = asciiRunes[randGenerator.Intn(len(asciiRunes))]
}
return string(b)
}
func randStrings(many, length int) []string {
func randStrings(randGenerator *rand.Rand, many, length int) []string {
out := make([]string, 0, many)
for i := 0; i < many; i++ {
out = append(out, randString(length))
out = append(out, randString(randGenerator, length))
}
return out
}
@ -524,16 +525,18 @@ func TestOptimizeEqualStringMatchers(t *testing.T) {
// This benchmark is used to find a good threshold to use to apply the optimization
// done by optimizeEqualStringMatchers()
func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
randGenerator := rand.New(rand.NewSource(time.Now().UnixNano()))
// Generate variable lengths random texts to match against.
texts := append([]string{}, randStrings(10, 10)...)
texts = append(texts, randStrings(5, 30)...)
texts = append(texts, randStrings(1, 100)...)
texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
texts = append(texts, randStrings(randGenerator, 5, 30)...)
texts = append(texts, randStrings(randGenerator, 1, 100)...)
for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
for _, caseSensitive := range []bool{true, false} {
b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
// Generate a regex with the expected number of alternations.
re := strings.Join(randStrings(numAlternations, 10), "|")
re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|")
if !caseSensitive {
re = "(?i:(" + re + "))"
}

View file

@ -3565,7 +3565,32 @@ func TestParseExpressions(t *testing.T) {
if !test.fail {
require.NoError(t, err)
require.Equal(t, test.expected, expr, "error on input '%s'", test.input)
expected := test.expected
// The FastRegexMatcher introduced in mimir-prometheus is not comparable with
// a deep equal, so only compare its String() version.
if actualVector, ok := expr.(*VectorSelector); ok {
require.IsType(t, &VectorSelector{}, test.expected, "error on input '%s'", test.input)
expectedVector := test.expected.(*VectorSelector)
require.Len(t, actualVector.LabelMatchers, len(expectedVector.LabelMatchers), "error on input '%s'", test.input)
for i := 0; i < len(actualVector.LabelMatchers); i++ {
expectedMatcher := expectedVector.LabelMatchers[i].String()
actualMatcher := actualVector.LabelMatchers[i].String()
require.Equal(t, expectedMatcher, actualMatcher, "unexpected label matcher '%s' on input '%s'", actualMatcher, test.input)
}
// Make a shallow copy of the expected expr (because the test cases are defined in a global variable)
// and then reset the LabelMatcher to not compared them with the following deep equal.
expectedCopy := *expectedVector
expectedCopy.LabelMatchers = nil
expected = &expectedCopy
actualVector.LabelMatchers = nil
}
require.Equal(t, expected, expr, "error on input '%s'", test.input)
} else {
require.Error(t, err)
require.Contains(t, err.Error(), test.errMsg, "unexpected error on input '%s', expected '%s', got '%s'", test.input, test.errMsg, err.Error())