Merge pull request #537 from grafana/more-regexp-optimizations-with-dot-quest-support

Optimize .? in the FastRegexMatcher
This commit is contained in:
Marco Pracucci 2023-10-02 09:10:56 +02:00 committed by GitHub
commit c7c149e1d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 146 additions and 2 deletions

View file

@ -480,6 +480,15 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
// Any string is fine (including an empty one), as far as it doesn't contain any newline. // Any string is fine (including an empty one), as far as it doesn't contain any newline.
return anyStringWithoutNewlineMatcher{} return anyStringWithoutNewlineMatcher{}
case syntax.OpQuest:
// Only optimize for ".?".
if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) {
return nil
}
return &zeroOrOneCharacterStringMatcher{
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpEmptyMatch: case syntax.OpEmptyMatch:
return emptyStringMatcher{} return emptyStringMatcher{}
@ -511,14 +520,14 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
var left, right StringMatcher var left, right StringMatcher
// Let's try to find if there's a first and last any matchers. // Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar { if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest {
left = stringMatcherFromRegexpInternal(re.Sub[0]) left = stringMatcherFromRegexpInternal(re.Sub[0])
if left == nil { if left == nil {
return nil return nil
} }
re.Sub = re.Sub[1:] re.Sub = re.Sub[1:]
} }
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar { if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest {
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1]) right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
if right == nil { if right == nil {
return nil return nil
@ -845,6 +854,26 @@ func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
return len(s) > 0 && strings.IndexByte(s, '\n') == -1 return len(s) > 0 && strings.IndexByte(s, '\n') == -1
} }
// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence
// of any character. The newline character is matches only if matchNL is set to true.
type zeroOrOneCharacterStringMatcher struct {
matchNL bool
}
func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool {
// Zero or one.
if len(s) > 1 {
return false
}
// No need to check for the newline if the string is empty or matching a newline is OK.
if m.matchNL || len(s) == 0 {
return true
}
return s[0] != '\n'
}
// trueMatcher is a stringMatcher which matches any string (always returns true). // trueMatcher is a stringMatcher which matches any string (always returns true).
type trueMatcher struct{} type trueMatcher struct{}

View file

@ -78,6 +78,13 @@ var (
"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))", "(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))",
// A long case insensitive alternation where each entry starts with ".*". // A long case insensitive alternation where each entry starts with ".*".
"(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))", "(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))",
// Quest ".?".
"fo.?",
"foo.?",
"f.?o",
".*foo.?",
".?foo.+",
"foo.?|bar",
} }
values = []string{ values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@ -418,6 +425,13 @@ func TestStringMatcherFromRegexp(t *testing.T) {
{"foo.+.+", nil}, {"foo.+.+", nil},
{".*.*foo", nil}, {".*.*foo", nil},
{".+.+foo", nil}, {".+.+foo", nil},
{"aaa.?.?", nil},
{"aaa.?.*", nil},
// Regexps with ".?".
{"ext.?|xfs", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"(?s)(ext.?|xfs)", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: true}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"foo.?", &literalPrefixStringMatcher{prefix: "foo", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}},
{"f.?o", nil},
} { } {
c := c c := c
t.Run(c.pattern, func(t *testing.T) { t.Run(c.pattern, func(t *testing.T) {
@ -588,6 +602,91 @@ func TestStringMatcherFromRegexp_LiteralSuffix(t *testing.T) {
} }
} }
func TestStringMatcherFromRegexp_Quest(t *testing.T) {
for _, c := range []struct {
pattern string
expectedZeroOrOneMatchers int
expectedMatches []string
expectedNotMatches []string
}{
// Not match newline.
{
pattern: "test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!"},
expectedNotMatches: []string{"test\n", "tes", "test!!"},
},
{
pattern: ".?test",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "!test"},
expectedNotMatches: []string{"\ntest", "tes", "test!"},
},
{
pattern: "(aaa.?|bbb.?)",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbb", "bbbX"},
expectedNotMatches: []string{"aa", "aaaXX", "aaa\n", "bb", "bbbXX", "bbb\n"},
},
{
pattern: ".*aaa.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"aaa", "Xaaa", "aaaX", "XXXaaa", "XXXaaaX"},
expectedNotMatches: []string{"aa", "aaaXX", "XXXaaaXXX", "XXXaaa\n"},
},
// Match newline.
{
pattern: "(?s)test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!", "test\n"},
expectedNotMatches: []string{"tes", "test!!", "test\n\n"},
},
// Mixed flags (a part matches newline another doesn't).
{
pattern: "(aaa.?|((?s).?bbb.+))",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbbX", "XbbbX", "bbbXXX", "\nbbbX"},
expectedNotMatches: []string{"aa", "aaa\n", "Xbbb", "\nbbb"},
},
} {
t.Run(c.pattern, func(t *testing.T) {
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)
matcher := stringMatcherFromRegexp(parsed)
require.NotNil(t, matcher)
re := regexp.MustCompile("^" + c.pattern + "$")
// Pre-condition check: ensure it contains zeroOrOneCharacterStringMatcher.
numZeroOrOneMatchers := 0
visitStringMatcher(matcher, func(matcher StringMatcher) {
if _, ok := matcher.(*zeroOrOneCharacterStringMatcher); ok {
numZeroOrOneMatchers++
}
})
require.Equal(t, c.expectedZeroOrOneMatchers, numZeroOrOneMatchers)
for _, value := range c.expectedMatches {
assert.Truef(t, matcher.Matches(value), "Value: %s", value)
// Ensure the golang regexp engine would return the same.
assert.Truef(t, re.MatchString(value), "Value: %s", value)
}
for _, value := range c.expectedNotMatches {
assert.Falsef(t, matcher.Matches(value), "Value: %s", value)
// Ensure the golang regexp engine would return the same.
assert.Falsef(t, re.MatchString(value), "Value: %s", value)
}
})
}
}
func randString(randGenerator *rand.Rand, length int) string { func randString(randGenerator *rand.Rand, length int) string {
b := make([]rune, length) b := make([]rune, length)
for i := range b { for i := range b {
@ -1106,6 +1205,22 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
} }
} }
func TestZeroOrOneCharacterStringMatcher(t *testing.T) {
matcher := &zeroOrOneCharacterStringMatcher{matchNL: true}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.True(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))
matcher = &zeroOrOneCharacterStringMatcher{matchNL: false}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.False(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))
}
func TestLiteralPrefixStringMatcher(t *testing.T) { func TestLiteralPrefixStringMatcher(t *testing.T) {
m := &literalPrefixStringMatcher{prefix: "mar", prefixCaseSensitive: true, right: &emptyStringMatcher{}} m := &literalPrefixStringMatcher{prefix: "mar", prefixCaseSensitive: true, right: &emptyStringMatcher{}}
assert.True(t, m.Matches("mar")) assert.True(t, m.Matches("mar"))