Improve contains check done by FastRegexMatcher

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2024-05-31 15:35:58 +02:00
parent 42b546a43d
commit 78fdd2188d
No known key found for this signature in database
GPG key ID: 74C1BD403D2DF9B5
2 changed files with 60 additions and 32 deletions

View file

@ -42,7 +42,7 @@ type FastRegexMatcher struct {
stringMatcher StringMatcher stringMatcher StringMatcher
prefix string prefix string
suffix string suffix string
contains string contains []string
// matchString is the "compiled" function to run by MatchString(). // matchString is the "compiled" function to run by MatchString().
matchString func(string) bool matchString func(string) bool
@ -87,7 +87,7 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
// compileMatchStringFunction returns the function to run by MatchString(). // compileMatchStringFunction returns the function to run by MatchString().
func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
// If the only optimization available is the string matcher, then we can just run it. // If the only optimization available is the string matcher, then we can just run it.
if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil { if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil {
return m.stringMatcher.Matches return m.stringMatcher.Matches
} }
@ -106,7 +106,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
return false return false
} }
if m.contains != "" && !strings.Contains(s, m.contains) { if len(m.contains) > 0 && !containsInOrder(s, m.contains) {
return false return false
} }
if m.stringMatcher != nil { if m.stringMatcher != nil {
@ -119,7 +119,7 @@ func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
// IsOptimized returns true if any fast-path optimization is applied to the // IsOptimized returns true if any fast-path optimization is applied to the
// regex matcher. // regex matcher.
func (m *FastRegexMatcher) IsOptimized() bool { func (m *FastRegexMatcher) IsOptimized() bool {
return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || m.contains != "" return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0
} }
// findSetMatches extract equality matches from a regexp. // findSetMatches extract equality matches from a regexp.
@ -361,8 +361,9 @@ func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
// optimizeConcatRegex returns literal prefix/suffix text that can be safely // optimizeConcatRegex returns literal prefix/suffix text that can be safely
// checked against the label value before running the regexp matcher. // checked against the label value before running the regexp matcher.
func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) { func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) {
sub := r.Sub sub := r.Sub
clearCapture(sub...)
// We can safely remove begin and end text matchers respectively // We can safely remove begin and end text matchers respectively
// at the beginning and end of the regexp. // at the beginning and end of the regexp.
@ -387,13 +388,12 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
suffix = string(sub[last].Rune) suffix = string(sub[last].Rune)
} }
// If contains any literal which is not a prefix/suffix, we keep the // If contains any literal which is not a prefix/suffix, we keep track of
// 1st one. We do not keep the whole list of literals to simplify the // all the ones which are case sensitive.
// fast path.
for i := 1; i < len(sub)-1; i++ { for i := 1; i < len(sub)-1; i++ {
// TODO if it's case insensitive we should return an contains list or is it safe to keep searching for case sensitive ones?
if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 { if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
contains = string(sub[i].Rune) contains = append(contains, string(sub[i].Rune))
break
} }
} }
@ -940,3 +940,18 @@ func hasPrefixCaseInsensitive(s, prefix string) bool {
func hasSuffixCaseInsensitive(s, suffix string) bool { func hasSuffixCaseInsensitive(s, suffix string) bool {
return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix) return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix)
} }
func containsInOrder(s string, contains []string) bool {
offset := 0
for _, substr := range contains {
at := strings.Index(s[offset:], substr)
if at == -1 {
return false
}
offset += at + len(substr)
}
return true
}

View file

@ -81,6 +81,10 @@ var (
".*foo.?", ".*foo.?",
".?foo.+", ".?foo.+",
"foo.?|bar", "foo.?|bar",
// Concat of literals and wildcards.
".*-.*-.*-.*-.*",
"(.+)-(.+)-(.+)-(.+)-(.+)",
"((.*))-((.*))-((.*))-((.*))-((.*))",
} }
values = []string{ values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
@ -132,29 +136,29 @@ func TestOptimizeConcatRegex(t *testing.T) {
regex string regex string
prefix string prefix string
suffix string suffix string
contains string contains []string
}{ }{
{regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: ""}, {regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: nil},
{regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: ""}, {regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: nil},
{regex: "foo.*", prefix: "foo", suffix: "", contains: ""}, {regex: "foo.*", prefix: "foo", suffix: "", contains: nil},
{regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: "hello"}, {regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: []string{"hello"}},
{regex: ".*foo", prefix: "", suffix: "foo", contains: ""}, {regex: ".*foo", prefix: "", suffix: "foo", contains: nil},
{regex: "^.*foo$", prefix: "", suffix: "foo", contains: ""}, {regex: "^.*foo$", prefix: "", suffix: "foo", contains: nil},
{regex: ".*foo.*", prefix: "", suffix: "", contains: "foo"}, {regex: ".*foo.*", prefix: "", suffix: "", contains: []string{"foo"}},
{regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: "foo"}, {regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: []string{"foo", "bar"}},
{regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: ""}, {regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: nil},
{regex: ".*[abc].*", prefix: "", suffix: "", contains: ""}, {regex: ".*[abc].*", prefix: "", suffix: "", contains: nil},
{regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: ""}, {regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: nil},
{regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: ""}, {regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: nil},
{regex: "(?i:abc).*", prefix: "", suffix: "", contains: ""}, {regex: "(?i:abc).*", prefix: "", suffix: "", contains: nil},
{regex: ".*(?i:abc)", prefix: "", suffix: "", contains: ""}, {regex: ".*(?i:abc)", prefix: "", suffix: "", contains: nil},
{regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: "def"}, {regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: []string{"def"}},
{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"}, {regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: []string{"abc"}},
{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"}, {regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: []string{"abc"}},
{regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"}, {regex: "[aA]bc.*", prefix: "", suffix: "", contains: []string{"bc"}},
{regex: "^5..$", prefix: "5", suffix: "", contains: ""}, {regex: "^5..$", prefix: "5", suffix: "", contains: nil},
{regex: "^release.*", prefix: "release", suffix: "", contains: ""}, {regex: "^release.*", prefix: "release", suffix: "", contains: nil},
{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"}, {regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: []string{"laio"}},
} }
for _, c := range cases { for _, c := range cases {
@ -1089,6 +1093,15 @@ func TestHasSuffixCaseInsensitive(t *testing.T) {
require.False(t, hasSuffixCaseInsensitive("marco", "abcdefghi")) require.False(t, hasSuffixCaseInsensitive("marco", "abcdefghi"))
} }
func TestContainsInOrder(t *testing.T) {
require.True(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "no"}))
require.True(t, containsInOrder("abcdefghilmno", []string{"def", "hil"}))
require.False(t, containsInOrder("abcdefghilmno", []string{"ac"}))
require.False(t, containsInOrder("abcdefghilmno", []string{"ab", "cd", "de"}))
require.False(t, containsInOrder("abcdefghilmno", []string{"cd", "ab"}))
}
func getTestNameFromRegexp(re string) string { func getTestNameFromRegexp(re string) string {
if len(re) > 32 { if len(re) > 32 {
return re[:32] return re[:32]