Merge pull request #19 from grafana/regexpopti

Optimize most common regexp
This commit is contained in:
Cyril Tovena 2021-10-11 14:15:51 +02:00 committed by GitHub
commit 62935a1241
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 340 additions and 42 deletions

View file

@ -24,10 +24,11 @@ const maxSetMatches = 256
type FastRegexMatcher struct {
re *regexp.Regexp
setMatches []string
prefix string
suffix string
contains string
setMatches []string
stringMatcher StringMatcher
prefix string
suffix string
contains string
}
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
@ -42,13 +43,13 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
return nil, err
}
m := &FastRegexMatcher{
re: re,
setMatches: findSetMatches(parsed, ""),
re: re,
}
if parsed.Op == syntax.OpConcat {
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
}
m.setMatches = findSetMatches(parsed, "")
m.stringMatcher = stringMatcherFromRegexp(parsed)
return m, nil
}
@ -202,6 +203,9 @@ func (m *FastRegexMatcher) MatchString(s string) bool {
if m.contains != "" && !strings.Contains(s, m.contains) {
return false
}
if m.stringMatcher != nil {
return m.stringMatcher.Matches(s)
}
return m.re.MatchString(s)
}
@ -253,3 +257,179 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
return
}
// StringMatcher is a matcher that matches a string in place of a regular expression.
type StringMatcher interface {
Matches(s string) bool
}
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
// It returns nil if the regexp is not supported.
// For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`.
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
clearCapture(re)
clearBeginEndText(re)
switch re.Op {
case syntax.OpPlus, syntax.OpStar:
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
}
return &anyStringMatcher{
allowEmpty: re.Op == syntax.OpStar,
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpEmptyMatch:
return emptyStringMatcher{}
case syntax.OpLiteral:
return &equalStringMatcher{
s: string(re.Rune),
caseSensitive: !isCaseInsensitive(re),
}
case syntax.OpAlternate:
or := make([]StringMatcher, 0, len(re.Sub))
for _, sub := range re.Sub {
m := stringMatcherFromRegexp(sub)
if m == nil {
return nil
}
or = append(or, m)
}
return orStringMatcher(or)
case syntax.OpConcat:
clearCapture(re.Sub...)
if len(re.Sub) == 0 {
return emptyStringMatcher{}
}
if len(re.Sub) == 1 {
return stringMatcherFromRegexp(re.Sub[0])
}
var left, right StringMatcher
// Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
left = stringMatcherFromRegexp(re.Sub[0])
if left == nil {
return nil
}
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1])
if right == nil {
return nil
}
re.Sub = re.Sub[:len(re.Sub)-1]
}
// findSetMatches will returns only literals that are case sensitive.
matches := findSetMatches(re, "")
if left == nil && right == nil && len(matches) > 0 {
// if there's no any matchers on both side it's a concat of literals
or := make([]StringMatcher, 0, len(matches))
for _, match := range matches {
or = append(or, &equalStringMatcher{
s: match,
caseSensitive: true,
})
}
return orStringMatcher(or)
}
// others we found literals in the middle.
if len(matches) > 0 {
return &containsStringMatcher{
substrings: matches,
left: left,
right: right,
}
}
}
return nil
}
// containsStringMatcher matches a string if it contains any of the substrings.
// If left and right are not nil, it's a contains operation where left and right must match.
// If left is nil, it's a hasPrefix operation and right must match.
// Finally if right is nil it's a hasSuffix operation and left must match.
type containsStringMatcher struct {
substrings []string
left StringMatcher
right StringMatcher
}
func (m *containsStringMatcher) Matches(s string) bool {
for _, substr := range m.substrings {
if m.right != nil && m.left != nil {
pos := strings.Index(s, substr)
if pos < 0 {
continue
}
if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
return true
}
continue
}
// If we have to check for characters on the left then we need to match a suffix.
if m.left != nil {
if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
return true
}
continue
}
if m.right != nil {
if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
return true
}
continue
}
}
return false
}
// emptyStringMatcher matches an empty string.
type emptyStringMatcher struct{}
func (m emptyStringMatcher) Matches(s string) bool {
return len(s) == 0
}
// orStringMatcher matches any of the sub-matchers.
type orStringMatcher []StringMatcher
func (m orStringMatcher) Matches(s string) bool {
for _, matcher := range m {
if matcher.Matches(s) {
return true
}
}
return false
}
// equalStringMatcher matches a string exactly and support case insensitive.
type equalStringMatcher struct {
s string
caseSensitive bool
}
func (m *equalStringMatcher) Matches(s string) bool {
if m.caseSensitive {
return m.s == s
}
return strings.EqualFold(m.s, s)
}
// anyStringMatcher is a matcher that matches any string.
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
type anyStringMatcher struct {
allowEmpty bool
matchNL bool
}
func (m *anyStringMatcher) Matches(s string) bool {
if !m.allowEmpty && len(s) == 0 {
return false
}
if !m.matchNL && strings.ContainsRune(s, '\n') {
return false
}
return true
}

View file

@ -14,48 +14,95 @@
package labels
import (
"math/rand"
"regexp"
"regexp/syntax"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestNewFastRegexMatcher(t *testing.T) {
cases := []struct {
regex string
value string
expected bool
}{
{regex: "(foo|bar)", value: "foo", expected: true},
{regex: "(foo|bar)", value: "foo bar", expected: false},
{regex: "(foo|bar)", value: "bar", expected: true},
{regex: "foo.*", value: "foo bar", expected: true},
{regex: "foo.*", value: "bar foo", expected: false},
{regex: ".*foo", value: "foo bar", expected: false},
{regex: ".*foo", value: "bar foo", expected: true},
{regex: ".*foo", value: "foo", expected: true},
{regex: "^.*foo$", value: "foo", expected: true},
{regex: "^.+foo$", value: "foo", expected: false},
{regex: "^.+foo$", value: "bfoo", expected: true},
{regex: ".*", value: "\n", expected: false},
{regex: ".*", value: "\nfoo", expected: false},
{regex: ".*foo", value: "\nfoo", expected: false},
{regex: "foo.*", value: "foo\n", expected: false},
{regex: "foo\n.*", value: "foo\n", expected: true},
{regex: ".*foo.*", value: "foo", expected: true},
{regex: ".*foo.*", value: "foo bar", expected: true},
{regex: ".*foo.*", value: "hello foo world", expected: true},
{regex: ".*foo.*", value: "hello foo\n world", expected: false},
{regex: ".*foo\n.*", value: "hello foo\n world", expected: true},
{regex: ".*", value: "foo", expected: true},
{regex: "", value: "foo", expected: false},
{regex: "", value: "", expected: true},
}
func init() {
rand.Seed(time.Now().UnixNano())
}
var (
letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
regexes = []string{
"(foo|bar)",
"foo.*",
".*foo",
"^.*foo$",
"^.+foo$",
".*",
".+",
"foo.+",
".+foo",
"foo\n.+",
"foo\n.*",
".*foo.*",
".+foo.+",
"",
"(?s:.*)",
"(?s:.+)",
"(?s:^.*foo$)",
"^(?i:foo|oo)|(bar)$",
"((.*)(bar|b|buzz)(.+)|foo)$",
"^$",
"(prometheus|api_prom)_api_v1_.+",
"10\\.0\\.(1|2)\\.+",
"10\\.0\\.(1|2).+",
"((fo(bar))|.+foo)",
}
values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
"FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
"10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40",
}
)
func TestNewFastRegexMatcher(t *testing.T) {
for _, r := range regexes {
r := r
for _, v := range values {
v := v
t.Run(r+` on "`+v+`"`, func(t *testing.T) {
t.Parallel()
m, err := NewFastRegexMatcher(r)
require.NoError(t, err)
re, err := regexp.Compile("^(?:" + r + ")$")
require.NoError(t, err)
require.Equal(t, re.MatchString(v), m.MatchString(v))
})
}
}
}
func BenchmarkNewFastRegexMatcher(b *testing.B) {
benchValues := values
for _, v := range values {
for i := 5; i < 50; i = i + 5 {
benchValues = append(benchValues, v+RandStringRunes(i))
benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
benchValues = append(benchValues, RandStringRunes(i)+v)
}
}
for _, r := range regexes {
r := r
b.Run(r, func(b *testing.B) {
m, err := NewFastRegexMatcher(r)
require.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, v := range benchValues {
_ = m.MatchString(v)
}
}
})
for _, c := range cases {
m, err := NewFastRegexMatcher(c.regex)
require.NoError(t, err)
require.Equal(t, c.expected, m.MatchString(c.value))
}
}
@ -158,3 +205,74 @@ func TestFindSetMatches(t *testing.T) {
}
}
func Test_OptimizeRegex(t *testing.T) {
for _, c := range []struct {
pattern string
exp StringMatcher
}{
{".*", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{".*?", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{"(?s:.*)", &anyStringMatcher{allowEmpty: true, matchNL: true}},
{"(.*)", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{"^.*$", &anyStringMatcher{allowEmpty: true, matchNL: false}},
{".+", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{"(?s:.+)", &anyStringMatcher{allowEmpty: false, matchNL: true}},
{"^.+$", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{"(.+)", &anyStringMatcher{allowEmpty: false, matchNL: false}},
{"", emptyStringMatcher{}},
{"^$", emptyStringMatcher{}},
{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
{"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
{"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}}), &equalStringMatcher{s: "bar", caseSensitive: true}})},
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"10\\.0\\.(1|2)\\.+", nil},
{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})},
{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
// we don't support case insensitive matching for contains.
// This is because there's no strings.IndexOfFold function.
// We can revisit later if this is really popular by using strings.ToUpper.
{"^(.*)((?i)foo|foobar)(.*)$", nil},
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
{"[a-z][a-z]", nil},
{"[1^3]", nil},
{".*foo.*bar.*", nil},
{`\d*`, nil},
{".", nil},
// This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
// It would make the code too complex to handle it.
{"/|/bar.*", nil},
{"(.+)/(foo.*|bar$)", nil},
} {
c := c
t.Run(c.pattern, func(t *testing.T) {
t.Parallel()
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)
matches := stringMatcherFromRegexp(parsed)
require.Equal(t, c.exp, matches)
})
}
}
func RandStringRunes(n int) string {
b := make([]rune, n)
for i := range b {
b[i] = letterRunes[rand.Intn(len(letterRunes))]
}
return string(b)
}