mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-26 22:19:40 -08:00
Merge pull request #19 from grafana/regexpopti
Optimize most common regexp
This commit is contained in:
commit
62935a1241
|
@ -24,10 +24,11 @@ const maxSetMatches = 256
|
|||
type FastRegexMatcher struct {
|
||||
re *regexp.Regexp
|
||||
|
||||
setMatches []string
|
||||
prefix string
|
||||
suffix string
|
||||
contains string
|
||||
setMatches []string
|
||||
stringMatcher StringMatcher
|
||||
prefix string
|
||||
suffix string
|
||||
contains string
|
||||
}
|
||||
|
||||
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
|
||||
|
@ -42,13 +43,13 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
|
|||
return nil, err
|
||||
}
|
||||
m := &FastRegexMatcher{
|
||||
re: re,
|
||||
setMatches: findSetMatches(parsed, ""),
|
||||
re: re,
|
||||
}
|
||||
|
||||
if parsed.Op == syntax.OpConcat {
|
||||
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
|
||||
}
|
||||
m.setMatches = findSetMatches(parsed, "")
|
||||
m.stringMatcher = stringMatcherFromRegexp(parsed)
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
@ -202,6 +203,9 @@ func (m *FastRegexMatcher) MatchString(s string) bool {
|
|||
if m.contains != "" && !strings.Contains(s, m.contains) {
|
||||
return false
|
||||
}
|
||||
if m.stringMatcher != nil {
|
||||
return m.stringMatcher.Matches(s)
|
||||
}
|
||||
return m.re.MatchString(s)
|
||||
}
|
||||
|
||||
|
@ -253,3 +257,179 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
|
|||
|
||||
return
|
||||
}
|
||||
|
||||
// StringMatcher is a matcher that matches a string in place of a regular expression.
|
||||
type StringMatcher interface {
|
||||
Matches(s string) bool
|
||||
}
|
||||
|
||||
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
|
||||
// It returns nil if the regexp is not supported.
|
||||
// For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`.
|
||||
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
|
||||
clearCapture(re)
|
||||
clearBeginEndText(re)
|
||||
switch re.Op {
|
||||
case syntax.OpPlus, syntax.OpStar:
|
||||
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
|
||||
return nil
|
||||
}
|
||||
return &anyStringMatcher{
|
||||
allowEmpty: re.Op == syntax.OpStar,
|
||||
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
|
||||
}
|
||||
case syntax.OpEmptyMatch:
|
||||
return emptyStringMatcher{}
|
||||
|
||||
case syntax.OpLiteral:
|
||||
return &equalStringMatcher{
|
||||
s: string(re.Rune),
|
||||
caseSensitive: !isCaseInsensitive(re),
|
||||
}
|
||||
case syntax.OpAlternate:
|
||||
or := make([]StringMatcher, 0, len(re.Sub))
|
||||
for _, sub := range re.Sub {
|
||||
m := stringMatcherFromRegexp(sub)
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
or = append(or, m)
|
||||
}
|
||||
return orStringMatcher(or)
|
||||
case syntax.OpConcat:
|
||||
clearCapture(re.Sub...)
|
||||
if len(re.Sub) == 0 {
|
||||
return emptyStringMatcher{}
|
||||
}
|
||||
if len(re.Sub) == 1 {
|
||||
return stringMatcherFromRegexp(re.Sub[0])
|
||||
}
|
||||
var left, right StringMatcher
|
||||
// Let's try to find if there's a first and last any matchers.
|
||||
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
|
||||
left = stringMatcherFromRegexp(re.Sub[0])
|
||||
if left == nil {
|
||||
return nil
|
||||
}
|
||||
re.Sub = re.Sub[1:]
|
||||
}
|
||||
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
|
||||
right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1])
|
||||
if right == nil {
|
||||
return nil
|
||||
}
|
||||
re.Sub = re.Sub[:len(re.Sub)-1]
|
||||
}
|
||||
// findSetMatches will returns only literals that are case sensitive.
|
||||
matches := findSetMatches(re, "")
|
||||
if left == nil && right == nil && len(matches) > 0 {
|
||||
// if there's no any matchers on both side it's a concat of literals
|
||||
|
||||
or := make([]StringMatcher, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
or = append(or, &equalStringMatcher{
|
||||
s: match,
|
||||
caseSensitive: true,
|
||||
})
|
||||
}
|
||||
return orStringMatcher(or)
|
||||
}
|
||||
// others we found literals in the middle.
|
||||
if len(matches) > 0 {
|
||||
return &containsStringMatcher{
|
||||
substrings: matches,
|
||||
left: left,
|
||||
right: right,
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// containsStringMatcher matches a string if it contains any of the substrings.
|
||||
// If left and right are not nil, it's a contains operation where left and right must match.
|
||||
// If left is nil, it's a hasPrefix operation and right must match.
|
||||
// Finally if right is nil it's a hasSuffix operation and left must match.
|
||||
type containsStringMatcher struct {
|
||||
substrings []string
|
||||
left StringMatcher
|
||||
right StringMatcher
|
||||
}
|
||||
|
||||
func (m *containsStringMatcher) Matches(s string) bool {
|
||||
for _, substr := range m.substrings {
|
||||
if m.right != nil && m.left != nil {
|
||||
pos := strings.Index(s, substr)
|
||||
if pos < 0 {
|
||||
continue
|
||||
}
|
||||
if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
// If we have to check for characters on the left then we need to match a suffix.
|
||||
if m.left != nil {
|
||||
if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
if m.right != nil {
|
||||
if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// emptyStringMatcher matches an empty string.
|
||||
type emptyStringMatcher struct{}
|
||||
|
||||
func (m emptyStringMatcher) Matches(s string) bool {
|
||||
return len(s) == 0
|
||||
}
|
||||
|
||||
// orStringMatcher matches any of the sub-matchers.
|
||||
type orStringMatcher []StringMatcher
|
||||
|
||||
func (m orStringMatcher) Matches(s string) bool {
|
||||
for _, matcher := range m {
|
||||
if matcher.Matches(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// equalStringMatcher matches a string exactly and support case insensitive.
|
||||
type equalStringMatcher struct {
|
||||
s string
|
||||
caseSensitive bool
|
||||
}
|
||||
|
||||
func (m *equalStringMatcher) Matches(s string) bool {
|
||||
if m.caseSensitive {
|
||||
return m.s == s
|
||||
}
|
||||
return strings.EqualFold(m.s, s)
|
||||
}
|
||||
|
||||
// anyStringMatcher is a matcher that matches any string.
|
||||
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
|
||||
type anyStringMatcher struct {
|
||||
allowEmpty bool
|
||||
matchNL bool
|
||||
}
|
||||
|
||||
func (m *anyStringMatcher) Matches(s string) bool {
|
||||
if !m.allowEmpty && len(s) == 0 {
|
||||
return false
|
||||
}
|
||||
if !m.matchNL && strings.ContainsRune(s, '\n') {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -14,48 +14,95 @@
|
|||
package labels
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"regexp/syntax"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestNewFastRegexMatcher(t *testing.T) {
|
||||
cases := []struct {
|
||||
regex string
|
||||
value string
|
||||
expected bool
|
||||
}{
|
||||
{regex: "(foo|bar)", value: "foo", expected: true},
|
||||
{regex: "(foo|bar)", value: "foo bar", expected: false},
|
||||
{regex: "(foo|bar)", value: "bar", expected: true},
|
||||
{regex: "foo.*", value: "foo bar", expected: true},
|
||||
{regex: "foo.*", value: "bar foo", expected: false},
|
||||
{regex: ".*foo", value: "foo bar", expected: false},
|
||||
{regex: ".*foo", value: "bar foo", expected: true},
|
||||
{regex: ".*foo", value: "foo", expected: true},
|
||||
{regex: "^.*foo$", value: "foo", expected: true},
|
||||
{regex: "^.+foo$", value: "foo", expected: false},
|
||||
{regex: "^.+foo$", value: "bfoo", expected: true},
|
||||
{regex: ".*", value: "\n", expected: false},
|
||||
{regex: ".*", value: "\nfoo", expected: false},
|
||||
{regex: ".*foo", value: "\nfoo", expected: false},
|
||||
{regex: "foo.*", value: "foo\n", expected: false},
|
||||
{regex: "foo\n.*", value: "foo\n", expected: true},
|
||||
{regex: ".*foo.*", value: "foo", expected: true},
|
||||
{regex: ".*foo.*", value: "foo bar", expected: true},
|
||||
{regex: ".*foo.*", value: "hello foo world", expected: true},
|
||||
{regex: ".*foo.*", value: "hello foo\n world", expected: false},
|
||||
{regex: ".*foo\n.*", value: "hello foo\n world", expected: true},
|
||||
{regex: ".*", value: "foo", expected: true},
|
||||
{regex: "", value: "foo", expected: false},
|
||||
{regex: "", value: "", expected: true},
|
||||
}
|
||||
func init() {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
|
||||
var (
|
||||
letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||
regexes = []string{
|
||||
"(foo|bar)",
|
||||
"foo.*",
|
||||
".*foo",
|
||||
"^.*foo$",
|
||||
"^.+foo$",
|
||||
".*",
|
||||
".+",
|
||||
"foo.+",
|
||||
".+foo",
|
||||
"foo\n.+",
|
||||
"foo\n.*",
|
||||
".*foo.*",
|
||||
".+foo.+",
|
||||
"",
|
||||
"(?s:.*)",
|
||||
"(?s:.+)",
|
||||
"(?s:^.*foo$)",
|
||||
"^(?i:foo|oo)|(bar)$",
|
||||
"((.*)(bar|b|buzz)(.+)|foo)$",
|
||||
"^$",
|
||||
"(prometheus|api_prom)_api_v1_.+",
|
||||
"10\\.0\\.(1|2)\\.+",
|
||||
"10\\.0\\.(1|2).+",
|
||||
"((fo(bar))|.+foo)",
|
||||
}
|
||||
values = []string{
|
||||
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
|
||||
"FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
|
||||
"10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40",
|
||||
}
|
||||
)
|
||||
|
||||
func TestNewFastRegexMatcher(t *testing.T) {
|
||||
for _, r := range regexes {
|
||||
r := r
|
||||
for _, v := range values {
|
||||
v := v
|
||||
t.Run(r+` on "`+v+`"`, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
m, err := NewFastRegexMatcher(r)
|
||||
require.NoError(t, err)
|
||||
re, err := regexp.Compile("^(?:" + r + ")$")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, re.MatchString(v), m.MatchString(v))
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNewFastRegexMatcher(b *testing.B) {
|
||||
benchValues := values
|
||||
for _, v := range values {
|
||||
for i := 5; i < 50; i = i + 5 {
|
||||
benchValues = append(benchValues, v+RandStringRunes(i))
|
||||
benchValues = append(benchValues, RandStringRunes(i)+v+RandStringRunes(i))
|
||||
benchValues = append(benchValues, RandStringRunes(i)+v)
|
||||
}
|
||||
}
|
||||
for _, r := range regexes {
|
||||
r := r
|
||||
b.Run(r, func(b *testing.B) {
|
||||
m, err := NewFastRegexMatcher(r)
|
||||
require.NoError(b, err)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, v := range benchValues {
|
||||
_ = m.MatchString(v)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
for _, c := range cases {
|
||||
m, err := NewFastRegexMatcher(c.regex)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, c.expected, m.MatchString(c.value))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -158,3 +205,74 @@ func TestFindSetMatches(t *testing.T) {
|
|||
|
||||
}
|
||||
}
|
||||
|
||||
func Test_OptimizeRegex(t *testing.T) {
|
||||
for _, c := range []struct {
|
||||
pattern string
|
||||
exp StringMatcher
|
||||
}{
|
||||
{".*", &anyStringMatcher{allowEmpty: true, matchNL: false}},
|
||||
{".*?", &anyStringMatcher{allowEmpty: true, matchNL: false}},
|
||||
{"(?s:.*)", &anyStringMatcher{allowEmpty: true, matchNL: true}},
|
||||
{"(.*)", &anyStringMatcher{allowEmpty: true, matchNL: false}},
|
||||
{"^.*$", &anyStringMatcher{allowEmpty: true, matchNL: false}},
|
||||
{".+", &anyStringMatcher{allowEmpty: false, matchNL: false}},
|
||||
{"(?s:.+)", &anyStringMatcher{allowEmpty: false, matchNL: true}},
|
||||
{"^.+$", &anyStringMatcher{allowEmpty: false, matchNL: false}},
|
||||
{"(.+)", &anyStringMatcher{allowEmpty: false, matchNL: false}},
|
||||
{"", emptyStringMatcher{}},
|
||||
{"^$", emptyStringMatcher{}},
|
||||
{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
|
||||
{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
|
||||
{"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
|
||||
{"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}}), &equalStringMatcher{s: "bar", caseSensitive: true}})},
|
||||
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
|
||||
{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
|
||||
{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
|
||||
{"10\\.0\\.(1|2)\\.+", nil},
|
||||
{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
|
||||
{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
|
||||
{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}},
|
||||
{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: false, matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
|
||||
{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}})},
|
||||
{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyStringMatcher{allowEmpty: false, matchNL: false}, right: nil}},
|
||||
// we don't support case insensitive matching for contains.
|
||||
// This is because there's no strings.IndexOfFold function.
|
||||
// We can revisit later if this is really popular by using strings.ToUpper.
|
||||
{"^(.*)((?i)foo|foobar)(.*)$", nil},
|
||||
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
|
||||
{"[a-z][a-z]", nil},
|
||||
{"[1^3]", nil},
|
||||
{".*foo.*bar.*", nil},
|
||||
{`\d*`, nil},
|
||||
{".", nil},
|
||||
// This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
|
||||
// It would make the code too complex to handle it.
|
||||
{"/|/bar.*", nil},
|
||||
{"(.+)/(foo.*|bar$)", nil},
|
||||
} {
|
||||
c := c
|
||||
t.Run(c.pattern, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
|
||||
require.NoError(t, err)
|
||||
matches := stringMatcherFromRegexp(parsed)
|
||||
require.Equal(t, c.exp, matches)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func RandStringRunes(n int) string {
|
||||
b := make([]rune, n)
|
||||
for i := range b {
|
||||
b[i] = letterRunes[rand.Intn(len(letterRunes))]
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue