mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
Optimized FastRegexMatcher when the regex contains a case insensitive alternation made with concats too (#430)
* Optimized FastRegexMatcher when the regex contains a case insensitive alternation made with concats too Signed-off-by: Marco Pracucci <marco@pracucci.com> * Do not use a pointer to hold whether the matches are case sensitive Signed-off-by: Marco Pracucci <marco@pracucci.com> * Improved unit tests based on review feedback Signed-off-by: Marco Pracucci <marco@pracucci.com> --------- Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
parent
d26f584bfd
commit
c77900d58e
model/labels
|
@ -49,27 +49,26 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
|
|||
if parsed.Op == syntax.OpConcat {
|
||||
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
|
||||
}
|
||||
m.setMatches = findSetMatches(parsed, "")
|
||||
if matches, caseSensitive := findSetMatches(parsed, ""); caseSensitive {
|
||||
m.setMatches = matches
|
||||
}
|
||||
m.stringMatcher = stringMatcherFromRegexp(parsed)
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// findSetMatches extract equality matches from a regexp.
|
||||
// Returns nil if we can't replace the regexp by only equality matchers.
|
||||
func findSetMatches(re *syntax.Regexp, base string) []string {
|
||||
// Matches are case sensitive, if we find a case insensitive regexp.
|
||||
// We have to abort.
|
||||
if isCaseInsensitive(re) {
|
||||
return nil
|
||||
}
|
||||
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
|
||||
// a mix of case sensitive and case insensitive matchers.
|
||||
func findSetMatches(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
|
||||
clearBeginEndText(re)
|
||||
|
||||
switch re.Op {
|
||||
case syntax.OpLiteral:
|
||||
return []string{base + string(re.Rune)}
|
||||
return []string{base + string(re.Rune)}, isCaseSensitive(re)
|
||||
case syntax.OpEmptyMatch:
|
||||
if base != "" {
|
||||
return []string{base}
|
||||
return []string{base}, isCaseSensitive(re)
|
||||
}
|
||||
case syntax.OpAlternate:
|
||||
return findSetMatchesFromAlternate(re, base)
|
||||
|
@ -80,7 +79,7 @@ func findSetMatches(re *syntax.Regexp, base string) []string {
|
|||
return findSetMatchesFromConcat(re, base)
|
||||
case syntax.OpCharClass:
|
||||
if len(re.Rune)%2 != 0 {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
var matches []string
|
||||
var totalSet int
|
||||
|
@ -91,60 +90,82 @@ func findSetMatches(re *syntax.Regexp, base string) []string {
|
|||
// In some case like negation [^0-9] a lot of possibilities exists and that
|
||||
// can create thousands of possible matches at which points we're better off using regexp.
|
||||
if totalSet > maxSetMatches {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
for i := 0; i+1 < len(re.Rune); i = i + 2 {
|
||||
lo, hi := re.Rune[i], re.Rune[i+1]
|
||||
for c := lo; c <= hi; c++ {
|
||||
matches = append(matches, base+string(c))
|
||||
}
|
||||
|
||||
}
|
||||
return matches
|
||||
return matches, isCaseSensitive(re)
|
||||
default:
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
|
||||
func findSetMatchesFromConcat(re *syntax.Regexp, base string) []string {
|
||||
func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
|
||||
if len(re.Sub) == 0 {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
clearCapture(re.Sub...)
|
||||
matches := []string{base}
|
||||
|
||||
matches = []string{base}
|
||||
|
||||
for i := 0; i < len(re.Sub); i++ {
|
||||
var newMatches []string
|
||||
for _, b := range matches {
|
||||
m := findSetMatches(re.Sub[i], b)
|
||||
for j, b := range matches {
|
||||
m, caseSensitive := findSetMatches(re.Sub[i], b)
|
||||
if m == nil {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
if tooManyMatches(newMatches, m...) {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// All matches must have the same case sensitivity. If it's the first set of matches
|
||||
// returned, we store its sensitivity as the expected case, and then we'll check all
|
||||
// other ones.
|
||||
if i == 0 && j == 0 {
|
||||
matchesCaseSensitive = caseSensitive
|
||||
}
|
||||
if matchesCaseSensitive != caseSensitive {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
newMatches = append(newMatches, m...)
|
||||
}
|
||||
matches = newMatches
|
||||
}
|
||||
|
||||
return matches
|
||||
return matches, matchesCaseSensitive
|
||||
}
|
||||
|
||||
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) []string {
|
||||
var setMatches []string
|
||||
for _, sub := range re.Sub {
|
||||
found := findSetMatches(sub, base)
|
||||
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
|
||||
for i, sub := range re.Sub {
|
||||
found, caseSensitive := findSetMatches(sub, base)
|
||||
if found == nil {
|
||||
return nil
|
||||
return nil, false
|
||||
}
|
||||
if tooManyMatches(setMatches, found...) {
|
||||
return nil
|
||||
if tooManyMatches(matches, found...) {
|
||||
return nil, false
|
||||
}
|
||||
setMatches = append(setMatches, found...)
|
||||
|
||||
// All matches must have the same case sensitivity. If it's the first set of matches
|
||||
// returned, we store its sensitivity as the expected case, and then we'll check all
|
||||
// other ones.
|
||||
if i == 0 {
|
||||
matchesCaseSensitive = caseSensitive
|
||||
}
|
||||
if matchesCaseSensitive != caseSensitive {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
matches = append(matches, found...)
|
||||
}
|
||||
return setMatches
|
||||
|
||||
return matches, matchesCaseSensitive
|
||||
}
|
||||
|
||||
// clearCapture removes capture operation as they are not used for matching.
|
||||
|
@ -184,6 +205,12 @@ func isCaseInsensitive(reg *syntax.Regexp) bool {
|
|||
return (reg.Flags & syntax.FoldCase) != 0
|
||||
}
|
||||
|
||||
// isCaseSensitive tells if a regexp is case sensitive.
|
||||
// The flag should be check at each level of the syntax tree.
|
||||
func isCaseSensitive(reg *syntax.Regexp) bool {
|
||||
return !isCaseInsensitive(reg)
|
||||
}
|
||||
|
||||
// tooManyMatches guards against creating too many set matches
|
||||
func tooManyMatches(matches []string, new ...string) bool {
|
||||
return len(matches)+len(new) > maxSetMatches
|
||||
|
@ -273,6 +300,7 @@ type StringMatcher interface {
|
|||
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
|
||||
clearCapture(re)
|
||||
clearBeginEndText(re)
|
||||
|
||||
switch re.Op {
|
||||
case syntax.OpPlus, syntax.OpStar:
|
||||
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
|
||||
|
@ -324,22 +352,28 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
|
|||
}
|
||||
re.Sub = re.Sub[:len(re.Sub)-1]
|
||||
}
|
||||
// findSetMatches will returns only literals that are case sensitive.
|
||||
matches := findSetMatches(re, "")
|
||||
if left == nil && right == nil && len(matches) > 0 {
|
||||
// if there's no any matchers on both side it's a concat of literals
|
||||
|
||||
matches, matchesCaseSensitive := findSetMatches(re, "")
|
||||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if left == nil && right == nil {
|
||||
// if there's no any matchers on both side it's a concat of literals
|
||||
or := make([]StringMatcher, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
or = append(or, &equalStringMatcher{
|
||||
s: match,
|
||||
caseSensitive: true,
|
||||
caseSensitive: matchesCaseSensitive,
|
||||
})
|
||||
}
|
||||
return orStringMatcher(or)
|
||||
}
|
||||
// others we found literals in the middle.
|
||||
if len(matches) > 0 {
|
||||
|
||||
// We found literals in the middle. We can triggered the fast path only if
|
||||
// the matches are case sensitive because containsStringMatcher doesn't
|
||||
// support case insensitive.
|
||||
if matchesCaseSensitive {
|
||||
return &containsStringMatcher{
|
||||
substrings: matches,
|
||||
left: left,
|
||||
|
|
|
@ -147,60 +147,72 @@ func TestOptimizeConcatRegex(t *testing.T) {
|
|||
// Refer to https://github.com/prometheus/prometheus/issues/2651.
|
||||
func TestFindSetMatches(t *testing.T) {
|
||||
for _, c := range []struct {
|
||||
pattern string
|
||||
exp []string
|
||||
pattern string
|
||||
expMatches []string
|
||||
expCaseSensitive bool
|
||||
}{
|
||||
// Single value, coming from a `bar=~"foo"` selector.
|
||||
{"foo", []string{"foo"}},
|
||||
{"^foo", []string{"foo"}},
|
||||
{"^foo$", []string{"foo"}},
|
||||
{"foo", []string{"foo"}, true},
|
||||
{"^foo", []string{"foo"}, true},
|
||||
{"^foo$", []string{"foo"}, true},
|
||||
// Simple sets alternates.
|
||||
{"foo|bar|zz", []string{"foo", "bar", "zz"}},
|
||||
{"foo|bar|zz", []string{"foo", "bar", "zz"}, true},
|
||||
// Simple sets alternate and concat (bar|baz is parsed as "ba[rz]").
|
||||
{"foo|bar|baz", []string{"foo", "bar", "baz"}},
|
||||
{"foo|bar|baz", []string{"foo", "bar", "baz"}, true},
|
||||
// Simple sets alternate and concat and capture
|
||||
{"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}},
|
||||
{"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}, true},
|
||||
// Simple sets alternate and concat and alternates with empty matches
|
||||
// parsed as b(ar|(?:)|uzz) where b(?:) means literal b.
|
||||
{"bar|b|buzz", []string{"bar", "b", "buzz"}},
|
||||
{"bar|b|buzz", []string{"bar", "b", "buzz"}, true},
|
||||
// Skip anchors it's enforced anyway at the root.
|
||||
{"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}},
|
||||
{"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}, true},
|
||||
// Simple sets containing escaped characters.
|
||||
{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}},
|
||||
{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}, true},
|
||||
// using charclass
|
||||
{"[abc]d", []string{"ad", "bd", "cd"}},
|
||||
{"[abc]d", []string{"ad", "bd", "cd"}, true},
|
||||
// high low charset different => A(B[CD]|EF)|BC[XY]
|
||||
{"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}},
|
||||
{"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}, true},
|
||||
// triple concat
|
||||
{"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}},
|
||||
{"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}, true},
|
||||
// triple concat with multiple alternates
|
||||
{"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}},
|
||||
{"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}},
|
||||
{"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}, true},
|
||||
{"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}, true},
|
||||
// class starting with "-"
|
||||
{"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}},
|
||||
{"[1^3]", []string{"1", "3", "^"}},
|
||||
{"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}, true},
|
||||
{"[1^3]", []string{"1", "3", "^"}, true},
|
||||
// OpPlus with concat
|
||||
{"(.+)/(foo|bar)", nil},
|
||||
{"(.+)/(foo|bar)", nil, false},
|
||||
// Simple sets containing special characters without escaping.
|
||||
{"fo.o|bar?|^baz", nil},
|
||||
{"fo.o|bar?|^baz", nil, false},
|
||||
// case sensitive wrapper.
|
||||
{"(?i)foo", nil},
|
||||
{"(?i)foo", []string{"FOO"}, false},
|
||||
// case sensitive wrapper on alternate.
|
||||
{"(?i)foo|bar|baz", nil},
|
||||
// case sensitive wrapper on concat.
|
||||
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
|
||||
{"(?i)foo|bar|baz", []string{"FOO", "BAR", "BAZ", "BAr", "BAz"}, false},
|
||||
// mixed case sensitivity.
|
||||
{"(api|rpc)_(v1|prom)_((?i)push|query)", nil, false},
|
||||
// mixed case sensitivity concatenation only without capture group.
|
||||
{"api_v1_(?i)push", nil, false},
|
||||
// mixed case sensitivity alternation only without capture group.
|
||||
{"api|(?i)rpc", nil, false},
|
||||
// case sensitive after unsetting insensitivity.
|
||||
{"rpc|(?i)(?-i)api", []string{"rpc", "api"}, true},
|
||||
// case sensitive after unsetting insensitivity in all alternation options.
|
||||
{"(?i)((?-i)api|(?-i)rpc)", []string{"api", "rpc"}, true},
|
||||
// mixed case sensitivity after unsetting insensitivity.
|
||||
{"(?i)rpc|(?-i)api", nil, false},
|
||||
// too high charset combination
|
||||
{"(api|rpc)_[^0-9]", nil},
|
||||
{"(api|rpc)_[^0-9]", nil, false},
|
||||
// too many combinations
|
||||
{"[a-z][a-z]", nil},
|
||||
{"[a-z][a-z]", nil, false},
|
||||
} {
|
||||
c := c
|
||||
t.Run(c.pattern, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
|
||||
require.NoError(t, err)
|
||||
matches := findSetMatches(parsed, "")
|
||||
require.Equal(t, c.exp, matches)
|
||||
matches, actualCaseSensitive := findSetMatches(parsed, "")
|
||||
require.Equal(t, c.expMatches, matches)
|
||||
require.Equal(t, c.expCaseSensitive, actualCaseSensitive)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -225,6 +237,9 @@ func BenchmarkFastRegexMatcher(b *testing.B) {
|
|||
".+foo",
|
||||
".*foo.*",
|
||||
"(?i:foo)",
|
||||
"(?i:(foo|bar))",
|
||||
"(?i:(foo1|foo2|bar))",
|
||||
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
|
||||
"(prometheus|api_prom)_api_v1_.+",
|
||||
"((fo(bar))|.+foo)",
|
||||
}
|
||||
|
@ -263,6 +278,7 @@ func Test_OptimizeRegex(t *testing.T) {
|
|||
{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
|
||||
{"^(?i:foo)|(bar)$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
|
||||
{"^(?i:foo|oo)|(bar)$", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}}), &equalStringMatcher{s: "bar", caseSensitive: true}})},
|
||||
{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
|
||||
{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyStringMatcher{allowEmpty: true, matchNL: false}, right: &anyStringMatcher{allowEmpty: true, matchNL: false}}},
|
||||
|
|
Loading…
Reference in a new issue