prometheus/model/labels/regexp.go

437 lines
11 KiB
Go
Raw Normal View History

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
"strings"
"github.com/grafana/regexp"
"github.com/grafana/regexp/syntax"
)
const maxSetMatches = 256
type FastRegexMatcher struct {
re *regexp.Regexp
setMatches []string
stringMatcher StringMatcher
prefix string
suffix string
contains string
}
func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
parsed, err := syntax.Parse(v, syntax.Perl)
if err != nil {
return nil, err
}
// Simplify the syntax tree to run faster.
parsed = parsed.Simplify()
re, err := regexp.Compile("^(?:" + parsed.String() + ")$")
if err != nil {
return nil, err
}
m := &FastRegexMatcher{
re: re,
}
if parsed.Op == syntax.OpConcat {
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
}
m.setMatches = findSetMatches(parsed, "")
m.stringMatcher = stringMatcherFromRegexp(parsed)
return m, nil
}
// findSetMatches extract equality matches from a regexp.
// Returns nil if we can't replace the regexp by only equality matchers.
func findSetMatches(re *syntax.Regexp, base string) []string {
// Matches are case sensitive, if we find a case insensitive regexp.
// We have to abort.
if isCaseInsensitive(re) {
return nil
}
clearBeginEndText(re)
switch re.Op {
case syntax.OpLiteral:
return []string{base + string(re.Rune)}
case syntax.OpEmptyMatch:
if base != "" {
return []string{base}
}
case syntax.OpAlternate:
return findSetMatchesFromAlternate(re, base)
case syntax.OpCapture:
clearCapture(re)
return findSetMatches(re, base)
case syntax.OpConcat:
return findSetMatchesFromConcat(re, base)
case syntax.OpCharClass:
if len(re.Rune)%2 != 0 {
return nil
}
var matches []string
var totalSet int
for i := 0; i+1 < len(re.Rune); i = i + 2 {
totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1
}
// limits the total characters that can be used to create matches.
// In some case like negation [^0-9] a lot of possibilities exists and that
// can create thousands of possible matches at which points we're better off using regexp.
if totalSet > maxSetMatches {
return nil
}
for i := 0; i+1 < len(re.Rune); i = i + 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
for c := lo; c <= hi; c++ {
matches = append(matches, base+string(c))
}
}
return matches
default:
return nil
}
return nil
}
func findSetMatchesFromConcat(re *syntax.Regexp, base string) []string {
if len(re.Sub) == 0 {
return nil
}
clearCapture(re.Sub...)
matches := []string{base}
for i := 0; i < len(re.Sub); i++ {
var newMatches []string
for _, b := range matches {
m := findSetMatches(re.Sub[i], b)
if m == nil {
return nil
}
if tooManyMatches(newMatches, m...) {
return nil
}
newMatches = append(newMatches, m...)
}
matches = newMatches
}
return matches
}
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) []string {
var setMatches []string
for _, sub := range re.Sub {
found := findSetMatches(sub, base)
if found == nil {
return nil
}
if tooManyMatches(setMatches, found...) {
return nil
}
setMatches = append(setMatches, found...)
}
return setMatches
}
// clearCapture removes capture operation as they are not used for matching.
func clearCapture(regs ...*syntax.Regexp) {
for _, r := range regs {
if r.Op == syntax.OpCapture {
*r = *r.Sub[0]
}
}
}
// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
func clearBeginEndText(re *syntax.Regexp) {
if len(re.Sub) == 0 {
return
}
if len(re.Sub) == 1 {
if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText {
re.Sub = nil
return
}
}
if re.Sub[0].Op == syntax.OpBeginText {
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText {
re.Sub = re.Sub[:len(re.Sub)-1]
}
}
// isCaseInsensitive tells if a regexp is case insensitive.
// The flag should be check at each level of the syntax tree.
func isCaseInsensitive(reg *syntax.Regexp) bool {
return (reg.Flags & syntax.FoldCase) != 0
}
// tooManyMatches guards against creating too many set matches
func tooManyMatches(matches []string, new ...string) bool {
return len(matches)+len(new) > maxSetMatches
}
func (m *FastRegexMatcher) MatchString(s string) bool {
if len(m.setMatches) != 0 {
for _, match := range m.setMatches {
if match == s {
return true
}
}
return false
}
if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
return false
}
if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
return false
}
if m.contains != "" && !strings.Contains(s, m.contains) {
return false
}
if m.stringMatcher != nil {
return m.stringMatcher.Matches(s)
}
return m.re.MatchString(s)
}
func (m *FastRegexMatcher) SetMatches() []string {
return m.setMatches
}
func (m *FastRegexMatcher) GetRegexString() string {
return m.re.String()
}
// optimizeConcatRegex returns literal prefix/suffix text that can be safely
// checked against the label value before running the regexp matcher.
func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {
sub := r.Sub
// We can safely remove begin and end text matchers respectively
// at the beginning and end of the regexp.
if len(sub) > 0 && sub[0].Op == syntax.OpBeginText {
sub = sub[1:]
}
if len(sub) > 0 && sub[len(sub)-1].Op == syntax.OpEndText {
sub = sub[:len(sub)-1]
}
if len(sub) == 0 {
return
}
// Given Prometheus regex matchers are always anchored to the begin/end
// of the text, if the first/last operations are literals, we can safely
// treat them as prefix/suffix.
if sub[0].Op == syntax.OpLiteral && (sub[0].Flags&syntax.FoldCase) == 0 {
prefix = string(sub[0].Rune)
}
if last := len(sub) - 1; sub[last].Op == syntax.OpLiteral && (sub[last].Flags&syntax.FoldCase) == 0 {
suffix = string(sub[last].Rune)
}
// If contains any literal which is not a prefix/suffix, we keep the
// 1st one. We do not keep the whole list of literals to simplify the
// fast path.
for i := 1; i < len(sub)-1; i++ {
if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 {
contains = string(sub[i].Rune)
break
}
}
return
}
// StringMatcher is a matcher that matches a string in place of a regular expression.
type StringMatcher interface {
Matches(s string) bool
}
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
// It returns nil if the regexp is not supported.
// For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`.
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
clearCapture(re)
clearBeginEndText(re)
switch re.Op {
case syntax.OpPlus, syntax.OpStar:
if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
}
return &anyStringMatcher{
allowEmpty: re.Op == syntax.OpStar,
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpEmptyMatch:
return emptyStringMatcher{}
case syntax.OpLiteral:
return &equalStringMatcher{
s: string(re.Rune),
caseSensitive: !isCaseInsensitive(re),
}
case syntax.OpAlternate:
or := make([]StringMatcher, 0, len(re.Sub))
for _, sub := range re.Sub {
m := stringMatcherFromRegexp(sub)
if m == nil {
return nil
}
or = append(or, m)
}
return orStringMatcher(or)
case syntax.OpConcat:
clearCapture(re.Sub...)
if len(re.Sub) == 0 {
return emptyStringMatcher{}
}
if len(re.Sub) == 1 {
return stringMatcherFromRegexp(re.Sub[0])
}
var left, right StringMatcher
// Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
left = stringMatcherFromRegexp(re.Sub[0])
if left == nil {
return nil
}
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
right = stringMatcherFromRegexp(re.Sub[len(re.Sub)-1])
if right == nil {
return nil
}
re.Sub = re.Sub[:len(re.Sub)-1]
}
// findSetMatches will returns only literals that are case sensitive.
matches := findSetMatches(re, "")
if left == nil && right == nil && len(matches) > 0 {
// if there's no any matchers on both side it's a concat of literals
or := make([]StringMatcher, 0, len(matches))
for _, match := range matches {
or = append(or, &equalStringMatcher{
s: match,
caseSensitive: true,
})
}
return orStringMatcher(or)
}
// others we found literals in the middle.
if len(matches) > 0 {
return &containsStringMatcher{
substrings: matches,
left: left,
right: right,
}
}
}
return nil
}
// containsStringMatcher matches a string if it contains any of the substrings.
// If left and right are not nil, it's a contains operation where left and right must match.
// If left is nil, it's a hasPrefix operation and right must match.
// Finally if right is nil it's a hasSuffix operation and left must match.
type containsStringMatcher struct {
substrings []string
left StringMatcher
right StringMatcher
}
func (m *containsStringMatcher) Matches(s string) bool {
for _, substr := range m.substrings {
if m.right != nil && m.left != nil {
pos := strings.Index(s, substr)
if pos < 0 {
continue
}
if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
return true
}
continue
}
// If we have to check for characters on the left then we need to match a suffix.
if m.left != nil {
if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
return true
}
continue
}
if m.right != nil {
if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
return true
}
continue
}
}
return false
}
// emptyStringMatcher matches an empty string.
type emptyStringMatcher struct{}
func (m emptyStringMatcher) Matches(s string) bool {
return len(s) == 0
}
// orStringMatcher matches any of the sub-matchers.
type orStringMatcher []StringMatcher
func (m orStringMatcher) Matches(s string) bool {
for _, matcher := range m {
if matcher.Matches(s) {
return true
}
}
return false
}
// equalStringMatcher matches a string exactly and support case insensitive.
type equalStringMatcher struct {
s string
caseSensitive bool
}
func (m *equalStringMatcher) Matches(s string) bool {
if m.caseSensitive {
return m.s == s
}
return strings.EqualFold(m.s, s)
}
// anyStringMatcher is a matcher that matches any string.
// It is used for the + and * operator. matchNL tells if it should matches newlines or not.
type anyStringMatcher struct {
allowEmpty bool
matchNL bool
}
func (m *anyStringMatcher) Matches(s string) bool {
Better benchmarks values. ``` ❯ benchcmp before.txt after.txt benchmark old ns/op new ns/op delta BenchmarkNewFastRegexMatcher/(foo|bar)-16 2487 2520 +1.33% BenchmarkNewFastRegexMatcher/foo.*-16 8856 4531 -48.84% BenchmarkNewFastRegexMatcher/.*foo-16 25195 6389 -74.64% BenchmarkNewFastRegexMatcher/^.*foo$-16 25626 6253 -75.60% BenchmarkNewFastRegexMatcher/^.+foo$-16 25429 6248 -75.43% BenchmarkNewFastRegexMatcher/.*-16 429707 8439 -98.04% BenchmarkNewFastRegexMatcher/.+-16 380165 8503 -97.76% BenchmarkNewFastRegexMatcher/foo.+-16 8180 4586 -43.94% BenchmarkNewFastRegexMatcher/.+foo-16 25214 6255 -75.19% BenchmarkNewFastRegexMatcher/foo_.+-16 8116 4334 -46.60% BenchmarkNewFastRegexMatcher/foo_.*-16 8354 4287 -48.68% BenchmarkNewFastRegexMatcher/.*foo.*-16 206076 19227 -90.67% BenchmarkNewFastRegexMatcher/.+foo.+-16 208434 18793 -90.98% BenchmarkNewFastRegexMatcher/#00-16 33045 3936 -88.09% BenchmarkNewFastRegexMatcher/(?s:.*)-16 403806 4208 -98.96% BenchmarkNewFastRegexMatcher/(?s:.+)-16 418177 4150 -99.01% BenchmarkNewFastRegexMatcher/(?s:^.*foo$)-16 24452 5661 -76.85% BenchmarkNewFastRegexMatcher/^(?i:foo|oo)|(bar)$-16 48087 23183 -51.79% BenchmarkNewFastRegexMatcher/((.*)(bar|b|buzz)(.+)|foo)$-16 645430 47193 -92.69% BenchmarkNewFastRegexMatcher/^$-16 37244 3912 -89.50% BenchmarkNewFastRegexMatcher/(prometheus|api_prom)_api_v1_.+-16 17205 10006 -41.84% BenchmarkNewFastRegexMatcher/10\.0\.(1|2)\.+-16 6776 7011 +3.47% BenchmarkNewFastRegexMatcher/10\.0\.(1|2).+-16 14792 4674 -68.40% BenchmarkNewFastRegexMatcher/((fo(bar))|.+foo)-16 497007 17410 -96.50% ``` Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
2021-10-11 00:45:02 -07:00
if !m.allowEmpty && len(s) == 0 {
return false
}
Better benchmarks values. ``` ❯ benchcmp before.txt after.txt benchmark old ns/op new ns/op delta BenchmarkNewFastRegexMatcher/(foo|bar)-16 2487 2520 +1.33% BenchmarkNewFastRegexMatcher/foo.*-16 8856 4531 -48.84% BenchmarkNewFastRegexMatcher/.*foo-16 25195 6389 -74.64% BenchmarkNewFastRegexMatcher/^.*foo$-16 25626 6253 -75.60% BenchmarkNewFastRegexMatcher/^.+foo$-16 25429 6248 -75.43% BenchmarkNewFastRegexMatcher/.*-16 429707 8439 -98.04% BenchmarkNewFastRegexMatcher/.+-16 380165 8503 -97.76% BenchmarkNewFastRegexMatcher/foo.+-16 8180 4586 -43.94% BenchmarkNewFastRegexMatcher/.+foo-16 25214 6255 -75.19% BenchmarkNewFastRegexMatcher/foo_.+-16 8116 4334 -46.60% BenchmarkNewFastRegexMatcher/foo_.*-16 8354 4287 -48.68% BenchmarkNewFastRegexMatcher/.*foo.*-16 206076 19227 -90.67% BenchmarkNewFastRegexMatcher/.+foo.+-16 208434 18793 -90.98% BenchmarkNewFastRegexMatcher/#00-16 33045 3936 -88.09% BenchmarkNewFastRegexMatcher/(?s:.*)-16 403806 4208 -98.96% BenchmarkNewFastRegexMatcher/(?s:.+)-16 418177 4150 -99.01% BenchmarkNewFastRegexMatcher/(?s:^.*foo$)-16 24452 5661 -76.85% BenchmarkNewFastRegexMatcher/^(?i:foo|oo)|(bar)$-16 48087 23183 -51.79% BenchmarkNewFastRegexMatcher/((.*)(bar|b|buzz)(.+)|foo)$-16 645430 47193 -92.69% BenchmarkNewFastRegexMatcher/^$-16 37244 3912 -89.50% BenchmarkNewFastRegexMatcher/(prometheus|api_prom)_api_v1_.+-16 17205 10006 -41.84% BenchmarkNewFastRegexMatcher/10\.0\.(1|2)\.+-16 6776 7011 +3.47% BenchmarkNewFastRegexMatcher/10\.0\.(1|2).+-16 14792 4674 -68.40% BenchmarkNewFastRegexMatcher/((fo(bar))|.+foo)-16 497007 17410 -96.50% ``` Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
2021-10-11 00:45:02 -07:00
if !m.matchNL && strings.ContainsRune(s, '\n') {
return false
}
return true
}