Merge pull request #15 from grafana/improvesetmatches

Improve `findSetMatches `to support concatenation.
2025-03-05 20:59:13 -08:00 · 2021-10-07 14:48:53 +02:00 · 2021-10-07 14:48:53 +02:00 · 9173cade01
parent b3ae917f96 23ac1d73ef
commit 9173cade01
7 changed files with 249 additions and 140 deletions
--- a/Makefile.common
+++ b/Makefile.common
@ -118,6 +118,8 @@ ifeq ($(GOHOSTARCH),amd64)
        endif
 endif

+test-flags += -timeout 20m
+
 # This rule is used to forward a target like "build" to "common-build".  This
 # allows a new "build" target to be defined in a Makefile which includes this
 # one and override "common-build" without override warnings.
--- a/pkg/labels/matcher.go
+++ b/pkg/labels/matcher.go
@ -111,10 +111,19 @@ func (m *Matcher) Inverse() (*Matcher, error) {
 	panic("labels.Matcher.Matches: invalid match type")
 }

-// GetRegexString returns the regex string.
 func (m *Matcher) GetRegexString() string {
 	if m.re == nil {
 		return ""
 	}
 	return m.re.GetRegexString()
 }
+
+// SetMatches returns a set of equality matchers for the current regex matchers if possible.
+// For examples the regexp `a(b|f)` will returns "ab" and "af".
+// Returns nil if we can't replace the regexp by only equality matchers.
+func (m *Matcher) SetMatches() []string {
+	if m.re == nil {
+		return nil
+	}
+	return m.re.setMatches
+}
--- a/pkg/labels/regexp.go
+++ b/pkg/labels/regexp.go
@ -19,26 +19,31 @@ import (
 	"strings"
 )

+const maxSetMatches = 256
+
 type FastRegexMatcher struct {
-	re       *regexp.Regexp
-	prefix   string
-	suffix   string
-	contains string
+	re *regexp.Regexp
+
+	setMatches []string
+	prefix     string
+	suffix     string
+	contains   string
 }

 func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
-	re, err := regexp.Compile("^(?:" + v + ")$")
-	if err != nil {
-		return nil, err
-	}
-
 	parsed, err := syntax.Parse(v, syntax.Perl)
 	if err != nil {
 		return nil, err
 	}
-
+	// Simplify the syntax tree to run faster.
+	parsed = parsed.Simplify()
+	re, err := regexp.Compile("^(?:" + parsed.String() + ")$")
+	if err != nil {
+		return nil, err
+	}
 	m := &FastRegexMatcher{
-		re: re,
+		re:         re,
+		setMatches: findSetMatches(parsed, ""),
 	}

 	if parsed.Op == syntax.OpConcat {
@ -48,7 +53,146 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
 	return m, nil
 }

+// findSetMatches extract equality matches from a regexp.
+// Returns nil if we can't replace the regexp by only equality matchers.
+func findSetMatches(re *syntax.Regexp, base string) []string {
+	// Matches are case sensitive, if we find a case insensitive regexp.
+	// We have to abort.
+	if isCaseInsensitive(re) {
+		return nil
+	}
+	clearBeginEndText(re)
+	switch re.Op {
+	case syntax.OpLiteral:
+		return []string{base + string(re.Rune)}
+	case syntax.OpEmptyMatch:
+		if base != "" {
+			return []string{base}
+		}
+	case syntax.OpAlternate:
+		return findSetMatchesFromAlternate(re, base)
+	case syntax.OpCapture:
+		clearCapture(re)
+		return findSetMatches(re, base)
+	case syntax.OpConcat:
+		return findSetMatchesFromConcat(re, base)
+	case syntax.OpCharClass:
+		if len(re.Rune)%2 != 0 {
+			return nil
+		}
+		var matches []string
+		var totalSet int
+		for i := 0; i+1 < len(re.Rune); i = i + 2 {
+			totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1
+		}
+		// limits the total characters that can be used to create matches.
+		// In some case like negation [^0-9] a lot of possibilities exists and that
+		// can create thousands of possible matches at which points we're better off using regexp.
+		if totalSet > maxSetMatches {
+			return nil
+		}
+		for i := 0; i+1 < len(re.Rune); i = i + 2 {
+			lo, hi := re.Rune[i], re.Rune[i+1]
+			for c := lo; c <= hi; c++ {
+				matches = append(matches, base+string(c))
+			}
+
+		}
+		return matches
+	default:
+		return nil
+	}
+	return nil
+}
+
+func findSetMatchesFromConcat(re *syntax.Regexp, base string) []string {
+	if len(re.Sub) == 0 {
+		return nil
+	}
+	clearCapture(re.Sub...)
+	matches := []string{base}
+
+	for i := 0; i < len(re.Sub); i++ {
+		var newMatches []string
+		for _, b := range matches {
+			m := findSetMatches(re.Sub[i], b)
+			if m == nil {
+				return nil
+			}
+			if tooManyMatches(newMatches, m...) {
+				return nil
+			}
+			newMatches = append(newMatches, m...)
+		}
+		matches = newMatches
+	}
+
+	return matches
+}
+
+func findSetMatchesFromAlternate(re *syntax.Regexp, base string) []string {
+	var setMatches []string
+	for _, sub := range re.Sub {
+		found := findSetMatches(sub, base)
+		if found == nil {
+			return nil
+		}
+		if tooManyMatches(setMatches, found...) {
+			return nil
+		}
+		setMatches = append(setMatches, found...)
+	}
+	return setMatches
+}
+
+// clearCapture removes capture operation as they are not used for matching.
+func clearCapture(regs ...*syntax.Regexp) {
+	for _, r := range regs {
+		if r.Op == syntax.OpCapture {
+			*r = *r.Sub[0]
+		}
+	}
+}
+
+// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
+func clearBeginEndText(re *syntax.Regexp) {
+	if len(re.Sub) == 0 {
+		return
+	}
+	if len(re.Sub) == 1 {
+		if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText {
+			re.Sub = nil
+			return
+		}
+	}
+	if re.Sub[0].Op == syntax.OpBeginText {
+		re.Sub = re.Sub[1:]
+	}
+	if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText {
+		re.Sub = re.Sub[:len(re.Sub)-1]
+	}
+}
+
+// isCaseInsensitive tells if a regexp is case insensitive.
+// The flag should be check at each level of the syntax tree.
+func isCaseInsensitive(reg *syntax.Regexp) bool {
+	return (reg.Flags & syntax.FoldCase) != 0
+}
+
+// tooManyMatches guards against creating too many set matches
+func tooManyMatches(matches []string, new ...string) bool {
+	return len(matches)+len(new) > maxSetMatches
+}
+
 func (m *FastRegexMatcher) MatchString(s string) bool {
+	if len(m.setMatches) != 0 {
+		for _, match := range m.setMatches {
+			if match == s {
+				return true
+			}
+		}
+		return false
+	}
 	if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
 		return false
 	}
@ -61,6 +205,10 @@ func (m *FastRegexMatcher) MatchString(s string) bool {
 	return m.re.MatchString(s)
 }

+func (m *FastRegexMatcher) SetMatches() []string {
+	return m.setMatches
+}
+
 func (m *FastRegexMatcher) GetRegexString() string {
 	return m.re.String()
 }
--- a/pkg/labels/regexp_test.go
+++ b/pkg/labels/regexp_test.go
@ -96,3 +96,65 @@ func TestOptimizeConcatRegex(t *testing.T) {
 		require.Equal(t, c.contains, contains)
 	}
 }
+
+// Refer to https://github.com/prometheus/prometheus/issues/2651.
+func TestFindSetMatches(t *testing.T) {
+	for _, c := range []struct {
+		pattern string
+		exp     []string
+	}{
+		// Single value, coming from a `bar=~"foo"` selector.
+		{"foo", []string{"foo"}},
+		{"^foo", []string{"foo"}},
+		{"^foo$", []string{"foo"}},
+		// Simple sets alternates.
+		{"foo|bar|zz", []string{"foo", "bar", "zz"}},
+		// Simple sets alternate and concat (bar|baz is parsed as "ba[rz]").
+		{"foo|bar|baz", []string{"foo", "bar", "baz"}},
+		// Simple sets alternate and concat and capture
+		{"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}},
+		// Simple sets alternate and concat and alternates with empty matches
+		// parsed as  b(ar|(?:)|uzz) where b(?:) means literal b.
+		{"bar|b|buzz", []string{"bar", "b", "buzz"}},
+		// Skip anchors it's enforced anyway at the root.
+		{"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}},
+		// Simple sets containing escaped characters.
+		{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}},
+		// using charclass
+		{"[abc]d", []string{"ad", "bd", "cd"}},
+		// high low charset different => A(B[CD]|EF)|BC[XY]
+		{"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}},
+		// triple concat
+		{"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}},
+		// triple concat with multiple alternates
+		{"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}},
+		{"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}},
+		// class starting with "-"
+		{"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}},
+		{"[1^3]", []string{"1", "3", "^"}},
+		// OpPlus with concat
+		{"(.+)/(foo|bar)", nil},
+		// Simple sets containing special characters without escaping.
+		{"fo.o|bar?|^baz", nil},
+		// case sensitive wrapper.
+		{"(?i)foo", nil},
+		// case sensitive wrapper on alternate.
+		{"(?i)foo|bar|baz", nil},
+		// case sensitive wrapper on concat.
+		{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
+		// too high charset combination
+		{"(api|rpc)_[^0-9]", nil},
+		// too many combinations
+		{"[a-z][a-z]", nil},
+	} {
+		c := c
+		t.Run(c.pattern, func(t *testing.T) {
+			t.Parallel()
+			parsed, err := syntax.Parse(c.pattern, syntax.Perl)
+			require.NoError(t, err)
+			matches := findSetMatches(parsed, "")
+			require.Equal(t, c.exp, matches)
+		})
+
+	}
+}
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
@ -16,8 +16,6 @@ package tsdb
 import (
 	"math"
 	"sort"
-	"strings"
-	"unicode/utf8"

 	"github.com/pkg/errors"

@ -30,20 +28,6 @@ import (
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 )

-// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped.
-var regexMetaCharacterBytes [16]byte
-
-// isRegexMetaCharacter reports whether byte b needs to be escaped.
-func isRegexMetaCharacter(b byte) bool {
-	return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0
-}
-
-func init() {
-	for _, b := range []byte(`.+*?()|[]{}^$`) {
-		regexMetaCharacterBytes[b%16] |= 1 << (b / 16)
-	}
-}
-
 type blockBaseQuerier struct {
 	index      IndexReader
 	chunks     ChunkReader
@ -180,48 +164,6 @@ func (q *blockChunkQuerier) Select(sortSeries bool, hints *storage.SelectHints,
 	return newBlockChunkSeriesSet(q.index, q.chunks, q.tombstones, p, mint, maxt)
 }

-func findSetMatches(pattern string) []string {
-	// Return empty matches if the wrapper from Prometheus is missing.
-	if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" {
-		return nil
-	}
-	escaped := false
-	sets := []*strings.Builder{{}}
-	for i := 4; i < len(pattern)-2; i++ {
-		if escaped {
-			switch {
-			case isRegexMetaCharacter(pattern[i]):
-				sets[len(sets)-1].WriteByte(pattern[i])
-			case pattern[i] == '\\':
-				sets[len(sets)-1].WriteByte('\\')
-			default:
-				return nil
-			}
-			escaped = false
-		} else {
-			switch {
-			case isRegexMetaCharacter(pattern[i]):
-				if pattern[i] == '|' {
-					sets = append(sets, &strings.Builder{})
-				} else {
-					return nil
-				}
-			case pattern[i] == '\\':
-				escaped = true
-			default:
-				sets[len(sets)-1].WriteByte(pattern[i])
-			}
-		}
-	}
-	matches := make([]string, 0, len(sets))
-	for _, s := range sets {
-		if s.Len() > 0 {
-			matches = append(matches, s.String())
-		}
-	}
-	return matches
-}
-
 // PostingsForMatchers assembles a single postings iterator against the index reader
 // based on the given matchers. The resulting postings are not ordered by series.
 func PostingsForMatchers(ix IndexReader, ms ...*labels.Matcher) (index.Postings, error) {
@ -316,7 +258,7 @@ func postingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, erro

 	// Fast-path for set matching.
 	if m.Type == labels.MatchRegexp {
-		setMatches := findSetMatches(m.GetRegexString())
+		setMatches := m.SetMatches()
 		if len(setMatches) > 0 {
 			sort.Strings(setMatches)
 			return ix.Postings(m.Name, setMatches...)
@ -612,6 +554,7 @@ func (p *populateWithDelGenericSeriesIterator) Err() error { return p.err }
 func (p *populateWithDelGenericSeriesIterator) toSeriesIterator() chunkenc.Iterator {
 	return &populateWithDelSeriesIterator{populateWithDelGenericSeriesIterator: p}
 }
+
 func (p *populateWithDelGenericSeriesIterator) toChunkSeriesIterator() chunks.Iterator {
 	return &populateWithDelChunkSeriesIterator{populateWithDelGenericSeriesIterator: p}
 }
@ -881,7 +824,6 @@ Outer:

 			if ts <= tr.Maxt {
 				return true
-
 			}
 			it.Intervals = it.Intervals[1:]
 		}
--- a/tsdb/querier_bench_test.go
+++ b/tsdb/querier_bench_test.go
@ -108,7 +108,9 @@ func benchmarkPostingsForMatchers(b *testing.B, ir IndexReader) {
 	iNot2 := labels.MustNewMatcher(labels.MatchNotEqual, "n", "2"+postingsBenchSuffix)
 	iNot2Star := labels.MustNewMatcher(labels.MatchNotRegexp, "i", "^2.*$")
 	iNotStar2Star := labels.MustNewMatcher(labels.MatchNotRegexp, "i", "^.*2.*$")
-
+	jFooBar := labels.MustNewMatcher(labels.MatchRegexp, "j", "foo|bar")
+	iCharSet := labels.MustNewMatcher(labels.MatchRegexp, "i", "1[0-9]")
+	iAlternate := labels.MustNewMatcher(labels.MatchRegexp, "i", "(1|2|3|4|5|6|20|55)")
 	cases := []struct {
 		name     string
 		matchers []*labels.Matcher
@ -117,6 +119,9 @@ func benchmarkPostingsForMatchers(b *testing.B, ir IndexReader) {
 		{`n="1",j="foo"`, []*labels.Matcher{n1, jFoo}},
 		{`j="foo",n="1"`, []*labels.Matcher{jFoo, n1}},
 		{`n="1",j!="foo"`, []*labels.Matcher{n1, jNotFoo}},
+		{`i=~"1[0-9]",j=~"foo|bar"`, []*labels.Matcher{iCharSet, jFooBar}},
+		{`j=~"foo|bar"`, []*labels.Matcher{jFooBar}},
+		{`i=~"(1|2|3|4|5|6|20|55)"`, []*labels.Matcher{iAlternate}},
 		{`i=~".*"`, []*labels.Matcher{iStar}},
 		{`i=~"1.*"`, []*labels.Matcher{i1Star}},
 		{`i=~".*1"`, []*labels.Matcher{iStar1}},
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
@ -918,7 +918,7 @@ func TestPopulateWithDelSeriesIterator_NextWithMinTime(t *testing.T) {
 // The subset are all equivalent so this does not capture merging of partial or non-overlapping sets well.
 // TODO(bwplotka): Merge with storage merged series set benchmark.
 func BenchmarkMergedSeriesSet(b *testing.B) {
-	var sel = func(sets []storage.SeriesSet) storage.SeriesSet {
+	sel := func(sets []storage.SeriesSet) storage.SeriesSet {
 		return storage.NewMergeSeriesSet(sets, storage.ChainedSeriesMerge)
 	}

@ -1560,69 +1560,6 @@ func BenchmarkSetMatcher(b *testing.B) {
 	}
 }

-// Refer to https://github.com/prometheus/prometheus/issues/2651.
-func TestFindSetMatches(t *testing.T) {
-	cases := []struct {
-		pattern string
-		exp     []string
-	}{
-		// Single value, coming from a `bar=~"foo"` selector.
-		{
-			pattern: "^(?:foo)$",
-			exp: []string{
-				"foo",
-			},
-		},
-		// Simple sets.
-		{
-			pattern: "^(?:foo|bar|baz)$",
-			exp: []string{
-				"foo",
-				"bar",
-				"baz",
-			},
-		},
-		// Simple sets containing escaped characters.
-		{
-			pattern: "^(?:fo\\.o|bar\\?|\\^baz)$",
-			exp: []string{
-				"fo.o",
-				"bar?",
-				"^baz",
-			},
-		},
-		// Simple sets containing special characters without escaping.
-		{
-			pattern: "^(?:fo.o|bar?|^baz)$",
-			exp:     nil,
-		},
-		// Missing wrapper.
-		{
-			pattern: "foo|bar|baz",
-			exp:     nil,
-		},
-	}
-
-	for _, c := range cases {
-		matches := findSetMatches(c.pattern)
-		if len(c.exp) == 0 {
-			if len(matches) != 0 {
-				t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches)
-			}
-		} else {
-			if len(matches) != len(c.exp) {
-				t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern)
-			} else {
-				for i := 0; i < len(c.exp); i++ {
-					if c.exp[i] != matches[i] {
-						t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i])
-					}
-				}
-			}
-		}
-	}
-}
-
 func TestPostingsForMatchers(t *testing.T) {
 	chunkDir, err := ioutil.TempDir("", "chunk_dir")
 	require.NoError(t, err)
@ -1881,7 +1818,6 @@ func TestPostingsForMatchers(t *testing.T) {
 			t.Errorf("Evaluating %v, missing results %+v", c.matchers, exp)
 		}
 	}
-
 }

 // TestClose ensures that calling Close more than once doesn't block and doesn't panic.
@ -2106,7 +2042,7 @@ func TestPostingsForMatcher(t *testing.T) {
 		{
 			// Test case for double quoted regex matcher
 			matcher:  labels.MustNewMatcher(labels.MatchRegexp, "test", "^(?:a|b)$"),
-			hasError: true,
+			hasError: false,
 		},
 	}

@ -2141,7 +2077,12 @@ func TestBlockBaseSeriesSet(t *testing.T) {
 				{
 					lset: labels.New([]labels.Label{{Name: "a", Value: "a"}}...),
 					chunks: []chunks.Meta{
-						{Ref: 29}, {Ref: 45}, {Ref: 245}, {Ref: 123}, {Ref: 4232}, {Ref: 5344},
+						{Ref: 29},
+						{Ref: 45},
+						{Ref: 245},
+						{Ref: 123},
+						{Ref: 4232},
+						{Ref: 5344},
 						{Ref: 121},
 					},
 					ref: 12,