diff --git a/Makefile.common b/Makefile.common index 99e8f9f1b..7b0e6e32c 100644 --- a/Makefile.common +++ b/Makefile.common @@ -118,6 +118,8 @@ ifeq ($(GOHOSTARCH),amd64) endif endif +test-flags += -timeout 20m + # This rule is used to forward a target like "build" to "common-build". This # allows a new "build" target to be defined in a Makefile which includes this # one and override "common-build" without override warnings. diff --git a/pkg/labels/matcher.go b/pkg/labels/matcher.go index f299c40f6..c0d186b48 100644 --- a/pkg/labels/matcher.go +++ b/pkg/labels/matcher.go @@ -111,10 +111,19 @@ func (m *Matcher) Inverse() (*Matcher, error) { panic("labels.Matcher.Matches: invalid match type") } -// GetRegexString returns the regex string. func (m *Matcher) GetRegexString() string { if m.re == nil { return "" } return m.re.GetRegexString() } + +// SetMatches returns a set of equality matchers for the current regex matchers if possible. +// For examples the regexp `a(b|f)` will returns "ab" and "af". +// Returns nil if we can't replace the regexp by only equality matchers. +func (m *Matcher) SetMatches() []string { + if m.re == nil { + return nil + } + return m.re.setMatches +} diff --git a/pkg/labels/regexp.go b/pkg/labels/regexp.go index eb2b07995..f3664db2a 100644 --- a/pkg/labels/regexp.go +++ b/pkg/labels/regexp.go @@ -19,26 +19,31 @@ import ( "strings" ) +const maxSetMatches = 256 + type FastRegexMatcher struct { - re *regexp.Regexp - prefix string - suffix string - contains string + re *regexp.Regexp + + setMatches []string + prefix string + suffix string + contains string } func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { - re, err := regexp.Compile("^(?:" + v + ")$") - if err != nil { - return nil, err - } - parsed, err := syntax.Parse(v, syntax.Perl) if err != nil { return nil, err } - + // Simplify the syntax tree to run faster. + parsed = parsed.Simplify() + re, err := regexp.Compile("^(?:" + parsed.String() + ")$") + if err != nil { + return nil, err + } m := &FastRegexMatcher{ - re: re, + re: re, + setMatches: findSetMatches(parsed, ""), } if parsed.Op == syntax.OpConcat { @@ -48,7 +53,146 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { return m, nil } +// findSetMatches extract equality matches from a regexp. +// Returns nil if we can't replace the regexp by only equality matchers. +func findSetMatches(re *syntax.Regexp, base string) []string { + // Matches are case sensitive, if we find a case insensitive regexp. + // We have to abort. + if isCaseInsensitive(re) { + return nil + } + clearBeginEndText(re) + switch re.Op { + case syntax.OpLiteral: + return []string{base + string(re.Rune)} + case syntax.OpEmptyMatch: + if base != "" { + return []string{base} + } + case syntax.OpAlternate: + return findSetMatchesFromAlternate(re, base) + case syntax.OpCapture: + clearCapture(re) + return findSetMatches(re, base) + case syntax.OpConcat: + return findSetMatchesFromConcat(re, base) + case syntax.OpCharClass: + if len(re.Rune)%2 != 0 { + return nil + } + var matches []string + var totalSet int + for i := 0; i+1 < len(re.Rune); i = i + 2 { + totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1 + } + // limits the total characters that can be used to create matches. + // In some case like negation [^0-9] a lot of possibilities exists and that + // can create thousands of possible matches at which points we're better off using regexp. + if totalSet > maxSetMatches { + return nil + } + for i := 0; i+1 < len(re.Rune); i = i + 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + for c := lo; c <= hi; c++ { + matches = append(matches, base+string(c)) + } + + } + return matches + default: + return nil + } + return nil +} + +func findSetMatchesFromConcat(re *syntax.Regexp, base string) []string { + if len(re.Sub) == 0 { + return nil + } + clearCapture(re.Sub...) + matches := []string{base} + + for i := 0; i < len(re.Sub); i++ { + var newMatches []string + for _, b := range matches { + m := findSetMatches(re.Sub[i], b) + if m == nil { + return nil + } + if tooManyMatches(newMatches, m...) { + return nil + } + newMatches = append(newMatches, m...) + } + matches = newMatches + } + + return matches +} + +func findSetMatchesFromAlternate(re *syntax.Regexp, base string) []string { + var setMatches []string + for _, sub := range re.Sub { + found := findSetMatches(sub, base) + if found == nil { + return nil + } + if tooManyMatches(setMatches, found...) { + return nil + } + setMatches = append(setMatches, found...) + } + return setMatches +} + +// clearCapture removes capture operation as they are not used for matching. +func clearCapture(regs ...*syntax.Regexp) { + for _, r := range regs { + if r.Op == syntax.OpCapture { + *r = *r.Sub[0] + } + } +} + +// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string. +func clearBeginEndText(re *syntax.Regexp) { + if len(re.Sub) == 0 { + return + } + if len(re.Sub) == 1 { + if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText { + re.Sub = nil + return + } + } + if re.Sub[0].Op == syntax.OpBeginText { + re.Sub = re.Sub[1:] + } + if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText { + re.Sub = re.Sub[:len(re.Sub)-1] + } +} + +// isCaseInsensitive tells if a regexp is case insensitive. +// The flag should be check at each level of the syntax tree. +func isCaseInsensitive(reg *syntax.Regexp) bool { + return (reg.Flags & syntax.FoldCase) != 0 +} + +// tooManyMatches guards against creating too many set matches +func tooManyMatches(matches []string, new ...string) bool { + return len(matches)+len(new) > maxSetMatches +} + func (m *FastRegexMatcher) MatchString(s string) bool { + if len(m.setMatches) != 0 { + for _, match := range m.setMatches { + if match == s { + return true + } + } + return false + } if m.prefix != "" && !strings.HasPrefix(s, m.prefix) { return false } @@ -61,6 +205,10 @@ func (m *FastRegexMatcher) MatchString(s string) bool { return m.re.MatchString(s) } +func (m *FastRegexMatcher) SetMatches() []string { + return m.setMatches +} + func (m *FastRegexMatcher) GetRegexString() string { return m.re.String() } diff --git a/pkg/labels/regexp_test.go b/pkg/labels/regexp_test.go index eed2711bb..1b0dc8fd3 100644 --- a/pkg/labels/regexp_test.go +++ b/pkg/labels/regexp_test.go @@ -96,3 +96,65 @@ func TestOptimizeConcatRegex(t *testing.T) { require.Equal(t, c.contains, contains) } } + +// Refer to https://github.com/prometheus/prometheus/issues/2651. +func TestFindSetMatches(t *testing.T) { + for _, c := range []struct { + pattern string + exp []string + }{ + // Single value, coming from a `bar=~"foo"` selector. + {"foo", []string{"foo"}}, + {"^foo", []string{"foo"}}, + {"^foo$", []string{"foo"}}, + // Simple sets alternates. + {"foo|bar|zz", []string{"foo", "bar", "zz"}}, + // Simple sets alternate and concat (bar|baz is parsed as "ba[rz]"). + {"foo|bar|baz", []string{"foo", "bar", "baz"}}, + // Simple sets alternate and concat and capture + {"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}}, + // Simple sets alternate and concat and alternates with empty matches + // parsed as b(ar|(?:)|uzz) where b(?:) means literal b. + {"bar|b|buzz", []string{"bar", "b", "buzz"}}, + // Skip anchors it's enforced anyway at the root. + {"(^bar$)|(b$)|(^buzz)", []string{"bar", "b", "buzz"}}, + // Simple sets containing escaped characters. + {"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}}, + // using charclass + {"[abc]d", []string{"ad", "bd", "cd"}}, + // high low charset different => A(B[CD]|EF)|BC[XY] + {"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}}, + // triple concat + {"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}}, + // triple concat with multiple alternates + {"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}}, + {"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}}, + // class starting with "-" + {"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}}, + {"[1^3]", []string{"1", "3", "^"}}, + // OpPlus with concat + {"(.+)/(foo|bar)", nil}, + // Simple sets containing special characters without escaping. + {"fo.o|bar?|^baz", nil}, + // case sensitive wrapper. + {"(?i)foo", nil}, + // case sensitive wrapper on alternate. + {"(?i)foo|bar|baz", nil}, + // case sensitive wrapper on concat. + {"(api|rpc)_(v1|prom)_((?i)push|query)", nil}, + // too high charset combination + {"(api|rpc)_[^0-9]", nil}, + // too many combinations + {"[a-z][a-z]", nil}, + } { + c := c + t.Run(c.pattern, func(t *testing.T) { + t.Parallel() + parsed, err := syntax.Parse(c.pattern, syntax.Perl) + require.NoError(t, err) + matches := findSetMatches(parsed, "") + require.Equal(t, c.exp, matches) + }) + + } +} diff --git a/tsdb/querier.go b/tsdb/querier.go index 0cab141ee..1693ac24b 100644 --- a/tsdb/querier.go +++ b/tsdb/querier.go @@ -16,8 +16,6 @@ package tsdb import ( "math" "sort" - "strings" - "unicode/utf8" "github.com/pkg/errors" @@ -30,20 +28,6 @@ import ( "github.com/prometheus/prometheus/tsdb/tombstones" ) -// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped. -var regexMetaCharacterBytes [16]byte - -// isRegexMetaCharacter reports whether byte b needs to be escaped. -func isRegexMetaCharacter(b byte) bool { - return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0 -} - -func init() { - for _, b := range []byte(`.+*?()|[]{}^$`) { - regexMetaCharacterBytes[b%16] |= 1 << (b / 16) - } -} - type blockBaseQuerier struct { index IndexReader chunks ChunkReader @@ -180,48 +164,6 @@ func (q *blockChunkQuerier) Select(sortSeries bool, hints *storage.SelectHints, return newBlockChunkSeriesSet(q.index, q.chunks, q.tombstones, p, mint, maxt) } -func findSetMatches(pattern string) []string { - // Return empty matches if the wrapper from Prometheus is missing. - if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" { - return nil - } - escaped := false - sets := []*strings.Builder{{}} - for i := 4; i < len(pattern)-2; i++ { - if escaped { - switch { - case isRegexMetaCharacter(pattern[i]): - sets[len(sets)-1].WriteByte(pattern[i]) - case pattern[i] == '\\': - sets[len(sets)-1].WriteByte('\\') - default: - return nil - } - escaped = false - } else { - switch { - case isRegexMetaCharacter(pattern[i]): - if pattern[i] == '|' { - sets = append(sets, &strings.Builder{}) - } else { - return nil - } - case pattern[i] == '\\': - escaped = true - default: - sets[len(sets)-1].WriteByte(pattern[i]) - } - } - } - matches := make([]string, 0, len(sets)) - for _, s := range sets { - if s.Len() > 0 { - matches = append(matches, s.String()) - } - } - return matches -} - // PostingsForMatchers assembles a single postings iterator against the index reader // based on the given matchers. The resulting postings are not ordered by series. func PostingsForMatchers(ix IndexReader, ms ...*labels.Matcher) (index.Postings, error) { @@ -316,7 +258,7 @@ func postingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, erro // Fast-path for set matching. if m.Type == labels.MatchRegexp { - setMatches := findSetMatches(m.GetRegexString()) + setMatches := m.SetMatches() if len(setMatches) > 0 { sort.Strings(setMatches) return ix.Postings(m.Name, setMatches...) @@ -612,6 +554,7 @@ func (p *populateWithDelGenericSeriesIterator) Err() error { return p.err } func (p *populateWithDelGenericSeriesIterator) toSeriesIterator() chunkenc.Iterator { return &populateWithDelSeriesIterator{populateWithDelGenericSeriesIterator: p} } + func (p *populateWithDelGenericSeriesIterator) toChunkSeriesIterator() chunks.Iterator { return &populateWithDelChunkSeriesIterator{populateWithDelGenericSeriesIterator: p} } @@ -881,7 +824,6 @@ Outer: if ts <= tr.Maxt { return true - } it.Intervals = it.Intervals[1:] } diff --git a/tsdb/querier_bench_test.go b/tsdb/querier_bench_test.go index 242759b79..72e4b287f 100644 --- a/tsdb/querier_bench_test.go +++ b/tsdb/querier_bench_test.go @@ -108,7 +108,9 @@ func benchmarkPostingsForMatchers(b *testing.B, ir IndexReader) { iNot2 := labels.MustNewMatcher(labels.MatchNotEqual, "n", "2"+postingsBenchSuffix) iNot2Star := labels.MustNewMatcher(labels.MatchNotRegexp, "i", "^2.*$") iNotStar2Star := labels.MustNewMatcher(labels.MatchNotRegexp, "i", "^.*2.*$") - + jFooBar := labels.MustNewMatcher(labels.MatchRegexp, "j", "foo|bar") + iCharSet := labels.MustNewMatcher(labels.MatchRegexp, "i", "1[0-9]") + iAlternate := labels.MustNewMatcher(labels.MatchRegexp, "i", "(1|2|3|4|5|6|20|55)") cases := []struct { name string matchers []*labels.Matcher @@ -117,6 +119,9 @@ func benchmarkPostingsForMatchers(b *testing.B, ir IndexReader) { {`n="1",j="foo"`, []*labels.Matcher{n1, jFoo}}, {`j="foo",n="1"`, []*labels.Matcher{jFoo, n1}}, {`n="1",j!="foo"`, []*labels.Matcher{n1, jNotFoo}}, + {`i=~"1[0-9]",j=~"foo|bar"`, []*labels.Matcher{iCharSet, jFooBar}}, + {`j=~"foo|bar"`, []*labels.Matcher{jFooBar}}, + {`i=~"(1|2|3|4|5|6|20|55)"`, []*labels.Matcher{iAlternate}}, {`i=~".*"`, []*labels.Matcher{iStar}}, {`i=~"1.*"`, []*labels.Matcher{i1Star}}, {`i=~".*1"`, []*labels.Matcher{iStar1}}, diff --git a/tsdb/querier_test.go b/tsdb/querier_test.go index 0588f6619..224835578 100644 --- a/tsdb/querier_test.go +++ b/tsdb/querier_test.go @@ -918,7 +918,7 @@ func TestPopulateWithDelSeriesIterator_NextWithMinTime(t *testing.T) { // The subset are all equivalent so this does not capture merging of partial or non-overlapping sets well. // TODO(bwplotka): Merge with storage merged series set benchmark. func BenchmarkMergedSeriesSet(b *testing.B) { - var sel = func(sets []storage.SeriesSet) storage.SeriesSet { + sel := func(sets []storage.SeriesSet) storage.SeriesSet { return storage.NewMergeSeriesSet(sets, storage.ChainedSeriesMerge) } @@ -1560,69 +1560,6 @@ func BenchmarkSetMatcher(b *testing.B) { } } -// Refer to https://github.com/prometheus/prometheus/issues/2651. -func TestFindSetMatches(t *testing.T) { - cases := []struct { - pattern string - exp []string - }{ - // Single value, coming from a `bar=~"foo"` selector. - { - pattern: "^(?:foo)$", - exp: []string{ - "foo", - }, - }, - // Simple sets. - { - pattern: "^(?:foo|bar|baz)$", - exp: []string{ - "foo", - "bar", - "baz", - }, - }, - // Simple sets containing escaped characters. - { - pattern: "^(?:fo\\.o|bar\\?|\\^baz)$", - exp: []string{ - "fo.o", - "bar?", - "^baz", - }, - }, - // Simple sets containing special characters without escaping. - { - pattern: "^(?:fo.o|bar?|^baz)$", - exp: nil, - }, - // Missing wrapper. - { - pattern: "foo|bar|baz", - exp: nil, - }, - } - - for _, c := range cases { - matches := findSetMatches(c.pattern) - if len(c.exp) == 0 { - if len(matches) != 0 { - t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches) - } - } else { - if len(matches) != len(c.exp) { - t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern) - } else { - for i := 0; i < len(c.exp); i++ { - if c.exp[i] != matches[i] { - t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i]) - } - } - } - } - } -} - func TestPostingsForMatchers(t *testing.T) { chunkDir, err := ioutil.TempDir("", "chunk_dir") require.NoError(t, err) @@ -1881,7 +1818,6 @@ func TestPostingsForMatchers(t *testing.T) { t.Errorf("Evaluating %v, missing results %+v", c.matchers, exp) } } - } // TestClose ensures that calling Close more than once doesn't block and doesn't panic. @@ -2106,7 +2042,7 @@ func TestPostingsForMatcher(t *testing.T) { { // Test case for double quoted regex matcher matcher: labels.MustNewMatcher(labels.MatchRegexp, "test", "^(?:a|b)$"), - hasError: true, + hasError: false, }, } @@ -2141,7 +2077,12 @@ func TestBlockBaseSeriesSet(t *testing.T) { { lset: labels.New([]labels.Label{{Name: "a", Value: "a"}}...), chunks: []chunks.Meta{ - {Ref: 29}, {Ref: 45}, {Ref: 245}, {Ref: 123}, {Ref: 4232}, {Ref: 5344}, + {Ref: 29}, + {Ref: 45}, + {Ref: 245}, + {Ref: 123}, + {Ref: 4232}, + {Ref: 5344}, {Ref: 121}, }, ref: 12,