Optimize queries using regex matchers for set lookups (#602)

* Original version of the set optimization

Signed-off-by: naivewong <867245430@qq.com>

* simple set matcher

Signed-off-by: naivewong <867245430@qq.com>

* simple set matcher

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* add benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update benchmark

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>

* use genSeries from #467

Signed-off-by: naivewong <867245430@qq.com>

* update

Signed-off-by: naivewong <867245430@qq.com>
This commit is contained in:
naivewong 2019-05-27 19:24:46 +08:00 committed by Ganesh Vernekar
parent 562e93e8e6
commit 13c80a5979
4 changed files with 324 additions and 36 deletions

View file

@ -21,6 +21,7 @@ import (
"math/rand" "math/rand"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"testing" "testing"
"github.com/go-kit/kit/log" "github.com/go-kit/kit/log"
@ -184,6 +185,11 @@ func createBlock(tb testing.TB, dir string, series []Series) string {
return filepath.Join(dir, ulid.String()) return filepath.Join(dir, ulid.String())
} }
const (
defaultLabelName = "labelName"
defaultLabelValue = "labelValue"
)
// genSeries generates series with a given number of labels and values. // genSeries generates series with a given number of labels and values.
func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series { func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
if totalSeries == 0 || labelCount == 0 { if totalSeries == 0 || labelCount == 0 {
@ -193,8 +199,9 @@ func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
series := make([]Series, totalSeries) series := make([]Series, totalSeries)
for i := 0; i < totalSeries; i++ { for i := 0; i < totalSeries; i++ {
lbls := make(map[string]string, labelCount) lbls := make(map[string]string, labelCount)
for len(lbls) < labelCount { lbls[defaultLabelName] = strconv.Itoa(i)
lbls[randString()] = randString() for j := 1; len(lbls) < labelCount; j++ {
lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j)
} }
samples := make([]tsdbutil.Sample, 0, maxt-mint+1) samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
for t := mint; t <= maxt; t++ { for t := mint; t <= maxt; t++ {
@ -224,31 +231,3 @@ func populateSeries(lbls []map[string]string, mint, maxt int64) []Series {
} }
return series return series
} }
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
const (
letterIdxBits = 6 // 6 bits to represent a letter index
letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
letterIdxMax = 63 / letterIdxBits // # of letter indices fitting in 63 bits
)
// randString generates random string.
func randString() string {
maxLength := int32(50)
length := rand.Int31n(maxLength)
b := make([]byte, length+1)
// A rand.Int63() generates 63 random bits, enough for letterIdxMax characters!
for i, cache, remain := length, rand.Int63(), letterIdxMax; i >= 0; {
if remain == 0 {
cache, remain = rand.Int63(), letterIdxMax
}
if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
b[i] = letterBytes[idx]
i--
}
cache >>= letterIdxBits
remain--
}
return string(b)
}

View file

@ -63,14 +63,15 @@ func NewEqualMatcher(name, value string) Matcher {
return &EqualMatcher{name: name, value: value} return &EqualMatcher{name: name, value: value}
} }
type regexpMatcher struct { type RegexpMatcher struct {
name string name string
re *regexp.Regexp re *regexp.Regexp
} }
func (m regexpMatcher) Name() string { return m.name } func (m RegexpMatcher) Name() string { return m.name }
func (m regexpMatcher) Matches(v string) bool { return m.re.MatchString(v) } func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
func (m regexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) } func (m RegexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
func (m RegexpMatcher) Value() string { return m.re.String() }
// NewRegexpMatcher returns a new matcher verifying that a value matches // NewRegexpMatcher returns a new matcher verifying that a value matches
// the regular expression pattern. // the regular expression pattern.
@ -79,7 +80,7 @@ func NewRegexpMatcher(name, pattern string) (Matcher, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &regexpMatcher{name: name, re: re}, nil return &RegexpMatcher{name: name, re: re}, nil
} }
// NewMustRegexpMatcher returns a new matcher verifying that a value matches // NewMustRegexpMatcher returns a new matcher verifying that a value matches
@ -90,7 +91,7 @@ func NewMustRegexpMatcher(name, pattern string) Matcher {
if err != nil { if err != nil {
panic(err) panic(err)
} }
return &regexpMatcher{name: name, re: re} return &RegexpMatcher{name: name, re: re}
} }

View file

@ -17,6 +17,7 @@ import (
"fmt" "fmt"
"sort" "sort"
"strings" "strings"
"unicode/utf8"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/tsdb/chunkenc" "github.com/prometheus/tsdb/chunkenc"
@ -266,6 +267,62 @@ func (q *blockQuerier) Close() error {
return merr.Err() return merr.Err()
} }
// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped.
var regexMetaCharacterBytes [16]byte
// isRegexMetaCharacter reports whether byte b needs to be escaped.
func isRegexMetaCharacter(b byte) bool {
return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0
}
func init() {
for _, b := range []byte(`.+*?()|[]{}^$`) {
regexMetaCharacterBytes[b%16] |= 1 << (b / 16)
}
}
func findSetMatches(pattern string) []string {
// Return empty matches if the wrapper from Prometheus is missing.
if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" {
return nil
}
escaped := false
sets := []*strings.Builder{&strings.Builder{}}
for i := 4; i < len(pattern)-2; i++ {
if escaped {
switch {
case isRegexMetaCharacter(pattern[i]):
sets[len(sets)-1].WriteByte(pattern[i])
case pattern[i] == '\\':
sets[len(sets)-1].WriteByte('\\')
default:
return nil
}
escaped = false
} else {
switch {
case isRegexMetaCharacter(pattern[i]):
if pattern[i] == '|' {
sets = append(sets, &strings.Builder{})
} else {
return nil
}
case pattern[i] == '\\':
escaped = true
default:
sets[len(sets)-1].WriteByte(pattern[i])
}
}
}
matches := make([]string, 0, len(sets))
for _, s := range sets {
if s.Len() > 0 {
matches = append(matches, s.String())
}
}
return matches
}
// PostingsForMatchers assembles a single postings iterator against the index reader // PostingsForMatchers assembles a single postings iterator against the index reader
// based on the given matchers. // based on the given matchers.
func PostingsForMatchers(ix IndexReader, ms ...labels.Matcher) (index.Postings, error) { func PostingsForMatchers(ix IndexReader, ms ...labels.Matcher) (index.Postings, error) {
@ -346,6 +403,14 @@ func postingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings, error
return ix.Postings(em.Name(), em.Value()) return ix.Postings(em.Name(), em.Value())
} }
// Fast-path for set matching.
if em, ok := m.(*labels.RegexpMatcher); ok {
setMatches := findSetMatches(em.Value())
if len(setMatches) > 0 {
return postingsForSetMatcher(ix, em.Name(), setMatches)
}
}
tpls, err := ix.LabelValues(m.Name()) tpls, err := ix.LabelValues(m.Name())
if err != nil { if err != nil {
return nil, err return nil, err
@ -411,6 +476,18 @@ func inversePostingsForMatcher(ix IndexReader, m labels.Matcher) (index.Postings
return index.Merge(rit...), nil return index.Merge(rit...), nil
} }
func postingsForSetMatcher(ix IndexReader, name string, matches []string) (index.Postings, error) {
var its []index.Postings
for _, match := range matches {
if it, err := ix.Postings(name, match); err == nil {
its = append(its, it)
} else {
return nil, err
}
}
return index.Merge(its...), nil
}
func mergeStrings(a, b []string) []string { func mergeStrings(a, b []string) []string {
maxl := len(a) maxl := len(a)
if len(b) > len(a) { if len(b) > len(a) {

View file

@ -1691,6 +1691,192 @@ func BenchmarkQuerySeek(b *testing.B) {
} }
} }
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func BenchmarkSetMatcher(b *testing.B) {
cases := []struct {
numBlocks int
numSeries int
numSamplesPerSeriesPerBlock int
cardinality int
pattern string
}{
// The first three cases are to find out whether the set
// matcher is always faster than regex matcher.
{
numBlocks: 1,
numSeries: 1,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 15,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 15,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3)$",
},
// Big data sizes benchmarks.
{
numBlocks: 20,
numSeries: 1000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3)$",
},
{
numBlocks: 20,
numSeries: 1000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
// Increase cardinality.
{
numBlocks: 1,
numSeries: 100000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 100000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 500000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 500000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 10,
numSeries: 500000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 500000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
{
numBlocks: 1,
numSeries: 1000000,
numSamplesPerSeriesPerBlock: 10,
cardinality: 1000000,
pattern: "^(?:1|2|3|4|5|6|7|8|9|10)$",
},
}
for _, c := range cases {
dir, err := ioutil.TempDir("", "bench_postings_for_matchers")
testutil.Ok(b, err)
defer func() {
testutil.Ok(b, os.RemoveAll(dir))
}()
var (
blocks []*Block
prefilledLabels []map[string]string
generatedSeries []Series
)
for i := int64(0); i < int64(c.numBlocks); i++ {
mint := i * int64(c.numSamplesPerSeriesPerBlock)
maxt := mint + int64(c.numSamplesPerSeriesPerBlock) - 1
if len(prefilledLabels) == 0 {
generatedSeries = genSeries(c.numSeries, 10, mint, maxt)
for _, s := range generatedSeries {
prefilledLabels = append(prefilledLabels, s.Labels().Map())
}
} else {
generatedSeries = populateSeries(prefilledLabels, mint, maxt)
}
block, err := OpenBlock(nil, createBlock(b, dir, generatedSeries), nil)
testutil.Ok(b, err)
blocks = append(blocks, block)
defer block.Close()
}
que := &querier{
blocks: make([]Querier, 0, len(blocks)),
}
for _, blk := range blocks {
q, err := NewBlockQuerier(blk, math.MinInt64, math.MaxInt64)
testutil.Ok(b, err)
que.blocks = append(que.blocks, q)
}
defer que.Close()
benchMsg := fmt.Sprintf("nSeries=%d,nBlocks=%d,cardinality=%d,pattern=\"%s\"", c.numSeries, c.numBlocks, c.cardinality, c.pattern)
b.Run(benchMsg, func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for n := 0; n < b.N; n++ {
_, err := que.Select(labels.NewMustRegexpMatcher("test", c.pattern))
testutil.Ok(b, err)
}
})
}
}
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func TestFindSetMatches(t *testing.T) {
cases := []struct {
pattern string
exp []string
}{
// Simple sets.
{
pattern: "^(?:foo|bar|baz)$",
exp: []string{
"foo",
"bar",
"baz",
},
},
// Simple sets containing escaped characters.
{
pattern: "^(?:fo\\.o|bar\\?|\\^baz)$",
exp: []string{
"fo.o",
"bar?",
"^baz",
},
},
// Simple sets containing special characters without escaping.
{
pattern: "^(?:fo.o|bar?|^baz)$",
exp: nil,
},
// Missing wrapper.
{
pattern: "foo|bar|baz",
exp: nil,
},
}
for _, c := range cases {
matches := findSetMatches(c.pattern)
if len(c.exp) == 0 {
if len(matches) != 0 {
t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches)
}
} else {
if len(matches) != len(c.exp) {
t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern)
} else {
for i := 0; i < len(c.exp); i++ {
if c.exp[i] != matches[i] {
t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i])
}
}
}
}
}
}
func TestPostingsForMatchers(t *testing.T) { func TestPostingsForMatchers(t *testing.T) {
h, err := NewHead(nil, nil, nil, 1000) h, err := NewHead(nil, nil, nil, 1000)
testutil.Ok(t, err) testutil.Ok(t, err)
@ -1703,6 +1889,7 @@ func TestPostingsForMatchers(t *testing.T) {
app.Add(labels.FromStrings("n", "1", "i", "a"), 0, 0) app.Add(labels.FromStrings("n", "1", "i", "a"), 0, 0)
app.Add(labels.FromStrings("n", "1", "i", "b"), 0, 0) app.Add(labels.FromStrings("n", "1", "i", "b"), 0, 0)
app.Add(labels.FromStrings("n", "2"), 0, 0) app.Add(labels.FromStrings("n", "2"), 0, 0)
app.Add(labels.FromStrings("n", "2.5"), 0, 0)
testutil.Ok(t, app.Commit()) testutil.Ok(t, app.Commit())
cases := []struct { cases := []struct {
@ -1735,6 +1922,7 @@ func TestPostingsForMatchers(t *testing.T) {
labels.FromStrings("n", "1", "i", "a"), labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"), labels.FromStrings("n", "1", "i", "b"),
labels.FromStrings("n", "2"), labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
}, },
}, },
// Not equals. // Not equals.
@ -1742,6 +1930,7 @@ func TestPostingsForMatchers(t *testing.T) {
matchers: []labels.Matcher{labels.Not(labels.NewEqualMatcher("n", "1"))}, matchers: []labels.Matcher{labels.Not(labels.NewEqualMatcher("n", "1"))},
exp: []labels.Labels{ exp: []labels.Labels{
labels.FromStrings("n", "2"), labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
}, },
}, },
{ {
@ -1796,6 +1985,7 @@ func TestPostingsForMatchers(t *testing.T) {
exp: []labels.Labels{ exp: []labels.Labels{
labels.FromStrings("n", "1"), labels.FromStrings("n", "1"),
labels.FromStrings("n", "2"), labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
}, },
}, },
{ {
@ -1824,6 +2014,7 @@ func TestPostingsForMatchers(t *testing.T) {
matchers: []labels.Matcher{labels.Not(labels.NewMustRegexpMatcher("n", "^1$"))}, matchers: []labels.Matcher{labels.Not(labels.NewMustRegexpMatcher("n", "^1$"))},
exp: []labels.Labels{ exp: []labels.Labels{
labels.FromStrings("n", "2"), labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
}, },
}, },
{ {
@ -1869,6 +2060,46 @@ func TestPostingsForMatchers(t *testing.T) {
labels.FromStrings("n", "1", "i", "a"), labels.FromStrings("n", "1", "i", "a"),
}, },
}, },
// Set optimization for Regex.
// Refer to https://github.com/prometheus/prometheus/issues/2651.
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:1|2)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1"),
labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"),
labels.FromStrings("n", "2"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:a|b)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1", "i", "a"),
labels.FromStrings("n", "1", "i", "b"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:x1|2)$")},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
},
},
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("n", "^(?:2|2\\.5)$")},
exp: []labels.Labels{
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
// Empty value.
{
matchers: []labels.Matcher{labels.NewMustRegexpMatcher("i", "^(?:c||d)$")},
exp: []labels.Labels{
labels.FromStrings("n", "1"),
labels.FromStrings("n", "2"),
labels.FromStrings("n", "2.5"),
},
},
} }
ir, err := h.Index() ir, err := h.Index()