Improve TestAnalyzeRealQueries (#464)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2023-03-31 10:27:43 +02:00 committed by GitHub
parent 05a3a79015
commit 9fe1bd2d63
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -18,6 +18,7 @@ import (
"fmt" "fmt"
"math/rand" "math/rand"
"os" "os"
"sort"
"strconv" "strconv"
"strings" "strings"
"testing" "testing"
@ -408,8 +409,23 @@ func FuzzFastRegexMatcher_WithFuzzyRegularExpressions(f *testing.F) {
func TestAnalyzeRealQueries(t *testing.T) { func TestAnalyzeRealQueries(t *testing.T) {
t.Skip("Decomment this test only to manually analyze real queries") t.Skip("Decomment this test only to manually analyze real queries")
type labelValueInfo struct {
numMatchingQueries int //nolint:unused
numShardedQueries int //nolint:unused
numSplitQueries int //nolint:unused
optimized bool //nolint:unused
averageParsingTimeMillis float64 //nolint:unused
// Sorted list of timestamps when the queries have been received.
queryStartTimes []time.Time
}
labelValueRE := regexp.MustCompile(`=~\\"([^"]+)\\"`) labelValueRE := regexp.MustCompile(`=~\\"([^"]+)\\"`)
labelValues := make(map[string]struct{}) tsRE := regexp.MustCompile(`ts=([^ ]+)`)
shardedQueriesRE := regexp.MustCompile(`sharded_queries=(\d+)`)
splitQueriesRE := regexp.MustCompile(`split_queries=(\d+)`)
labelValues := make(map[string]*labelValueInfo)
// Read the logs file line-by-line, and find all values for regex label matchers. // Read the logs file line-by-line, and find all values for regex label matchers.
readFile, err := os.Open("logs.txt") readFile, err := os.Open("logs.txt")
@ -418,23 +434,67 @@ func TestAnalyzeRealQueries(t *testing.T) {
fileScanner := bufio.NewScanner(readFile) fileScanner := bufio.NewScanner(readFile)
fileScanner.Split(bufio.ScanLines) fileScanner.Split(bufio.ScanLines)
numQueries := 0
for fileScanner.Scan() { for fileScanner.Scan() {
line := fileScanner.Text() line := fileScanner.Text()
matches := labelValueRE.FindAllStringSubmatch(line, -1) matches := labelValueRE.FindAllStringSubmatch(line, -1)
if len(matches) == 0 {
continue
}
// Look up query stats.
tsRaw := tsRE.FindStringSubmatch(line)
shardedQueriesRaw := shardedQueriesRE.FindStringSubmatch(line)
splitQueriesRaw := splitQueriesRE.FindStringSubmatch(line)
shardedQueries := 0
splitQueries := 0
var ts time.Time
if len(tsRaw) > 0 {
ts, _ = time.Parse(time.RFC3339Nano, tsRaw[1])
}
if len(shardedQueriesRaw) > 0 {
shardedQueries, _ = strconv.Atoi(shardedQueriesRaw[1])
}
if len(splitQueriesRaw) > 0 {
splitQueries, _ = strconv.Atoi(splitQueriesRaw[1])
}
numQueries++
for _, match := range matches { for _, match := range matches {
labelValues[match[1]] = struct{}{} info := labelValues[match[1]]
if info == nil {
info = &labelValueInfo{}
labelValues[match[1]] = info
}
info.numMatchingQueries++
info.numShardedQueries += shardedQueries
info.numSplitQueries += splitQueries
if !ts.IsZero() {
info.queryStartTimes = append(info.queryStartTimes, ts)
}
} }
} }
require.NoError(t, readFile.Close()) // Sort query start times.
t.Logf("Found %d unique regexp matchers", len(labelValues)) for _, info := range labelValues {
sort.Slice(info.queryStartTimes, func(i, j int) bool {
return info.queryStartTimes[i].Before(info.queryStartTimes[j])
})
}
// Check if each regexp matcher is supported by our optimization. require.NoError(t, readFile.Close())
t.Logf("Found %d unique regexp matchers out of %d queries", len(labelValues), numQueries)
// Analyze each regexp matcher found.
numChecked := 0 numChecked := 0
numOptimized := 0 numOptimized := 0
for re := range labelValues { for re, info := range labelValues {
m, err := NewFastRegexMatcher(re) m, err := NewFastRegexMatcher(re)
if err != nil { if err != nil {
// Ignore it, because we may have failed to extract the label matcher. // Ignore it, because we may have failed to extract the label matcher.
@ -442,14 +502,59 @@ func TestAnalyzeRealQueries(t *testing.T) {
} }
numChecked++ numChecked++
// Check if each regexp matcher is supported by our optimization.
if m.isOptimized() { if m.isOptimized() {
numOptimized++ numOptimized++
} else { info.optimized = true
t.Logf("Not optimized matcher: %s", re)
} }
// Estimate the parsing complexity.
startTime := time.Now()
const numParsingRuns = 1000
for i := 0; i < numParsingRuns; i++ {
NewFastRegexMatcher(re)
}
info.averageParsingTimeMillis = float64(time.Since(startTime).Milliseconds()) / float64(numParsingRuns)
} }
t.Logf("Found %d (%.2f%%) optimized matchers out of %d", numOptimized, (float64(numOptimized)/float64(numChecked))*100, numChecked) t.Logf("Found %d out of %d (%.2f%%) regexp matchers optimized by FastRegexMatcher", numOptimized, numChecked, (float64(numOptimized)/float64(numChecked))*100)
// Print some statistics.
for labelValue, info := range labelValues {
// Find the min/avg/max difference between query start times.
var (
minQueryStartTimeDiff time.Duration
maxQueryStartTimeDiff time.Duration
avgQueryStartTimeDiff time.Duration
sumQueryStartTime time.Duration
countQueryStartTime int
)
for i := 1; i < len(info.queryStartTimes); i++ {
diff := info.queryStartTimes[i].Sub(info.queryStartTimes[i-1])
sumQueryStartTime += diff
countQueryStartTime++
if minQueryStartTimeDiff == 0 || diff < minQueryStartTimeDiff {
minQueryStartTimeDiff = diff
}
if diff > maxQueryStartTimeDiff {
maxQueryStartTimeDiff = diff
}
}
if countQueryStartTime > 0 {
avgQueryStartTimeDiff = sumQueryStartTime / time.Duration(countQueryStartTime)
}
t.Logf("num queries: %d\t num split queries: %d\t num sharded queries: %d\t optimized: %t\t parsing time: %.0fms\t min/avg/max query start time diff (sec): %.2f/%.2f/%.2f regexp: %s",
info.numMatchingQueries, info.numSplitQueries, info.numShardedQueries, info.optimized, info.averageParsingTimeMillis,
minQueryStartTimeDiff.Seconds(), avgQueryStartTimeDiff.Seconds(), maxQueryStartTimeDiff.Seconds(), labelValue)
}
} }
func TestOptimizeEqualStringMatchers(t *testing.T) { func TestOptimizeEqualStringMatchers(t *testing.T) {