prometheus/model/labels/regexp_test.go

// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package labels

import (
	"bufio"
	"fmt"
	"math/rand"
	"os"
	"sort"
	"strconv"
	"strings"
	"testing"
	"time"

	"github.com/grafana/regexp"
	"github.com/grafana/regexp/syntax"
	"github.com/stretchr/testify/require"
)

var (
	asciiRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_")
	regexes    = []string{
		"foo",
		"^foo",
		"(foo|bar)",
		"foo.*",
		".*foo",
		"^.*foo$",
		"^.+foo$",
		".*",
		".+",
		"foo.+",
		".+foo",
		"foo\n.+",
		"foo\n.*",
		".*foo.*",
		".+foo.+",
		"(?s:.*)",
		"(?s:.+)",
		"(?s:^.*foo$)",
		"(?i:foo)",
		"(?i:(foo|bar))",
		"(?i:(foo1|foo2|bar))",
		"^(?i:foo|oo)|(bar)$",
		"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))",
		"((.*)(bar|b|buzz)(.+)|foo)$",
		"^$",
		"(prometheus|api_prom)_api_v1_.+",
		"10\\.0\\.(1|2)\\.+",
		"10\\.0\\.(1|2).+",
		"((fo(bar))|.+foo)",
		// A long case sensitive alternation.
		"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb",
		// A long case insensitive alternation.
		"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
	}
	values = []string{
		"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
		"FOO", "Foo", "OO", "Oo", "\nfoo\n", strings.Repeat("f", 20), "prometheus", "prometheus_api_v1", "prometheus_api_v1_foo",
		"10.0.1.20", "10.0.2.10", "10.0.3.30", "10.0.4.40",
		"foofoo0", "foofoo",
	}
)

func TestNewFastRegexMatcher(t *testing.T) {
	for _, r := range regexes {
		r := r
		for _, v := range values {
			v := v
			t.Run(r+` on "`+v+`"`, func(t *testing.T) {
				t.Parallel()
				m, err := NewFastRegexMatcher(r)
				require.NoError(t, err)
				require.Equal(t, m.re.MatchString(v), m.MatchString(v))
			})
		}

	}
}

func BenchmarkNewFastRegexMatcher(b *testing.B) {
	runBenchmark := func(newFunc func(v string) (*FastRegexMatcher, error)) func(b *testing.B) {
		return func(b *testing.B) {
			for _, r := range regexes {
				b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
					for n := 0; n < b.N; n++ {
						_, err := newFunc(r)
						if err != nil {
							b.Fatal(err)
						}
					}
				})
			}
		}
	}

	b.Run("with cache", runBenchmark(NewFastRegexMatcher))
	b.Run("without cache", runBenchmark(newFastRegexMatcherWithoutCache))
}

func BenchmarkNewFastRegexMatcher_CacheMisses(b *testing.B) {
	// Init the random seed with a constant, so that it doesn't change between runs.
	randGenerator := rand.New(rand.NewSource(1))

	tests := map[string]string{
		"simple regexp":  randString(randGenerator, 10),
		"complex regexp": strings.Join(randStrings(randGenerator, 100, 10), "|"),
	}

	for testName, regexpPrefix := range tests {
		b.Run(testName, func(b *testing.B) {
			// Ensure the cache is empty.
			fastRegexMatcherCache.Purge()

			b.ResetTimer()

			for n := 0; n < b.N; n++ {
				// Unique regexp to emulate 100% cache misses.
				regexp := regexpPrefix + strconv.Itoa(n)

				_, err := NewFastRegexMatcher(regexp)
				if err != nil {
					b.Fatal(err)
				}
			}
		})
	}
}

func TestOptimizeConcatRegex(t *testing.T) {
	cases := []struct {
		regex    string
		prefix   string
		suffix   string
		contains string
	}{
		{regex: "foo(hello|bar)", prefix: "foo", suffix: "", contains: ""},
		{regex: "foo(hello|bar)world", prefix: "foo", suffix: "world", contains: ""},
		{regex: "foo.*", prefix: "foo", suffix: "", contains: ""},
		{regex: "foo.*hello.*bar", prefix: "foo", suffix: "bar", contains: "hello"},
		{regex: ".*foo", prefix: "", suffix: "foo", contains: ""},
		{regex: "^.*foo$", prefix: "", suffix: "foo", contains: ""},
		{regex: ".*foo.*", prefix: "", suffix: "", contains: "foo"},
		{regex: ".*foo.*bar.*", prefix: "", suffix: "", contains: "foo"},
		{regex: ".*(foo|bar).*", prefix: "", suffix: "", contains: ""},
		{regex: ".*[abc].*", prefix: "", suffix: "", contains: ""},
		{regex: ".*((?i)abc).*", prefix: "", suffix: "", contains: ""},
		{regex: ".*(?i:abc).*", prefix: "", suffix: "", contains: ""},
		{regex: "(?i:abc).*", prefix: "", suffix: "", contains: ""},
		{regex: ".*(?i:abc)", prefix: "", suffix: "", contains: ""},
		{regex: ".*(?i:abc)def.*", prefix: "", suffix: "", contains: "def"},
		{regex: "(?i).*(?-i:abc)def", prefix: "", suffix: "", contains: "abc"},
		{regex: ".*(?msU:abc).*", prefix: "", suffix: "", contains: "abc"},
		{regex: "[aA]bc.*", prefix: "", suffix: "", contains: "bc"},
		{regex: "^5..$", prefix: "5", suffix: "", contains: ""},
		{regex: "^release.*", prefix: "release", suffix: "", contains: ""},
		{regex: "^env-[0-9]+laio[1]?[^0-9].*", prefix: "env-", suffix: "", contains: "laio"},
	}

	for _, c := range cases {
		parsed, err := syntax.Parse(c.regex, syntax.Perl)
		require.NoError(t, err)

		prefix, suffix, contains := optimizeConcatRegex(parsed)
		require.Equal(t, c.prefix, prefix)
		require.Equal(t, c.suffix, suffix)
		require.Equal(t, c.contains, contains)
	}
}

// Refer to https://github.com/prometheus/prometheus/issues/2651.
func TestFindSetMatches(t *testing.T) {
	for _, c := range []struct {
		pattern          string
		expMatches       []string
		expCaseSensitive bool
	}{
		// Single value, coming from a `bar=~"foo"` selector.
		{"foo", []string{"foo"}, true},
		{"^foo", []string{"foo"}, true},
		{"^foo$", []string{"foo"}, true},
		// Simple sets alternates.
		{"foo|bar|zz", []string{"foo", "bar", "zz"}, true},
		// Simple sets alternate and concat (bar|baz is parsed as "ba[rz]").
		{"foo|bar|baz", []string{"foo", "bar", "baz"}, true},
		// Simple sets alternate and concat and capture
		{"foo|bar|baz|(zz)", []string{"foo", "bar", "baz", "zz"}, true},
		// Simple sets alternate and concat and alternates with empty matches
		// parsed as  b(ar|(?:)|uzz) where b(?:) means literal b.
		{"bar|b|buzz", []string{"bar", "b", "buzz"}, true},
		// Skip outer anchors (it's enforced anyway at the root).
		{"^(bar|b|buzz)$", []string{"bar", "b", "buzz"}, true},
		{"^(?:prod|production)$", []string{"prod", "production"}, true},
		// Do not optimize regexp with inner anchors.
		{"(bar|b|b^uz$z)", nil, false},
		// Do not optimize regexp with empty string matcher.
		{"^$|Running", nil, false},
		// Simple sets containing escaped characters.
		{"fo\\.o|bar\\?|\\^baz", []string{"fo.o", "bar?", "^baz"}, true},
		// using charclass
		{"[abc]d", []string{"ad", "bd", "cd"}, true},
		// high low charset different => A(B[CD]|EF)|BC[XY]
		{"ABC|ABD|AEF|BCX|BCY", []string{"ABC", "ABD", "AEF", "BCX", "BCY"}, true},
		// triple concat
		{"api_(v1|prom)_push", []string{"api_v1_push", "api_prom_push"}, true},
		// triple concat with multiple alternates
		{"(api|rpc)_(v1|prom)_push", []string{"api_v1_push", "api_prom_push", "rpc_v1_push", "rpc_prom_push"}, true},
		{"(api|rpc)_(v1|prom)_(push|query)", []string{"api_v1_push", "api_v1_query", "api_prom_push", "api_prom_query", "rpc_v1_push", "rpc_v1_query", "rpc_prom_push", "rpc_prom_query"}, true},
		// class starting with "-"
		{"[-1-2][a-c]", []string{"-a", "-b", "-c", "1a", "1b", "1c", "2a", "2b", "2c"}, true},
		{"[1^3]", []string{"1", "3", "^"}, true},
		// OpPlus with concat
		{"(.+)/(foo|bar)", nil, false},
		// Simple sets containing special characters without escaping.
		{"fo.o|bar?|^baz", nil, false},
		// case sensitive wrapper.
		{"(?i)foo", []string{"FOO"}, false},
		// case sensitive wrapper on alternate.
		{"(?i)foo|bar|baz", []string{"FOO", "BAR", "BAZ", "BAr", "BAz"}, false},
		// mixed case sensitivity.
		{"(api|rpc)_(v1|prom)_((?i)push|query)", nil, false},
		// mixed case sensitivity concatenation only without capture group.
		{"api_v1_(?i)push", nil, false},
		// mixed case sensitivity alternation only without capture group.
		{"api|(?i)rpc", nil, false},
		// case sensitive after unsetting insensitivity.
		{"rpc|(?i)(?-i)api", []string{"rpc", "api"}, true},
		// case sensitive after unsetting insensitivity in all alternation options.
		{"(?i)((?-i)api|(?-i)rpc)", []string{"api", "rpc"}, true},
		// mixed case sensitivity after unsetting insensitivity.
		{"(?i)rpc|(?-i)api", nil, false},
		// too high charset combination
		{"(api|rpc)_[^0-9]", nil, false},
		// too many combinations
		{"[a-z][a-z]", nil, false},
	} {
		c := c
		t.Run(c.pattern, func(t *testing.T) {
			t.Parallel()
			parsed, err := syntax.Parse(c.pattern, syntax.Perl)
			require.NoError(t, err)
			matches, actualCaseSensitive := findSetMatches(parsed)
			require.Equal(t, c.expMatches, matches)
			require.Equal(t, c.expCaseSensitive, actualCaseSensitive)
		})
	}
}

func BenchmarkFastRegexMatcher(b *testing.B) {
	// Init the random seed with a constant, so that it doesn't change between runs.
	randGenerator := rand.New(rand.NewSource(1))

	// Generate variable lengths random texts to match against.
	texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
	texts = append(texts, randStrings(randGenerator, 5, 30)...)
	texts = append(texts, randStrings(randGenerator, 1, 100)...)
	texts = append(texts, "foo"+randString(randGenerator, 50))
	texts = append(texts, randString(randGenerator, 50)+"foo")

	for _, r := range regexes {
		b.Run(getTestNameFromRegexp(r), func(b *testing.B) {
			m, err := NewFastRegexMatcher(r)
			require.NoError(b, err)
			b.ResetTimer()
			for i := 0; i < b.N; i++ {
				for _, text := range texts {
					_ = m.MatchString(text)
				}
			}
		})
	}
}

func Test_OptimizeRegex(t *testing.T) {
	for _, c := range []struct {
		pattern string
		exp     StringMatcher
	}{
		{".*", anyStringWithoutNewlineMatcher{}},
		{".*?", anyStringWithoutNewlineMatcher{}},
		{"(?s:.*)", trueMatcher{}},
		{"(.*)", anyStringWithoutNewlineMatcher{}},
		{"^.*$", anyStringWithoutNewlineMatcher{}},
		{".+", &anyNonEmptyStringMatcher{matchNL: false}},
		{"(?s:.+)", &anyNonEmptyStringMatcher{matchNL: true}},
		{"^.+$", &anyNonEmptyStringMatcher{matchNL: false}},
		{"(.+)", &anyNonEmptyStringMatcher{matchNL: false}},
		{"", emptyStringMatcher{}},
		{"^$", emptyStringMatcher{}},
		{"^foo$", &equalStringMatcher{s: "foo", caseSensitive: true}},
		{"^(?i:foo)$", &equalStringMatcher{s: "FOO", caseSensitive: false}},
		{"^((?i:foo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
		{"^((?i:foo|oo)|(bar))$", orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO", caseSensitive: false}, &equalStringMatcher{s: "OO", caseSensitive: false}, &equalStringMatcher{s: "bar", caseSensitive: true}})},
		{"(?i:(foo1|foo2|bar))", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "FOO1", caseSensitive: false}, &equalStringMatcher{s: "FOO2", caseSensitive: false}}), &equalStringMatcher{s: "BAR", caseSensitive: false}})},
		{".*foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
		{"(.*)foo.*", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
		{"(.*)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
		{"(.+)foo(.*)", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: anyStringWithoutNewlineMatcher{}}},
		{"^.+foo.+", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
		{"^(.*)(foo)(.*)$", &containsStringMatcher{substrings: []string{"foo"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
		{"^(.*)(foo|foobar)(.*)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: anyStringWithoutNewlineMatcher{}}},
		{"^(.*)(foo|foobar)(.+)$", &containsStringMatcher{substrings: []string{"foo", "foobar"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
		{"^(.*)(bar|b|buzz)(.+)$", &containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}},
		{"10\\.0\\.(1|2)\\.+", nil},
		{"10\\.0\\.(1|2).+", &containsStringMatcher{substrings: []string{"10.0.1", "10.0.2"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
		{"^.+foo", &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
		{"foo-.*$", &containsStringMatcher{substrings: []string{"foo-"}, left: nil, right: anyStringWithoutNewlineMatcher{}}},
		{"(prometheus|api_prom)_api_v1_.+", &containsStringMatcher{substrings: []string{"prometheus_api_v1_", "api_prom_api_v1_"}, left: nil, right: &anyNonEmptyStringMatcher{matchNL: false}}},
		{"^((.*)(bar|b|buzz)(.+)|foo)$", orStringMatcher([]StringMatcher{&containsStringMatcher{substrings: []string{"bar", "b", "buzz"}, left: anyStringWithoutNewlineMatcher{}, right: &anyNonEmptyStringMatcher{matchNL: false}}, &equalStringMatcher{s: "foo", caseSensitive: true}})},
		{"((fo(bar))|.+foo)", orStringMatcher([]StringMatcher{orStringMatcher([]StringMatcher{&equalStringMatcher{s: "fobar", caseSensitive: true}}), &containsStringMatcher{substrings: []string{"foo"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}})},
		{"(.+)/(gateway|cortex-gw|cortex-gw-internal)", &containsStringMatcher{substrings: []string{"/gateway", "/cortex-gw", "/cortex-gw-internal"}, left: &anyNonEmptyStringMatcher{matchNL: false}, right: nil}},
		// we don't support case insensitive matching for contains.
		// This is because there's no strings.IndexOfFold function.
		// We can revisit later if this is really popular by using strings.ToUpper.
		{"^(.*)((?i)foo|foobar)(.*)$", nil},
		{"(api|rpc)_(v1|prom)_((?i)push|query)", nil},
		{"[a-z][a-z]", nil},
		{"[1^3]", nil},
		{".*foo.*bar.*", nil},
		{`\d*`, nil},
		{".", nil},
		// This one is not supported because  `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
		// It would make the code too complex to handle it.
		{"/|/bar.*", nil},
		{"(.+)/(foo.*|bar$)", nil},
	} {
		c := c
		t.Run(c.pattern, func(t *testing.T) {
			t.Parallel()
			parsed, err := syntax.Parse(c.pattern, syntax.Perl)
			require.NoError(t, err)
			matches := stringMatcherFromRegexp(parsed)
			require.Equal(t, c.exp, matches)
		})
	}
}

func randString(randGenerator *rand.Rand, length int) string {
	b := make([]rune, length)
	for i := range b {
		b[i] = asciiRunes[randGenerator.Intn(len(asciiRunes))]
	}
	return string(b)
}

func randStrings(randGenerator *rand.Rand, many, length int) []string {
	out := make([]string, 0, many)
	for i := 0; i < many; i++ {
		out = append(out, randString(randGenerator, length))
	}
	return out
}

func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions(f *testing.F) {
	// Create all matchers.
	matchers := make([]*FastRegexMatcher, 0, len(regexes))
	for _, re := range regexes {
		m, err := NewFastRegexMatcher(re)
		require.NoError(f, err)
		matchers = append(matchers, m)
	}

	// Add known values to seed corpus.
	for _, v := range values {
		f.Add(v)
	}

	f.Fuzz(func(t *testing.T, text string) {
		for _, m := range matchers {
			require.Equalf(t, m.re.MatchString(text), m.MatchString(text), "regexp: %s text: %s", m.re.String(), text)
		}
	})
}

func FuzzFastRegexMatcher_WithFuzzyRegularExpressions(f *testing.F) {
	for _, re := range regexes {
		for _, text := range values {
			f.Add(re, text)
		}
	}

	f.Fuzz(func(t *testing.T, re, text string) {
		m, err := NewFastRegexMatcher(re)
		if err != nil {
			// Ignore invalid regexes.
			return
		}

		require.Equalf(t, m.re.MatchString(text), m.MatchString(text), "regexp: %s text: %s", m.re.String(), text)
	})
}

// This test can be used to analyze real queries from Mimir logs. You can extract real queries with a regexp matcher
// running the following command:
//
// logcli --addr=XXX --username=YYY --password=ZZZ query '{namespace=~"(cortex|mimir).*",name="query-frontend"} |= "query stats" |= "=~" --limit=100000 > logs.txt
func TestAnalyzeRealQueries(t *testing.T) {
	t.Skip("Decomment this test only to manually analyze real queries")

	type labelValueInfo struct {
		numMatchingQueries       int     //nolint:unused
		numShardedQueries        int     //nolint:unused
		numSplitQueries          int     //nolint:unused
		optimized                bool    //nolint:unused
		averageParsingTimeMillis float64 //nolint:unused

		// Sorted list of timestamps when the queries have been received.
		queryStartTimes []time.Time
	}

	labelValueRE := regexp.MustCompile(`=~\\"([^"]+)\\"`)
	tsRE := regexp.MustCompile(`ts=([^ ]+)`)
	shardedQueriesRE := regexp.MustCompile(`sharded_queries=(\d+)`)
	splitQueriesRE := regexp.MustCompile(`split_queries=(\d+)`)

	labelValues := make(map[string]*labelValueInfo)

	// Read the logs file line-by-line, and find all values for regex label matchers.
	readFile, err := os.Open("logs.txt")
	require.NoError(t, err)

	fileScanner := bufio.NewScanner(readFile)
	fileScanner.Split(bufio.ScanLines)

	numQueries := 0

	for fileScanner.Scan() {
		line := fileScanner.Text()
		matches := labelValueRE.FindAllStringSubmatch(line, -1)
		if len(matches) == 0 {
			continue
		}

		// Look up query stats.
		tsRaw := tsRE.FindStringSubmatch(line)
		shardedQueriesRaw := shardedQueriesRE.FindStringSubmatch(line)
		splitQueriesRaw := splitQueriesRE.FindStringSubmatch(line)
		shardedQueries := 0
		splitQueries := 0
		var ts time.Time

		if len(tsRaw) > 0 {
			ts, _ = time.Parse(time.RFC3339Nano, tsRaw[1])
		}
		if len(shardedQueriesRaw) > 0 {
			shardedQueries, _ = strconv.Atoi(shardedQueriesRaw[1])
		}
		if len(splitQueriesRaw) > 0 {
			splitQueries, _ = strconv.Atoi(splitQueriesRaw[1])
		}

		numQueries++

		for _, match := range matches {
			info := labelValues[match[1]]
			if info == nil {
				info = &labelValueInfo{}
				labelValues[match[1]] = info
			}

			info.numMatchingQueries++
			info.numShardedQueries += shardedQueries
			info.numSplitQueries += splitQueries

			if !ts.IsZero() {
				info.queryStartTimes = append(info.queryStartTimes, ts)
			}
		}
	}

	// Sort query start times.
	for _, info := range labelValues {
		sort.Slice(info.queryStartTimes, func(i, j int) bool {
			return info.queryStartTimes[i].Before(info.queryStartTimes[j])
		})
	}

	require.NoError(t, readFile.Close())
	t.Logf("Found %d unique regexp matchers out of %d queries", len(labelValues), numQueries)

	// Analyze each regexp matcher found.
	numChecked := 0
	numOptimized := 0

	for re, info := range labelValues {
		m, err := NewFastRegexMatcher(re)
		if err != nil {
			// Ignore it, because we may have failed to extract the label matcher.
			continue
		}

		numChecked++

		// Check if each regexp matcher is supported by our optimization.
		if m.isOptimized() {
			numOptimized++
			info.optimized = true
		}

		// Estimate the parsing complexity.
		startTime := time.Now()
		const numParsingRuns = 1000

		for i := 0; i < numParsingRuns; i++ {
			NewFastRegexMatcher(re)
		}

		info.averageParsingTimeMillis = float64(time.Since(startTime).Milliseconds()) / float64(numParsingRuns)
	}

	t.Logf("Found %d out of %d (%.2f%%) regexp matchers optimized by FastRegexMatcher", numOptimized, numChecked, (float64(numOptimized)/float64(numChecked))*100)

	// Print some statistics.
	for labelValue, info := range labelValues {
		// Find the min/avg/max difference between query start times.
		var (
			minQueryStartTimeDiff time.Duration
			maxQueryStartTimeDiff time.Duration
			avgQueryStartTimeDiff time.Duration
			sumQueryStartTime     time.Duration
			countQueryStartTime   int
		)

		for i := 1; i < len(info.queryStartTimes); i++ {
			diff := info.queryStartTimes[i].Sub(info.queryStartTimes[i-1])

			sumQueryStartTime += diff
			countQueryStartTime++

			if minQueryStartTimeDiff == 0 || diff < minQueryStartTimeDiff {
				minQueryStartTimeDiff = diff
			}
			if diff > maxQueryStartTimeDiff {
				maxQueryStartTimeDiff = diff
			}
		}

		if countQueryStartTime > 0 {
			avgQueryStartTimeDiff = sumQueryStartTime / time.Duration(countQueryStartTime)
		}

		t.Logf("num queries: %d\t num split queries: %d\t num sharded queries: %d\t optimized: %t\t parsing time: %.0fms\t min/avg/max query start time diff (sec): %.2f/%.2f/%.2f regexp: %s",
			info.numMatchingQueries, info.numSplitQueries, info.numShardedQueries, info.optimized, info.averageParsingTimeMillis,
			minQueryStartTimeDiff.Seconds(), avgQueryStartTimeDiff.Seconds(), maxQueryStartTimeDiff.Seconds(), labelValue)
	}
}

func TestOptimizeEqualStringMatchers(t *testing.T) {
	tests := map[string]struct {
		input                 StringMatcher
		expectedValues        map[string]struct{}
		expectedCaseSensitive bool
	}{
		"should skip optimization on orStringMatcher with containsStringMatcher": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				&containsStringMatcher{substrings: []string{"a", "b", "c"}},
			},
			expectedValues: nil,
		},
		"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				&equalStringMatcher{s: "bar", caseSensitive: true},
				&equalStringMatcher{s: "baz", caseSensitive: true},
			},
			expectedValues: map[string]struct{}{
				"FOO": {},
				"bar": {},
				"baz": {},
			},
			expectedCaseSensitive: true,
		},
		"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				&equalStringMatcher{s: "bar", caseSensitive: false},
				&equalStringMatcher{s: "baz", caseSensitive: true},
			},
			expectedValues: nil,
		},
		"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				orStringMatcher{
					&equalStringMatcher{s: "bar", caseSensitive: true},
					&equalStringMatcher{s: "xxx", caseSensitive: true},
				},
				&equalStringMatcher{s: "baz", caseSensitive: true},
			},
			expectedValues: map[string]struct{}{
				"FOO": {},
				"bar": {},
				"xxx": {},
				"baz": {},
			},
			expectedCaseSensitive: true,
		},
		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				orStringMatcher{
					// Case sensitivity is different within items at the same level.
					&equalStringMatcher{s: "bar", caseSensitive: true},
					&equalStringMatcher{s: "xxx", caseSensitive: false},
				},
				&equalStringMatcher{s: "baz", caseSensitive: true},
			},
			expectedValues: nil,
		},
		"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: true},
				// Case sensitivity is different between the parent and child.
				orStringMatcher{
					&equalStringMatcher{s: "bar", caseSensitive: false},
					&equalStringMatcher{s: "xxx", caseSensitive: false},
				},
				&equalStringMatcher{s: "baz", caseSensitive: true},
			},
			expectedValues: nil,
		},
		"should return lowercase values on case insensitive matchers": {
			input: orStringMatcher{
				&equalStringMatcher{s: "FOO", caseSensitive: false},
				orStringMatcher{
					&equalStringMatcher{s: "bAr", caseSensitive: false},
				},
				&equalStringMatcher{s: "baZ", caseSensitive: false},
			},
			expectedValues: map[string]struct{}{
				"foo": {},
				"bar": {},
				"baz": {},
			},
			expectedCaseSensitive: false,
		},
	}

	for testName, testData := range tests {
		t.Run(testName, func(t *testing.T) {
			actualMatcher := optimizeEqualStringMatchers(testData.input, 0)

			if testData.expectedValues == nil {
				require.IsType(t, testData.input, actualMatcher)
			} else {
				require.IsType(t, &equalMultiStringMatcher{}, actualMatcher)
				require.Equal(t, testData.expectedValues, actualMatcher.(*equalMultiStringMatcher).values)
				require.Equal(t, testData.expectedCaseSensitive, actualMatcher.(*equalMultiStringMatcher).caseSensitive)
			}
		})
	}
}

// This benchmark is used to find a good threshold to use to apply the optimization
// done by optimizeEqualStringMatchers()
func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
	randGenerator := rand.New(rand.NewSource(time.Now().UnixNano()))

	// Generate variable lengths random texts to match against.
	texts := append([]string{}, randStrings(randGenerator, 10, 10)...)
	texts = append(texts, randStrings(randGenerator, 5, 30)...)
	texts = append(texts, randStrings(randGenerator, 1, 100)...)

	for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
		for _, caseSensitive := range []bool{true, false} {
			b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
				// Generate a regex with the expected number of alternations.
				re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|")
				if !caseSensitive {
					re = "(?i:(" + re + "))"
				}

				parsed, err := syntax.Parse(re, syntax.Perl)
				require.NoError(b, err)

				unoptimized := stringMatcherFromRegexpInternal(parsed)
				require.IsType(b, orStringMatcher{}, unoptimized)

				optimized := optimizeEqualStringMatchers(unoptimized, 0)
				require.IsType(b, &equalMultiStringMatcher{}, optimized)

				b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
					for n := 0; n < b.N; n++ {
						for _, t := range texts {
							unoptimized.Matches(t)
						}
					}
				})

				b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
					for n := 0; n < b.N; n++ {
						for _, t := range texts {
							optimized.Matches(t)
						}
					}
				})
			})
		}
	}
}

func getTestNameFromRegexp(re string) string {
	if len(re) > 32 {
		return re[:32]
	}
	return re
}