Optimize long alternate lists (#463)

* Use a prefix trie for long alternate lists * Add test for non terminal node * Fix panic in FuzzFastRegexMatcher_WithFuzzyRegularExpressions when the fuzzy regex is invalid Signed-off-by: Marco Pracucci <marco@pracucci.com> * Address PR feedback * Update model/labels/regexp_test.go Co-authored-by: Marco Pracucci <marco@pracucci.com> * Replace trie with slice or map depending on input size * Fix tests * Pull in tests from @pracucci's branch * Add setMatches back in * Use stringMatcher when it's faster * Fix linter * Estimate alternates ahead of time * Simplify construction with `IndexByte` * Add test and early return for empty regexp. * Fix race conditions in tests --------- Signed-off-by: Marco Pracucci <marco@pracucci.com> Co-authored-by: Marco Pracucci <marco@pracucci.com>
2025-03-05 20:59:13 -08:00 · 2023-04-01 01:35:35 -05:00 · 2023-04-01 01:35:35 -05:00 · ae170f644c
parent 9fe1bd2d63
commit ae170f644c
3 changed files with 290 additions and 66 deletions
--- a/model/labels/matcher.go
+++ b/model/labels/matcher.go
@ -126,7 +126,7 @@ func (m *Matcher) SetMatches() []string {
 	if m.re == nil {
 		return nil
 	}
-	return m.re.setMatches
+	return m.re.SetMatches()
 }

 // Prefix returns the required prefix of the value to match, if possible.
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -25,9 +25,10 @@ const (
 	maxSetMatches = 256

 	// The minimum number of alternate values a regex should have to trigger
-	// the optimization done by optimizeEqualStringMatchers(). This value has
+	// the optimization done by optimizeEqualStringMatchers() and so use a map
+	// to match values instead of iterating over a list. This value has
 	// been computed running BenchmarkOptimizeEqualStringMatchers.
-	optimizeEqualStringMatchersThreshold = 16
+	minEqualMultiStringMatcherMapThreshold = 16
 )

 var fastRegexMatcherCache *lru.Cache[string, *FastRegexMatcher]
@ -39,7 +40,10 @@ func init() {
 }

 type FastRegexMatcher struct {
-	re *regexp.Regexp
+	// Under some conditions, re is nil because the expression is never parsed.
+	// We store the original string to be able to return it in GetRegexString().
+	reString string
+	re       *regexp.Regexp

 	setMatches    []string
 	stringMatcher StringMatcher
@ -70,28 +74,38 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
 }

 func newFastRegexMatcherWithoutCache(v string) (*FastRegexMatcher, error) {
-	parsed, err := syntax.Parse(v, syntax.Perl)
-	if err != nil {
-		return nil, err
-	}
-	// Simplify the syntax tree to run faster.
-	parsed = parsed.Simplify()
-	re, err := regexp.Compile("^(?:" + parsed.String() + ")$")
-	if err != nil {
-		return nil, err
-	}
 	m := &FastRegexMatcher{
-		re: re,
+		reString: v,
+	}
+
+	m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)
+	if m.stringMatcher != nil {
+		// If we already have a string matcher, we don't need to parse the regex
+		// or compile the matchString function. This also avoids the behavior in
+		// compileMatchStringFunction where it prefers to use setMatches when
+		// available, even if the string matcher is faster.
+		m.matchString = m.stringMatcher.Matches
+	} else {
+		parsed, err := syntax.Parse(v, syntax.Perl)
+		if err != nil {
+			return nil, err
+		}
+		// Simplify the syntax tree to run faster.
+		parsed = parsed.Simplify()
+		m.re, err = regexp.Compile("^(?:" + parsed.String() + ")$")
+		if err != nil {
+			return nil, err
+		}
+		if parsed.Op == syntax.OpConcat {
+			m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
+		}
+		if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
+			m.setMatches = matches
+		}
+		m.stringMatcher = stringMatcherFromRegexp(parsed)
+		m.matchString = m.compileMatchStringFunction()
 	}
-	if parsed.Op == syntax.OpConcat {
-		m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
-	}
-	if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
-		m.setMatches = matches
-	}
-	m.stringMatcher = stringMatcherFromRegexp(parsed)

-	m.matchString = m.compileMatchStringFunction()
 	return m, nil
 }

@ -321,7 +335,52 @@ func (m *FastRegexMatcher) SetMatches() []string {
 }

 func (m *FastRegexMatcher) GetRegexString() string {
-	return m.re.String()
+	return m.reString
+}
+
+// optimizeAlternatingLiterals optimizes a regex of the form
+//
+//	`literal1|literal2|literal3|...`
+//
+// this function returns an optimized StringMatcher or nil if the regex
+// cannot be optimized in this way, and a list of setMatches up to maxSetMatches
+func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
+	if len(s) == 0 {
+		return emptyStringMatcher{}, nil
+	}
+
+	estimatedAlternates := strings.Count(s, "|") + 1
+
+	// If there are no alternates, check if the string is a literal
+	if estimatedAlternates == 1 {
+		if regexp.QuoteMeta(s) == s {
+			return &equalStringMatcher{s: s, caseSensitive: true}, nil
+		}
+		return nil, nil
+	}
+
+	multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates)
+
+	for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') {
+		// Split the string into the next literal and the remainder
+		subMatch := s[:end]
+		s = s[end+1:]
+
+		// break if any of the submatches are not literals
+		if regexp.QuoteMeta(subMatch) != subMatch {
+			return nil, nil
+		}
+
+		multiMatcher.add(subMatch)
+	}
+
+	// break if the remainder is not a literal
+	if regexp.QuoteMeta(s) != s {
+		return nil, nil
+	}
+	multiMatcher.add(s)
+
+	return multiMatcher, multiMatcher.setMatches()
 }

 // optimizeConcatRegex returns literal prefix/suffix text that can be safely
@ -377,7 +436,7 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
 	clearBeginEndText(re)

 	m := stringMatcherFromRegexpInternal(re)
-	m = optimizeEqualStringMatchers(m, optimizeEqualStringMatchersThreshold)
+	m = optimizeEqualStringMatchers(m, minEqualMultiStringMatcherMapThreshold)

 	return m
 }
@ -567,16 +626,86 @@ func (m *equalStringMatcher) Matches(s string) bool {
 	return strings.EqualFold(m.s, s)
 }

-// equalMultiStringMatcher matches a string exactly against a set of valid values.
-type equalMultiStringMatcher struct {
-	// values to match a string against. If the matching is case insensitive,
+type multiStringMatcherBuilder interface {
+	StringMatcher
+	add(s string)
+	setMatches() []string
+}
+
+func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize int) multiStringMatcherBuilder {
+	// If the estimated size is low enough, it's faster to use a slice instead of a map.
+	if estimatedSize < minEqualMultiStringMatcherMapThreshold {
+		return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)}
+	}
+
+	return &equalMultiStringMapMatcher{
+		values:        make(map[string]struct{}, estimatedSize),
+		caseSensitive: caseSensitive,
+	}
+}
+
+// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values.
+type equalMultiStringSliceMatcher struct {
+	values []string
+
+	caseSensitive bool
+}
+
+func (m *equalMultiStringSliceMatcher) add(s string) {
+	m.values = append(m.values, s)
+}
+
+func (m *equalMultiStringSliceMatcher) setMatches() []string {
+	return m.values
+}
+
+func (m *equalMultiStringSliceMatcher) Matches(s string) bool {
+	if m.caseSensitive {
+		for _, v := range m.values {
+			if s == v {
+				return true
+			}
+		}
+	} else {
+		for _, v := range m.values {
+			if strings.EqualFold(s, v) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// equalMultiStringMapMatcher matches a string exactly against a map of valid values.
+type equalMultiStringMapMatcher struct {
+	// values contains values to match a string against. If the matching is case insensitive,
 	// the values here must be lowercase.
 	values map[string]struct{}

 	caseSensitive bool
 }

-func (m *equalMultiStringMatcher) Matches(s string) bool {
+func (m *equalMultiStringMapMatcher) add(s string) {
+	if !m.caseSensitive {
+		s = strings.ToLower(s)
+	}
+
+	m.values[s] = struct{}{}
+}
+
+func (m *equalMultiStringMapMatcher) setMatches() []string {
+	if len(m.values) >= maxSetMatches {
+		return nil
+	}
+
+	matches := make([]string, 0, len(m.values))
+	for s := range m.values {
+		matches = append(matches, s)
+	}
+	return matches
+}
+
+func (m *equalMultiStringMapMatcher) Matches(s string) bool {
 	if !m.caseSensitive {
 		s = strings.ToLower(s)
 	}
@ -657,24 +786,16 @@ func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatch
 	// Parse again the input StringMatcher to extract all values and storing them.
 	// We can skip the case sensitivity check because we've already checked it and
 	// if the code reach this point then it means all matchers have the same case sensitivity.
-	values := make(map[string]struct{}, numValues)
+	multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues)

 	// Ignore the return value because we already iterated over the input StringMatcher
 	// and it was all good.
 	findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
-		if caseSensitive {
-			values[matcher.s] = struct{}{}
-		} else {
-			values[strings.ToLower(matcher.s)] = struct{}{}
-		}
-
+		multiMatcher.add(matcher.s)
 		return true
 	})

-	return &equalMultiStringMatcher{
-		values:        values,
-		caseSensitive: caseSensitive,
-	}
+	return multiMatcher
 }

 // findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go