[BUGFIX] FastRegexpMatcher: do Unicode normalization as part of case-insensitive comparison (#14170)

* Converted string to standarized form * Added golang.org/x/text in Go dependencies * Added test cases for FastRegexMatcher * Added benchmark for toNormalizedLower Signed-off-by: RA <ranveeravhad777@gmail.com>
2025-03-05 20:59:13 -08:00 · 2024-06-11 04:01:41 +05:30 · 2024-06-11 04:01:41 +05:30 · 39902ba694
parent 64c5cc5134
commit 39902ba694
3 changed files with 105 additions and 4 deletions
--- a/go.mod
+++ b/go.mod
@ -77,6 +77,7 @@ require (
 	golang.org/x/oauth2 v0.21.0
 	golang.org/x/sync v0.7.0
 	golang.org/x/sys v0.21.0
+	golang.org/x/text v0.16.0
 	golang.org/x/time v0.5.0
 	golang.org/x/tools v0.22.0
 	google.golang.org/api v0.183.0
@ -188,7 +189,6 @@ require (
 	golang.org/x/exp v0.0.0-20240119083558-1b970713d09a // indirect
 	golang.org/x/mod v0.18.0 // indirect
 	golang.org/x/term v0.21.0 // indirect
-	golang.org/x/text v0.16.0 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/ini.v1 v1.67.0 // indirect
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -16,10 +16,12 @@ package labels
 import (
 	"slices"
 	"strings"
+	"unicode"
 	"unicode/utf8"

 	"github.com/grafana/regexp"
 	"github.com/grafana/regexp/syntax"
+	"golang.org/x/text/unicode/norm"
 )

 const (
@ -766,7 +768,7 @@ type equalMultiStringMapMatcher struct {

 func (m *equalMultiStringMapMatcher) add(s string) {
 	if !m.caseSensitive {
-		s = strings.ToLower(s)
+		s = toNormalisedLower(s)
 	}

 	m.values[s] = struct{}{}
@ -786,13 +788,51 @@ func (m *equalMultiStringMapMatcher) setMatches() []string {

 func (m *equalMultiStringMapMatcher) Matches(s string) bool {
 	if !m.caseSensitive {
-		s = strings.ToLower(s)
+		s = toNormalisedLower(s)
 	}

 	_, ok := m.values[s]
 	return ok
 }

+// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
+// it to lower case.
+func toNormalisedLower(s string) string {
+	// Check if the string is all ASCII chars and convert any upper case character to lower case character.
+	isASCII := true
+	var (
+		b   strings.Builder
+		pos int
+	)
+	b.Grow(len(s))
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if isASCII && c >= utf8.RuneSelf {
+			isASCII = false
+			break
+		}
+		if 'A' <= c && c <= 'Z' {
+			c += 'a' - 'A'
+			if pos < i {
+				b.WriteString(s[pos:i])
+			}
+			b.WriteByte(c)
+			pos = i + 1
+		}
+	}
+	if pos < len(s) {
+		b.WriteString(s[pos:])
+	}
+
+	// Optimize for ASCII-only strings. In this case we don't have to do any normalization.
+	if isASCII {
+		return b.String()
+	}
+
+	// Normalise and convert to lower.
+	return strings.Map(unicode.ToLower, norm.NFKD.String(b.String()))
+}
+
 // anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
 // (including an empty one) as far as it doesn't contain any newline character.
 type anyStringWithoutNewlineMatcher struct{}
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go