Merge pull request #15210 from bboreham/faster-lowercase

FastRegexMatcher: use stack memory for lowercase copy of string
This commit is contained in:
Bryan Boreham 2024-11-12 11:46:08 +00:00 committed by GitHub
commit 49999b8a90
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 42 additions and 17 deletions

View file

@ -802,7 +802,7 @@ type equalMultiStringMapMatcher struct {
func (m *equalMultiStringMapMatcher) add(s string) { func (m *equalMultiStringMapMatcher) add(s string) {
if !m.caseSensitive { if !m.caseSensitive {
s = toNormalisedLower(s) s = toNormalisedLower(s, nil) // Don't pass a stack buffer here - it will always escape to heap.
} }
m.values[s] = struct{}{} m.values[s] = struct{}{}
@ -840,15 +840,24 @@ func (m *equalMultiStringMapMatcher) setMatches() []string {
} }
func (m *equalMultiStringMapMatcher) Matches(s string) bool { func (m *equalMultiStringMapMatcher) Matches(s string) bool {
if len(m.values) > 0 {
sNorm := s
var a [32]byte
if !m.caseSensitive { if !m.caseSensitive {
s = toNormalisedLower(s) sNorm = toNormalisedLower(s, a[:])
} }
if _, ok := m.values[sNorm]; ok {
if _, ok := m.values[s]; ok {
return true return true
} }
}
if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen { if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen {
for _, matcher := range m.prefixes[s[:m.minPrefixLen]] { prefix := s[:m.minPrefixLen]
var a [32]byte
if !m.caseSensitive {
prefix = toNormalisedLower(s[:m.minPrefixLen], a[:])
}
for _, matcher := range m.prefixes[prefix] {
if matcher.Matches(s) { if matcher.Matches(s) {
return true return true
} }
@ -859,23 +868,38 @@ func (m *equalMultiStringMapMatcher) Matches(s string) bool {
// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert // toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
// it to lower case. // it to lower case.
func toNormalisedLower(s string) string { func toNormalisedLower(s string, a []byte) string {
var buf []byte
for i := 0; i < len(s); i++ { for i := 0; i < len(s); i++ {
c := s[i] c := s[i]
if c >= utf8.RuneSelf { if c >= utf8.RuneSelf {
return strings.Map(unicode.ToLower, norm.NFKD.String(s)) return strings.Map(unicode.ToLower, norm.NFKD.String(s))
} }
if 'A' <= c && c <= 'Z' { if 'A' <= c && c <= 'Z' {
if buf == nil { return toNormalisedLowerSlow(s, i, a)
}
}
return s
}
// toNormalisedLowerSlow is split from toNormalisedLower because having a call
// to `copy` slows it down even when it is not called.
func toNormalisedLowerSlow(s string, i int, a []byte) string {
var buf []byte
if cap(a) > len(s) {
buf = a[:len(s)]
copy(buf, s)
} else {
buf = []byte(s) buf = []byte(s)
} }
for ; i < len(s); i++ {
c := s[i]
if c >= utf8.RuneSelf {
return strings.Map(unicode.ToLower, norm.NFKD.String(s))
}
if 'A' <= c && c <= 'Z' {
buf[i] = c + 'a' - 'A' buf[i] = c + 'a' - 'A'
} }
} }
if buf == nil {
return s
}
return yoloString(buf) return yoloString(buf)
} }

View file

@ -333,7 +333,8 @@ func BenchmarkToNormalizedLower(b *testing.B) {
} }
b.ResetTimer() b.ResetTimer()
for n := 0; n < b.N; n++ { for n := 0; n < b.N; n++ {
toNormalisedLower(inputs[n%len(inputs)]) var a [256]byte
toNormalisedLower(inputs[n%len(inputs)], a[:])
} }
}) })
} }
@ -1390,6 +1391,6 @@ func TestToNormalisedLower(t *testing.T) {
"ſſAſſa": "ssassa", "ſſAſſa": "ssassa",
} }
for input, expectedOutput := range testCases { for input, expectedOutput := range testCases {
require.Equal(t, expectedOutput, toNormalisedLower(input)) require.Equal(t, expectedOutput, toNormalisedLower(input, nil))
} }
} }