2020-06-26 02:49:09 -07:00
// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
2023-03-01 06:50:04 -08:00
"bufio"
2023-03-02 08:20:52 -08:00
"fmt"
2021-10-07 05:25:31 -07:00
"math/rand"
2023-03-01 06:50:04 -08:00
"os"
2023-03-31 01:27:43 -07:00
"sort"
2023-03-30 19:05:26 -07:00
"strconv"
2021-10-07 05:25:31 -07:00
"strings"
2020-06-26 02:49:09 -07:00
"testing"
2021-10-07 05:25:31 -07:00
"time"
2020-06-26 02:49:09 -07:00
2023-04-17 06:20:58 -07:00
"github.com/DmitriyVTitov/size"
2023-03-01 06:50:04 -08:00
"github.com/grafana/regexp"
2022-02-08 02:03:20 -08:00
"github.com/grafana/regexp/syntax"
2020-10-29 02:43:23 -07:00
"github.com/stretchr/testify/require"
2020-06-26 02:49:09 -07:00
)
2021-10-06 06:24:57 -07:00
var (
2023-03-09 00:38:41 -08:00
asciiRunes = [ ] rune ( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_" )
regexes = [ ] string {
2023-03-31 23:35:35 -07:00
"" ,
2023-03-02 08:20:52 -08:00
"foo" ,
"^foo" ,
2021-10-06 06:24:57 -07:00
"(foo|bar)" ,
"foo.*" ,
".*foo" ,
"^.*foo$" ,
"^.+foo$" ,
".*" ,
".+" ,
"foo.+" ,
".+foo" ,
"foo\n.+" ,
"foo\n.*" ,
".*foo.*" ,
".+foo.+" ,
2021-10-07 05:25:31 -07:00
"(?s:.*)" ,
"(?s:.+)" ,
"(?s:^.*foo$)" ,
2023-03-02 08:20:52 -08:00
"(?i:foo)" ,
"(?i:(foo|bar))" ,
"(?i:(foo1|foo2|bar))" ,
2021-10-07 06:10:26 -07:00
"^(?i:foo|oo)|(bar)$" ,
2023-03-02 08:20:52 -08:00
"(?i:(foo1|foo2|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz))" ,
2021-10-07 05:25:31 -07:00
"((.*)(bar|b|buzz)(.+)|foo)$" ,
"^$" ,
"(prometheus|api_prom)_api_v1_.+" ,
"10\\.0\\.(1|2)\\.+" ,
2021-10-07 06:10:26 -07:00
"10\\.0\\.(1|2).+" ,
"((fo(bar))|.+foo)" ,
2023-03-02 08:20:52 -08:00
// A long case sensitive alternation.
"zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb" ,
2023-03-31 23:35:35 -07:00
// An extremely long case sensitive alternation. This is a special
// case because the values share common prefixes rather than being
// entirely random. This is common in the real world. For example, the
// values of a label like kubernetes pod will often include the
// deployment name as a prefix.
" jyyfj00j0061 | jyyfj00j0062 | jyyfj94j0093 | jyyfj99j0093 | jyyfm01j0021 | jyyfm02j0021 | jyefj00j0192 | jyefj00j0193 | jyefj00j0194 | jyefj00j0195 | jyefj00j0196 | jyefj00j0197 | jyefj00j0290 | jyefj00j0291 | jyefj00j0292 | jyefj00j0293 | jyefj00j0294 | jyefj00j0295 | jyefj00j0296 | jyefj00j0297 | jyefj89j0394 | jyefj90j0394 | jyefj91j0394 | jyefj95j0347 | jyefj96j0322 | jyefj96j0347 | jyefj97j0322 | jyefj97j0347 | jyefj98j0322 | jyefj98j0347 | jyefj99j0320 | jyefj99j0322 | jyefj99j0323 | jyefj99j0335 | jyefj99j0336 | jyefj99j0344 | jyefj99j0347 | jyefj99j0349 | jyefj99j0351 | jyeff00j0117 | lyyfm01j0025 | lyyfm01j0028 | lyyfm01j0041 | lyyfm01j0133 | lyyfm01j0701 | lyyfm02j0025 | lyyfm02j0028 | lyyfm02j0041 | lyyfm02j0133 | lyyfm02j0701 | lyyfm03j0701 | lyefj00j0775 | lyefj00j0776 | lyefj00j0777 | lyefj00j0778 | lyefj00j0779 | lyefj00j0780 | lyefj00j0781 | lyefj00j0782 | lyefj50j3807 | lyefj50j3852 | lyefj51j3807 | lyefj51j3852 | lyefj52j3807 | lyefj52j3852 | lyefj53j3807 | lyefj53j3852 | lyefj54j3807 | lyefj54j3852 | lyefj54j3886 | lyefj55j3807 | lyefj55j3852 | lyefj55j3886 | lyefj56j3807 | lyefj56j3852 | lyefj56j3886 | lyefj57j3807 | lyefj57j3852 | lyefj57j3886 | lyefj58j3807 | lyefj58j3852 | lyefj58j3886 | lyefj59j3807 | lyefj59j3852 | lyefj59j3886 | lyefj60j3807 | lyefj60j3852 | lyefj60j3886 | lyefj61j3807 | lyefj61j3852 | lyefj61j3886 | lyefj62j3807 | lyefj62j3852 | lyefj62j3886 | lyefj63j3807 | lyefj63j3852 | lyefj63j3886 | lyefj64j3807 | lyefj64j3852 | lyefj64j3886 | lyefj65j3807 | lyefj65j3852 | lyefj65j3886 | lyefj66j3807 | lyefj66j3852 | lyefj66j3886 | lyefj67j3807 | lyefj67j3852 | lyefj67j3886 | lyefj68j3807 | lyefj68j3852 | lyefj68j3886 | lyefj69j3807 | lyefj69j3846 | lyefj69j3852 | lyefj69j3886 | lyefj70j3807 | lyefj70j3846 | lyefj70j3852 | lyefj70j3886 | lyefj71j3807 | lyefj71j3846 | lyefj71j3852 | lyefj71j3886 | lyefj72j3807 | lyefj72j3846 | lyefj72j3852 | lyefj72j3886 | lyefj73j3807 | lyefj73j3846 | lyefj73j3852 | lyefj73j3886 | lyefj74j3807 | lyefj74j3846 | lyefj74j3852 | lyefj74j3886 | lyefj75j3807 | lyefj75j3808 | lyefj75j3846 | lyefj75j3852 | lyefj75j3886 | lyefj76j3732 | lyefj76j3807 | lyefj76j3808 | lyefj76j3846 | lyefj76j3852 | lyefj76j3886 | lyefj77j3732 | lyefj77j3807 | lyefj77j3808 | lyefj77j3846 | lyefj77j3852 | lyefj77j3886 | lyefj78j3278 | lyefj78j3732 | lyefj78j3807 | lyefj78j3808 | lyefj78j3846 | lyefj78j3852 | lyefj78j3886 | lyefj79j3732 | lyefj79j3807 | lyefj79j3808 | lyefj79j3846 | lyefj79j3852 | lyefj79j3886 | lyefj80j3732 | lyefj80j3807 | lyefj80j3808 | lyefj80j3846 | lyefj80j3852 | lyefj80j3886 | lyefj81j3732 | lyefj81j3807 | lyefj81j3808 | lyefj81j3846 | lyefj81j3852 | lyefj81j3886 | lyefj82j3732 | lyefj82j3807 | lyefj82j3808 | lyefj82j3846 | lyefj82j3852 | lyefj82j3886 | lyefj83j3732 | lyefj83j3807 | lyefj83j3808 | lyefj83j3846 | lyefj83j3852 | lyefj83j3886 | lyefj84j3732 | lyefj84j3807 | lyefj84j3808 | lyefj84j3846 | lyefj84j3852 | lyefj84j3886 | lyefj85j3732 | lyefj85j3807 | lyefj85j3808 | lyefj85j3846 | lyefj85j3852 | lyefj85j3886 | lyefj86j3278 | lyefj86j3732 | lyefj86j3807 | lyefj86j3808 | lyefj86j3846 | lyefj86j3852 | lyefj86j3886 | lyefj87j3278 | lyefj87j3732 | lyefj87j3807 | lyefj87j3808 | lyefj87j3846 | lyefj87j3852 | lyefj87j3886 | lyefj88j3732 | lyefj88j3807 | lyefj88j3808 | lyefj88j3846 | lyefj88j3852 | lyefj88j3886 | lyefj89j3732 | lyefj89j3807 | lyefj89j3808 | lyefj89j3846 | lyefj89j3852 | lyefj89j3886 | lyefj90j3732 | lyefj90j3807 | lyefj90j3808 | lyefj90j3846 | lyefj90j3852 | lyefj90j3886 | lyefj91j3732 | lyefj91j3807 | lyefj91j3808 | lyefj91j3846 | lyefj91j3852 | lyefj91j3886 | lyefj92j3732 | lyefj92j3807 | lyefj92j3808 | lyefj92j3846 | lyefj92j3852 | lyefj92j3886 | lyefj93j3732 | lyefj93j3807 | lyefj93j3808 | lyefj93j3846 | lyefj93j3852 | lyefj93j3885 | lyefj93j3886 | lyefj94j3525 | lyefj94j3732 | lyefj94j3807 | lyefj94j3808 | lyefj94j3846 | lyefj94j3852 | lyefj94j3885 | lyefj94j3886 | lyefj95j3525 | lyefj95j3732 | lyefj95j3807 | lyefj95j3808 | lyefj95j3846 | lyefj95j3852 | lyefj95j3886 | lyefj96j3732 | lyefj96j3803 | lyefj96j3807 | lyefj96j3808 | lyefj96j3846 | lyefj96j3852 | lyefj96j3886 | lyefj97j3333 | lyefj97j3732 | lyefj97j3792 | lyefj97j3803 | lyefj97j3807 | lyefj97j3808 | lyefj97j3838 | lyefj97j3843 | lyefj97j3846 | lyefj97j3852 | lyefj97j3886 | lyefj98j3083 | lyefj98j3333 | lyefj98j3732 | lyefj98j3807 | lyefj98j3808 | lyefj98j3838 | lyefj98j3843 | lyefj98j3846 | lyefj98j3852 | lyefj98j3873 | lyefj98j3877 | lyefj98j3882 | lyefj98j3886 | lyefj99j2984 | lyefj99j3083 | lyefj99j3333 | lyefj99j3732 | lyefj99j3807 | lyefj99j3808 | lyefj99j3846 | lyefj99j3849 | lyefj99j3852 | lyefj99j3873 | lyefj99j3877 | lyefj99j3882 | lyefj99j3884 | lyefj99j3886 | lyeff00j0106 | lyeff00j01
2023-03-02 08:20:52 -08:00
// A long case insensitive alternation.
"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))" ,
2023-09-28 08:36:37 -07:00
// A long case insensitive alternation where each entry ends with ".*".
"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))" ,
// A long case insensitive alternation where each entry starts with ".*".
"(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))" ,
2023-09-29 05:12:41 -07:00
// Quest ".?".
"fo.?" ,
"foo.?" ,
"f.?o" ,
".*foo.?" ,
".?foo.+" ,
"foo.?|bar" ,
2021-10-07 05:25:31 -07:00
}
values = [ ] string {
"foo" , " foo bar" , "bar" , "buzz\nbar" , "bar foo" , "bfoo" , "\n" , "\nfoo" , "foo\n" , "hello foo world" , "hello foo\n world" , "" ,
2021-10-07 06:10:26 -07:00
"FOO" , "Foo" , "OO" , "Oo" , "\nfoo\n" , strings . Repeat ( "f" , 20 ) , "prometheus" , "prometheus_api_v1" , "prometheus_api_v1_foo" ,
2021-10-07 05:25:31 -07:00
"10.0.1.20" , "10.0.2.10" , "10.0.3.30" , "10.0.4.40" ,
2023-03-01 03:18:30 -08:00
"foofoo0" , "foofoo" ,
2023-09-28 08:36:37 -07:00
// Values matching / not matching the test regexps on long alternations.
"zQPbMkNO" , "zQPbMkNo" , "jyyfj00j0061" , "jyyfj00j006" , "jyyfj00j00612" , "NNSPdvMi" , "NNSPdvMiXXX" , "NNSPdvMixxx" , "nnSPdvMi" , "nnSPdvMiXXX" ,
2021-10-06 06:24:57 -07:00
}
)
2023-09-28 08:36:37 -07:00
func TestFastRegexMatcher_MatchString ( t * testing . T ) {
// Run the test both against a set of predefined values and a set of random ones.
testValues := append ( [ ] string { } , values ... )
testValues = append ( testValues , generateRandomValues ( ) ... )
2021-10-06 06:24:57 -07:00
for _ , r := range regexes {
r := r
2023-09-28 08:36:37 -07:00
for _ , v := range testValues {
2021-10-06 06:24:57 -07:00
v := v
t . Run ( r + ` on " ` + v + ` " ` , func ( t * testing . T ) {
t . Parallel ( )
m , err := NewFastRegexMatcher ( r )
require . NoError ( t , err )
2023-03-31 23:35:35 -07:00
re := regexp . MustCompile ( "^(?:" + r + ")$" )
require . Equal ( t , re . MatchString ( v ) , m . MatchString ( v ) )
2021-10-06 06:24:57 -07:00
} )
}
2020-06-26 02:49:09 -07:00
}
2021-10-06 06:24:57 -07:00
}
2020-06-26 02:49:09 -07:00
2023-04-17 06:20:58 -07:00
func TestNewFastRegexMatcher_CacheSizeLimit ( t * testing . T ) {
// Start with an empty cache.
fastRegexMatcherCache . Clear ( )
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand . New ( rand . NewSource ( 1 ) )
// Generate a very expensive regex.
alternates := make ( [ ] string , 1000 )
for i := 0 ; i < len ( alternates ) ; i ++ {
alternates [ i ] = randString ( randGenerator , 100 ) + fmt . Sprintf ( ".%d" , i )
}
expensiveRegexp := strings . Join ( alternates , "|" )
// Utility function to get a unique expensive regexp.
getExpensiveRegexp := func ( id int ) string {
return expensiveRegexp + fmt . Sprintf ( "%d" , id )
}
// Estimate the size of the matcher with the expensive regexp.
m , err := newFastRegexMatcherWithoutCache ( expensiveRegexp )
require . NoError ( t , err )
expensiveRegexpSizeBytes := size . Of ( m )
t . Logf ( "expensive regexp estimated size (bytes): %d" , expensiveRegexpSizeBytes )
// Estimate the max number of items in the cache.
estimatedMaxItemsInCache := fastRegexMatcherCacheMaxSizeBytes / expensiveRegexpSizeBytes
// Fill the cache.
for i := 0 ; i < estimatedMaxItemsInCache ; i ++ {
_ , err := NewFastRegexMatcher ( getExpensiveRegexp ( i ) )
require . NoError ( t , err )
}
// Ensure all regexp matchers are still in the cache.
fastRegexMatcherCache . Wait ( )
for i := 0 ; i < estimatedMaxItemsInCache ; i ++ {
_ , ok := fastRegexMatcherCache . Get ( getExpensiveRegexp ( i ) )
require . True ( t , ok , "checking if regexp matcher #%d is still in the cache" , i )
}
// Add one more regexp matcher to the cache.
_ , err = NewFastRegexMatcher ( getExpensiveRegexp ( estimatedMaxItemsInCache + 1 ) )
require . NoError ( t , err )
// Ensure one item has been evicted from the cache to make room for the new entry.
fastRegexMatcherCache . Wait ( )
numEvicted := 0
for i := 0 ; i < estimatedMaxItemsInCache ; i ++ {
if _ , ok := fastRegexMatcherCache . Get ( getExpensiveRegexp ( i ) ) ; ! ok {
t . Logf ( "the regexp matcher #%d has been evicted from the cache" , i )
numEvicted ++
}
}
require . Equal ( t , 1 , numEvicted )
}
2021-10-06 06:24:57 -07:00
func BenchmarkNewFastRegexMatcher ( b * testing . B ) {
2023-03-30 19:05:26 -07:00
runBenchmark := func ( newFunc func ( v string ) ( * FastRegexMatcher , error ) ) func ( b * testing . B ) {
return func ( b * testing . B ) {
for _ , r := range regexes {
b . Run ( getTestNameFromRegexp ( r ) , func ( b * testing . B ) {
for n := 0 ; n < b . N ; n ++ {
_ , err := newFunc ( r )
if err != nil {
b . Fatal ( err )
}
}
} )
}
}
}
b . Run ( "with cache" , runBenchmark ( NewFastRegexMatcher ) )
b . Run ( "without cache" , runBenchmark ( newFastRegexMatcherWithoutCache ) )
}
func BenchmarkNewFastRegexMatcher_CacheMisses ( b * testing . B ) {
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand . New ( rand . NewSource ( 1 ) )
tests := map [ string ] string {
"simple regexp" : randString ( randGenerator , 10 ) ,
"complex regexp" : strings . Join ( randStrings ( randGenerator , 100 , 10 ) , "|" ) ,
}
for testName , regexpPrefix := range tests {
b . Run ( testName , func ( b * testing . B ) {
// Ensure the cache is empty.
2023-04-17 06:20:58 -07:00
fastRegexMatcherCache . Clear ( )
2023-03-30 19:05:26 -07:00
b . ResetTimer ( )
2023-03-02 08:20:52 -08:00
for n := 0 ; n < b . N ; n ++ {
2023-03-30 19:05:26 -07:00
// Unique regexp to emulate 100% cache misses.
regexp := regexpPrefix + strconv . Itoa ( n )
_ , err := NewFastRegexMatcher ( regexp )
2023-03-02 08:20:52 -08:00
if err != nil {
b . Fatal ( err )
2021-10-06 06:24:57 -07:00
}
2021-10-07 05:25:31 -07:00
}
} )
2020-06-26 02:49:09 -07:00
}
}
func TestOptimizeConcatRegex ( t * testing . T ) {
cases := [ ] struct {
2020-07-07 01:38:04 -07:00
regex string
prefix string
suffix string
contains string
2020-06-26 02:49:09 -07:00
} {
2020-07-07 01:38:04 -07:00
{ regex : "foo(hello|bar)" , prefix : "foo" , suffix : "" , contains : "" } ,
{ regex : "foo(hello|bar)world" , prefix : "foo" , suffix : "world" , contains : "" } ,
{ regex : "foo.*" , prefix : "foo" , suffix : "" , contains : "" } ,
{ regex : "foo.*hello.*bar" , prefix : "foo" , suffix : "bar" , contains : "hello" } ,
{ regex : ".*foo" , prefix : "" , suffix : "foo" , contains : "" } ,
{ regex : "^.*foo$" , prefix : "" , suffix : "foo" , contains : "" } ,
{ regex : ".*foo.*" , prefix : "" , suffix : "" , contains : "foo" } ,
{ regex : ".*foo.*bar.*" , prefix : "" , suffix : "" , contains : "foo" } ,
{ regex : ".*(foo|bar).*" , prefix : "" , suffix : "" , contains : "" } ,
{ regex : ".*[abc].*" , prefix : "" , suffix : "" , contains : "" } ,
2020-10-06 05:16:26 -07:00
{ regex : ".*((?i)abc).*" , prefix : "" , suffix : "" , contains : "" } ,
{ regex : ".*(?i:abc).*" , prefix : "" , suffix : "" , contains : "" } ,
{ regex : "(?i:abc).*" , prefix : "" , suffix : "" , contains : "" } ,
{ regex : ".*(?i:abc)" , prefix : "" , suffix : "" , contains : "" } ,
{ regex : ".*(?i:abc)def.*" , prefix : "" , suffix : "" , contains : "def" } ,
{ regex : "(?i).*(?-i:abc)def" , prefix : "" , suffix : "" , contains : "abc" } ,
{ regex : ".*(?msU:abc).*" , prefix : "" , suffix : "" , contains : "abc" } ,
2020-10-12 04:17:29 -07:00
{ regex : "[aA]bc.*" , prefix : "" , suffix : "" , contains : "bc" } ,
2023-03-01 05:50:26 -08:00
{ regex : "^5..$" , prefix : "5" , suffix : "" , contains : "" } ,
{ regex : "^release.*" , prefix : "release" , suffix : "" , contains : "" } ,
{ regex : "^env-[0-9]+laio[1]?[^0-9].*" , prefix : "env-" , suffix : "" , contains : "laio" } ,
2020-06-26 02:49:09 -07:00
}
for _ , c := range cases {
parsed , err := syntax . Parse ( c . regex , syntax . Perl )
2020-10-29 02:43:23 -07:00
require . NoError ( t , err )
2020-06-26 02:49:09 -07:00
2020-07-07 01:38:04 -07:00
prefix , suffix , contains := optimizeConcatRegex ( parsed )
2020-10-29 02:43:23 -07:00
require . Equal ( t , c . prefix , prefix )
require . Equal ( t , c . suffix , suffix )
require . Equal ( t , c . contains , contains )
2020-06-26 02:49:09 -07:00
}
}
2021-10-05 04:43:41 -07:00
// Refer to https://github.com/prometheus/prometheus/issues/2651.
func TestFindSetMatches ( t * testing . T ) {
for _ , c := range [ ] struct {
2023-03-01 01:49:25 -08:00
pattern string
expMatches [ ] string
expCaseSensitive bool
2021-10-05 04:43:41 -07:00
} {
// Single value, coming from a `bar=~"foo"` selector.
2023-03-01 01:49:25 -08:00
{ "foo" , [ ] string { "foo" } , true } ,
{ "^foo" , [ ] string { "foo" } , true } ,
{ "^foo$" , [ ] string { "foo" } , true } ,
2021-10-05 04:43:41 -07:00
// Simple sets alternates.
2023-03-01 01:49:25 -08:00
{ "foo|bar|zz" , [ ] string { "foo" , "bar" , "zz" } , true } ,
2021-10-06 07:44:26 -07:00
// Simple sets alternate and concat (bar|baz is parsed as "ba[rz]").
2023-03-01 01:49:25 -08:00
{ "foo|bar|baz" , [ ] string { "foo" , "bar" , "baz" } , true } ,
2021-10-05 04:43:41 -07:00
// Simple sets alternate and concat and capture
2023-03-01 01:49:25 -08:00
{ "foo|bar|baz|(zz)" , [ ] string { "foo" , "bar" , "baz" , "zz" } , true } ,
2021-10-05 04:43:41 -07:00
// Simple sets alternate and concat and alternates with empty matches
// parsed as b(ar|(?:)|uzz) where b(?:) means literal b.
2023-03-01 01:49:25 -08:00
{ "bar|b|buzz" , [ ] string { "bar" , "b" , "buzz" } , true } ,
2023-03-01 05:50:26 -08:00
// Skip outer anchors (it's enforced anyway at the root).
{ "^(bar|b|buzz)$" , [ ] string { "bar" , "b" , "buzz" } , true } ,
{ "^(?:prod|production)$" , [ ] string { "prod" , "production" } , true } ,
// Do not optimize regexp with inner anchors.
{ "(bar|b|b^uz$z)" , nil , false } ,
// Do not optimize regexp with empty string matcher.
{ "^$|Running" , nil , false } ,
2021-10-05 04:43:41 -07:00
// Simple sets containing escaped characters.
2023-03-01 01:49:25 -08:00
{ "fo\\.o|bar\\?|\\^baz" , [ ] string { "fo.o" , "bar?" , "^baz" } , true } ,
2021-10-05 06:59:40 -07:00
// using charclass
2023-03-01 01:49:25 -08:00
{ "[abc]d" , [ ] string { "ad" , "bd" , "cd" } , true } ,
2021-10-05 04:43:41 -07:00
// high low charset different => A(B[CD]|EF)|BC[XY]
2023-03-01 01:49:25 -08:00
{ "ABC|ABD|AEF|BCX|BCY" , [ ] string { "ABC" , "ABD" , "AEF" , "BCX" , "BCY" } , true } ,
2021-10-05 04:43:41 -07:00
// triple concat
2023-03-01 01:49:25 -08:00
{ "api_(v1|prom)_push" , [ ] string { "api_v1_push" , "api_prom_push" } , true } ,
2021-10-05 04:43:41 -07:00
// triple concat with multiple alternates
2023-03-01 01:49:25 -08:00
{ "(api|rpc)_(v1|prom)_push" , [ ] string { "api_v1_push" , "api_prom_push" , "rpc_v1_push" , "rpc_prom_push" } , true } ,
{ "(api|rpc)_(v1|prom)_(push|query)" , [ ] string { "api_v1_push" , "api_v1_query" , "api_prom_push" , "api_prom_query" , "rpc_v1_push" , "rpc_v1_query" , "rpc_prom_push" , "rpc_prom_query" } , true } ,
2021-10-06 07:22:48 -07:00
// class starting with "-"
2023-03-01 01:49:25 -08:00
{ "[-1-2][a-c]" , [ ] string { "-a" , "-b" , "-c" , "1a" , "1b" , "1c" , "2a" , "2b" , "2c" } , true } ,
{ "[1^3]" , [ ] string { "1" , "3" , "^" } , true } ,
2021-10-05 04:43:41 -07:00
// OpPlus with concat
2023-03-01 01:49:25 -08:00
{ "(.+)/(foo|bar)" , nil , false } ,
2021-10-05 04:43:41 -07:00
// Simple sets containing special characters without escaping.
2023-03-01 01:49:25 -08:00
{ "fo.o|bar?|^baz" , nil , false } ,
2021-10-05 04:43:41 -07:00
// case sensitive wrapper.
2023-03-01 01:49:25 -08:00
{ "(?i)foo" , [ ] string { "FOO" } , false } ,
2021-10-05 04:43:41 -07:00
// case sensitive wrapper on alternate.
2023-03-01 01:49:25 -08:00
{ "(?i)foo|bar|baz" , [ ] string { "FOO" , "BAR" , "BAZ" , "BAr" , "BAz" } , false } ,
// mixed case sensitivity.
{ "(api|rpc)_(v1|prom)_((?i)push|query)" , nil , false } ,
// mixed case sensitivity concatenation only without capture group.
{ "api_v1_(?i)push" , nil , false } ,
// mixed case sensitivity alternation only without capture group.
{ "api|(?i)rpc" , nil , false } ,
// case sensitive after unsetting insensitivity.
{ "rpc|(?i)(?-i)api" , [ ] string { "rpc" , "api" } , true } ,
// case sensitive after unsetting insensitivity in all alternation options.
{ "(?i)((?-i)api|(?-i)rpc)" , [ ] string { "api" , "rpc" } , true } ,
// mixed case sensitivity after unsetting insensitivity.
{ "(?i)rpc|(?-i)api" , nil , false } ,
2021-10-05 04:43:41 -07:00
// too high charset combination
2023-03-01 01:49:25 -08:00
{ "(api|rpc)_[^0-9]" , nil , false } ,
2021-10-05 07:46:24 -07:00
// too many combinations
2023-03-01 01:49:25 -08:00
{ "[a-z][a-z]" , nil , false } ,
2021-10-05 04:43:41 -07:00
} {
c := c
t . Run ( c . pattern , func ( t * testing . T ) {
t . Parallel ( )
parsed , err := syntax . Parse ( c . pattern , syntax . Perl )
require . NoError ( t , err )
2023-03-01 05:50:26 -08:00
matches , actualCaseSensitive := findSetMatches ( parsed )
2023-03-01 01:49:25 -08:00
require . Equal ( t , c . expMatches , matches )
require . Equal ( t , c . expCaseSensitive , actualCaseSensitive )
2021-10-05 04:43:41 -07:00
} )
2022-04-12 07:40:00 -07:00
}
}
2021-10-05 04:43:41 -07:00
2022-02-08 02:03:20 -08:00
func BenchmarkFastRegexMatcher ( b * testing . B ) {
2023-09-28 08:36:37 -07:00
texts := generateRandomValues ( )
2023-03-09 00:38:41 -08:00
2022-02-08 02:03:20 -08:00
for _ , r := range regexes {
2023-03-02 08:20:52 -08:00
b . Run ( getTestNameFromRegexp ( r ) , func ( b * testing . B ) {
2022-02-08 02:03:20 -08:00
m , err := NewFastRegexMatcher ( r )
require . NoError ( b , err )
2023-09-28 08:36:37 -07:00
2022-02-08 02:03:20 -08:00
b . ResetTimer ( )
for i := 0 ; i < b . N ; i ++ {
2023-03-09 00:38:41 -08:00
for _ , text := range texts {
_ = m . MatchString ( text )
}
2022-02-08 02:03:20 -08:00
}
} )
2021-10-05 04:43:41 -07:00
}
}
2021-10-07 04:56:31 -07:00
2023-09-28 08:36:37 -07:00
func TestStringMatcherFromRegexp ( t * testing . T ) {
2021-10-07 04:56:31 -07:00
for _ , c := range [ ] struct {
pattern string
exp StringMatcher
} {
2023-03-09 00:38:41 -08:00
{ ".*" , anyStringWithoutNewlineMatcher { } } ,
{ ".*?" , anyStringWithoutNewlineMatcher { } } ,
{ "(?s:.*)" , trueMatcher { } } ,
{ "(.*)" , anyStringWithoutNewlineMatcher { } } ,
{ "^.*$" , anyStringWithoutNewlineMatcher { } } ,
{ ".+" , & anyNonEmptyStringMatcher { matchNL : false } } ,
{ "(?s:.+)" , & anyNonEmptyStringMatcher { matchNL : true } } ,
{ "^.+$" , & anyNonEmptyStringMatcher { matchNL : false } } ,
{ "(.+)" , & anyNonEmptyStringMatcher { matchNL : false } } ,
2021-10-08 01:10:18 -07:00
{ "" , emptyStringMatcher { } } ,
{ "^$" , emptyStringMatcher { } } ,
{ "^foo$" , & equalStringMatcher { s : "foo" , caseSensitive : true } } ,
{ "^(?i:foo)$" , & equalStringMatcher { s : "FOO" , caseSensitive : false } } ,
2023-03-01 05:50:26 -08:00
{ "^((?i:foo)|(bar))$" , orStringMatcher ( [ ] StringMatcher { & equalStringMatcher { s : "FOO" , caseSensitive : false } , & equalStringMatcher { s : "bar" , caseSensitive : true } } ) } ,
{ "^((?i:foo|oo)|(bar))$" , orStringMatcher ( [ ] StringMatcher { & equalStringMatcher { s : "FOO" , caseSensitive : false } , & equalStringMatcher { s : "OO" , caseSensitive : false } , & equalStringMatcher { s : "bar" , caseSensitive : true } } ) } ,
2023-03-01 01:49:25 -08:00
{ "(?i:(foo1|foo2|bar))" , orStringMatcher ( [ ] StringMatcher { orStringMatcher ( [ ] StringMatcher { & equalStringMatcher { s : "FOO1" , caseSensitive : false } , & equalStringMatcher { s : "FOO2" , caseSensitive : false } } ) , & equalStringMatcher { s : "BAR" , caseSensitive : false } } ) } ,
2023-03-09 00:38:41 -08:00
{ ".*foo.*" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : anyStringWithoutNewlineMatcher { } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "(.*)foo.*" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : anyStringWithoutNewlineMatcher { } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "(.*)foo(.*)" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : anyStringWithoutNewlineMatcher { } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "(.+)foo(.*)" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : & anyNonEmptyStringMatcher { matchNL : false } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "^.+foo.+" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : & anyNonEmptyStringMatcher { matchNL : false } , right : & anyNonEmptyStringMatcher { matchNL : false } } } ,
{ "^(.*)(foo)(.*)$" , & containsStringMatcher { substrings : [ ] string { "foo" } , left : anyStringWithoutNewlineMatcher { } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "^(.*)(foo|foobar)(.*)$" , & containsStringMatcher { substrings : [ ] string { "foo" , "foobar" } , left : anyStringWithoutNewlineMatcher { } , right : anyStringWithoutNewlineMatcher { } } } ,
{ "^(.*)(foo|foobar)(.+)$" , & containsStringMatcher { substrings : [ ] string { "foo" , "foobar" } , left : anyStringWithoutNewlineMatcher { } , right : & anyNonEmptyStringMatcher { matchNL : false } } } ,
{ "^(.*)(bar|b|buzz)(.+)$" , & containsStringMatcher { substrings : [ ] string { "bar" , "b" , "buzz" } , left : anyStringWithoutNewlineMatcher { } , right : & anyNonEmptyStringMatcher { matchNL : false } } } ,
2021-10-08 01:10:18 -07:00
{ "10\\.0\\.(1|2)\\.+" , nil } ,
2023-03-09 00:38:41 -08:00
{ "10\\.0\\.(1|2).+" , & containsStringMatcher { substrings : [ ] string { "10.0.1" , "10.0.2" } , left : nil , right : & anyNonEmptyStringMatcher { matchNL : false } } } ,
2023-09-28 08:36:37 -07:00
{ "^.+foo" , & literalSuffixStringMatcher { left : & anyNonEmptyStringMatcher { } , suffix : "foo" , suffixCaseSensitive : true } } ,
{ "foo-.*$" , & literalPrefixStringMatcher { prefix : "foo-" , prefixCaseSensitive : true , right : anyStringWithoutNewlineMatcher { } } } ,
2023-03-09 00:38:41 -08:00
{ "(prometheus|api_prom)_api_v1_.+" , & containsStringMatcher { substrings : [ ] string { "prometheus_api_v1_" , "api_prom_api_v1_" } , left : nil , right : & anyNonEmptyStringMatcher { matchNL : false } } } ,
{ "^((.*)(bar|b|buzz)(.+)|foo)$" , orStringMatcher ( [ ] StringMatcher { & containsStringMatcher { substrings : [ ] string { "bar" , "b" , "buzz" } , left : anyStringWithoutNewlineMatcher { } , right : & anyNonEmptyStringMatcher { matchNL : false } } , & equalStringMatcher { s : "foo" , caseSensitive : true } } ) } ,
2023-09-28 08:36:37 -07:00
{ "((fo(bar))|.+foo)" , orStringMatcher ( [ ] StringMatcher { orStringMatcher ( [ ] StringMatcher { & equalStringMatcher { s : "fobar" , caseSensitive : true } } ) , & literalSuffixStringMatcher { suffix : "foo" , suffixCaseSensitive : true , left : & anyNonEmptyStringMatcher { matchNL : false } } } ) } ,
2023-03-09 00:38:41 -08:00
{ "(.+)/(gateway|cortex-gw|cortex-gw-internal)" , & containsStringMatcher { substrings : [ ] string { "/gateway" , "/cortex-gw" , "/cortex-gw-internal" } , left : & anyNonEmptyStringMatcher { matchNL : false } , right : nil } } ,
2021-10-08 01:10:18 -07:00
// we don't support case insensitive matching for contains.
// This is because there's no strings.IndexOfFold function.
// We can revisit later if this is really popular by using strings.ToUpper.
{ "^(.*)((?i)foo|foobar)(.*)$" , nil } ,
{ "(api|rpc)_(v1|prom)_((?i)push|query)" , nil } ,
{ "[a-z][a-z]" , nil } ,
{ "[1^3]" , nil } ,
2021-10-11 01:10:51 -07:00
{ ".*foo.*bar.*" , nil } ,
{ ` \d* ` , nil } ,
{ "." , nil } ,
2023-09-28 08:36:37 -07:00
{ "/|/bar.*" , & literalPrefixStringMatcher { prefix : "/" , prefixCaseSensitive : true , right : orStringMatcher { emptyStringMatcher { } , & literalPrefixStringMatcher { prefix : "bar" , prefixCaseSensitive : true , right : anyStringWithoutNewlineMatcher { } } } } } ,
2021-10-08 01:10:18 -07:00
// This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
// It would make the code too complex to handle it.
{ "(.+)/(foo.*|bar$)" , nil } ,
2023-09-28 08:36:37 -07:00
// Case sensitive alternate with same literal prefix and .* suffix.
{ "(xyz-016a-ixb-dp.*|xyz-016a-ixb-op.*)" , & literalPrefixStringMatcher { prefix : "xyz-016a-ixb-" , prefixCaseSensitive : true , right : orStringMatcher { & literalPrefixStringMatcher { prefix : "dp" , prefixCaseSensitive : true , right : anyStringWithoutNewlineMatcher { } } , & literalPrefixStringMatcher { prefix : "op" , prefixCaseSensitive : true , right : anyStringWithoutNewlineMatcher { } } } } } ,
// Case insensitive alternate with same literal prefix and .* suffix.
{ "(?i:(xyz-016a-ixb-dp.*|xyz-016a-ixb-op.*))" , & literalPrefixStringMatcher { prefix : "XYZ-016A-IXB-" , prefixCaseSensitive : false , right : orStringMatcher { & literalPrefixStringMatcher { prefix : "DP" , prefixCaseSensitive : false , right : anyStringWithoutNewlineMatcher { } } , & literalPrefixStringMatcher { prefix : "OP" , prefixCaseSensitive : false , right : anyStringWithoutNewlineMatcher { } } } } } ,
{ "(?i)(xyz-016a-ixb-dp.*|xyz-016a-ixb-op.*)" , & literalPrefixStringMatcher { prefix : "XYZ-016A-IXB-" , prefixCaseSensitive : false , right : orStringMatcher { & literalPrefixStringMatcher { prefix : "DP" , prefixCaseSensitive : false , right : anyStringWithoutNewlineMatcher { } } , & literalPrefixStringMatcher { prefix : "OP" , prefixCaseSensitive : false , right : anyStringWithoutNewlineMatcher { } } } } } ,
2023-09-29 05:17:17 -07:00
// Concatenated variable length selectors are not supported.
{ "foo.*.*" , nil } ,
{ "foo.+.+" , nil } ,
{ ".*.*foo" , nil } ,
{ ".+.+foo" , nil } ,
2023-09-29 05:12:41 -07:00
{ "aaa.?.?" , nil } ,
{ "aaa.?.*" , nil } ,
// Regexps with ".?".
{ "ext.?|xfs" , orStringMatcher { & literalPrefixStringMatcher { prefix : "ext" , prefixCaseSensitive : true , right : & zeroOrOneCharacterStringMatcher { matchNL : false } } , & equalStringMatcher { s : "xfs" , caseSensitive : true } } } ,
{ "(?s)(ext.?|xfs)" , orStringMatcher { & literalPrefixStringMatcher { prefix : "ext" , prefixCaseSensitive : true , right : & zeroOrOneCharacterStringMatcher { matchNL : true } } , & equalStringMatcher { s : "xfs" , caseSensitive : true } } } ,
{ "foo.?" , & literalPrefixStringMatcher { prefix : "foo" , prefixCaseSensitive : true , right : & zeroOrOneCharacterStringMatcher { matchNL : false } } } ,
{ "f.?o" , nil } ,
2021-10-07 04:56:31 -07:00
} {
c := c
t . Run ( c . pattern , func ( t * testing . T ) {
t . Parallel ( )
parsed , err := syntax . Parse ( c . pattern , syntax . Perl )
require . NoError ( t , err )
matches := stringMatcherFromRegexp ( parsed )
require . Equal ( t , c . exp , matches )
} )
}
}
2021-10-07 05:25:31 -07:00
2023-09-28 08:36:37 -07:00
func TestStringMatcherFromRegexp_LiteralPrefix ( t * testing . T ) {
for _ , c := range [ ] struct {
pattern string
expectedLiteralPrefixMatchers int
expectedMatches [ ] string
expectedNotMatches [ ] string
} {
// Case sensitive.
{
pattern : "(xyz-016a-ixb-dp.*|xyz-016a-ixb-op.*)" ,
expectedLiteralPrefixMatchers : 3 ,
expectedMatches : [ ] string { "xyz-016a-ixb-dp" , "xyz-016a-ixb-dpXXX" , "xyz-016a-ixb-op" , "xyz-016a-ixb-opXXX" } ,
expectedNotMatches : [ ] string { "XYZ-016a-ixb-dp" , "xyz-016a-ixb-d" , "XYZ-016a-ixb-op" , "xyz-016a-ixb-o" , "xyz" , "dp" , "xyz-016a-ixb-dp\n" } ,
} ,
// Case insensitive.
{
pattern : "(?i)(xyz-016a-ixb-dp.*|xyz-016a-ixb-op.*)" ,
expectedLiteralPrefixMatchers : 3 ,
expectedMatches : [ ] string { "xyz-016a-ixb-dp" , "XYZ-016a-ixb-dpXXX" , "xyz-016a-ixb-op" , "XYZ-016a-ixb-opXXX" } ,
expectedNotMatches : [ ] string { "xyz-016a-ixb-d" , "xyz" , "dp" , "xyz-016a-ixb-dp\n" } ,
} ,
// Nested literal prefixes, case sensitive.
{
pattern : "(xyz-(aaa-(111.*)|bbb-(222.*)))|(xyz-(aaa-(333.*)|bbb-(444.*)))" ,
expectedLiteralPrefixMatchers : 10 ,
expectedMatches : [ ] string { "xyz-aaa-111" , "xyz-aaa-111XXX" , "xyz-aaa-333" , "xyz-aaa-333XXX" , "xyz-bbb-222" , "xyz-bbb-222XXX" , "xyz-bbb-444" , "xyz-bbb-444XXX" } ,
expectedNotMatches : [ ] string { "XYZ-aaa-111" , "xyz-aaa-11" , "xyz-aaa-222" , "xyz-bbb-111" } ,
} ,
// Nested literal prefixes, case insensitive.
{
pattern : "(?i)(xyz-(aaa-(111.*)|bbb-(222.*)))|(xyz-(aaa-(333.*)|bbb-(444.*)))" ,
expectedLiteralPrefixMatchers : 10 ,
expectedMatches : [ ] string { "xyz-aaa-111" , "XYZ-aaa-111XXX" , "xyz-aaa-333" , "xyz-AAA-333XXX" , "xyz-bbb-222" , "xyz-BBB-222XXX" , "XYZ-bbb-444" , "xyz-bbb-444XXX" } ,
expectedNotMatches : [ ] string { "xyz-aaa-11" , "xyz-aaa-222" , "xyz-bbb-111" } ,
} ,
// Mixed case sensitivity.
{
pattern : "(xyz-((?i)(aaa.*|bbb.*)))" ,
expectedLiteralPrefixMatchers : 3 ,
expectedMatches : [ ] string { "xyz-aaa" , "xyz-AAA" , "xyz-aaaXXX" , "xyz-AAAXXX" , "xyz-bbb" , "xyz-BBBXXX" } ,
expectedNotMatches : [ ] string { "XYZ-aaa" , "xyz-aa" , "yz-aaa" , "aaa" } ,
} ,
} {
t . Run ( c . pattern , func ( t * testing . T ) {
parsed , err := syntax . Parse ( c . pattern , syntax . Perl )
require . NoError ( t , err )
matcher := stringMatcherFromRegexp ( parsed )
require . NotNil ( t , matcher )
re := regexp . MustCompile ( "^" + c . pattern + "$" )
// Pre-condition check: ensure it contains literalPrefixStringMatcher.
numPrefixMatchers := 0
visitStringMatcher ( matcher , func ( matcher StringMatcher ) {
if _ , ok := matcher . ( * literalPrefixStringMatcher ) ; ok {
numPrefixMatchers ++
}
} )
require . Equal ( t , c . expectedLiteralPrefixMatchers , numPrefixMatchers )
for _ , value := range c . expectedMatches {
2023-10-06 06:44:17 -07:00
require . Truef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Truef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
}
for _ , value := range c . expectedNotMatches {
2023-10-06 06:44:17 -07:00
require . Falsef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Falsef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
}
} )
}
}
func TestStringMatcherFromRegexp_LiteralSuffix ( t * testing . T ) {
for _ , c := range [ ] struct {
pattern string
expectedLiteralSuffixMatchers int
expectedMatches [ ] string
expectedNotMatches [ ] string
} {
// Case sensitive.
{
pattern : "(.*xyz-016a-ixb-dp|.*xyz-016a-ixb-op)" ,
expectedLiteralSuffixMatchers : 2 ,
expectedMatches : [ ] string { "xyz-016a-ixb-dp" , "XXXxyz-016a-ixb-dp" , "xyz-016a-ixb-op" , "XXXxyz-016a-ixb-op" } ,
expectedNotMatches : [ ] string { "XYZ-016a-ixb-dp" , "yz-016a-ixb-dp" , "XYZ-016a-ixb-op" , "xyz-016a-ixb-o" , "xyz" , "dp" , "\nxyz-016a-ixb-dp" } ,
} ,
// Case insensitive.
{
pattern : "(?i)(.*xyz-016a-ixb-dp|.*xyz-016a-ixb-op)" ,
expectedLiteralSuffixMatchers : 2 ,
expectedMatches : [ ] string { "xyz-016a-ixb-dp" , "XYZ-016a-ixb-dp" , "XXXxyz-016a-ixb-dp" , "XyZ-016a-ixb-op" , "XXXxyz-016a-ixb-op" } ,
expectedNotMatches : [ ] string { "yz-016a-ixb-dp" , "xyz-016a-ixb-o" , "xyz" , "dp" , "\nxyz-016a-ixb-dp" } ,
} ,
// Nested literal suffixes, case sensitive.
{
pattern : "(.*aaa|.*bbb(.*ccc|.*ddd))" ,
expectedLiteralSuffixMatchers : 3 ,
expectedMatches : [ ] string { "aaa" , "XXXaaa" , "bbbccc" , "XXXbbbccc" , "XXXbbbXXXccc" , "bbbddd" , "bbbddd" , "XXXbbbddd" , "XXXbbbXXXddd" , "bbbXXXccc" , "aaabbbccc" , "aaabbbddd" } ,
expectedNotMatches : [ ] string { "AAA" , "aa" , "Xaa" , "BBBCCC" , "bb" , "Xbb" , "bbccc" , "bbbcc" , "bbbdd" } ,
} ,
// Mixed case sensitivity.
{
pattern : "(.*aaa|.*bbb((?i)(.*ccc|.*ddd)))" ,
expectedLiteralSuffixMatchers : 3 ,
expectedMatches : [ ] string { "aaa" , "XXXaaa" , "bbbccc" , "bbbCCC" , "bbbXXXCCC" , "bbbddd" , "bbbDDD" , "bbbXXXddd" , "bbbXXXDDD" } ,
expectedNotMatches : [ ] string { "AAA" , "XXXAAA" , "BBBccc" , "BBBCCC" , "aaaBBB" } ,
} ,
} {
t . Run ( c . pattern , func ( t * testing . T ) {
parsed , err := syntax . Parse ( c . pattern , syntax . Perl )
require . NoError ( t , err )
matcher := stringMatcherFromRegexp ( parsed )
require . NotNil ( t , matcher )
re := regexp . MustCompile ( "^" + c . pattern + "$" )
// Pre-condition check: ensure it contains literalSuffixStringMatcher.
numSuffixMatchers := 0
visitStringMatcher ( matcher , func ( matcher StringMatcher ) {
if _ , ok := matcher . ( * literalSuffixStringMatcher ) ; ok {
numSuffixMatchers ++
}
} )
require . Equal ( t , c . expectedLiteralSuffixMatchers , numSuffixMatchers )
for _ , value := range c . expectedMatches {
2023-10-06 06:44:17 -07:00
require . Truef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Truef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
}
for _ , value := range c . expectedNotMatches {
2023-10-06 06:44:17 -07:00
require . Falsef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Falsef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-28 08:36:37 -07:00
}
} )
}
}
2023-09-29 05:12:41 -07:00
func TestStringMatcherFromRegexp_Quest ( t * testing . T ) {
for _ , c := range [ ] struct {
pattern string
expectedZeroOrOneMatchers int
expectedMatches [ ] string
expectedNotMatches [ ] string
} {
// Not match newline.
{
pattern : "test.?" ,
expectedZeroOrOneMatchers : 1 ,
expectedMatches : [ ] string { "test" , "test!" } ,
expectedNotMatches : [ ] string { "test\n" , "tes" , "test!!" } ,
2023-09-29 06:01:41 -07:00
} ,
{
2023-09-29 05:12:41 -07:00
pattern : ".?test" ,
expectedZeroOrOneMatchers : 1 ,
expectedMatches : [ ] string { "test" , "!test" } ,
expectedNotMatches : [ ] string { "\ntest" , "tes" , "test!" } ,
2023-09-29 06:01:41 -07:00
} ,
{
2023-09-29 05:12:41 -07:00
pattern : "(aaa.?|bbb.?)" ,
expectedZeroOrOneMatchers : 2 ,
expectedMatches : [ ] string { "aaa" , "aaaX" , "bbb" , "bbbX" } ,
expectedNotMatches : [ ] string { "aa" , "aaaXX" , "aaa\n" , "bb" , "bbbXX" , "bbb\n" } ,
2023-09-29 06:01:41 -07:00
} ,
{
2023-09-29 05:12:41 -07:00
pattern : ".*aaa.?" ,
expectedZeroOrOneMatchers : 1 ,
expectedMatches : [ ] string { "aaa" , "Xaaa" , "aaaX" , "XXXaaa" , "XXXaaaX" } ,
expectedNotMatches : [ ] string { "aa" , "aaaXX" , "XXXaaaXXX" , "XXXaaa\n" } ,
} ,
// Match newline.
{
pattern : "(?s)test.?" ,
expectedZeroOrOneMatchers : 1 ,
expectedMatches : [ ] string { "test" , "test!" , "test\n" } ,
expectedNotMatches : [ ] string { "tes" , "test!!" , "test\n\n" } ,
} ,
// Mixed flags (a part matches newline another doesn't).
{
pattern : "(aaa.?|((?s).?bbb.+))" ,
expectedZeroOrOneMatchers : 2 ,
expectedMatches : [ ] string { "aaa" , "aaaX" , "bbbX" , "XbbbX" , "bbbXXX" , "\nbbbX" } ,
expectedNotMatches : [ ] string { "aa" , "aaa\n" , "Xbbb" , "\nbbb" } ,
} ,
} {
t . Run ( c . pattern , func ( t * testing . T ) {
parsed , err := syntax . Parse ( c . pattern , syntax . Perl )
require . NoError ( t , err )
matcher := stringMatcherFromRegexp ( parsed )
require . NotNil ( t , matcher )
re := regexp . MustCompile ( "^" + c . pattern + "$" )
// Pre-condition check: ensure it contains zeroOrOneCharacterStringMatcher.
numZeroOrOneMatchers := 0
visitStringMatcher ( matcher , func ( matcher StringMatcher ) {
if _ , ok := matcher . ( * zeroOrOneCharacterStringMatcher ) ; ok {
numZeroOrOneMatchers ++
}
} )
require . Equal ( t , c . expectedZeroOrOneMatchers , numZeroOrOneMatchers )
for _ , value := range c . expectedMatches {
2023-10-06 06:44:17 -07:00
require . Truef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-29 05:12:41 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Truef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-29 05:12:41 -07:00
}
for _ , value := range c . expectedNotMatches {
2023-10-06 06:44:17 -07:00
require . Falsef ( t , matcher . Matches ( value ) , "Value: %s" , value )
2023-09-29 05:12:41 -07:00
// Ensure the golang regexp engine would return the same.
2023-10-06 06:44:17 -07:00
require . Falsef ( t , re . MatchString ( value ) , "Value: %s" , value )
2023-09-29 05:12:41 -07:00
}
} )
}
}
2023-03-09 00:38:41 -08:00
func randString ( randGenerator * rand . Rand , length int ) string {
2023-03-02 08:20:52 -08:00
b := make ( [ ] rune , length )
2021-10-07 05:25:31 -07:00
for i := range b {
2023-03-09 00:38:41 -08:00
b [ i ] = asciiRunes [ randGenerator . Intn ( len ( asciiRunes ) ) ]
2021-10-07 05:25:31 -07:00
}
return string ( b )
}
2023-03-01 03:18:30 -08:00
2023-03-09 00:38:41 -08:00
func randStrings ( randGenerator * rand . Rand , many , length int ) [ ] string {
2023-03-02 08:20:52 -08:00
out := make ( [ ] string , 0 , many )
for i := 0 ; i < many ; i ++ {
2023-03-09 00:38:41 -08:00
out = append ( out , randString ( randGenerator , length ) )
2023-03-02 08:20:52 -08:00
}
return out
}
2023-03-01 03:18:30 -08:00
func FuzzFastRegexMatcher_WithStaticallyDefinedRegularExpressions ( f * testing . F ) {
// Create all matchers.
matchers := make ( [ ] * FastRegexMatcher , 0 , len ( regexes ) )
2023-03-31 23:35:35 -07:00
res := make ( [ ] * regexp . Regexp , 0 , len ( regexes ) )
2023-03-01 03:18:30 -08:00
for _ , re := range regexes {
m , err := NewFastRegexMatcher ( re )
require . NoError ( f , err )
2023-03-31 23:35:35 -07:00
r := regexp . MustCompile ( "^(?:" + re + ")$" )
2023-03-01 03:18:30 -08:00
matchers = append ( matchers , m )
2023-03-31 23:35:35 -07:00
res = append ( res , r )
2023-03-01 03:18:30 -08:00
}
// Add known values to seed corpus.
for _ , v := range values {
f . Add ( v )
}
f . Fuzz ( func ( t * testing . T , text string ) {
2023-03-31 23:35:35 -07:00
for i , m := range matchers {
require . Equalf ( t , res [ i ] . MatchString ( text ) , m . MatchString ( text ) , "regexp: %s text: %s" , res [ i ] . String ( ) , text )
2023-03-01 03:18:30 -08:00
}
} )
}
2023-03-01 05:50:26 -08:00
func FuzzFastRegexMatcher_WithFuzzyRegularExpressions ( f * testing . F ) {
for _ , re := range regexes {
for _ , text := range values {
f . Add ( re , text )
}
}
f . Fuzz ( func ( t * testing . T , re , text string ) {
m , err := NewFastRegexMatcher ( re )
if err != nil {
// Ignore invalid regexes.
return
}
2023-03-31 23:35:35 -07:00
reg , err := regexp . Compile ( "^(?:" + re + ")$" )
if err != nil {
// Ignore invalid regexes.
return
}
require . Equalf ( t , reg . MatchString ( text ) , m . MatchString ( text ) , "regexp: %s text: %s" , reg . String ( ) , text )
2023-03-01 05:50:26 -08:00
} )
}
2023-03-01 06:50:04 -08:00
// This test can be used to analyze real queries from Mimir logs. You can extract real queries with a regexp matcher
// running the following command:
//
// logcli --addr=XXX --username=YYY --password=ZZZ query '{namespace=~"(cortex|mimir).*",name="query-frontend"} |= "query stats" |= "=~" --limit=100000 > logs.txt
2023-11-02 13:09:38 -07:00
//
// against Loki.
2023-03-01 06:50:04 -08:00
func TestAnalyzeRealQueries ( t * testing . T ) {
2023-09-29 03:26:00 -07:00
t . Skip ( "Decomment this test only to manually analyze real queries" )
2023-03-01 06:50:04 -08:00
2023-03-31 01:27:43 -07:00
type labelValueInfo struct {
2023-11-01 06:19:50 -07:00
numMatchingQueries int
numShardedQueries int
numSplitQueries int
optimized bool
averageParsingTimeMillis float64
2023-03-31 01:27:43 -07:00
// Sorted list of timestamps when the queries have been received.
queryStartTimes [ ] time . Time
}
2023-09-29 03:48:55 -07:00
labelValueRE := regexp . MustCompile ( ` =~(?:\\"|')([^"']*)(?:\\"|') ` )
2023-03-31 01:27:43 -07:00
tsRE := regexp . MustCompile ( ` ts=([^ ]+) ` )
shardedQueriesRE := regexp . MustCompile ( ` sharded_queries=(\d+) ` )
splitQueriesRE := regexp . MustCompile ( ` split_queries=(\d+) ` )
labelValues := make ( map [ string ] * labelValueInfo )
2023-03-01 06:50:04 -08:00
// Read the logs file line-by-line, and find all values for regex label matchers.
2023-09-29 03:26:00 -07:00
readFile , err := os . Open ( "logs.txt" )
2023-03-01 06:50:04 -08:00
require . NoError ( t , err )
fileScanner := bufio . NewScanner ( readFile )
fileScanner . Split ( bufio . ScanLines )
2023-03-31 01:27:43 -07:00
numQueries := 0
2023-03-01 06:50:04 -08:00
for fileScanner . Scan ( ) {
line := fileScanner . Text ( )
matches := labelValueRE . FindAllStringSubmatch ( line , - 1 )
2023-03-31 01:27:43 -07:00
if len ( matches ) == 0 {
continue
}
// Look up query stats.
tsRaw := tsRE . FindStringSubmatch ( line )
shardedQueriesRaw := shardedQueriesRE . FindStringSubmatch ( line )
splitQueriesRaw := splitQueriesRE . FindStringSubmatch ( line )
shardedQueries := 0
splitQueries := 0
var ts time . Time
if len ( tsRaw ) > 0 {
ts , _ = time . Parse ( time . RFC3339Nano , tsRaw [ 1 ] )
}
if len ( shardedQueriesRaw ) > 0 {
shardedQueries , _ = strconv . Atoi ( shardedQueriesRaw [ 1 ] )
}
if len ( splitQueriesRaw ) > 0 {
splitQueries , _ = strconv . Atoi ( splitQueriesRaw [ 1 ] )
}
numQueries ++
2023-03-01 06:50:04 -08:00
for _ , match := range matches {
2023-03-31 01:27:43 -07:00
info := labelValues [ match [ 1 ] ]
if info == nil {
info = & labelValueInfo { }
labelValues [ match [ 1 ] ] = info
}
info . numMatchingQueries ++
info . numShardedQueries += shardedQueries
info . numSplitQueries += splitQueries
if ! ts . IsZero ( ) {
info . queryStartTimes = append ( info . queryStartTimes , ts )
}
2023-03-01 06:50:04 -08:00
}
}
2023-03-31 01:27:43 -07:00
// Sort query start times.
for _ , info := range labelValues {
sort . Slice ( info . queryStartTimes , func ( i , j int ) bool {
return info . queryStartTimes [ i ] . Before ( info . queryStartTimes [ j ] )
} )
}
2023-03-01 06:50:04 -08:00
require . NoError ( t , readFile . Close ( ) )
2023-03-31 01:27:43 -07:00
t . Logf ( "Found %d unique regexp matchers out of %d queries" , len ( labelValues ) , numQueries )
2023-03-01 06:50:04 -08:00
2023-03-31 01:27:43 -07:00
// Analyze each regexp matcher found.
2023-03-01 06:50:04 -08:00
numChecked := 0
numOptimized := 0
2023-03-31 01:27:43 -07:00
for re , info := range labelValues {
2023-03-01 06:50:04 -08:00
m , err := NewFastRegexMatcher ( re )
if err != nil {
// Ignore it, because we may have failed to extract the label matcher.
continue
}
numChecked ++
2023-03-31 01:27:43 -07:00
// Check if each regexp matcher is supported by our optimization.
2023-04-18 00:24:41 -07:00
if m . IsOptimized ( ) {
2023-03-01 06:50:04 -08:00
numOptimized ++
2023-03-31 01:27:43 -07:00
info . optimized = true
2023-03-01 06:50:04 -08:00
}
2023-03-31 01:27:43 -07:00
// Estimate the parsing complexity.
startTime := time . Now ( )
const numParsingRuns = 1000
for i := 0 ; i < numParsingRuns ; i ++ {
NewFastRegexMatcher ( re )
}
info . averageParsingTimeMillis = float64 ( time . Since ( startTime ) . Milliseconds ( ) ) / float64 ( numParsingRuns )
2023-03-01 06:50:04 -08:00
}
2023-03-31 01:27:43 -07:00
t . Logf ( "Found %d out of %d (%.2f%%) regexp matchers optimized by FastRegexMatcher" , numOptimized , numChecked , ( float64 ( numOptimized ) / float64 ( numChecked ) ) * 100 )
// Print some statistics.
for labelValue , info := range labelValues {
// Find the min/avg/max difference between query start times.
var (
minQueryStartTimeDiff time . Duration
maxQueryStartTimeDiff time . Duration
avgQueryStartTimeDiff time . Duration
sumQueryStartTime time . Duration
countQueryStartTime int
)
for i := 1 ; i < len ( info . queryStartTimes ) ; i ++ {
diff := info . queryStartTimes [ i ] . Sub ( info . queryStartTimes [ i - 1 ] )
sumQueryStartTime += diff
countQueryStartTime ++
if minQueryStartTimeDiff == 0 || diff < minQueryStartTimeDiff {
minQueryStartTimeDiff = diff
}
if diff > maxQueryStartTimeDiff {
maxQueryStartTimeDiff = diff
}
}
if countQueryStartTime > 0 {
avgQueryStartTimeDiff = sumQueryStartTime / time . Duration ( countQueryStartTime )
}
t . Logf ( "num queries: %d\t num split queries: %d\t num sharded queries: %d\t optimized: %t\t parsing time: %.0fms\t min/avg/max query start time diff (sec): %.2f/%.2f/%.2f regexp: %s" ,
info . numMatchingQueries , info . numSplitQueries , info . numShardedQueries , info . optimized , info . averageParsingTimeMillis ,
minQueryStartTimeDiff . Seconds ( ) , avgQueryStartTimeDiff . Seconds ( ) , maxQueryStartTimeDiff . Seconds ( ) , labelValue )
}
2023-03-01 06:50:04 -08:00
}
2023-03-02 08:20:52 -08:00
func TestOptimizeEqualStringMatchers ( t * testing . T ) {
tests := map [ string ] struct {
input StringMatcher
2023-03-31 23:35:35 -07:00
expectedValues [ ] string
2023-03-02 08:20:52 -08:00
expectedCaseSensitive bool
} {
"should skip optimization on orStringMatcher with containsStringMatcher" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
& containsStringMatcher { substrings : [ ] string { "a" , "b" , "c" } } ,
} ,
expectedValues : nil ,
} ,
"should run optimization on orStringMatcher with equalStringMatcher and same case sensitivity" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
& equalStringMatcher { s : "bar" , caseSensitive : true } ,
& equalStringMatcher { s : "baz" , caseSensitive : true } ,
} ,
2023-03-31 23:35:35 -07:00
expectedValues : [ ] string { "FOO" , "bar" , "baz" } ,
2023-03-02 08:20:52 -08:00
expectedCaseSensitive : true ,
} ,
"should skip optimization on orStringMatcher with equalStringMatcher but different case sensitivity" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
& equalStringMatcher { s : "bar" , caseSensitive : false } ,
& equalStringMatcher { s : "baz" , caseSensitive : true } ,
} ,
expectedValues : nil ,
} ,
"should run optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, and same case sensitivity" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
orStringMatcher {
& equalStringMatcher { s : "bar" , caseSensitive : true } ,
& equalStringMatcher { s : "xxx" , caseSensitive : true } ,
} ,
& equalStringMatcher { s : "baz" , caseSensitive : true } ,
} ,
2023-03-31 23:35:35 -07:00
expectedValues : [ ] string { "FOO" , "bar" , "xxx" , "baz" } ,
2023-03-02 08:20:52 -08:00
expectedCaseSensitive : true ,
} ,
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
orStringMatcher {
// Case sensitivity is different within items at the same level.
& equalStringMatcher { s : "bar" , caseSensitive : true } ,
& equalStringMatcher { s : "xxx" , caseSensitive : false } ,
} ,
& equalStringMatcher { s : "baz" , caseSensitive : true } ,
} ,
expectedValues : nil ,
} ,
"should skip optimization on orStringMatcher with nested orStringMatcher and equalStringMatcher, but different case sensitivity in the nested one" : {
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : true } ,
// Case sensitivity is different between the parent and child.
orStringMatcher {
& equalStringMatcher { s : "bar" , caseSensitive : false } ,
& equalStringMatcher { s : "xxx" , caseSensitive : false } ,
} ,
& equalStringMatcher { s : "baz" , caseSensitive : true } ,
} ,
expectedValues : nil ,
} ,
2023-03-31 23:35:35 -07:00
"should return unchanged values on few case insensitive matchers" : {
2023-03-02 08:20:52 -08:00
input : orStringMatcher {
& equalStringMatcher { s : "FOO" , caseSensitive : false } ,
orStringMatcher {
& equalStringMatcher { s : "bAr" , caseSensitive : false } ,
} ,
& equalStringMatcher { s : "baZ" , caseSensitive : false } ,
} ,
2023-03-31 23:35:35 -07:00
expectedValues : [ ] string { "FOO" , "bAr" , "baZ" } ,
2023-03-02 08:20:52 -08:00
expectedCaseSensitive : false ,
} ,
}
for testName , testData := range tests {
t . Run ( testName , func ( t * testing . T ) {
actualMatcher := optimizeEqualStringMatchers ( testData . input , 0 )
if testData . expectedValues == nil {
require . IsType ( t , testData . input , actualMatcher )
} else {
2023-03-31 23:35:35 -07:00
require . IsType ( t , & equalMultiStringSliceMatcher { } , actualMatcher )
require . Equal ( t , testData . expectedValues , actualMatcher . ( * equalMultiStringSliceMatcher ) . values )
require . Equal ( t , testData . expectedCaseSensitive , actualMatcher . ( * equalMultiStringSliceMatcher ) . caseSensitive )
}
} )
}
}
func TestNewEqualMultiStringMatcher ( t * testing . T ) {
tests := map [ string ] struct {
values [ ] string
caseSensitive bool
expectedValuesMap map [ string ] struct { }
expectedValuesList [ ] string
} {
"few case sensitive values" : {
values : [ ] string { "a" , "B" } ,
caseSensitive : true ,
expectedValuesList : [ ] string { "a" , "B" } ,
} ,
"few case insensitive values" : {
values : [ ] string { "a" , "B" } ,
caseSensitive : false ,
expectedValuesList : [ ] string { "a" , "B" } ,
} ,
"many case sensitive values" : {
values : [ ] string { "a" , "B" , "c" , "D" , "e" , "F" , "g" , "H" , "i" , "L" , "m" , "N" , "o" , "P" , "q" , "r" } ,
caseSensitive : true ,
expectedValuesMap : map [ string ] struct { } { "a" : { } , "B" : { } , "c" : { } , "D" : { } , "e" : { } , "F" : { } , "g" : { } , "H" : { } , "i" : { } , "L" : { } , "m" : { } , "N" : { } , "o" : { } , "P" : { } , "q" : { } , "r" : { } } ,
} ,
"many case insensitive values" : {
values : [ ] string { "a" , "B" , "c" , "D" , "e" , "F" , "g" , "H" , "i" , "L" , "m" , "N" , "o" , "P" , "q" , "r" } ,
caseSensitive : false ,
expectedValuesMap : map [ string ] struct { } { "a" : { } , "b" : { } , "c" : { } , "d" : { } , "e" : { } , "f" : { } , "g" : { } , "h" : { } , "i" : { } , "l" : { } , "m" : { } , "n" : { } , "o" : { } , "p" : { } , "q" : { } , "r" : { } } ,
} ,
}
for testName , testData := range tests {
t . Run ( testName , func ( t * testing . T ) {
matcher := newEqualMultiStringMatcher ( testData . caseSensitive , len ( testData . values ) )
for _ , v := range testData . values {
matcher . add ( v )
}
if testData . expectedValuesMap != nil {
require . IsType ( t , & equalMultiStringMapMatcher { } , matcher )
require . Equal ( t , testData . expectedValuesMap , matcher . ( * equalMultiStringMapMatcher ) . values )
require . Equal ( t , testData . caseSensitive , matcher . ( * equalMultiStringMapMatcher ) . caseSensitive )
}
if testData . expectedValuesList != nil {
require . IsType ( t , & equalMultiStringSliceMatcher { } , matcher )
require . Equal ( t , testData . expectedValuesList , matcher . ( * equalMultiStringSliceMatcher ) . values )
require . Equal ( t , testData . caseSensitive , matcher . ( * equalMultiStringSliceMatcher ) . caseSensitive )
}
} )
}
}
func TestEqualMultiStringMatcher_Matches ( t * testing . T ) {
tests := map [ string ] struct {
values [ ] string
caseSensitive bool
expectedMatches [ ] string
expectedNotMatches [ ] string
} {
"few case sensitive values" : {
values : [ ] string { "a" , "B" } ,
caseSensitive : true ,
expectedMatches : [ ] string { "a" , "B" } ,
expectedNotMatches : [ ] string { "A" , "b" } ,
} ,
"few case insensitive values" : {
values : [ ] string { "a" , "B" } ,
caseSensitive : false ,
expectedMatches : [ ] string { "a" , "A" , "b" , "B" } ,
expectedNotMatches : [ ] string { "c" , "C" } ,
} ,
"many case sensitive values" : {
values : [ ] string { "a" , "B" , "c" , "D" , "e" , "F" , "g" , "H" , "i" , "L" , "m" , "N" , "o" , "P" , "q" , "r" } ,
caseSensitive : true ,
expectedMatches : [ ] string { "a" , "B" } ,
expectedNotMatches : [ ] string { "A" , "b" } ,
} ,
"many case insensitive values" : {
values : [ ] string { "a" , "B" , "c" , "D" , "e" , "F" , "g" , "H" , "i" , "L" , "m" , "N" , "o" , "P" , "q" , "r" } ,
caseSensitive : false ,
expectedMatches : [ ] string { "a" , "A" , "b" , "B" } ,
expectedNotMatches : [ ] string { "x" , "X" } ,
} ,
}
for testName , testData := range tests {
t . Run ( testName , func ( t * testing . T ) {
matcher := newEqualMultiStringMatcher ( testData . caseSensitive , len ( testData . values ) )
for _ , v := range testData . values {
matcher . add ( v )
}
for _ , v := range testData . expectedMatches {
require . True ( t , matcher . Matches ( v ) , "value: %s" , v )
}
for _ , v := range testData . expectedNotMatches {
require . False ( t , matcher . Matches ( v ) , "value: %s" , v )
2023-03-02 08:20:52 -08:00
}
} )
}
}
2023-09-28 08:36:37 -07:00
func TestFindEqualStringMatchers ( t * testing . T ) {
type match struct {
s string
caseSensitive bool
}
// Utility to call findEqualStringMatchers() and collect all callback invocations.
findEqualStringMatchersAndCollectMatches := func ( input StringMatcher ) ( matches [ ] match , ok bool ) {
ok = findEqualStringMatchers ( input , func ( matcher * equalStringMatcher ) bool {
matches = append ( matches , match { matcher . s , matcher . caseSensitive } )
return true
} )
return
}
t . Run ( "empty matcher" , func ( t * testing . T ) {
actualMatches , actualOk := findEqualStringMatchersAndCollectMatches ( emptyStringMatcher { } )
2023-10-06 06:44:17 -07:00
require . False ( t , actualOk )
require . Empty ( t , actualMatches )
2023-09-28 08:36:37 -07:00
} )
t . Run ( "concat of literal matchers (case sensitive)" , func ( t * testing . T ) {
actualMatches , actualOk := findEqualStringMatchersAndCollectMatches (
orStringMatcher {
& equalStringMatcher { s : "test-1" , caseSensitive : true } ,
& equalStringMatcher { s : "test-2" , caseSensitive : true } ,
} ,
)
2023-10-06 06:44:17 -07:00
require . True ( t , actualOk )
require . Equal ( t , [ ] match { { "test-1" , true } , { "test-2" , true } } , actualMatches )
2023-09-28 08:36:37 -07:00
} )
t . Run ( "concat of literal matchers (case insensitive)" , func ( t * testing . T ) {
actualMatches , actualOk := findEqualStringMatchersAndCollectMatches (
orStringMatcher {
& equalStringMatcher { s : "test-1" , caseSensitive : false } ,
& equalStringMatcher { s : "test-2" , caseSensitive : false } ,
} ,
)
2023-10-06 06:44:17 -07:00
require . True ( t , actualOk )
require . Equal ( t , [ ] match { { "test-1" , false } , { "test-2" , false } } , actualMatches )
2023-09-28 08:36:37 -07:00
} )
t . Run ( "concat of literal matchers (mixed case)" , func ( t * testing . T ) {
actualMatches , actualOk := findEqualStringMatchersAndCollectMatches (
orStringMatcher {
& equalStringMatcher { s : "test-1" , caseSensitive : false } ,
& equalStringMatcher { s : "test-2" , caseSensitive : true } ,
} ,
)
2023-10-06 06:44:17 -07:00
require . True ( t , actualOk )
require . Equal ( t , [ ] match { { "test-1" , false } , { "test-2" , true } } , actualMatches )
2023-09-28 08:36:37 -07:00
} )
}
2023-03-02 08:20:52 -08:00
// This benchmark is used to find a good threshold to use to apply the optimization
2023-11-02 13:09:38 -07:00
// done by optimizeEqualStringMatchers().
2023-03-02 08:20:52 -08:00
func BenchmarkOptimizeEqualStringMatchers ( b * testing . B ) {
2023-03-09 00:38:41 -08:00
randGenerator := rand . New ( rand . NewSource ( time . Now ( ) . UnixNano ( ) ) )
2023-03-02 08:20:52 -08:00
// Generate variable lengths random texts to match against.
2023-03-09 00:38:41 -08:00
texts := append ( [ ] string { } , randStrings ( randGenerator , 10 , 10 ) ... )
texts = append ( texts , randStrings ( randGenerator , 5 , 30 ) ... )
texts = append ( texts , randStrings ( randGenerator , 1 , 100 ) ... )
2023-03-02 08:20:52 -08:00
for numAlternations := 2 ; numAlternations <= 256 ; numAlternations *= 2 {
for _ , caseSensitive := range [ ] bool { true , false } {
b . Run ( fmt . Sprintf ( "alternations: %d case sensitive: %t" , numAlternations , caseSensitive ) , func ( b * testing . B ) {
// Generate a regex with the expected number of alternations.
2023-03-09 00:38:41 -08:00
re := strings . Join ( randStrings ( randGenerator , numAlternations , 10 ) , "|" )
2023-03-02 08:20:52 -08:00
if ! caseSensitive {
re = "(?i:(" + re + "))"
}
parsed , err := syntax . Parse ( re , syntax . Perl )
require . NoError ( b , err )
unoptimized := stringMatcherFromRegexpInternal ( parsed )
require . IsType ( b , orStringMatcher { } , unoptimized )
optimized := optimizeEqualStringMatchers ( unoptimized , 0 )
2023-10-02 03:00:18 -07:00
if numAlternations < minEqualMultiStringMatcherMapThreshold {
require . IsType ( b , & equalMultiStringSliceMatcher { } , optimized )
} else {
require . IsType ( b , & equalMultiStringMapMatcher { } , optimized )
}
2023-03-02 08:20:52 -08:00
b . Run ( "without optimizeEqualStringMatchers()" , func ( b * testing . B ) {
for n := 0 ; n < b . N ; n ++ {
for _ , t := range texts {
unoptimized . Matches ( t )
}
}
} )
b . Run ( "with optimizeEqualStringMatchers()" , func ( b * testing . B ) {
for n := 0 ; n < b . N ; n ++ {
for _ , t := range texts {
optimized . Matches ( t )
}
}
} )
} )
}
}
}
2023-09-29 05:12:41 -07:00
func TestZeroOrOneCharacterStringMatcher ( t * testing . T ) {
matcher := & zeroOrOneCharacterStringMatcher { matchNL : true }
2023-10-06 06:44:17 -07:00
require . True ( t , matcher . Matches ( "" ) )
require . True ( t , matcher . Matches ( "x" ) )
require . True ( t , matcher . Matches ( "\n" ) )
require . False ( t , matcher . Matches ( "xx" ) )
require . False ( t , matcher . Matches ( "\n\n" ) )
2023-09-29 05:12:41 -07:00
matcher = & zeroOrOneCharacterStringMatcher { matchNL : false }
2023-10-06 06:44:17 -07:00
require . True ( t , matcher . Matches ( "" ) )
require . True ( t , matcher . Matches ( "x" ) )
require . False ( t , matcher . Matches ( "\n" ) )
require . False ( t , matcher . Matches ( "xx" ) )
require . False ( t , matcher . Matches ( "\n\n" ) )
2023-09-29 05:12:41 -07:00
}
2023-09-28 08:36:37 -07:00
func TestLiteralPrefixStringMatcher ( t * testing . T ) {
m := & literalPrefixStringMatcher { prefix : "mar" , prefixCaseSensitive : true , right : & emptyStringMatcher { } }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "mar" ) )
require . False ( t , m . Matches ( "marco" ) )
require . False ( t , m . Matches ( "ma" ) )
require . False ( t , m . Matches ( "mAr" ) )
2023-09-28 08:36:37 -07:00
m = & literalPrefixStringMatcher { prefix : "mar" , prefixCaseSensitive : false , right : & emptyStringMatcher { } }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "mar" ) )
require . False ( t , m . Matches ( "marco" ) )
require . False ( t , m . Matches ( "ma" ) )
require . True ( t , m . Matches ( "mAr" ) )
2023-09-28 08:36:37 -07:00
m = & literalPrefixStringMatcher { prefix : "mar" , prefixCaseSensitive : true , right : & equalStringMatcher { s : "co" , caseSensitive : false } }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "marco" ) )
require . True ( t , m . Matches ( "marCO" ) )
require . False ( t , m . Matches ( "MARco" ) )
require . False ( t , m . Matches ( "mar" ) )
require . False ( t , m . Matches ( "marcopracucci" ) )
2023-09-28 08:36:37 -07:00
}
func TestLiteralSuffixStringMatcher ( t * testing . T ) {
m := & literalSuffixStringMatcher { left : & emptyStringMatcher { } , suffix : "co" , suffixCaseSensitive : true }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "co" ) )
require . False ( t , m . Matches ( "marco" ) )
require . False ( t , m . Matches ( "coo" ) )
require . False ( t , m . Matches ( "Co" ) )
2023-09-28 08:36:37 -07:00
m = & literalSuffixStringMatcher { left : & emptyStringMatcher { } , suffix : "co" , suffixCaseSensitive : false }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "co" ) )
require . False ( t , m . Matches ( "marco" ) )
require . False ( t , m . Matches ( "coo" ) )
require . True ( t , m . Matches ( "Co" ) )
2023-09-28 08:36:37 -07:00
m = & literalSuffixStringMatcher { left : & equalStringMatcher { s : "mar" , caseSensitive : false } , suffix : "co" , suffixCaseSensitive : true }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "marco" ) )
require . True ( t , m . Matches ( "MARco" ) )
require . False ( t , m . Matches ( "marCO" ) )
require . False ( t , m . Matches ( "mar" ) )
require . False ( t , m . Matches ( "marcopracucci" ) )
2023-09-28 08:36:37 -07:00
m = & literalSuffixStringMatcher { left : & equalStringMatcher { s : "mar" , caseSensitive : false } , suffix : "co" , suffixCaseSensitive : false }
2023-10-06 06:44:17 -07:00
require . True ( t , m . Matches ( "marco" ) )
require . True ( t , m . Matches ( "MARco" ) )
require . True ( t , m . Matches ( "marCO" ) )
require . False ( t , m . Matches ( "mar" ) )
require . False ( t , m . Matches ( "marcopracucci" ) )
2023-09-28 08:36:37 -07:00
}
func TestHasPrefixCaseInsensitive ( t * testing . T ) {
2023-10-06 06:44:17 -07:00
require . True ( t , hasPrefixCaseInsensitive ( "marco" , "mar" ) )
require . True ( t , hasPrefixCaseInsensitive ( "mArco" , "mar" ) )
require . True ( t , hasPrefixCaseInsensitive ( "marco" , "MaR" ) )
require . True ( t , hasPrefixCaseInsensitive ( "marco" , "marco" ) )
require . True ( t , hasPrefixCaseInsensitive ( "mArco" , "marco" ) )
require . False ( t , hasPrefixCaseInsensitive ( "marco" , "a" ) )
require . False ( t , hasPrefixCaseInsensitive ( "marco" , "abcdefghi" ) )
2023-09-28 08:36:37 -07:00
}
func TestHasSuffixCaseInsensitive ( t * testing . T ) {
2023-10-06 06:44:17 -07:00
require . True ( t , hasSuffixCaseInsensitive ( "marco" , "rco" ) )
require . True ( t , hasSuffixCaseInsensitive ( "marco" , "RcO" ) )
require . True ( t , hasSuffixCaseInsensitive ( "marco" , "marco" ) )
require . False ( t , hasSuffixCaseInsensitive ( "marco" , "a" ) )
require . False ( t , hasSuffixCaseInsensitive ( "marco" , "abcdefghi" ) )
2023-09-28 08:36:37 -07:00
}
2023-03-02 08:20:52 -08:00
func getTestNameFromRegexp ( re string ) string {
if len ( re ) > 32 {
return re [ : 32 ]
}
return re
}
2023-09-28 08:36:37 -07:00
func generateRandomValues ( ) [ ] string {
// Init the random seed with a constant, so that it doesn't change between runs.
randGenerator := rand . New ( rand . NewSource ( 1 ) )
// Generate variable lengths random texts to match against.
texts := append ( [ ] string { } , randStrings ( randGenerator , 10 , 10 ) ... )
texts = append ( texts , randStrings ( randGenerator , 5 , 30 ) ... )
texts = append ( texts , randStrings ( randGenerator , 1 , 100 ) ... )
texts = append ( texts , "foo" + randString ( randGenerator , 50 ) )
texts = append ( texts , randString ( randGenerator , 50 ) + "foo" )
return texts
}
func visitStringMatcher ( matcher StringMatcher , callback func ( matcher StringMatcher ) ) {
callback ( matcher )
switch casted := matcher . ( type ) {
case * containsStringMatcher :
if casted . left != nil {
visitStringMatcher ( casted . left , callback )
}
if casted . right != nil {
visitStringMatcher ( casted . right , callback )
}
case * literalPrefixStringMatcher :
visitStringMatcher ( casted . right , callback )
case * literalSuffixStringMatcher :
visitStringMatcher ( casted . left , callback )
case orStringMatcher :
for _ , entry := range casted {
visitStringMatcher ( entry , callback )
}
// No nested matchers for the folling ones.
case emptyStringMatcher :
case * equalStringMatcher :
case * equalMultiStringSliceMatcher :
case * equalMultiStringMapMatcher :
case anyStringWithoutNewlineMatcher :
case * anyNonEmptyStringMatcher :
case trueMatcher :
}
}