prometheus/model/textparse/promlex.l
Owen Williams a28d7865ad UTF-8: Add support for parsing UTF8 metric and label names
This adds support for the new grammar of `{"metric_name", "l1"="val"}` to promql and some of the exposition formats.
This grammar will also be valid for non-UTF-8 names.
UTF-8 names will not be considered valid unless model.NameValidationScheme is changed.

This does not update the go expfmt parser in text_parse.go, which will be addressed by https://github.com/prometheus/common/issues/554/.

Part of https://github.com/prometheus/prometheus/issues/13095

Signed-off-by: Owen Williams <owen.williams@grafana.com>
2024-02-15 14:34:37 -05:00

107 lines
3.2 KiB
Plaintext

%{
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package textparse
import (
"fmt"
)
const (
sInit = iota
sComment
sMeta1
sMeta2
sLabels
sLValue
sValue
sTimestamp
sExemplar
sEValue
sETimestamp
)
// Lex is called by the parser generated by "go tool yacc" to obtain each
// token. The method is opened before the matching rules block and closed at
// the end of the file.
func (l *promlexer) Lex() token {
if l.i >= len(l.b) {
return tEOF
}
c := l.b[l.i]
l.start = l.i
%}
D [0-9]
L [a-zA-Z_]
M [a-zA-Z_:]
C [^\n]
%x sComment sMeta1 sMeta2 sLabels sLValue sValue sTimestamp
%yyc c
%yyn c = l.next()
%yyt l.state
%%
\0 return tEOF
\n l.state = sInit; return tLinebreak
<*>[ \t]+ return tWhitespace
#[ \t]+ l.state = sComment
# return l.consumeComment()
<sComment>HELP[\t ]+ l.state = sMeta1; return tHelp
<sComment>TYPE[\t ]+ l.state = sMeta1; return tType
<sMeta1>\"(\\.|[^\\"])*\" l.state = sMeta2; return tMName
<sMeta1>{M}({M}|{D})* l.state = sMeta2; return tMName
<sMeta2>{C}* l.state = sInit; return tText
{M}({M}|{D})* l.state = sValue; return tMName
<sValue>\{ l.state = sLabels; return tBraceOpen
\{ l.state = sLabels; return tBraceOpen
<sLabels>{L}({L}|{D})* return tLName
<sLabels>\"(\\.|[^\\"])*\" l.state = sLabels; return tQString
<sLabels>\} l.state = sValue; return tBraceClose
<sLabels>= l.state = sLValue; return tEqual
<sLabels>, return tComma
<sLValue>\"(\\.|[^\\"])*\" l.state = sLabels; return tLValue
<sValue>[^{ \t\n]+ l.state = sTimestamp; return tValue
<sTimestamp>{D}+ return tTimestamp
<sTimestamp>\n l.state = sInit; return tLinebreak
%%
// Workaround to gobble up comments that started with a HELP or TYPE
// prefix. We just consume all characters until we reach a newline.
// This saves us from adding disproportionate complexity to the parser.
if l.state == sComment {
return l.consumeComment()
}
return tInvalid
}
func (l *promlexer) consumeComment() token {
for c := l.cur(); ; c = l.next() {
switch c {
case 0:
return tEOF
case '\n':
l.state = sInit
return tComment
}
}
}