pkg/textparse: refactor and add metadata handling

Extends the parser to allow retrieving metadata.
The lexer now yields proper tokens that are fed into a hand-written
parser on top.

Signed-off-by: Fabian Reinartz <freinartz@google.com>
This commit is contained in:
Fabian Reinartz 2018-05-14 16:19:53 -04:00
parent 7e376dfc89
commit 76a4a46cb0
5 changed files with 762 additions and 471 deletions

View file

@ -15,48 +15,38 @@
package textparse
import (
"fmt"
"math"
"strconv"
"unicode/utf8"
"github.com/prometheus/prometheus/pkg/value"
"fmt"
)
const (
lstateInit = iota
lstateName
lstateValue
lstateTimestamp
lstateLabels
lstateLName
lstateLEq
lstateLValue
lstateLValueIn
sInit = iota
sComment
sMeta1
sMeta2
sLabels
sLValue
sValue
sTimestamp
)
// Lex is called by the parser generated by "go tool yacc" to obtain each
// token. The method is opened before the matching rules block and closed at
// the end of the file.
func (l *lexer) Lex() int {
l.state = lstateInit
func (l *lexer) Lex() token {
if l.i >= len(l.b) {
return eof
return tEOF
}
c := l.b[l.i]
l.start = l.i
l.ts = nil
l.mstart = l.nextMstart
l.offsets = l.offsets[:0]
%}
D [0-9]
L [a-zA-Z_]
M [a-zA-Z_:]
C [^\n]
%x lstateName lstateValue lstateTimestamp lstateLabels lstateLName lstateLEq lstateLValue lstateLValueIn
%x sComment sMeta1 sMeta2 sLabels sLValue sValue sTimestamp
%yyc c
%yyn c = l.next()
@ -65,65 +55,46 @@ M [a-zA-Z_:]
%%
\0 return eof
#[^\r\n]*\n l.mstart = l.i
[\r\n \t]+ l.mstart = l.i
\0 return tEOF
\n l.state = sInit; return tLinebreak
<*>[ \t]+ return tWhitespace
{M}({M}|{D})* l.state = lstateName
l.offsets = append(l.offsets, l.i)
l.mend = l.i
#[ \t]+ l.state = sComment
# return l.consumeComment()
<sComment>HELP[\t ]+ l.state = sMeta1; return tHelp
<sComment>TYPE[\t ]+ l.state = sMeta1; return tType
<sMeta1>{M}({M}|{D})* l.state = sMeta2; return tMName
<sMeta2>{C}+ l.state = sInit; return tText
<lstateName>([ \t]*)\{ l.state = lstateLabels
<lstateName>[ \t]+ l.state = lstateValue
l.vstart = l.i
<lstateLabels>[ \t]+
<lstateLabels>,?\} l.state = lstateValue
l.mend = l.i
<lstateLabels>(,?[ \t]*) l.state = lstateLName
l.offsets = append(l.offsets, l.i)
<lstateLName>{L}({L}|{D})* l.state = lstateLEq
l.offsets = append(l.offsets, l.i)
<lstateLEq>[ \t]*= l.state = lstateLValue
<lstateLValue>[ \t]+
<lstateLValue>\" l.state = lstateLValueIn
l.offsets = append(l.offsets, l.i)
<lstateLValueIn>(\\.|[^\\"])*\" l.state = lstateLabels
if !utf8.Valid(l.b[l.offsets[len(l.offsets)-1]:l.i-1]) {
l.err = fmt.Errorf("invalid UTF-8 label value")
return -1
}
l.offsets = append(l.offsets, l.i-1)
<lstateValue>[ \t]+ l.vstart = l.i
<lstateValue>(NaN) l.val = math.Float64frombits(value.NormalNaN)
l.state = lstateTimestamp
<lstateValue>[^\n \t\r]+ // We don't parse strictly correct floats as the conversion
// repeats the effort anyway.
l.val, l.err = strconv.ParseFloat(yoloString(l.b[l.vstart:l.i]), 64)
if l.err != nil {
return -1
}
l.state = lstateTimestamp
<lstateTimestamp>[ \t]+ l.tstart = l.i
<lstateTimestamp>{D}+ ts, err := strconv.ParseInt(yoloString(l.b[l.tstart:l.i]), 10, 64)
if err != nil {
l.err = err
return -1
}
l.ts = &ts
<lstateTimestamp>[\r\n]+ l.nextMstart = l.i
return 1
<lstateTimestamp>\0 return 1
{M}({M}|{D})* l.state = sValue; return tMName
<sValue>\{ l.state = sLabels; return tBraceOpen
<sLabels>{L}({L}|{D})* return tLName
<sLabels>\} l.state = sValue; return tBraceClose
<sLabels>= l.state = sLValue; return tEqual
<sLabels>, return tComma
<sLValue>\"(\\.|[^\\"])*\" l.state = sLabels; return tLValue
<sValue>[^{ \t\n]+ l.state = sTimestamp; return tValue
<sTimestamp>{D}+ return tTimestamp
<sTimestamp>\n l.state = sInit; return tLinebreak
%%
l.err = fmt.Errorf("no token found")
return -1
// Workaround to gobble up comments that started with a HELP or TYPE
// prefix. We just consume all characters until we reach a newline.
// This saves us from adding disproportionate complexity to the parser.
if l.state == sComment {
return l.consumeComment()
}
return tInvalid
}
func (l *lexer) consumeComment() token {
for c := l.cur(); ; c = l.next() {
switch c {
case 0:
return tEOF
case '\n':
l.state = sInit
return tComment
}
}
}

View file

@ -17,39 +17,28 @@ package textparse
import (
"fmt"
"math"
"strconv"
"unicode/utf8"
"github.com/prometheus/prometheus/pkg/value"
)
const (
lstateInit = iota
lstateName
lstateValue
lstateTimestamp
lstateLabels
lstateLName
lstateLEq
lstateLValue
lstateLValueIn
sInit = iota
sComment
sMeta1
sMeta2
sLabels
sLValue
sValue
sTimestamp
)
// Lex is called by the parser generated by "go tool yacc" to obtain each
// token. The method is opened before the matching rules block and closed at
// the end of the file.
func (l *lexer) Lex() int {
l.state = lstateInit
func (l *lexer) Lex() token {
if l.i >= len(l.b) {
return eof
return tEOF
}
c := l.b[l.i]
l.ts = nil
l.mstart = l.nextMstart
l.offsets = l.offsets[:0]
l.start = l.i
yystate0:
@ -58,22 +47,20 @@ yystate0:
panic(fmt.Errorf(`invalid start condition %d`, yyt))
case 0: // start condition: INITIAL
goto yystart1
case 1: // start condition: lstateName
goto yystart7
case 2: // start condition: lstateValue
goto yystart10
case 3: // start condition: lstateTimestamp
goto yystart16
case 4: // start condition: lstateLabels
case 1: // start condition: sComment
goto yystart8
case 2: // start condition: sMeta1
goto yystart19
case 3: // start condition: sMeta2
goto yystart21
case 5: // start condition: lstateLName
goto yystart26
case 6: // start condition: lstateLEq
goto yystart28
case 7: // start condition: lstateLValue
goto yystart31
case 8: // start condition: lstateLValueIn
goto yystart34
case 4: // start condition: sLabels
goto yystart24
case 5: // start condition: sLValue
goto yystart29
case 6: // start condition: sValue
goto yystart33
case 7: // start condition: sTimestamp
goto yystart36
}
goto yystate0 // silence unused label error
@ -85,10 +72,12 @@ yystart1:
default:
goto yyabort
case c == '#':
goto yystate4
goto yystate5
case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate6
case c == '\t' || c == '\n' || c == '\r' || c == ' ':
goto yystate7
case c == '\n':
goto yystate4
case c == '\t' || c == ' ':
goto yystate3
case c == '\x00':
goto yystate2
@ -103,74 +92,71 @@ yystate3:
switch {
default:
goto yyrule3
case c == '\t' || c == '\n' || c == '\r' || c == ' ':
case c == '\t' || c == ' ':
goto yystate3
}
yystate4:
c = l.next()
switch {
default:
goto yyabort
case c == '\n':
goto yystate5
case c >= '\x01' && c <= '\t' || c == '\v' || c == '\f' || c >= '\x0e' && c <= 'ÿ':
goto yystate4
}
goto yyrule2
yystate5:
c = l.next()
goto yyrule2
switch {
default:
goto yyrule5
case c == '\t' || c == ' ':
goto yystate6
}
yystate6:
c = l.next()
switch {
default:
goto yyrule4
case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
case c == '\t' || c == ' ':
goto yystate6
}
goto yystate7 // silence unused label error
yystate7:
c = l.next()
yystart7:
switch {
default:
goto yyrule10
case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate7
}
goto yystate8 // silence unused label error
yystate8:
c = l.next()
yystart8:
switch {
default:
goto yyabort
case c == '\t' || c == ' ':
goto yystate8
case c == '{':
case c == 'H':
goto yystate9
}
yystate8:
c = l.next()
switch {
default:
goto yyrule6
case c == 'T':
goto yystate14
case c == '\t' || c == ' ':
goto yystate8
case c == '{':
goto yystate9
goto yystate3
}
yystate9:
c = l.next()
goto yyrule5
goto yystate10 // silence unused label error
yystate10:
c = l.next()
yystart10:
switch {
default:
goto yyabort
case c == 'N':
goto yystate13
case c == '\t' || c == ' ':
goto yystate12
case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'M' || c >= 'O' && c <= 'ÿ':
case c == 'E':
goto yystate10
}
yystate10:
c = l.next()
switch {
default:
goto yyabort
case c == 'L':
goto yystate11
}
@ -178,96 +164,93 @@ yystate11:
c = l.next()
switch {
default:
goto yyrule17
case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
goto yystate11
goto yyabort
case c == 'P':
goto yystate12
}
yystate12:
c = l.next()
switch {
default:
goto yyrule15
goto yyabort
case c == '\t' || c == ' ':
goto yystate12
goto yystate13
}
yystate13:
c = l.next()
switch {
default:
goto yyrule17
case c == 'a':
goto yystate14
case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= '`' || c >= 'b' && c <= 'ÿ':
goto yystate11
goto yyrule6
case c == '\t' || c == ' ':
goto yystate13
}
yystate14:
c = l.next()
switch {
default:
goto yyrule17
case c == 'N':
goto yyabort
case c == 'Y':
goto yystate15
case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'M' || c >= 'O' && c <= 'ÿ':
goto yystate11
}
yystate15:
c = l.next()
switch {
default:
goto yyrule16
case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
goto yystate11
goto yyabort
case c == 'P':
goto yystate16
}
goto yystate16 // silence unused label error
yystate16:
c = l.next()
yystart16:
switch {
default:
goto yyabort
case c == '\n' || c == '\r':
goto yystate19
case c == '\t' || c == ' ':
goto yystate18
case c == '\x00':
case c == 'E':
goto yystate17
case c >= '0' && c <= '9':
goto yystate20
}
yystate17:
c = l.next()
goto yyrule21
switch {
default:
goto yyabort
case c == '\t' || c == ' ':
goto yystate18
}
yystate18:
c = l.next()
switch {
default:
goto yyrule18
goto yyrule7
case c == '\t' || c == ' ':
goto yystate18
}
goto yystate19 // silence unused label error
yystate19:
c = l.next()
yystart19:
switch {
default:
goto yyrule20
case c == '\n' || c == '\r':
goto yystate19
goto yyabort
case c == ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate20
case c == '\t' || c == ' ':
goto yystate3
}
yystate20:
c = l.next()
switch {
default:
goto yyrule19
case c >= '0' && c <= '9':
goto yyrule8
case c >= '0' && c <= ':' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate20
}
@ -277,21 +260,19 @@ yystate21:
yystart21:
switch {
default:
goto yyrule9
case c == ',':
goto yystate23
goto yyabort
case c == '\t' || c == ' ':
goto yystate23
case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
goto yystate22
case c == '}':
goto yystate25
}
yystate22:
c = l.next()
switch {
default:
goto yyrule7
case c == '\t' || c == ' ':
goto yyrule9
case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
goto yystate22
}
@ -299,269 +280,271 @@ yystate23:
c = l.next()
switch {
default:
goto yyrule9
goto yyrule3
case c == '\t' || c == ' ':
goto yystate24
case c == '}':
goto yystate25
goto yystate23
case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'ÿ':
goto yystate22
}
goto yystate24 // silence unused label error
yystate24:
c = l.next()
yystart24:
switch {
default:
goto yyrule9
goto yyabort
case c == ',':
goto yystate25
case c == '=':
goto yystate26
case c == '\t' || c == ' ':
goto yystate24
goto yystate3
case c == '}':
goto yystate28
case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate27
}
yystate25:
c = l.next()
goto yyrule8
goto yyrule15
goto yystate26 // silence unused label error
yystate26:
c = l.next()
yystart26:
switch {
default:
goto yyabort
case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate27
}
goto yyrule14
yystate27:
c = l.next()
switch {
default:
goto yyrule10
goto yyrule12
case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z':
goto yystate27
}
goto yystate28 // silence unused label error
yystate28:
c = l.next()
yystart28:
switch {
default:
goto yyabort
case c == '=':
goto yystate30
case c == '\t' || c == ' ':
goto yystate29
}
goto yyrule13
goto yystate29 // silence unused label error
yystate29:
c = l.next()
switch {
default:
goto yyabort
case c == '=':
goto yystate30
case c == '\t' || c == ' ':
goto yystate29
}
yystate30:
c = l.next()
goto yyrule11
goto yystate31 // silence unused label error
yystate31:
c = l.next()
yystart31:
yystart29:
switch {
default:
goto yyabort
case c == '"':
goto yystate33
goto yystate30
case c == '\t' || c == ' ':
goto yystate32
goto yystate3
}
yystate30:
c = l.next()
switch {
default:
goto yyabort
case c == '"':
goto yystate31
case c == '\\':
goto yystate32
case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
goto yystate30
}
yystate31:
c = l.next()
goto yyrule16
yystate32:
c = l.next()
switch {
default:
goto yyrule12
case c == '\t' || c == ' ':
goto yystate32
goto yyabort
case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
goto yystate30
}
goto yystate33 // silence unused label error
yystate33:
c = l.next()
goto yyrule13
goto yystate34 // silence unused label error
yystate34:
c = l.next()
yystart34:
yystart33:
switch {
default:
goto yyabort
case c == '"':
goto yystate36
case c == '\\':
goto yystate37
case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
case c == '\t' || c == ' ':
goto yystate3
case c == '{':
goto yystate35
case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ':
goto yystate34
}
yystate34:
c = l.next()
switch {
default:
goto yyrule17
case c >= '\x01' && c <= '\b' || c >= '\v' && c <= '\x1f' || c >= '!' && c <= 'z' || c >= '|' && c <= 'ÿ':
goto yystate34
}
yystate35:
c = l.next()
goto yyrule11
goto yystate36 // silence unused label error
yystate36:
c = l.next()
yystart36:
switch {
default:
goto yyabort
case c == '"':
goto yystate36
case c == '\\':
case c == '\n':
goto yystate37
case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'ÿ':
goto yystate35
case c == '\t' || c == ' ':
goto yystate3
case c >= '0' && c <= '9':
goto yystate38
}
yystate36:
c = l.next()
goto yyrule14
yystate37:
c = l.next()
goto yyrule19
yystate38:
c = l.next()
switch {
default:
goto yyabort
case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ':
goto yystate35
goto yyrule18
case c >= '0' && c <= '9':
goto yystate38
}
yyrule1: // \0
{
return eof
return tEOF
}
yyrule2: // #[^\r\n]*\n
yyrule2: // \n
{
l.mstart = l.i
l.state = sInit
return tLinebreak
goto yystate0
}
yyrule3: // [\r\n \t]+
yyrule3: // [ \t]+
{
l.mstart = l.i
return tWhitespace
}
yyrule4: // #[ \t]+
{
l.state = sComment
goto yystate0
}
yyrule4: // {M}({M}|{D})*
yyrule5: // #
{
l.state = lstateName
l.offsets = append(l.offsets, l.i)
l.mend = l.i
return l.consumeComment()
}
yyrule6: // HELP[\t ]+
{
l.state = sMeta1
return tHelp
goto yystate0
}
yyrule5: // ([ \t]*)\{
yyrule7: // TYPE[\t ]+
{
l.state = lstateLabels
l.state = sMeta1
return tType
goto yystate0
}
yyrule6: // [ \t]+
yyrule8: // {M}({M}|{D})*
{
l.state = lstateValue
l.vstart = l.i
l.state = sMeta2
return tMName
goto yystate0
}
yyrule7: // [ \t]+
goto yystate0
yyrule8: // ,?\}
yyrule9: // {C}+
{
l.state = lstateValue
l.mend = l.i
l.state = sInit
return tText
goto yystate0
}
yyrule9: // (,?[ \t]*)
yyrule10: // {M}({M}|{D})*
{
l.state = lstateLName
l.offsets = append(l.offsets, l.i)
l.state = sValue
return tMName
goto yystate0
}
yyrule10: // {L}({L}|{D})*
yyrule11: // \{
{
l.state = lstateLEq
l.offsets = append(l.offsets, l.i)
l.state = sLabels
return tBraceOpen
goto yystate0
}
yyrule11: // [ \t]*=
yyrule12: // {L}({L}|{D})*
{
l.state = lstateLValue
return tLName
}
yyrule13: // \}
{
l.state = sValue
return tBraceClose
goto yystate0
}
yyrule12: // [ \t]+
goto yystate0
yyrule13: // \"
yyrule14: // =
{
l.state = lstateLValueIn
l.offsets = append(l.offsets, l.i)
l.state = sLValue
return tEqual
goto yystate0
}
yyrule14: // (\\.|[^\\"])*\"
yyrule15: // ,
{
l.state = lstateLabels
if !utf8.Valid(l.b[l.offsets[len(l.offsets)-1] : l.i-1]) {
l.err = fmt.Errorf("invalid UTF-8 label value")
return -1
}
l.offsets = append(l.offsets, l.i-1)
return tComma
}
yyrule16: // \"(\\.|[^\\"])*\"
{
l.state = sLabels
return tLValue
goto yystate0
}
yyrule15: // [ \t]+
yyrule17: // [^{ \t\n]+
{
l.vstart = l.i
l.state = sTimestamp
return tValue
goto yystate0
}
yyrule16: // (NaN)
yyrule18: // {D}+
{
l.val = math.Float64frombits(value.NormalNaN)
l.state = lstateTimestamp
return tTimestamp
}
yyrule19: // \n
{
l.state = sInit
return tLinebreak
goto yystate0
}
yyrule17: // [^\n \t\r]+
{
// We don't parse strictly correct floats as the conversion
// repeats the effort anyway.
l.val, l.err = strconv.ParseFloat(yoloString(l.b[l.vstart:l.i]), 64)
if l.err != nil {
return -1
}
l.state = lstateTimestamp
goto yystate0
}
yyrule18: // [ \t]+
{
l.tstart = l.i
goto yystate0
}
yyrule19: // {D}+
{
ts, err := strconv.ParseInt(yoloString(l.b[l.tstart:l.i]), 10, 64)
if err != nil {
l.err = err
return -1
}
l.ts = &ts
goto yystate0
}
yyrule20: // [\r\n]+
{
l.nextMstart = l.i
return 1
}
yyrule21: // \0
{
return 1
}
panic("unreachable")
goto yyabort // silence unused label error
yyabort: // no lexem recognized
l.err = fmt.Errorf("no token found")
return -1
// Workaround to gobble up comments that started with a HELP or TYPE
// prefix. We just consume all characters until we reach a newline.
// This saves us from adding disproportionate complexity to the parser.
if l.state == sComment {
return l.consumeComment()
}
return tInvalid
}
func (l *lexer) consumeComment() token {
for c := l.cur(); ; c = l.next() {
switch c {
case 0:
return tEOF
case '\n':
l.state = sInit
return tComment
}
}
}

View file

@ -19,45 +19,115 @@ package textparse
import (
"errors"
"fmt"
"io"
"math"
"sort"
"strconv"
"strings"
"unicode/utf8"
"unsafe"
"github.com/prometheus/prometheus/pkg/value"
"github.com/prometheus/prometheus/pkg/labels"
)
type lexer struct {
b []byte
i int
vstart int
tstart int
err error
val float64
ts *int64
offsets []int
mstart, mend int
nextMstart int
b []byte
i int
start int
err error
state int
}
const eof = 0
type token int
const (
tInvalid token = -1
tEOF token = 0
tLinebreak token = iota
tWhitespace
tHelp
tType
tText
tComment
tBlank
tMName
tBraceOpen
tBraceClose
tLName
tLValue
tComma
tEqual
tTimestamp
tValue
)
func (t token) String() string {
switch t {
case tInvalid:
return "INVALID"
case tEOF:
return "EOF"
case tLinebreak:
return "LINEBREAK"
case tWhitespace:
return "WHITESPACE"
case tHelp:
return "HELP"
case tType:
return "TYPE"
case tText:
return "TEXT"
case tComment:
return "COMMENT"
case tBlank:
return "BLANK"
case tMName:
return "MNAME"
case tBraceOpen:
return "BOPEN"
case tBraceClose:
return "BCLOSE"
case tLName:
return "LNAME"
case tLValue:
return "LVALUE"
case tEqual:
return "EQUAL"
case tComma:
return "COMMA"
case tTimestamp:
return "TIMESTAMP"
case tValue:
return "VALUE"
}
return fmt.Sprintf("<invalid: %d>", t)
}
// buf returns the buffer of the current token.
func (l *lexer) buf() []byte {
return l.b[l.start:l.i]
}
func (l *lexer) cur() byte {
return l.b[l.i]
}
// next advances the lexer to the next character.
func (l *lexer) next() byte {
l.i++
if l.i >= len(l.b) {
l.err = io.EOF
return eof
return byte(tEOF)
}
c := l.b[l.i]
// Consume null byte when encountered in label-value.
if c == eof && (l.state == lstateLValueIn || l.state == lstateLValue) {
return l.next()
// Lex struggles with null bytes. If we are in a label value, where
// they are allowed, consume them here immediately.
for l.b[l.i] == 0 && l.state == sLValue {
l.i++
}
return c
return l.b[l.i]
}
func (l *lexer) Error(es string) {
@ -67,43 +137,50 @@ func (l *lexer) Error(es string) {
// Parser parses samples from a byte slice of samples in the official
// Prometheus text exposition format.
type Parser struct {
l *lexer
err error
val float64
l *lexer
series []byte
text []byte
mtype MetricType
val float64
ts int64
hasTS bool
start int
offsets []int
}
// New returns a new parser of the byte slice.
func New(b []byte) *Parser {
return &Parser{l: &lexer{b: b}}
return &Parser{l: &lexer{b: append(b, '\n')}}
}
// Next advances the parser to the next sample. It returns false if no
// more samples were read or an error occurred.
func (p *Parser) Next() bool {
switch p.l.Lex() {
case -1, eof:
return false
case 1:
return true
}
panic("unexpected")
}
// At returns the bytes of the metric, the timestamp if set, and the value
// Series returns the bytes of the series, the timestamp if set, and the value
// of the current sample.
func (p *Parser) At() ([]byte, *int64, float64) {
return p.l.b[p.l.mstart:p.l.mend], p.l.ts, p.l.val
func (p *Parser) Series() ([]byte, *int64, float64) {
if p.hasTS {
return p.series, &p.ts, p.val
}
return p.series, nil, p.val
}
// Err returns the current error.
func (p *Parser) Err() error {
if p.err != nil {
return p.err
}
if p.l.err == io.EOF {
return nil
}
return p.l.err
// Help returns the metric name and help text in the current entry.
// Must only be called after Next returned a help entry.
// The returned byte slices become invalid after the next call to Next.
func (p *Parser) Help() ([]byte, []byte) {
return p.l.b[p.offsets[0]:p.offsets[1]], p.text
}
// Type returns the metric name and type in the current entry.
// Must only be called after Next returned a type entry.
// The returned byte slices become invalid after the next call to Next.
func (p *Parser) Type() ([]byte, MetricType) {
return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
}
// Comment returns the text of the current comment.
// Must only be called after Next returned a comment entry.
// The returned byte slice becomes invalid after the next call to Next.
func (p *Parser) Comment() []byte {
return p.text
}
// Metric writes the labels of the current sample into the passed labels.
@ -111,33 +188,208 @@ func (p *Parser) Err() error {
func (p *Parser) Metric(l *labels.Labels) string {
// Allocate the full immutable string immediately, so we just
// have to create references on it below.
s := string(p.l.b[p.l.mstart:p.l.mend])
s := string(p.series)
*l = append(*l, labels.Label{
Name: labels.MetricName,
Value: s[:p.l.offsets[0]-p.l.mstart],
Value: s[:p.offsets[0]-p.start],
})
for i := 1; i < len(p.l.offsets); i += 4 {
a := p.l.offsets[i] - p.l.mstart
b := p.l.offsets[i+1] - p.l.mstart
c := p.l.offsets[i+2] - p.l.mstart
d := p.l.offsets[i+3] - p.l.mstart
for i := 1; i < len(p.offsets); i += 4 {
a := p.offsets[i] - p.start
b := p.offsets[i+1] - p.start
c := p.offsets[i+2] - p.start
d := p.offsets[i+3] - p.start
// Replacer causes allocations. Replace only when necessary.
if strings.IndexByte(s[c:d], byte('\\')) >= 0 {
*l = append(*l, labels.Label{Name: s[a:b], Value: replacer.Replace(s[c:d])})
continue
}
*l = append(*l, labels.Label{Name: s[a:b], Value: s[c:d]})
}
// Sort labels. We can skip the first entry since the metric name is
// already at the right place.
sort.Sort((*l)[1:])
return s
}
// nextToken returns the next token from the lexer. It skips over tabs
// and spaces.
func (p *Parser) nextToken() token {
for {
if tok := p.l.Lex(); tok != tWhitespace {
return tok
}
}
}
// Entry represents the type of a parsed entry.
type Entry int
const (
EntryInvalid Entry = -1
EntryType Entry = 0
EntryHelp Entry = 1
EntrySeries Entry = 2
EntryComment Entry = 3
)
// MetricType represents metric type values.
type MetricType string
const (
MetricTypeCounter = "counter"
MetricTypeGauge = "gauge"
MetricTypeHistogram = "histogram"
MetricTypeSummary = "summary"
MetricTypeUntyped = "untyped"
)
func parseError(exp string, got token) error {
return fmt.Errorf("%s, got %q", exp, got)
}
// Next advances the parser to the next sample. It returns false if no
// more samples were read or an error occurred.
func (p *Parser) Next() (Entry, error) {
var err error
p.start = p.l.i
p.offsets = p.offsets[:0]
switch t := p.nextToken(); t {
case tEOF:
return EntryInvalid, io.EOF
case tLinebreak:
// Allow full blank lines.
return p.Next()
case tHelp, tType:
switch t := p.nextToken(); t {
case tMName:
p.offsets = append(p.offsets, p.l.start, p.l.i)
default:
return EntryInvalid, parseError("expected metric name after HELP", t)
}
switch t := p.nextToken(); t {
case tText:
p.text = p.l.buf()[1:]
default:
return EntryInvalid, parseError("expected text in HELP", t)
}
if t == tType {
switch s := yoloString(p.text); s {
case "counter":
p.mtype = MetricTypeCounter
case "gauge":
p.mtype = MetricTypeGauge
case "histogram":
p.mtype = MetricTypeHistogram
case "summary":
p.mtype = MetricTypeSummary
case "untyped":
p.mtype = MetricTypeUntyped
default:
return EntryInvalid, fmt.Errorf("invalid metric type %q", s)
}
}
if t := p.nextToken(); t != tLinebreak {
return EntryInvalid, parseError("linebreak expected after metadata", t)
}
switch t {
case tHelp:
return EntryHelp, nil
case tType:
return EntryType, nil
}
case tComment:
p.text = p.l.buf()
if t := p.nextToken(); t != tLinebreak {
return EntryInvalid, parseError("linebreak expected after comment", t)
}
return EntryComment, nil
case tMName:
p.offsets = append(p.offsets, p.l.i)
p.series = p.l.b[p.start:p.l.i]
t2 := p.nextToken()
if t2 == tBraceOpen {
if err := p.parseLVals(); err != nil {
return EntryInvalid, err
}
p.series = p.l.b[p.start:p.l.i]
t2 = p.nextToken()
}
if t2 != tValue {
return EntryInvalid, parseError("expected value after metric", t)
}
if p.val, err = strconv.ParseFloat(yoloString(p.l.buf()), 64); err != nil {
return EntryInvalid, err
}
// Ensure canonical NaN value.
if math.IsNaN(p.val) {
p.val = math.Float64frombits(value.NormalNaN)
}
p.hasTS = false
switch p.nextToken() {
case tLinebreak:
break
case tTimestamp:
p.hasTS = true
if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
return EntryInvalid, err
}
if t2 := p.nextToken(); t2 != tLinebreak {
return EntryInvalid, parseError("expected next entry after timestamp", t)
}
default:
return EntryInvalid, parseError("expected timestamp or new record", t)
}
return EntrySeries, nil
default:
err = fmt.Errorf("%q is not a valid start token", t)
}
return EntryInvalid, err
}
func (p *Parser) parseLVals() error {
t := p.nextToken()
for {
switch t {
case tBraceClose:
return nil
case tLName:
default:
return parseError("expected label name", t)
}
p.offsets = append(p.offsets, p.l.start, p.l.i)
if t := p.nextToken(); t != tEqual {
return parseError("expected equal", t)
}
if t := p.nextToken(); t != tLValue {
return parseError("expected label value", t)
}
if !utf8.Valid(p.l.buf()) {
return fmt.Errorf("invalid UTF-8 label value")
}
// The lexer ensures the value string is quoted. Strip first
// and last character.
p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)
// Free trailing commas are allowed.
if t = p.nextToken(); t == tComma {
t = p.nextToken()
}
}
}
var replacer = strings.NewReplacer(
`\"`, `"`,
`\\`, `\`,

View file

@ -29,15 +29,19 @@ import (
func TestParse(t *testing.T) {
input := `# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 4.9351e-05
go_gc_duration_seconds{quantile="0.25",} 7.424100000000001e-05
go_gc_duration_seconds{quantile="0.5",a="b"} 8.3835e-05
go_gc_duration_seconds{quantile="0.8", a="b"} 8.3835e-05
go_gc_duration_seconds{ quantile="0.9", a="b"} 8.3835e-05
# Hrandom comment starting with prefix of HELP
#
# comment with escaped \n newline
# comment with escaped \ escape character
go_gc_duration_seconds{ quantile="1.0", a="b" } 8.3835e-05
go_gc_duration_seconds { quantile="1.0", a="b" } 8.3835e-05
go_gc_duration_seconds { quantile= "1.0", a= "b" } 8.3835e-05
go_gc_duration_seconds { quantile= "1.0", a= "b", } 8.3835e-05
go_gc_duration_seconds { quantile = "1.0", a = "b" } 8.3835e-05
go_gc_duration_seconds_count 99
some:aggregate:rate5m{a_b="c"} 1
@ -52,12 +56,21 @@ testmetric{label="\"bar\""} 1`
int64p := func(x int64) *int64 { return &x }
exp := []struct {
lset labels.Labels
m string
t *int64
v float64
lset labels.Labels
m string
t *int64
v float64
typ MetricType
help string
comment string
}{
{
m: "go_gc_duration_seconds",
help: "A summary of the GC invocation durations.",
}, {
m: "go_gc_duration_seconds",
typ: MetricTypeSummary,
}, {
m: `go_gc_duration_seconds{quantile="0"}`,
v: 4.9351e-05,
lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0"),
@ -77,6 +90,14 @@ testmetric{label="\"bar\""} 1`
m: `go_gc_duration_seconds{ quantile="0.9", a="b"}`,
v: 8.3835e-05,
lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.9", "a", "b"),
}, {
comment: "# Hrandom comment starting with prefix of HELP",
}, {
comment: "#",
}, {
comment: "# comment with escaped \\n newline",
}, {
comment: "# comment with escaped \\ escape character",
}, {
m: `go_gc_duration_seconds{ quantile="1.0", a="b" }`,
v: 8.3835e-05,
@ -86,7 +107,7 @@ testmetric{label="\"bar\""} 1`
v: 8.3835e-05,
lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "1.0", "a", "b"),
}, {
m: `go_gc_duration_seconds { quantile= "1.0", a= "b" }`,
m: `go_gc_duration_seconds { quantile= "1.0", a= "b", }`,
v: 8.3835e-05,
lset: labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "1.0", "a", "b"),
}, {
@ -101,6 +122,12 @@ testmetric{label="\"bar\""} 1`
m: `some:aggregate:rate5m{a_b="c"}`,
v: 1,
lset: labels.FromStrings("__name__", "some:aggregate:rate5m", "a_b", "c"),
}, {
m: "go_goroutines",
help: "Number of goroutines that currently exist.",
}, {
m: "go_goroutines",
typ: MetricTypeGauge,
}, {
m: `go_goroutines`,
v: 33,
@ -130,23 +157,42 @@ testmetric{label="\"bar\""} 1`
var res labels.Labels
for p.Next() {
m, ts, v := p.At()
for {
et, err := p.Next()
if err == io.EOF {
break
}
require.NoError(t, err)
p.Metric(&res)
switch et {
case EntrySeries:
m, ts, v := p.Series()
require.Equal(t, exp[i].m, string(m))
require.Equal(t, exp[i].t, ts)
require.Equal(t, exp[i].v, v)
require.Equal(t, exp[i].lset, res)
p.Metric(&res)
require.Equal(t, exp[i].m, string(m))
require.Equal(t, exp[i].t, ts)
require.Equal(t, exp[i].v, v)
require.Equal(t, exp[i].lset, res)
res = res[:0]
case EntryType:
m, typ := p.Type()
require.Equal(t, exp[i].m, string(m))
require.Equal(t, exp[i].typ, typ)
case EntryHelp:
m, h := p.Help()
require.Equal(t, exp[i].m, string(m))
require.Equal(t, exp[i].help, string(h))
case EntryComment:
require.Equal(t, exp[i].comment, string(p.Comment()))
}
i++
res = res[:0]
}
require.NoError(t, p.Err())
require.Equal(t, len(exp), i)
}
func TestParseErrors(t *testing.T) {
@ -156,19 +202,19 @@ func TestParseErrors(t *testing.T) {
}{
{
input: "a",
err: "no token found",
err: "expected value after metric, got \"MNAME\"",
},
{
input: "a{b='c'} 1\n",
err: "no token found",
err: "expected label value, got \"INVALID\"",
},
{
input: "a{b=\n",
err: "no token found",
err: "expected label value, got \"INVALID\"",
},
{
input: "a{\xff=\"foo\"} 1\n",
err: "no token found",
err: "expected label name, got \"INVALID\"",
},
{
input: "a{b=\"\xff\"} 1\n",
@ -180,20 +226,22 @@ func TestParseErrors(t *testing.T) {
},
{
input: "something_weird{problem=\"",
err: "no token found",
err: "expected label value, got \"INVALID\"",
},
{
input: "empty_label_name{=\"\"} 0",
err: "no token found",
err: "expected label name, got \"EQUAL\"",
},
}
for _, c := range cases {
for i, c := range cases {
p := New([]byte(c.input))
for p.Next() {
var err error
for err == nil {
_, err = p.Next()
}
require.NotNil(t, p.Err())
require.Equal(t, c.err, p.Err().Error())
require.NotNil(t, err)
require.Equal(t, c.err, err.Error(), "test %d", i)
}
}
@ -220,34 +268,36 @@ func TestNullByteHandling(t *testing.T) {
},
{
input: "a{b=\x00\"ssss\"} 1\n",
err: "no token found",
err: "expected label value, got \"INVALID\"",
},
{
input: "a{b=\"\x00",
err: "no token found",
err: "expected label value, got \"INVALID\"",
},
{
input: "a{b\x00=\"hiih\"} 1",
err: "no token found",
err: "expected equal, got \"INVALID\"",
},
{
input: "a\x00{b=\"ddd\"} 1",
err: "no token found",
err: "expected value after metric, got \"MNAME\"",
},
}
for _, c := range cases {
for i, c := range cases {
p := New([]byte(c.input))
for p.Next() {
var err error
for err == nil {
_, err = p.Next()
}
if c.err == "" {
require.NoError(t, p.Err())
require.Equal(t, io.EOF, err, "test %d", i)
continue
}
require.Error(t, p.Err())
require.Equal(t, c.err, p.Err().Error())
require.Error(t, err)
require.Equal(t, c.err, err.Error(), "test %d", i)
}
}
@ -274,13 +324,21 @@ func BenchmarkParse(b *testing.B) {
for i := 0; i < b.N; i += testdataSampleCount {
p := New(buf)
for p.Next() && i < b.N {
m, _, _ := p.At()
total += len(m)
i++
Outer:
for i < b.N {
t, err := p.Next()
switch t {
case EntryInvalid:
if err == io.EOF {
break Outer
}
b.Fatal(err)
case EntrySeries:
m, _, _ := p.Series()
total += len(m)
i++
}
}
require.NoError(b, p.Err())
}
_ = total
})
@ -294,16 +352,25 @@ func BenchmarkParse(b *testing.B) {
for i := 0; i < b.N; i += testdataSampleCount {
p := New(buf)
for p.Next() && i < b.N {
m, _, _ := p.At()
Outer:
for i < b.N {
t, err := p.Next()
switch t {
case EntryInvalid:
if err == io.EOF {
break Outer
}
b.Fatal(err)
case EntrySeries:
m, _, _ := p.Series()
res := make(labels.Labels, 0, 5)
p.Metric(&res)
res := make(labels.Labels, 0, 5)
p.Metric(&res)
total += len(m)
i++
total += len(m)
i++
}
}
require.NoError(b, p.Err())
}
_ = total
})
@ -318,16 +385,25 @@ func BenchmarkParse(b *testing.B) {
for i := 0; i < b.N; i += testdataSampleCount {
p := New(buf)
for p.Next() && i < b.N {
m, _, _ := p.At()
Outer:
for i < b.N {
t, err := p.Next()
switch t {
case EntryInvalid:
if err == io.EOF {
break Outer
}
b.Fatal(err)
case EntrySeries:
m, _, _ := p.Series()
p.Metric(&res)
p.Metric(&res)
total += len(m)
i++
res = res[:0]
total += len(m)
i++
res = res[:0]
}
}
require.NoError(b, p.Err())
}
_ = total
})
@ -361,7 +437,6 @@ func BenchmarkParse(b *testing.B) {
})
}
}
func BenchmarkGzip(b *testing.B) {
for _, fn := range []string{"testdata.txt", "testdata.nometa.txt"} {
b.Run(fn, func(b *testing.B) {

View file

@ -830,11 +830,21 @@ func (sl *scrapeLoop) append(b []byte, ts time.Time) (total, added int, err erro
var sampleLimitErr error
loop:
for p.Next() {
for {
var et textparse.Entry
if et, err = p.Next(); err != nil {
if err == io.EOF {
err = nil
}
break
}
if et != textparse.EntrySeries {
continue
}
total++
t := defTime
met, tp, v := p.At()
met, tp, v := p.Series()
if tp != nil {
t = *tp
}
@ -931,10 +941,10 @@ loop:
}
added++
}
if err == nil {
err = p.Err()
}
if sampleLimitErr != nil {
if err == nil {
err = sampleLimitErr
}
// We only want to increment this once per scrape, so this is Inc'd outside the loop.
targetScrapeSampleLimit.Inc()
}