Automatically select common strings to map from last tsdb block

This will populate the static mapping of strings to store as a single byte on startup.
We use the last TSDB block as the source of data, iterate the index for each label and count how many time series given label pair is referencing.

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
This commit is contained in:
Lukasz Mierzwa 2025-02-18 12:06:22 +00:00
parent b07a131829
commit 24dd39bc57
4 changed files with 170 additions and 3 deletions

26
cmd/prometheus/labels.go Normal file
View file

@ -0,0 +1,26 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !stringlabels
package main
import (
"log/slog"
"github.com/prometheus/prometheus/tsdb"
)
func mapCommonLabelSymbols(_ *tsdb.DB, _ *slog.Logger) error {
return nil
}

View file

@ -0,0 +1,137 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build stringlabels
package main
import (
"cmp"
"context"
"fmt"
"log/slog"
"slices"
"strings"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/tsdb"
"github.com/prometheus/prometheus/tsdb/index"
)
// countBlockSymbols reads given block index and counts how many time each string
// occurs on time series labels.
func countBlockSymbols(ctx context.Context, block *tsdb.Block) (map[string]int, error) {
names := map[string]int{}
ir, err := block.Index()
if err != nil {
return names, err
}
labelNames, err := ir.LabelNames(ctx)
if err != nil {
return names, err
}
for _, name := range labelNames {
name = strings.Clone(name)
if _, ok := names[name]; !ok {
names[name] = 0
}
values, err := ir.LabelValues(ctx, name)
if err != nil {
return names, err
}
for _, value := range values {
value = strings.Clone(value)
if _, ok := names[value]; !ok {
names[value] = 0
}
p, err := ir.Postings(ctx, name, value)
if err != nil {
return names, err
}
refs, err := index.ExpandPostings(p)
if err != nil {
return names, err
}
names[name] += len(refs)
names[value] += len(refs)
}
}
return names, ir.Close()
}
type labelCost struct {
name string
cost int
}
// selectBlockStringsToMap takes a block and returns a list of strings that are most commonly
// present on all time series.
// List is sorted starting with the most frequent strings.
func selectBlockStringsToMap(block *tsdb.Block) ([]string, error) {
names, err := countBlockSymbols(context.Background(), block)
if err != nil {
return nil, fmt.Errorf("failed to build list of common strings in block %s: %w", block.Meta().ULID, err)
}
costs := make([]labelCost, 0, len(names))
for name, count := range names {
costs = append(costs, labelCost{name: name, cost: (len(name) - 1) * count})
}
slices.SortFunc(costs, func(a, b labelCost) int {
return cmp.Compare(b.cost, a.cost)
})
mappedLabels := make([]string, 0, 256)
mappedLabels = append(mappedLabels, "") // We must always store empty string.
for i, c := range costs {
if i > 254 {
break
}
mappedLabels = append(mappedLabels, c.name)
}
return mappedLabels, nil
}
func mapCommonLabelSymbols(db *tsdb.DB, logger *slog.Logger) error {
var block *tsdb.Block
for _, b := range db.Blocks() {
if block == nil || b.MaxTime() > block.MaxTime() {
block = b
}
}
if block == nil {
logger.Info("No tsdb blocks found, can't map common label strings")
return nil
}
logger.Info(
"Finding most common label strings in last block",
slog.String("block", block.String()),
)
mappedLabels, err := selectBlockStringsToMap(block)
if err != nil {
return err
}
logger.Info("Mapped common label strings", slog.Int("count", len(mappedLabels)))
labels.MappedLabels = mappedLabels
return nil
}

View file

@ -1242,6 +1242,10 @@ func main() {
return fmt.Errorf("opening storage failed: %w", err)
}
if err = mapCommonLabelSymbols(db, logger); err != nil {
logger.Warn("Failed to map common strings in labels", slog.Any("err", err))
}
switch fsType := prom_runtime.Statfs(localStoragePath); fsType {
case "NFS_SUPER_MAGIC":
logger.Warn("This filesystem is not supported and may lead to data corruption and data loss. Please carefully read https://prometheus.io/docs/prometheus/latest/storage/ to learn more about supported filesystems.", "fs_type", fsType)

View file

@ -25,7 +25,7 @@ import (
// List of labels that should be mapped to a single byte value.
// Obviously can't have more than 256 here.
var mappedLabels = []string{
var MappedLabels = []string{
// Empty string, this must be present here.
"",
// These label names are always present on every time series.
@ -144,13 +144,13 @@ func decodeString(data string, index int) (string, int) {
size, index, mapped = decodeSize(data, index)
if mapped {
b := data[index]
return mappedLabels[int(b)], index + size
return MappedLabels[int(b)], index + size
}
return data[index : index+size], index + size
}
func encodeShortString(s string) (int, byte) {
i := slices.Index(mappedLabels, s)
i := slices.Index(MappedLabels, s)
if i >= 0 {
return 0, byte(i)
}