node_exporter/collector/textfile.go

324 lines
8.8 KiB
Go
Raw Normal View History

2015-09-26 08:36:40 -07:00
// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !notextfile
// +build !notextfile
package collector
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
)
var (
textFileDirectory = kingpin.Flag("collector.textfile.directory", "Directory to read text files with metrics from.").Default("").String()
mtimeDesc = prometheus.NewDesc(
"node_textfile_mtime_seconds",
"Unixtime mtime of textfiles successfully read.",
[]string{"file"},
nil,
)
)
type textFileCollector struct {
path string
// Only set for testing to get predictable output.
mtime *float64
logger *slog.Logger
}
func init() {
registerCollector("textfile", defaultEnabled, NewTextFileCollector)
}
2017-02-28 08:44:53 -08:00
// NewTextFileCollector returns a new Collector exposing metrics read from files
// in the given textfile directory.
func NewTextFileCollector(logger *slog.Logger) (Collector, error) {
c := &textFileCollector{
path: *textFileDirectory,
logger: logger,
}
return c, nil
}
func convertMetricFamily(metricFamily *dto.MetricFamily, ch chan<- prometheus.Metric, logger *slog.Logger) {
var valType prometheus.ValueType
var val float64
allLabelNames := map[string]struct{}{}
for _, metric := range metricFamily.Metric {
labels := metric.GetLabel()
for _, label := range labels {
if _, ok := allLabelNames[label.GetName()]; !ok {
allLabelNames[label.GetName()] = struct{}{}
}
}
}
for _, metric := range metricFamily.Metric {
if metric.TimestampMs != nil {
logger.Warn("Ignoring unsupported custom timestamp on textfile collector metric", "metric", metric)
}
labels := metric.GetLabel()
var names []string
var values []string
for _, label := range labels {
names = append(names, label.GetName())
values = append(values, label.GetValue())
}
for k := range allLabelNames {
present := false
for _, name := range names {
if k == name {
present = true
break
}
}
if !present {
names = append(names, k)
values = append(values, "")
}
}
metricType := metricFamily.GetType()
switch metricType {
case dto.MetricType_COUNTER:
valType = prometheus.CounterValue
val = metric.Counter.GetValue()
case dto.MetricType_GAUGE:
valType = prometheus.GaugeValue
val = metric.Gauge.GetValue()
case dto.MetricType_UNTYPED:
valType = prometheus.UntypedValue
val = metric.Untyped.GetValue()
case dto.MetricType_SUMMARY:
quantiles := map[float64]float64{}
for _, q := range metric.Summary.Quantile {
quantiles[q.GetQuantile()] = q.GetValue()
}
ch <- prometheus.MustNewConstSummary(
prometheus.NewDesc(
*metricFamily.Name,
metricFamily.GetHelp(),
names, nil,
),
metric.Summary.GetSampleCount(),
metric.Summary.GetSampleSum(),
quantiles, values...,
)
case dto.MetricType_HISTOGRAM:
buckets := map[float64]uint64{}
for _, b := range metric.Histogram.Bucket {
buckets[b.GetUpperBound()] = b.GetCumulativeCount()
}
ch <- prometheus.MustNewConstHistogram(
prometheus.NewDesc(
*metricFamily.Name,
metricFamily.GetHelp(),
names, nil,
),
metric.Histogram.GetSampleCount(),
metric.Histogram.GetSampleSum(),
buckets, values...,
)
default:
panic("unknown metric type")
}
if metricType == dto.MetricType_GAUGE || metricType == dto.MetricType_COUNTER || metricType == dto.MetricType_UNTYPED {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
*metricFamily.Name,
metricFamily.GetHelp(),
names, nil,
),
valType, val, values...,
)
}
}
}
func (c *textFileCollector) exportMTimes(mtimes map[string]time.Time, ch chan<- prometheus.Metric) {
if len(mtimes) == 0 {
return
}
// Export the mtimes of the successful files.
// Sorting is needed for predictable output comparison in tests.
filepaths := make([]string, 0, len(mtimes))
for path := range mtimes {
filepaths = append(filepaths, path)
}
sort.Strings(filepaths)
for _, path := range filepaths {
mtime := float64(mtimes[path].UnixNano() / 1e9)
if c.mtime != nil {
mtime = *c.mtime
}
ch <- prometheus.MustNewConstMetric(mtimeDesc, prometheus.GaugeValue, mtime, path)
}
}
// Update implements the Collector interface.
func (c *textFileCollector) Update(ch chan<- prometheus.Metric) error {
// Iterate over files and accumulate their metrics, but also track any
// parsing errors so an error metric can be reported.
var errored bool
var parsedFamilies []*dto.MetricFamily
metricsNamesToFiles := map[string][]string{}
collector/textfile: Avoid inconsistent help-texts (#2962) Avoid metrics with inconsistent help-texts. The earlier behaviour has been preserved in the sense that the first encountered instance is still used to generate metrics, whereas the subsequent inconsistent ones are ignored along with a few peripheral changes. ``` # HELP node_scrape_collector_duration_seconds node_exporter: Duration of a collector scrape. #TYPE node_scrape_collector_duration_seconds gauge node_scrape_collector_duration_seconds{collector="textfile"} 0.0004005 # HELP node_scrape_collector_success node_exporter: Whether a collector succeeded. # TYPE node_scrape_collector_success gauge node_scrape_collector_success{collector="textfile"} 1 # HELP node_textfile_mtime_seconds Unixtime mtime of textfiles successfully read. # TYPE node_textfile_mtime_seconds gauge node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-bar.prom"} 1.710812009e+09 node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-foo.prom"} 1.710811982e+09 # HELP node_textfile_scrape_error 1 if there was an error opening or reading a file, 0 otherwise # TYPE node_textfile_scrape_error gauge node_textfile_scrape_error 1 # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. # TYPE promhttp_metric_handler_errors_total counter promhttp_metric_handler_errors_total{cause="encoding"} 0 promhttp_metric_handler_errors_total{cause="gathering"} 0 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. # TYPE promhttp_metric_handler_requests_total counter promhttp_metric_handler_requests_total{code="200"} 0 promhttp_metric_handler_requests_total{code="500"} 0 promhttp_metric_handler_requests_total{code="503"} 0 # HELP tau_infrastructure_performing_maintenance_task At what timestamp a given task started or stopped, the last time it was run. # TYPE tau_infrastructure_performing_maintenance_task gauge tau_infrastructure_performing_maintenance_task{main_task="nightly",start_or_stop="start",sub_task="main"} 1.64728080198446e+09 ``` Fixes: #2317 Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
2024-03-23 22:43:03 -07:00
metricsNamesToHelpTexts := map[string][2]string{}
paths, err := filepath.Glob(c.path)
if err != nil || len(paths) == 0 {
// not glob or not accessible path either way assume single
// directory and let os.ReadDir handle it
paths = []string{c.path}
}
mtimes := make(map[string]time.Time)
for _, path := range paths {
files, err := os.ReadDir(path)
if err != nil && path != "" {
errored = true
c.logger.Error("failed to read textfile collector directory", "path", path, "err", err)
}
for _, f := range files {
metricsFilePath := filepath.Join(path, f.Name())
if !strings.HasSuffix(f.Name(), ".prom") {
continue
}
mtime, families, err := c.processFile(path, f.Name(), ch)
for _, mf := range families {
collector/textfile: Avoid inconsistent help-texts (#2962) Avoid metrics with inconsistent help-texts. The earlier behaviour has been preserved in the sense that the first encountered instance is still used to generate metrics, whereas the subsequent inconsistent ones are ignored along with a few peripheral changes. ``` # HELP node_scrape_collector_duration_seconds node_exporter: Duration of a collector scrape. #TYPE node_scrape_collector_duration_seconds gauge node_scrape_collector_duration_seconds{collector="textfile"} 0.0004005 # HELP node_scrape_collector_success node_exporter: Whether a collector succeeded. # TYPE node_scrape_collector_success gauge node_scrape_collector_success{collector="textfile"} 1 # HELP node_textfile_mtime_seconds Unixtime mtime of textfiles successfully read. # TYPE node_textfile_mtime_seconds gauge node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-bar.prom"} 1.710812009e+09 node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-foo.prom"} 1.710811982e+09 # HELP node_textfile_scrape_error 1 if there was an error opening or reading a file, 0 otherwise # TYPE node_textfile_scrape_error gauge node_textfile_scrape_error 1 # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. # TYPE promhttp_metric_handler_errors_total counter promhttp_metric_handler_errors_total{cause="encoding"} 0 promhttp_metric_handler_errors_total{cause="gathering"} 0 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. # TYPE promhttp_metric_handler_requests_total counter promhttp_metric_handler_requests_total{code="200"} 0 promhttp_metric_handler_requests_total{code="500"} 0 promhttp_metric_handler_requests_total{code="503"} 0 # HELP tau_infrastructure_performing_maintenance_task At what timestamp a given task started or stopped, the last time it was run. # TYPE tau_infrastructure_performing_maintenance_task gauge tau_infrastructure_performing_maintenance_task{main_task="nightly",start_or_stop="start",sub_task="main"} 1.64728080198446e+09 ``` Fixes: #2317 Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
2024-03-23 22:43:03 -07:00
// Check for metrics with inconsistent help texts and take the first help text occurrence.
if helpTexts, seen := metricsNamesToHelpTexts[*mf.Name]; seen {
if mf.Help != nil && helpTexts[0] != *mf.Help || helpTexts[1] != "" {
metricsNamesToHelpTexts[*mf.Name] = [2]string{helpTexts[0], *mf.Help}
errored = true
c.logger.Error("inconsistent metric help text",
collector/textfile: Avoid inconsistent help-texts (#2962) Avoid metrics with inconsistent help-texts. The earlier behaviour has been preserved in the sense that the first encountered instance is still used to generate metrics, whereas the subsequent inconsistent ones are ignored along with a few peripheral changes. ``` # HELP node_scrape_collector_duration_seconds node_exporter: Duration of a collector scrape. #TYPE node_scrape_collector_duration_seconds gauge node_scrape_collector_duration_seconds{collector="textfile"} 0.0004005 # HELP node_scrape_collector_success node_exporter: Whether a collector succeeded. # TYPE node_scrape_collector_success gauge node_scrape_collector_success{collector="textfile"} 1 # HELP node_textfile_mtime_seconds Unixtime mtime of textfiles successfully read. # TYPE node_textfile_mtime_seconds gauge node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-bar.prom"} 1.710812009e+09 node_textfile_mtime_seconds{file="/Users/rexagod/repositories/misc/node_exporter/ne-foo.prom"} 1.710811982e+09 # HELP node_textfile_scrape_error 1 if there was an error opening or reading a file, 0 otherwise # TYPE node_textfile_scrape_error gauge node_textfile_scrape_error 1 # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. # TYPE promhttp_metric_handler_errors_total counter promhttp_metric_handler_errors_total{cause="encoding"} 0 promhttp_metric_handler_errors_total{cause="gathering"} 0 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. # TYPE promhttp_metric_handler_requests_total counter promhttp_metric_handler_requests_total{code="200"} 0 promhttp_metric_handler_requests_total{code="500"} 0 promhttp_metric_handler_requests_total{code="503"} 0 # HELP tau_infrastructure_performing_maintenance_task At what timestamp a given task started or stopped, the last time it was run. # TYPE tau_infrastructure_performing_maintenance_task gauge tau_infrastructure_performing_maintenance_task{main_task="nightly",start_or_stop="start",sub_task="main"} 1.64728080198446e+09 ``` Fixes: #2317 Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
2024-03-23 22:43:03 -07:00
"metric", *mf.Name,
"original_help_text", helpTexts[0],
"new_help_text", *mf.Help,
// Only the first file path will be recorded in case of two or more inconsistent help texts.
"file", metricsNamesToFiles[*mf.Name][0])
continue
}
}
if mf.Help != nil {
metricsNamesToHelpTexts[*mf.Name] = [2]string{*mf.Help}
}
metricsNamesToFiles[*mf.Name] = append(metricsNamesToFiles[*mf.Name], metricsFilePath)
parsedFamilies = append(parsedFamilies, mf)
}
if err != nil {
errored = true
c.logger.Error("failed to collect textfile data", "file", f.Name(), "err", err)
continue
}
mtimes[metricsFilePath] = *mtime
}
}
for _, mf := range parsedFamilies {
if mf.Help == nil {
help := fmt.Sprintf("Metric read from %s", strings.Join(metricsNamesToFiles[*mf.Name], ", "))
mf.Help = &help
}
}
for _, mf := range parsedFamilies {
convertMetricFamily(mf, ch, c.logger)
}
c.exportMTimes(mtimes, ch)
// Export if there were errors.
var errVal float64
if errored {
errVal = 1.0
}
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
"node_textfile_scrape_error",
"1 if there was an error opening or reading a file, 0 otherwise",
nil, nil,
),
prometheus.GaugeValue, errVal,
)
return nil
}
// processFile processes a single file, returning its modification time on success.
func (c *textFileCollector) processFile(dir, name string, ch chan<- prometheus.Metric) (*time.Time, map[string]*dto.MetricFamily, error) {
path := filepath.Join(dir, name)
f, err := os.Open(path)
if err != nil {
return nil, nil, fmt.Errorf("failed to open textfile data file %q: %w", path, err)
}
defer f.Close()
var parser expfmt.TextParser
families, err := parser.TextToMetricFamilies(f)
if err != nil {
return nil, nil, fmt.Errorf("failed to parse textfile data from %q: %w", path, err)
}
if hasTimestamps(families) {
return nil, nil, fmt.Errorf("textfile %q contains unsupported client-side timestamps, skipping entire file", path)
}
// Only stat the file once it has been parsed and validated, so that
// a failure does not appear fresh.
stat, err := f.Stat()
if err != nil {
return nil, families, fmt.Errorf("failed to stat %q: %w", path, err)
}
t := stat.ModTime()
return &t, families, nil
}
// hasTimestamps returns true when metrics contain unsupported timestamps.
func hasTimestamps(parsedFamilies map[string]*dto.MetricFamily) bool {
for _, mf := range parsedFamilies {
for _, m := range mf.Metric {
if m.TimestampMs != nil {
return true
}
}
}
return false
}