diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index 96536e467..a8098bea1 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -18,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "io" "io/ioutil" "math" "net/http" @@ -27,6 +28,7 @@ import ( "sort" "strconv" "strings" + "text/tabwriter" "time" "github.com/go-kit/log" @@ -43,6 +45,9 @@ import ( "gopkg.in/alecthomas/kingpin.v2" yaml "gopkg.in/yaml.v2" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" "github.com/prometheus/prometheus/discovery/file" @@ -95,6 +100,7 @@ func main() { ).Required().ExistingFiles() checkMetricsCmd := checkCmd.Command("metrics", checkMetricsUsage) + checkMetricsExtended := checkCmd.Flag("extended", "Print extended information related to the cardinality of the metrics.").Bool() agentMode := checkConfigCmd.Flag("agent", "Check config file for Prometheus in Agent mode.").Bool() queryCmd := app.Command("query", "Run query against a Prometheus server.") @@ -228,7 +234,7 @@ func main() { os.Exit(CheckRules(*ruleFiles...)) case checkMetricsCmd.FullCommand(): - os.Exit(CheckMetrics()) + os.Exit(CheckMetrics(*checkMetricsExtended)) case queryInstantCmd.FullCommand(): os.Exit(QueryInstant(*queryInstantServer, *queryInstantExpr, *queryInstantTime, p)) @@ -629,8 +635,10 @@ $ curl -s http://localhost:9090/metrics | promtool check metrics `) // CheckMetrics performs a linting pass on input metrics. -func CheckMetrics() int { - l := promlint.New(os.Stdin) +func CheckMetrics(extended bool) int { + var buf bytes.Buffer + tee := io.TeeReader(os.Stdin, &buf) + l := promlint.New(tee) problems, err := l.Lint() if err != nil { fmt.Fprintln(os.Stderr, "error while linting:", err) @@ -645,9 +653,70 @@ func CheckMetrics() int { return lintErrExitCode } + if extended { + stats, total, err := checkMetricsExtended(&buf) + if err != nil { + fmt.Fprintln(os.Stderr, err) + return failureExitCode + } + w := tabwriter.NewWriter(os.Stdout, 4, 4, 4, ' ', tabwriter.TabIndent) + fmt.Fprintf(w, "Metric\tCardinality\tPercentage\t\n") + for _, stat := range stats { + fmt.Fprintf(w, "%s\t%d\t%.2f%%\t\n", stat.name, stat.cardinality, stat.percentage*100) + } + fmt.Fprintf(w, "Total\t%d\t%.f%%\t\n", total, 100.) + w.Flush() + } + return successExitCode } +type metricStat struct { + name string + cardinality int + percentage float64 +} + +func checkMetricsExtended(r io.Reader) ([]metricStat, int, error) { + p := expfmt.TextParser{} + metricFamilies, err := p.TextToMetricFamilies(r) + if err != nil { + return nil, 0, fmt.Errorf("error while parsing text to metric families: %w", err) + } + + var total int + stats := make([]metricStat, 0, len(metricFamilies)) + for _, mf := range metricFamilies { + var cardinality int + switch mf.GetType() { + case dto.MetricType_COUNTER, dto.MetricType_GAUGE, dto.MetricType_UNTYPED: + cardinality = len(mf.Metric) + case dto.MetricType_HISTOGRAM: + // Histogram metrics includes sum, count, buckets. + buckets := len(mf.Metric[0].Histogram.Bucket) + cardinality = len(mf.Metric) * (2 + buckets) + case dto.MetricType_SUMMARY: + // Summary metrics includes sum, count, quantiles. + quantiles := len(mf.Metric[0].Summary.Quantile) + cardinality = len(mf.Metric) * (2 + quantiles) + default: + cardinality = len(mf.Metric) + } + stats = append(stats, metricStat{name: mf.GetName(), cardinality: cardinality}) + total += cardinality + } + + for i := range stats { + stats[i].percentage = float64(stats[i].cardinality) / float64(total) + } + + sort.SliceStable(stats, func(i, j int) bool { + return stats[i].cardinality > stats[j].cardinality + }) + + return stats, total, nil +} + // QueryInstant performs an instant query against a Prometheus server. func QueryInstant(url *url.URL, query, evalTime string, p printer) int { if url.Scheme == "" { diff --git a/cmd/promtool/main_test.go b/cmd/promtool/main_test.go index 82b5323c6..d4773fc91 100644 --- a/cmd/promtool/main_test.go +++ b/cmd/promtool/main_test.go @@ -18,6 +18,7 @@ import ( "net/http" "net/http/httptest" "net/url" + "os" "runtime" "strings" "testing" @@ -322,3 +323,39 @@ func TestAuthorizationConfig(t *testing.T) { }) } } + +func TestCheckMetricsExtended(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("Skipping on windows") + } + + f, err := os.Open("testdata/metrics-test.prom") + require.NoError(t, err) + defer f.Close() + + stats, total, err := checkMetricsExtended(f) + require.NoError(t, err) + require.Equal(t, 27, total) + require.Equal(t, []metricStat{ + { + name: "prometheus_tsdb_compaction_chunk_size_bytes", + cardinality: 15, + percentage: float64(15) / float64(27), + }, + { + name: "go_gc_duration_seconds", + cardinality: 7, + percentage: float64(7) / float64(27), + }, + { + name: "net_conntrack_dialer_conn_attempted_total", + cardinality: 4, + percentage: float64(4) / float64(27), + }, + { + name: "go_info", + cardinality: 1, + percentage: float64(1) / float64(27), + }, + }, stats) +} diff --git a/cmd/promtool/testdata/metrics-test.prom b/cmd/promtool/testdata/metrics-test.prom new file mode 100644 index 000000000..bb4c81afe --- /dev/null +++ b/cmd/promtool/testdata/metrics-test.prom @@ -0,0 +1,35 @@ +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{quantile="0"} 2.391e-05 +go_gc_duration_seconds{quantile="0.25"} 9.4402e-05 +go_gc_duration_seconds{quantile="0.5"} 0.000118953 +go_gc_duration_seconds{quantile="0.75"} 0.000145884 +go_gc_duration_seconds{quantile="1"} 0.005201208 +go_gc_duration_seconds_sum 0.036134048 +go_gc_duration_seconds_count 232 +# HELP prometheus_tsdb_compaction_chunk_size_bytes Final size of chunks on their first compaction +# TYPE prometheus_tsdb_compaction_chunk_size_bytes histogram +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="32"} 662 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="48"} 1460 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="72"} 2266 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="108"} 3958 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="162"} 4861 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="243"} 5721 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="364.5"} 10493 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="546.75"} 12464 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="820.125"} 13254 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="1230.1875"} 13699 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="1845.28125"} 13806 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="2767.921875"} 13852 +prometheus_tsdb_compaction_chunk_size_bytes_bucket{le="+Inf"} 13867 +prometheus_tsdb_compaction_chunk_size_bytes_sum 3.886707e+06 +prometheus_tsdb_compaction_chunk_size_bytes_count 13867 +# HELP net_conntrack_dialer_conn_attempted_total Total number of connections attempted by the given dialer a given name. +# TYPE net_conntrack_dialer_conn_attempted_total counter +net_conntrack_dialer_conn_attempted_total{dialer_name="blackbox"} 5210 +net_conntrack_dialer_conn_attempted_total{dialer_name="default"} 0 +net_conntrack_dialer_conn_attempted_total{dialer_name="node"} 21 +net_conntrack_dialer_conn_attempted_total{dialer_name="prometheus"} 21 +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +go_info{version="go1.17"} 1