From afde4707c52ccce477c5dd10cdd2af05c641369c Mon Sep 17 00:00:00 2001 From: Jan Horstmann Date: Mon, 4 Nov 2024 09:06:52 +0100 Subject: [PATCH 1/5] Update mixin dashboard Update and rewrite the mixin dashboards to use the grafonnet ([1]) library. Grafana has deprecated angular plugins ([2]) as used by grafonnet-lib ([3]) with removal pending for grafana version 12. Additionally grafonnet-lib is deprecated/unmaintained in favor of grafonnet. Therefore the mixin dashboards have been updated to use grafonnet. Closes: https://github.com/prometheus/prometheus/issues/14404 [1] https://github.com/grafana/grafonnet [2] https://grafana.com/docs/grafana/latest/developers/angular_deprecation/ [3] https://github.com/grafana/grafonnet-lib Signed-off-by: Jan Horstmann --- .../prometheus-mixin/dashboards.libsonnet | 1185 +++++++++++------ .../prometheus-mixin/jsonnetfile.json | 15 +- 2 files changed, 789 insertions(+), 411 deletions(-) diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index 2bdd168cc9..22b8c92e6e 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -1,438 +1,825 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; +local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; local dashboard = grafana.dashboard; -local row = grafana.row; -local singlestat = grafana.singlestat; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local template = grafana.template; +local prometheus = grafana.query.prometheus; +local variable = dashboard.variable; +local panel = grafana.panel; +local row = panel.row; + { grafanaDashboards+:: { + + local panelTimeSeriesStdOptions = + {} + + panel.timeSeries.queryOptions.withDatasource('prometheus', '$datasource') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + panel.timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + panel.timeSeries.options.tooltip.withMode('multi') + , + + local panelTimeSeriesStacking = + {} + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(100) + + panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(0) + + panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + , + 'prometheus.json': + local showMultiCluster = $._config.showMultiCluster; - local dashboard = g.dashboard( - '%(prefix)sOverview' % $._config.grafanaPrometheus - ); - local templatedDashboard = if showMultiCluster then - dashboard - .addMultiTemplate('cluster', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, $._config.clusterLabel) - .addMultiTemplate('job', 'prometheus_build_info{cluster=~"$cluster"}', 'job') - .addMultiTemplate('instance', 'prometheus_build_info{cluster=~"$cluster", job=~"$job"}', 'instance') - else - dashboard - .addMultiTemplate('job', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, 'job') - .addMultiTemplate('instance', 'prometheus_build_info{job=~"$job"}', 'instance'); - templatedDashboard - .addRow( - g.row('Prometheus Stats') - .addPanel( - g.panel('Prometheus Stats') + - g.tablePanel(if showMultiCluster then [ - 'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - 'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - ] else [ - 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})', - 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', - ], { - cluster: { alias: if showMultiCluster then 'Cluster' else '' }, - job: { alias: 'Job' }, - instance: { alias: 'Instance' }, - version: { alias: 'Version' }, - 'Value #A': { alias: 'Count', type: 'hidden' }, - 'Value #B': { alias: 'Uptime', type: 'number', unit: 's' }, - }) - ) - ) - .addRow( - g.row('Discovery') - .addPanel( - g.panel('Target Sync') + - g.queryPanel(if showMultiCluster then 'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3' - else 'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}' - else '{{scrape_job}}') + - { yaxes: g.yaxes('ms') } - ) - .addPanel( - g.panel('Targets') + - g.queryPanel(if showMultiCluster then 'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})' - else 'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}' - else 'Targets') + - g.stack - ) - ) - .addRow( - g.row('Retrieval') - .addPanel( - g.panel('Average Scrape Interval Duration') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3' - else 'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}} {{interval}} configured' - else '{{interval}} configured') + - { yaxes: g.yaxes('ms') } - ) - .addPanel( - g.panel('Scrape failures') + - g.queryPanel(if showMultiCluster then [ - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - ] else [ - 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', - ], if showMultiCluster then [ - 'exceeded body size limit: {{cluster}} {{job}} {{instance}}', - 'exceeded sample limit: {{cluster}} {{job}} {{instance}}', - 'duplicate timestamp: {{cluster}} {{job}} {{instance}}', - 'out of bounds: {{cluster}} {{job}} {{instance}}', - 'out of order: {{cluster}} {{job}} {{instance}}', - ] else [ - 'exceeded body size limit: {{job}}', - 'exceeded sample limit: {{job}}', - 'duplicate timestamp: {{job}}', - 'out of bounds: {{job}}', - 'out of order: {{job}}', - ]) + - g.stack - ) - .addPanel( - g.panel('Appended Samples') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])' - else 'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', - if showMultiCluster then '{{cluster}} {{job}} {{instance}}' - else '{{job}} {{instance}}') + - g.stack - ) - ) - .addRow( - g.row('Storage') - .addPanel( - g.panel('Head Series') + - g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' - else 'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', - if showMultiCluster then '{{cluster}} {{job}} {{instance}} head series' - else '{{job}} {{instance}} head series') + - g.stack - ) - .addPanel( - g.panel('Head Chunks') + - g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' - else 'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', - if showMultiCluster then '{{cluster}} {{job}} {{instance}} head chunks' - else '{{job}} {{instance}} head chunks') + - g.stack - ) - ) - .addRow( - g.row('Query') - .addPanel( - g.panel('Query Rate') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' - else 'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', - if showMultiCluster then '{{cluster}} {{job}} {{instance}}' - else '{{job}} {{instance}}') + - g.stack, - ) - .addPanel( - g.panel('Stage Duration') + - g.queryPanel(if showMultiCluster then 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3' - else 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', - if showMultiCluster then '{{slice}}' - else '{{slice}}') + - { yaxes: g.yaxes('ms') } + - g.stack, - ) - ) + { - tags: $._config.grafanaPrometheus.tags, - refresh: $._config.grafanaPrometheus.refresh, - }, + + local datasourceVariable = + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withLabel('Data source') + + variable.datasource.generalOptions.withCurrent('default') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local clusterVariable = + variable.query.new('cluster') + + variable.query.generalOptions.withLabel('cluster') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + variable.query.generalOptions.withCurrent('$__all') + + variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info{%(prometheusSelector)s}' % $._config) + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local jobVariable = + variable.query.new('job') + + variable.query.generalOptions.withLabel('job') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + if showMultiCluster then + variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{cluster=~"$cluster"}') + else + variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{%(prometheusSelector)s}' % $._config) + ; + + local instanceVariable = + variable.query.new('instance') + + variable.query.generalOptions.withLabel('instance') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + if showMultiCluster then + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster", job=~"$job"}') + else + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{job=~"$job"}') + ; + + local prometheusStats = + panel.table.new('Prometheus Stats') + + panel.table.queryOptions.withDatasource('prometheus', '$datasource') + + panel.table.standardOptions.withUnit('short') + + panel.table.standardOptions.withDecimals(2) + + panel.table.standardOptions.withDisplayName('') + + panel.table.standardOptions.withOverrides([ + panel.table.standardOptions.override.byName.new('Time') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Time') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'), + panel.table.standardOptions.override.byName.new('cluster') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + if showMultiCluster then panel.table.standardOptions.override.byName.withProperty('displayName', 'Cluster') else {}, + panel.table.standardOptions.override.byName.new('job') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Job'), + panel.table.standardOptions.override.byName.new('instance') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Instance') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2), + panel.table.standardOptions.override.byName.new('version') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Version') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2), + panel.table.standardOptions.override.byName.new('Value #A') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Count') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'), + panel.table.standardOptions.override.byName.new('Value #B') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Uptime') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 's'), + ]) + + if showMultiCluster then + panel.table.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + prometheus.new( + '$datasource', + 'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + ]) + else + panel.table.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + prometheus.new( + '$datasource', + 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + ]) + ; + + local targetSync = + panel.timeSeries.new('Target Sync') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{scrape_job}}'), + ]) + ; + + local targets = + panel.timeSeries.new('Targets') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('Targets'), + ]) + ; + + local averageScrapeIntervalDuration = + panel.timeSeries.new('Average Scrape Interval Duration') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}} {{interval}} configured'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{interval}} configured'), + ]) + ; + + local scrapeFailures = + panel.timeSeries.new('Scrape failures') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded body size limit: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded sample limit: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('duplicate timestamp: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of bounds: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of order: {{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded body size limit: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded sample limit: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('duplicate timestamp: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of bounds: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of order: {{job}}'), + ]) + ; + + local appendedSamples = + panel.timeSeries.new('Appended Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}}'), + ]) + ; + + local headSeries = + panel.timeSeries.new('Head Series') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head series'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}} head series'), + ]) + ; + + local headChunks = + panel.timeSeries.new('Head Chunks') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head chunks'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}} head chunks'), + ]) + ; + + local queryRate = + panel.timeSeries.new('Query Rate') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}}'), + ]) + ; + + local stageDuration = + panel.timeSeries.new('Stage Duration') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{slice}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{slice}}'), + ]) + ; + + dashboard.new('%(prefix)sOverview' % $._config.grafanaPrometheus) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.grafanaPrometheus.tags) + + dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh) + + dashboard.withVariables(std.prune([ + datasourceVariable, + if showMultiCluster then clusterVariable, + jobVariable, + instanceVariable, + ])) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('Prometheus Stats') + + row.withPanels([ + prometheusStats, + ]), + ], panelWidth=24, panelHeight=7) + + + grafana.util.grid.makeGrid([ + row.new('Discovery') + + row.withPanels([ + targetSync, + targets, + ]), + ], panelWidth=12, panelHeight=7, startY=8) + + + grafana.util.grid.makeGrid([ + row.new('Retrieval') + + row.withPanels([ + averageScrapeIntervalDuration, + scrapeFailures, + appendedSamples, + ]), + ], panelWidth=8, panelHeight=7, startY=16) + + + grafana.util.grid.makeGrid([ + row.new('Storage') + + row.withPanels([ + headSeries, + headChunks, + ]), + row.new('Query') + + row.withPanels([ + queryRate, + stageDuration, + ]), + ], panelWidth=12, panelHeight=7, startY=24) + ), // Remote write specific dashboard. 'prometheus-remote-write.json': + + local datasourceVariable = + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withCurrent('default') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local clusterVariable = + variable.query.new('cluster') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.generalOptions.withCurrent('$__all') + + variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local instanceVariable = + variable.query.new('instance') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster"}') + ; + + local urlVariable = + variable.query.new('url') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.queryTypes.withLabelValues('url', metric='prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}') + ; + local timestampComparison = - graphPanel.new( - 'Highest Timestamp In vs. Highest Timestamp Sent', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - ||| - ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - - - ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0) - ) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', - )); + panel.timeSeries.new('Highest Timestamp In vs. Highest Timestamp Sent') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} + - + ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0) + ) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local timestampComparisonRate = - graphPanel.new( - 'Rate[5m]', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - ||| - clamp_min( - rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - - - ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) - , 0) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', - )); + panel.timeSeries.new('Rate[5m]') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + clamp_min( + rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) + - + ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) + , 0) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local samplesRate = - graphPanel.new( - 'Rate, in vs. succeeded or dropped [5m]', - datasource='$datasource', - span=12, - ) - .addTarget(prometheus.target( - ||| - rate( - prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - - - ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) - - - (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Rate, in vs. succeeded or dropped [5m]') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + rate( + prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) + - + ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) + - + (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local currentShards = - graphPanel.new( - 'Current Shards', - datasource='$datasource', - span=12, - min_span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Current Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local maxShards = - graphPanel.new( - 'Max Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Max Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local minShards = - graphPanel.new( - 'Min Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Min Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local desiredShards = - graphPanel.new( - 'Desired Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Desired Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local shardsCapacity = - graphPanel.new( - 'Shard Capacity', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); - + panel.timeSeries.new('Shard Capacity') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local pendingSamples = - graphPanel.new( - 'Pending Samples', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Pending Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local walSegment = - graphPanel.new( - 'TSDB Current Segment', - datasource='$datasource', - span=6, - formatY1='none', - ) - .addTarget(prometheus.target( - 'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}' - )); + panel.timeSeries.new('TSDB Current Segment') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.standardOptions.withUnit('none') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}}'), + ]); local queueSegment = - graphPanel.new( - 'Remote Write Current Segment', - datasource='$datasource', - span=6, - formatY1='none', - ) - .addTarget(prometheus.target( - 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}} {{consumer}}' - )); + panel.timeSeries.new('Remote Write Current Segment') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.standardOptions.withUnit('none') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{consumer}}'), + ]); local droppedSamples = - graphPanel.new( - 'Dropped Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Dropped Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local failedSamples = - graphPanel.new( - 'Failed Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Failed Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local retriedSamples = - graphPanel.new( - 'Retried Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Retried Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local enqueueRetries = - graphPanel.new( - 'Enqueue Retries', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Enqueue Retries') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); - dashboard.new( - title='%(prefix)sRemote Write' % $._config.grafanaPrometheus, - editable=true - ) - .addTemplate( - { - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addTemplate( - template.new( - 'cluster', - '$datasource', - 'label_values(prometheus_build_info, cluster)' % $._config, - refresh='time', - current={ - selected: true, - text: 'All', - value: '$__all', - }, - includeAll=true, - ) - ) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(prometheus_build_info{cluster=~"$cluster"}, instance)' % $._config, - refresh='time', - current={ - selected: true, - text: 'All', - value: '$__all', - }, - includeAll=true, - ) - ) - .addTemplate( - template.new( - 'url', - '$datasource', - 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config, - refresh='time', - includeAll=true, - ) - ) - .addRow( - row.new('Timestamps') - .addPanel(timestampComparison) - .addPanel(timestampComparisonRate) - ) - .addRow( - row.new('Samples') - .addPanel(samplesRate) - ) - .addRow( - row.new( - 'Shards' - ) - .addPanel(currentShards) - .addPanel(maxShards) - .addPanel(minShards) - .addPanel(desiredShards) - ) - .addRow( - row.new('Shard Details') - .addPanel(shardsCapacity) - .addPanel(pendingSamples) - ) - .addRow( - row.new('Segments') - .addPanel(walSegment) - .addPanel(queueSegment) - ) - .addRow( - row.new('Misc. Rates') - .addPanel(droppedSamples) - .addPanel(failedSamples) - .addPanel(retriedSamples) - .addPanel(enqueueRetries) - ) + { - tags: $._config.grafanaPrometheus.tags, - refresh: $._config.grafanaPrometheus.refresh, - }, + dashboard.new('%(prefix)sRemote Write' % $._config.grafanaPrometheus) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.grafanaPrometheus.tags) + + dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh) + + dashboard.withVariables([ + datasourceVariable, + clusterVariable, + instanceVariable, + urlVariable, + ]) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('Timestamps') + + row.withPanels([ + timestampComparison, + timestampComparisonRate, + ]), + ], panelWidth=12, panelHeight=7) + + + grafana.util.grid.makeGrid([ + row.new('Samples') + + row.withPanels([ + samplesRate + + panel.timeSeries.gridPos.withW(24), + ]), + row.new('Shards'), + ], panelWidth=24, panelHeight=7, startY=8) + + + grafana.util.grid.wrapPanels([ + currentShards + + panel.timeSeries.gridPos.withW(24), + maxShards, + minShards, + desiredShards, + ], panelWidth=8, panelHeight=7, startY=16) + + + grafana.util.grid.makeGrid([ + row.new('Shard Details') + + row.withPanels([ + shardsCapacity, + pendingSamples, + ]), + row.new('Segments') + + row.withPanels([ + walSegment, + queueSegment, + ]), + ], panelWidth=12, panelHeight=7, startY=24) + + + grafana.util.grid.makeGrid([ + row.new('Misc. Rates') + + row.withPanels([ + droppedSamples, + failedSamples, + retriedSamples, + enqueueRetries, + ]), + ], panelWidth=6, panelHeight=7, startY=40) + ), }, } diff --git a/documentation/prometheus-mixin/jsonnetfile.json b/documentation/prometheus-mixin/jsonnetfile.json index 1c64fd0151..2d56d91245 100644 --- a/documentation/prometheus-mixin/jsonnetfile.json +++ b/documentation/prometheus-mixin/jsonnetfile.json @@ -4,20 +4,11 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" } }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "master" + "version": "main" } ], "legacyImports": false From 9d6f88cb7300eb8c006942f8ce2560eca4e553c7 Mon Sep 17 00:00:00 2001 From: Fiona Liao Date: Thu, 9 Jan 2025 09:29:57 +0000 Subject: [PATCH 2/5] Add additional tests for operators over incompatible nhcb (#15787) * Add additional tests for operators over incompatible nhcb Signed-off-by: Fiona Liao --- .../testdata/native_histograms.test | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/promql/promqltest/testdata/native_histograms.test b/promql/promqltest/testdata/native_histograms.test index 6be298cf7d..414619d5cd 100644 --- a/promql/promqltest/testdata/native_histograms.test +++ b/promql/promqltest/testdata/native_histograms.test @@ -1128,6 +1128,39 @@ eval_warn range from 0 to 12m step 6m sum(metric) eval_warn range from 0 to 12m step 6m avg(metric) {} _ {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} _ +# Test incompatible schemas with additional aggregation operators +eval range from 0 to 12m step 6m count(metric) + {} 2 2 3 + +eval range from 0 to 12m step 6m group(metric) + {} 1 1 1 + +eval range from 0 to 12m step 6m count(limitk(1, metric)) + {} 1 1 1 + +eval range from 0 to 12m step 6m limitk(3, metric) + metric{series="1"} _ {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + metric{series="2"} {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} _ {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} + metric{series="3"} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + +eval range from 0 to 12m step 6m limit_ratio(1, metric) + metric{series="1"} _ {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + metric{series="2"} {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} _ {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} + metric{series="3"} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + +# Test incompatible schemas with and/or +eval range from 0 to 12m step 6m metric{series="1"} and ignoring(series) metric{series="2"} + metric{series="1"} _ _ {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + +eval range from 0 to 12m step 6m metric{series="1"} or ignoring(series) metric{series="2"} + metric{series="1"} _ {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} + metric{series="2"} {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} _ _ + +# Test incompatible schemas with arithmetic binary operators +eval_warn range from 0 to 12m step 6m metric{series="2"} + ignoring (series) metric{series="3"} + +eval_warn range from 0 to 12m step 6m metric{series="2"} - ignoring (series) metric{series="3"} + clear load 1m From b3e30d52cecd2cb6b34c9ca74633fa6c13da0d24 Mon Sep 17 00:00:00 2001 From: Neeraj Gartia <80708727+NeerajGartia21@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:08:42 +0530 Subject: [PATCH 3/5] [BUGFIX] PromQL: Fix `` functions with histograms (#15711) fix aggr_over_time with histograms Signed-off-by: Neeraj Gartia --------- Signed-off-by: Neeraj Gartia --- promql/functions.go | 75 +++++++++++++--------- promql/promqltest/testdata/functions.test | 78 +++++++++++++++++++---- 2 files changed, 111 insertions(+), 42 deletions(-) diff --git a/promql/functions.go b/promql/functions.go index 5f31a3db18..2d809571d4 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -691,9 +691,15 @@ func funcLastOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNod // === mad_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === func funcMadOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - if len(vals[0].(Matrix)[0].Floats) == 0 { + samples := vals[0].(Matrix)[0] + var annos annotations.Annotations + if len(samples.Floats) == 0 { return enh.Out, nil } + if len(samples.Histograms) > 0 { + metricName := samples.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInMixedRangeInfo(metricName, args[0].PositionRange())) + } return aggrOverTime(vals, enh, func(s Series) float64 { values := make(vectorByValueHeap, 0, len(s.Floats)) for _, f := range s.Floats { @@ -705,18 +711,20 @@ func funcMadOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode values = append(values, Sample{F: math.Abs(f.F - median)}) } return quantile(0.5, values) - }), nil + }), annos } // === max_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === func funcMaxOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - if len(vals[0].(Matrix)[0].Floats) == 0 { - // TODO(beorn7): The passed values only contain - // histograms. max_over_time ignores histograms for now. If - // there are only histograms, we have to return without adding - // anything to enh.Out. + samples := vals[0].(Matrix)[0] + var annos annotations.Annotations + if len(samples.Floats) == 0 { return enh.Out, nil } + if len(samples.Histograms) > 0 { + metricName := samples.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInMixedRangeInfo(metricName, args[0].PositionRange())) + } return aggrOverTime(vals, enh, func(s Series) float64 { maxVal := s.Floats[0].F for _, f := range s.Floats { @@ -725,18 +733,20 @@ func funcMaxOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode } } return maxVal - }), nil + }), annos } // === min_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === func funcMinOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - if len(vals[0].(Matrix)[0].Floats) == 0 { - // TODO(beorn7): The passed values only contain - // histograms. min_over_time ignores histograms for now. If - // there are only histograms, we have to return without adding - // anything to enh.Out. + samples := vals[0].(Matrix)[0] + var annos annotations.Annotations + if len(samples.Floats) == 0 { return enh.Out, nil } + if len(samples.Histograms) > 0 { + metricName := samples.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInMixedRangeInfo(metricName, args[0].PositionRange())) + } return aggrOverTime(vals, enh, func(s Series) float64 { minVal := s.Floats[0].F for _, f := range s.Floats { @@ -745,7 +755,7 @@ func funcMinOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode } } return minVal - }), nil + }), annos } // === sum_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === @@ -794,10 +804,6 @@ func funcQuantileOverTime(vals []parser.Value, args parser.Expressions, enh *Eva q := vals[0].(Vector)[0].F el := vals[1].(Matrix)[0] if len(el.Floats) == 0 { - // TODO(beorn7): The passed values only contain - // histograms. quantile_over_time ignores histograms for now. If - // there are only histograms, we have to return without adding - // anything to enh.Out. return enh.Out, nil } @@ -805,7 +811,10 @@ func funcQuantileOverTime(vals []parser.Value, args parser.Expressions, enh *Eva if math.IsNaN(q) || q < 0 || q > 1 { annos.Add(annotations.NewInvalidQuantileWarning(q, args[0].PositionRange())) } - + if len(el.Histograms) > 0 { + metricName := el.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInAggregationInfo(metricName, args[0].PositionRange())) + } values := make(vectorByValueHeap, 0, len(el.Floats)) for _, f := range el.Floats { values = append(values, Sample{F: f.F}) @@ -815,13 +824,15 @@ func funcQuantileOverTime(vals []parser.Value, args parser.Expressions, enh *Eva // === stddev_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === func funcStddevOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - if len(vals[0].(Matrix)[0].Floats) == 0 { - // TODO(beorn7): The passed values only contain - // histograms. stddev_over_time ignores histograms for now. If - // there are only histograms, we have to return without adding - // anything to enh.Out. + samples := vals[0].(Matrix)[0] + var annos annotations.Annotations + if len(samples.Floats) == 0 { return enh.Out, nil } + if len(samples.Histograms) > 0 { + metricName := samples.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInMixedRangeInfo(metricName, args[0].PositionRange())) + } return aggrOverTime(vals, enh, func(s Series) float64 { var count float64 var mean, cMean float64 @@ -833,18 +844,20 @@ func funcStddevOverTime(vals []parser.Value, args parser.Expressions, enh *EvalN aux, cAux = kahanSumInc(delta*(f.F-(mean+cMean)), aux, cAux) } return math.Sqrt((aux + cAux) / count) - }), nil + }), annos } // === stdvar_over_time(Matrix parser.ValueTypeMatrix) (Vector, Annotations) === func funcStdvarOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { - if len(vals[0].(Matrix)[0].Floats) == 0 { - // TODO(beorn7): The passed values only contain - // histograms. stdvar_over_time ignores histograms for now. If - // there are only histograms, we have to return without adding - // anything to enh.Out. + samples := vals[0].(Matrix)[0] + var annos annotations.Annotations + if len(samples.Floats) == 0 { return enh.Out, nil } + if len(samples.Histograms) > 0 { + metricName := samples.Metric.Get(labels.MetricName) + annos.Add(annotations.NewHistogramIgnoredInMixedRangeInfo(metricName, args[0].PositionRange())) + } return aggrOverTime(vals, enh, func(s Series) float64 { var count float64 var mean, cMean float64 @@ -856,7 +869,7 @@ func funcStdvarOverTime(vals []parser.Value, args parser.Expressions, enh *EvalN aux, cAux = kahanSumInc(delta*(f.F-(mean+cMean)), aux, cAux) } return (aux + cAux) / count - }), nil + }), annos } // === absent(Vector parser.ValueTypeVector) (Vector, Annotations) === diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index 6d2ade3abc..7fc636450f 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -929,35 +929,58 @@ eval instant at 1m avg_over_time(metric[2m]) # Tests for stddev_over_time and stdvar_over_time. clear load 10s - metric 0 8 8 2 3 + metric 0 8 8 2 3 + metric_histogram{type="only_histogram"} {{schema:1 sum:2 count:3}}x5 + metric_histogram{type="mix"} 1 1 1 {{schema:1 sum:2 count:3}} {{schema:1 sum:2 count:3}} eval instant at 1m stdvar_over_time(metric[2m]) - {} 10.56 + {} 10.56 eval instant at 1m stddev_over_time(metric[2m]) - {} 3.249615 + {} 3.249615 eval instant at 1m stddev_over_time((metric[2m])) - {} 3.249615 + {} 3.249615 + +# Tests for stddev_over_time and stdvar_over_time with histograms. +eval instant at 1m stddev_over_time(metric_histogram{type="only_histogram"}[2m]) + #empty + +eval_info instant at 1m stddev_over_time(metric_histogram{type="mix"}[2m]) + {type="mix"} 0 + +eval instant at 1m stdvar_over_time(metric_histogram{type="only_histogram"}[2m]) + #empty + +eval_info instant at 1m stdvar_over_time(metric_histogram{type="mix"}[2m]) + {type="mix"} 0 # Tests for stddev_over_time and stdvar_over_time #4927. clear load 10s - metric 1.5990505637277868 1.5990505637277868 1.5990505637277868 + metric 1.5990505637277868 1.5990505637277868 1.5990505637277868 eval instant at 1m stdvar_over_time(metric[1m]) - {} 0 + {} 0 eval instant at 1m stddev_over_time(metric[1m]) - {} 0 + {} 0 # Tests for mad_over_time. clear load 10s - metric 4 6 2 1 999 1 2 + metric 4 6 2 1 999 1 2 + metric_histogram{type="only_histogram"} {{schema:1 sum:2 count:3}}x5 + metric_histogram{type="mix"} 1 1 1 {{schema:1 sum:2 count:3}} {{schema:1 sum:2 count:3}} eval instant at 70s mad_over_time(metric[70s]) - {} 1 + {} 1 + +eval instant at 70s mad_over_time(metric_histogram{type="only_histogram"}[70s]) + #empty + +eval_info instant at 70s mad_over_time(metric_histogram{type="mix"}[70s]) + {type="mix"} 0 # Tests for quantile_over_time clear @@ -966,6 +989,8 @@ load 10s data{test="two samples"} 0 1 data{test="three samples"} 0 1 2 data{test="uneven samples"} 0 1 4 + data_histogram{test="only histogram samples"} {{schema:0 sum:1 count:2}}x4 + data_histogram{test="mix samples"} 0 1 2 {{schema:0 sum:1 count:2}}x2 eval instant at 1m quantile_over_time(0, data[2m]) {test="two samples"} 0 @@ -1007,6 +1032,12 @@ eval_warn instant at 1m (quantile_over_time(2, (data[2m]))) {test="three samples"} +Inf {test="uneven samples"} +Inf +eval instant at 1m quantile_over_time(0.5, data_histogram{test="only histogram samples"}[2m]) + #empty + +eval_info instant at 1m quantile_over_time(0.5, data_histogram{test="mix samples"}[2m]) + {test="mix samples"} 1 + clear # Test time-related functions. @@ -1120,15 +1151,17 @@ load 5m eval_fail instant at 0m changes({__name__=~'testmetric1|testmetric2'}[5m]) -# Tests for *_over_time clear +# Tests for *_over_time load 10s data{type="numbers"} 2 0 3 data{type="some_nan"} 2 0 NaN data{type="some_nan2"} 2 NaN 1 data{type="some_nan3"} NaN 0 1 data{type="only_nan"} NaN NaN NaN + data_histogram{type="only_histogram"} {{schema:0 sum:1 count:2}} {{schema:0 sum:2 count:3}} {{schema:0 sum:3 count:4}} + data_histogram{type="mix_samples"} 0 1 {{schema:0 sum:1 count:2}} {{schema:0 sum:2 count:3}} eval instant at 1m min_over_time(data[2m]) {type="numbers"} 0 @@ -1137,6 +1170,12 @@ eval instant at 1m min_over_time(data[2m]) {type="some_nan3"} 0 {type="only_nan"} NaN +eval instant at 1m min_over_time(data_histogram{type="only_histogram"}[2m]) + #empty + +eval_info instant at 1m min_over_time(data_histogram{type="mix_samples"}[2m]) + {type="mix_samples"} 0 + eval instant at 1m max_over_time(data[2m]) {type="numbers"} 3 {type="some_nan"} 2 @@ -1144,12 +1183,29 @@ eval instant at 1m max_over_time(data[2m]) {type="some_nan3"} 1 {type="only_nan"} NaN -eval instant at 1m last_over_time(data[2m]) +eval instant at 1m max_over_time(data_histogram{type="only_histogram"}[2m]) + #empty + +eval_info instant at 1m max_over_time(data_histogram{type="mix_samples"}[2m]) + {type="mix_samples"} 1 + +eval instant at 1m last_over_time({__name__=~"data(_histogram)?"}[2m]) data{type="numbers"} 3 data{type="some_nan"} NaN data{type="some_nan2"} 1 data{type="some_nan3"} 1 data{type="only_nan"} NaN + data_histogram{type="only_histogram"} {{schema:0 sum:3 count:4}} + data_histogram{type="mix_samples"} {{schema:0 sum:2 count:3}} + +eval instant at 1m count_over_time({__name__=~"data(_histogram)?"}[2m]) + {type="numbers"} 3 + {type="some_nan"} 3 + {type="some_nan2"} 3 + {type="some_nan3"} 3 + {type="only_nan"} 3 + {type="only_histogram"} 3 + {type="mix_samples"} 4 clear From 6339989e25102f37a57030f4338b4850c8c5b30e Mon Sep 17 00:00:00 2001 From: Vandit Singh <107131545+Vandit1604@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:57:39 +0530 Subject: [PATCH 4/5] web/api: Add a limit parameter to /query and /query_range (#15552) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add limit param to query and rangeQuery --------- Signed-off-by: Vandit Singh Signed-off-by: Vandit Singh <107131545+Vandit1604@users.noreply.github.com> Co-authored-by: Björn Rabenstein --- docs/querying/api.md | 2 + web/api/v1/api.go | 53 +++++++++- web/api/v1/api_test.go | 228 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 280 insertions(+), 3 deletions(-) diff --git a/docs/querying/api.md b/docs/querying/api.md index f1e7129303..e3f97886dc 100644 --- a/docs/querying/api.md +++ b/docs/querying/api.md @@ -86,6 +86,7 @@ URL query parameters: - `time=`: Evaluation timestamp. Optional. - `timeout=`: Evaluation timeout. Optional. Defaults to and is capped by the value of the `-query.timeout` flag. +- `limit=`: Maximum number of returned series. Doesn’t affect scalars or strings but truncates the number of series for matrices and vectors. Optional. 0 means disabled. The current server time is used if the `time` parameter is omitted. @@ -154,6 +155,7 @@ URL query parameters: - `step=`: Query resolution step width in `duration` format or float number of seconds. - `timeout=`: Evaluation timeout. Optional. Defaults to and is capped by the value of the `-query.timeout` flag. +- `limit=`: Maximum number of returned series. Optional. 0 means disabled. You can URL-encode these parameters directly in the request body by using the `POST` method and `Content-Type: application/x-www-form-urlencoded` header. This is useful when specifying a large diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 6e9c589087..4903f925cc 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -438,6 +438,10 @@ func (api *API) options(*http.Request) apiFuncResult { } func (api *API) query(r *http.Request) (result apiFuncResult) { + limit, err := parseLimitParam(r.FormValue("limit")) + if err != nil { + return invalidParamError(err, "limit") + } ts, err := parseTimeParam(r, "time", api.now()) if err != nil { return invalidParamError(err, "time") @@ -479,6 +483,15 @@ func (api *API) query(r *http.Request) (result apiFuncResult) { return apiFuncResult{nil, returnAPIError(res.Err), res.Warnings, qry.Close} } + warnings := res.Warnings + if limit > 0 { + var isTruncated bool + + res, isTruncated = truncateResults(res, limit) + if isTruncated { + warnings = warnings.Add(errors.New("results truncated due to limit")) + } + } // Optional stats field in response if parameter "stats" is not empty. sr := api.statsRenderer if sr == nil { @@ -490,7 +503,7 @@ func (api *API) query(r *http.Request) (result apiFuncResult) { ResultType: res.Value.Type(), Result: res.Value, Stats: qs, - }, nil, res.Warnings, qry.Close} + }, nil, warnings, qry.Close} } func (api *API) formatQuery(r *http.Request) (result apiFuncResult) { @@ -526,6 +539,10 @@ func extractQueryOpts(r *http.Request) (promql.QueryOpts, error) { } func (api *API) queryRange(r *http.Request) (result apiFuncResult) { + limit, err := parseLimitParam(r.FormValue("limit")) + if err != nil { + return invalidParamError(err, "limit") + } start, err := parseTime(r.FormValue("start")) if err != nil { return invalidParamError(err, "start") @@ -590,6 +607,16 @@ func (api *API) queryRange(r *http.Request) (result apiFuncResult) { return apiFuncResult{nil, returnAPIError(res.Err), res.Warnings, qry.Close} } + warnings := res.Warnings + if limit > 0 { + var isTruncated bool + + res, isTruncated = truncateResults(res, limit) + if isTruncated { + warnings = warnings.Add(errors.New("results truncated due to limit")) + } + } + // Optional stats field in response if parameter "stats" is not empty. sr := api.statsRenderer if sr == nil { @@ -601,7 +628,7 @@ func (api *API) queryRange(r *http.Request) (result apiFuncResult) { ResultType: res.Value.Type(), Result: res.Value, Stats: qs, - }, nil, res.Warnings, qry.Close} + }, nil, warnings, qry.Close} } func (api *API) queryExemplars(r *http.Request) apiFuncResult { @@ -2102,3 +2129,25 @@ func toHintLimit(limit int) int { } return limit } + +// truncateResults truncates result for queryRange() and query(). +// No truncation for other types(Scalars or Strings). +func truncateResults(result *promql.Result, limit int) (*promql.Result, bool) { + isTruncated := false + + switch v := result.Value.(type) { + case promql.Matrix: + if len(v) > limit { + result.Value = v[:limit] + isTruncated = true + } + case promql.Vector: + if len(v) > limit { + result.Value = v[:limit] + isTruncated = true + } + } + + // Return the modified result. Unchanged for other types. + return result, isTruncated +} diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 175ed2e0f0..e6ca43508b 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -1164,6 +1164,49 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, }, }, + // Only matrix and vector responses are limited/truncated. String and scalar responses aren't truncated. + { + endpoint: api.query, + query: url.Values{ + "query": []string{"2"}, + "time": []string{"123.4"}, + "limit": []string{"1"}, + }, + response: &QueryData{ + ResultType: parser.ValueTypeScalar, + Result: promql.Scalar{ + V: 2, + T: timestamp.FromTime(start.Add(123*time.Second + 400*time.Millisecond)), + }, + }, + warningsCount: 0, + }, + // When limit = 0, limit is disabled. + { + endpoint: api.query, + query: url.Values{ + "query": []string{"2"}, + "time": []string{"123.4"}, + "limit": []string{"0"}, + }, + response: &QueryData{ + ResultType: parser.ValueTypeScalar, + Result: promql.Scalar{ + V: 2, + T: timestamp.FromTime(start.Add(123*time.Second + 400*time.Millisecond)), + }, + }, + warningsCount: 0, + }, + { + endpoint: api.query, + query: url.Values{ + "query": []string{"2"}, + "time": []string{"123.4"}, + "limit": []string{"-1"}, + }, + errType: errorBadData, + }, { endpoint: api.query, query: url.Values{ @@ -1205,6 +1248,179 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, }, }, + { + endpoint: api.query, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "time": []string{"123.4"}, + "limit": []string{"2"}, + }, + warningsCount: 0, + responseAsJSON: `{ + "resultType": "vector", + "result": [ + { + "metric": { + "foo": "bar" + }, + "value": [123.4, "42"] + }, + { + "metric": { + "dings": "bums" + }, + "value": [123.4, "3.1415"] + } + ] + }`, + }, + { + endpoint: api.query, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "time": []string{"123.4"}, + "limit": []string{"1"}, + }, + warningsCount: 1, + responseAsJSON: `{ + "resultType": "vector", + "result": [ + { + "metric": { + "foo": "bar" + }, + "value": [123.4, "42"] + } + ] + }`, + }, + { + endpoint: api.query, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "time": []string{"123.4"}, + "limit": []string{"0"}, + }, + responseAsJSON: `{ + "resultType": "vector", + "result": [ + { + "metric": { + "foo": "bar" + }, + "value": [123.4, "42"] + }, + { + "metric": { + "dings": "bums" + }, + "value": [123.4, "3.1415"] + } + ] + }`, + warningsCount: 0, + }, + // limit=0 means no limit. + { + endpoint: api.queryRange, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "start": []string{"0"}, + "end": []string{"2"}, + "step": []string{"1"}, + "limit": []string{"0"}, + }, + response: &QueryData{ + ResultType: parser.ValueTypeMatrix, + Result: promql.Matrix{ + promql.Series{ + Metric: labels.FromMap(map[string]string{"dings": "bums"}), + Floats: []promql.FPoint{ + {F: 3.1415, T: timestamp.FromTime(start)}, + {F: 3.1415, T: timestamp.FromTime(start.Add(1 * time.Second))}, + {F: 3.1415, T: timestamp.FromTime(start.Add(2 * time.Second))}, + }, + }, + promql.Series{ + Metric: labels.FromMap(map[string]string{"foo": "bar"}), + Floats: []promql.FPoint{ + {F: 42, T: timestamp.FromTime(start)}, + {F: 42, T: timestamp.FromTime(start.Add(1 * time.Second))}, + {F: 42, T: timestamp.FromTime(start.Add(2 * time.Second))}, + }, + }, + }, + }, + warningsCount: 0, + }, + { + endpoint: api.queryRange, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "start": []string{"0"}, + "end": []string{"2"}, + "step": []string{"1"}, + "limit": []string{"1"}, + }, + response: &QueryData{ + ResultType: parser.ValueTypeMatrix, + Result: promql.Matrix{ + promql.Series{ + Metric: labels.FromMap(map[string]string{"dings": "bums"}), + Floats: []promql.FPoint{ + {F: 3.1415, T: timestamp.FromTime(start)}, + {F: 3.1415, T: timestamp.FromTime(start.Add(1 * time.Second))}, + {F: 3.1415, T: timestamp.FromTime(start.Add(2 * time.Second))}, + }, + }, + }, + }, + warningsCount: 1, + }, + { + endpoint: api.queryRange, + query: url.Values{ + "query": []string{ + `label_replace(vector(42), "foo", "bar", "", "") or label_replace(vector(3.1415), "dings", "bums", "", "")`, + }, + "start": []string{"0"}, + "end": []string{"2"}, + "step": []string{"1"}, + "limit": []string{"2"}, + }, + response: &QueryData{ + ResultType: parser.ValueTypeMatrix, + Result: promql.Matrix{ + promql.Series{ + Metric: labels.FromMap(map[string]string{"dings": "bums"}), + Floats: []promql.FPoint{ + {F: 3.1415, T: timestamp.FromTime(start)}, + {F: 3.1415, T: timestamp.FromTime(start.Add(1 * time.Second))}, + {F: 3.1415, T: timestamp.FromTime(start.Add(2 * time.Second))}, + }, + }, + promql.Series{ + Metric: labels.FromMap(map[string]string{"foo": "bar"}), + Floats: []promql.FPoint{ + {F: 42, T: timestamp.FromTime(start)}, + {F: 42, T: timestamp.FromTime(start.Add(1 * time.Second))}, + {F: 42, T: timestamp.FromTime(start.Add(2 * time.Second))}, + }, + }, + }, + }, + warningsCount: 0, + }, { endpoint: api.queryRange, query: url.Values{ @@ -1222,7 +1438,6 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E {F: 1, T: timestamp.FromTime(start.Add(1 * time.Second))}, {F: 2, T: timestamp.FromTime(start.Add(2 * time.Second))}, }, - // No Metric returned - use zero value for comparison. }, }, }, @@ -1235,6 +1450,17 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, responseAsJSON: `{"resultType":"vector","result":[]}`, }, + { + endpoint: api.queryRange, + query: url.Values{ + "query": []string{"bottomk(2, notExists)"}, + "start": []string{"0"}, + "end": []string{"2"}, + "step": []string{"1"}, + "limit": []string{"-1"}, + }, + errType: errorBadData, + }, // Test empty matrix result { endpoint: api.queryRange, From f030894c2cd124923030bd2cfa8a7b91a00544d6 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Thu, 9 Jan 2025 17:51:26 +0100 Subject: [PATCH 5/5] Fix issues raised by staticcheck (#15722) Fix issues raised by staticcheck We are not enabling staticcheck explicitly, though, because it has too many false positives. --------- Signed-off-by: Arve Knudsen --- cmd/prometheus/main.go | 6 +++--- cmd/promtool/main.go | 9 ++++----- promql/promqltest/test.go | 4 ++-- scrape/target.go | 4 ++-- storage/remote/metadata_watcher.go | 2 +- storage/remote/queue_manager.go | 2 +- tsdb/wlog/watcher.go | 2 +- web/api/v1/api.go | 2 +- web/api/v1/api_test.go | 2 +- 9 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 06f46f8d72..03c20dc52d 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -534,7 +534,7 @@ func main() { _, err := a.Parse(os.Args[1:]) if err != nil { - fmt.Fprintln(os.Stderr, fmt.Errorf("Error parsing command line arguments: %w", err)) + fmt.Fprintf(os.Stderr, "Error parsing command line arguments: %s\n", err) a.Usage(os.Args[1:]) os.Exit(2) } @@ -548,7 +548,7 @@ func main() { notifs.AddNotification(notifications.StartingUp) if err := cfg.setFeatureListOptions(logger); err != nil { - fmt.Fprintln(os.Stderr, fmt.Errorf("Error parsing feature list: %w", err)) + fmt.Fprintf(os.Stderr, "Error parsing feature list: %s\n", err) os.Exit(1) } @@ -1742,7 +1742,7 @@ func (s *readyStorage) WALReplayStatus() (tsdb.WALReplayStatus, error) { } // ErrNotReady is returned if the underlying scrape manager is not ready yet. -var ErrNotReady = errors.New("Scrape manager not ready") +var ErrNotReady = errors.New("scrape manager not ready") // ReadyScrapeManager allows a scrape manager to be retrieved. Even if it's set at a later point in time. type readyScrapeManager struct { diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index b52fe7cdbb..62a1d4f906 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -36,7 +36,7 @@ import ( "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil/promlint" - config_util "github.com/prometheus/common/config" + "github.com/prometheus/common/expfmt" "github.com/prometheus/common/model" "github.com/prometheus/common/promslog" "github.com/prometheus/common/version" @@ -45,7 +45,6 @@ import ( dto "github.com/prometheus/client_model/go" promconfig "github.com/prometheus/common/config" - "github.com/prometheus/common/expfmt" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" @@ -312,12 +311,12 @@ func main() { kingpin.Fatalf("Cannot set base auth in the server URL and use a http.config.file at the same time") } var err error - httpConfig, _, err := config_util.LoadHTTPConfigFile(httpConfigFilePath) + httpConfig, _, err := promconfig.LoadHTTPConfigFile(httpConfigFilePath) if err != nil { kingpin.Fatalf("Failed to load HTTP config file: %v", err) } - httpRoundTripper, err = promconfig.NewRoundTripperFromConfig(*httpConfig, "promtool", config_util.WithUserAgent("promtool/"+version.Version)) + httpRoundTripper, err = promconfig.NewRoundTripperFromConfig(*httpConfig, "promtool", promconfig.WithUserAgent("promtool/"+version.Version)) if err != nil { kingpin.Fatalf("Failed to create a new HTTP round tripper: %v", err) } @@ -702,7 +701,7 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin return ruleFiles, nil } -func checkTLSConfig(tlsConfig config_util.TLSConfig, checkSyntaxOnly bool) error { +func checkTLSConfig(tlsConfig promconfig.TLSConfig, checkSyntaxOnly bool) error { if len(tlsConfig.CertFile) > 0 && len(tlsConfig.KeyFile) == 0 { return fmt.Errorf("client cert file %q specified without client key file", tlsConfig.CertFile) } diff --git a/promql/promqltest/test.go b/promql/promqltest/test.go index 518164827a..5e0d9083cb 100644 --- a/promql/promqltest/test.go +++ b/promql/promqltest/test.go @@ -1419,8 +1419,8 @@ func (ll *LazyLoader) appendTill(ts int64) error { // WithSamplesTill loads the samples till given timestamp and executes the given function. func (ll *LazyLoader) WithSamplesTill(ts time.Time, fn func(error)) { - tsMilli := ts.Sub(time.Unix(0, 0).UTC()) / time.Millisecond - fn(ll.appendTill(int64(tsMilli))) + till := ts.Sub(time.Unix(0, 0).UTC()) / time.Millisecond + fn(ll.appendTill(int64(till))) } // QueryEngine returns the LazyLoader's query engine. diff --git a/scrape/target.go b/scrape/target.go index d05866f863..22cde01c05 100644 --- a/scrape/target.go +++ b/scrape/target.go @@ -295,12 +295,12 @@ func (t *Target) intervalAndTimeout(defaultInterval, defaultDuration time.Durati intervalLabel := t.labels.Get(model.ScrapeIntervalLabel) interval, err := model.ParseDuration(intervalLabel) if err != nil { - return defaultInterval, defaultDuration, fmt.Errorf("Error parsing interval label %q: %w", intervalLabel, err) + return defaultInterval, defaultDuration, fmt.Errorf("error parsing interval label %q: %w", intervalLabel, err) } timeoutLabel := t.labels.Get(model.ScrapeTimeoutLabel) timeout, err := model.ParseDuration(timeoutLabel) if err != nil { - return defaultInterval, defaultDuration, fmt.Errorf("Error parsing timeout label %q: %w", timeoutLabel, err) + return defaultInterval, defaultDuration, fmt.Errorf("error parsing timeout label %q: %w", timeoutLabel, err) } return time.Duration(interval), time.Duration(timeout), nil diff --git a/storage/remote/metadata_watcher.go b/storage/remote/metadata_watcher.go index 9306dcb4c2..d7f376c96a 100644 --- a/storage/remote/metadata_watcher.go +++ b/storage/remote/metadata_watcher.go @@ -38,7 +38,7 @@ type Watchable interface { type noopScrapeManager struct{} func (noop *noopScrapeManager) Get() (*scrape.Manager, error) { - return nil, errors.New("Scrape manager not ready") + return nil, errors.New("scrape manager not ready") } // MetadataWatcher watches the Scrape Manager for a given WriteMetadataTo. diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 475c126eff..4b966059f6 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -2119,7 +2119,7 @@ func compressPayload(tmpbuf *[]byte, inp []byte, enc Compression) (compressed [] } return compressed, nil default: - return compressed, fmt.Errorf("Unknown compression scheme [%v]", enc) + return compressed, fmt.Errorf("unknown compression scheme [%v]", enc) } } diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 6f1bc1df35..ca74a9ceaf 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -679,7 +679,7 @@ func (w *Watcher) readCheckpoint(checkpointDir string, readFn segmentReadFn) err // Ensure we read the whole contents of every segment in the checkpoint dir. segs, err := listSegments(checkpointDir) if err != nil { - return fmt.Errorf("Unable to get segments checkpoint dir: %w", err) + return fmt.Errorf("unable to get segments checkpoint dir: %w", err) } for _, segRef := range segs { size, err := getSegmentSize(checkpointDir, segRef.index) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 4903f925cc..ea7d5c5fe4 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -2043,7 +2043,7 @@ func parseTimeParam(r *http.Request, paramName string, defaultValue time.Time) ( } result, err := parseTime(val) if err != nil { - return time.Time{}, fmt.Errorf("Invalid time value for '%s': %w", paramName, err) + return time.Time{}, fmt.Errorf("invalid time value for '%s': %w", paramName, err) } return result, nil } diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index e6ca43508b..37227d849d 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -4186,7 +4186,7 @@ func TestParseTimeParam(t *testing.T) { asTime: time.Time{}, asError: func() error { _, err := parseTime("baz") - return fmt.Errorf("Invalid time value for '%s': %w", "foo", err) + return fmt.Errorf("invalid time value for '%s': %w", "foo", err) }, }, },