From afde4707c52ccce477c5dd10cdd2af05c641369c Mon Sep 17 00:00:00 2001 From: Jan Horstmann Date: Mon, 4 Nov 2024 09:06:52 +0100 Subject: [PATCH] Update mixin dashboard Update and rewrite the mixin dashboards to use the grafonnet ([1]) library. Grafana has deprecated angular plugins ([2]) as used by grafonnet-lib ([3]) with removal pending for grafana version 12. Additionally grafonnet-lib is deprecated/unmaintained in favor of grafonnet. Therefore the mixin dashboards have been updated to use grafonnet. Closes: https://github.com/prometheus/prometheus/issues/14404 [1] https://github.com/grafana/grafonnet [2] https://grafana.com/docs/grafana/latest/developers/angular_deprecation/ [3] https://github.com/grafana/grafonnet-lib Signed-off-by: Jan Horstmann --- .../prometheus-mixin/dashboards.libsonnet | 1185 +++++++++++------ .../prometheus-mixin/jsonnetfile.json | 15 +- 2 files changed, 789 insertions(+), 411 deletions(-) diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index 2bdd168cc..22b8c92e6 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -1,438 +1,825 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; +local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; local dashboard = grafana.dashboard; -local row = grafana.row; -local singlestat = grafana.singlestat; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local template = grafana.template; +local prometheus = grafana.query.prometheus; +local variable = dashboard.variable; +local panel = grafana.panel; +local row = panel.row; + { grafanaDashboards+:: { + + local panelTimeSeriesStdOptions = + {} + + panel.timeSeries.queryOptions.withDatasource('prometheus', '$datasource') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + panel.timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + panel.timeSeries.options.tooltip.withMode('multi') + , + + local panelTimeSeriesStacking = + {} + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(100) + + panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(0) + + panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + , + 'prometheus.json': + local showMultiCluster = $._config.showMultiCluster; - local dashboard = g.dashboard( - '%(prefix)sOverview' % $._config.grafanaPrometheus - ); - local templatedDashboard = if showMultiCluster then - dashboard - .addMultiTemplate('cluster', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, $._config.clusterLabel) - .addMultiTemplate('job', 'prometheus_build_info{cluster=~"$cluster"}', 'job') - .addMultiTemplate('instance', 'prometheus_build_info{cluster=~"$cluster", job=~"$job"}', 'instance') - else - dashboard - .addMultiTemplate('job', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, 'job') - .addMultiTemplate('instance', 'prometheus_build_info{job=~"$job"}', 'instance'); - templatedDashboard - .addRow( - g.row('Prometheus Stats') - .addPanel( - g.panel('Prometheus Stats') + - g.tablePanel(if showMultiCluster then [ - 'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - 'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - ] else [ - 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})', - 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', - ], { - cluster: { alias: if showMultiCluster then 'Cluster' else '' }, - job: { alias: 'Job' }, - instance: { alias: 'Instance' }, - version: { alias: 'Version' }, - 'Value #A': { alias: 'Count', type: 'hidden' }, - 'Value #B': { alias: 'Uptime', type: 'number', unit: 's' }, - }) - ) - ) - .addRow( - g.row('Discovery') - .addPanel( - g.panel('Target Sync') + - g.queryPanel(if showMultiCluster then 'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3' - else 'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}' - else '{{scrape_job}}') + - { yaxes: g.yaxes('ms') } - ) - .addPanel( - g.panel('Targets') + - g.queryPanel(if showMultiCluster then 'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})' - else 'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}' - else 'Targets') + - g.stack - ) - ) - .addRow( - g.row('Retrieval') - .addPanel( - g.panel('Average Scrape Interval Duration') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3' - else 'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', - if showMultiCluster then '{{cluster}}:{{job}}:{{instance}} {{interval}} configured' - else '{{interval}} configured') + - { yaxes: g.yaxes('ms') } - ) - .addPanel( - g.panel('Scrape failures') + - g.queryPanel(if showMultiCluster then [ - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))', - ] else [ - 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', - 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', - ], if showMultiCluster then [ - 'exceeded body size limit: {{cluster}} {{job}} {{instance}}', - 'exceeded sample limit: {{cluster}} {{job}} {{instance}}', - 'duplicate timestamp: {{cluster}} {{job}} {{instance}}', - 'out of bounds: {{cluster}} {{job}} {{instance}}', - 'out of order: {{cluster}} {{job}} {{instance}}', - ] else [ - 'exceeded body size limit: {{job}}', - 'exceeded sample limit: {{job}}', - 'duplicate timestamp: {{job}}', - 'out of bounds: {{job}}', - 'out of order: {{job}}', - ]) + - g.stack - ) - .addPanel( - g.panel('Appended Samples') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])' - else 'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', - if showMultiCluster then '{{cluster}} {{job}} {{instance}}' - else '{{job}} {{instance}}') + - g.stack - ) - ) - .addRow( - g.row('Storage') - .addPanel( - g.panel('Head Series') + - g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' - else 'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', - if showMultiCluster then '{{cluster}} {{job}} {{instance}} head series' - else '{{job}} {{instance}} head series') + - g.stack - ) - .addPanel( - g.panel('Head Chunks') + - g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' - else 'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', - if showMultiCluster then '{{cluster}} {{job}} {{instance}} head chunks' - else '{{job}} {{instance}} head chunks') + - g.stack - ) - ) - .addRow( - g.row('Query') - .addPanel( - g.panel('Query Rate') + - g.queryPanel(if showMultiCluster then 'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' - else 'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', - if showMultiCluster then '{{cluster}} {{job}} {{instance}}' - else '{{job}} {{instance}}') + - g.stack, - ) - .addPanel( - g.panel('Stage Duration') + - g.queryPanel(if showMultiCluster then 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3' - else 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', - if showMultiCluster then '{{slice}}' - else '{{slice}}') + - { yaxes: g.yaxes('ms') } + - g.stack, - ) - ) + { - tags: $._config.grafanaPrometheus.tags, - refresh: $._config.grafanaPrometheus.refresh, - }, + + local datasourceVariable = + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withLabel('Data source') + + variable.datasource.generalOptions.withCurrent('default') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local clusterVariable = + variable.query.new('cluster') + + variable.query.generalOptions.withLabel('cluster') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + variable.query.generalOptions.withCurrent('$__all') + + variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info{%(prometheusSelector)s}' % $._config) + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local jobVariable = + variable.query.new('job') + + variable.query.generalOptions.withLabel('job') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + if showMultiCluster then + variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{cluster=~"$cluster"}') + else + variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{%(prometheusSelector)s}' % $._config) + ; + + local instanceVariable = + variable.query.new('instance') + + variable.query.generalOptions.withLabel('instance') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.+') + + variable.query.selectionOptions.withMulti(true) + + if showMultiCluster then + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster", job=~"$job"}') + else + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{job=~"$job"}') + ; + + local prometheusStats = + panel.table.new('Prometheus Stats') + + panel.table.queryOptions.withDatasource('prometheus', '$datasource') + + panel.table.standardOptions.withUnit('short') + + panel.table.standardOptions.withDecimals(2) + + panel.table.standardOptions.withDisplayName('') + + panel.table.standardOptions.withOverrides([ + panel.table.standardOptions.override.byName.new('Time') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Time') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'), + panel.table.standardOptions.override.byName.new('cluster') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + if showMultiCluster then panel.table.standardOptions.override.byName.withProperty('displayName', 'Cluster') else {}, + panel.table.standardOptions.override.byName.new('job') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Job'), + panel.table.standardOptions.override.byName.new('instance') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Instance') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2), + panel.table.standardOptions.override.byName.new('version') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Version') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2), + panel.table.standardOptions.override.byName.new('Value #A') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Count') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 'short') + + panel.table.standardOptions.override.byName.withProperty('decimals', 2) + + panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'), + panel.table.standardOptions.override.byName.new('Value #B') + + panel.table.standardOptions.override.byName.withProperty('displayName', 'Uptime') + + panel.table.standardOptions.override.byName.withProperty('custom.align', null) + + panel.table.standardOptions.override.byName.withProperty('unit', 's'), + ]) + + if showMultiCluster then + panel.table.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + prometheus.new( + '$datasource', + 'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + ]) + else + panel.table.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + prometheus.new( + '$datasource', + 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})' + ) + + prometheus.withFormat('table') + + prometheus.withInstant(true) + + prometheus.withLegendFormat(''), + ]) + ; + + local targetSync = + panel.timeSeries.new('Target Sync') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{scrape_job}}'), + ]) + ; + + local targets = + panel.timeSeries.new('Targets') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('Targets'), + ]) + ; + + local averageScrapeIntervalDuration = + panel.timeSeries.new('Average Scrape Interval Duration') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}} {{interval}} configured'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{interval}} configured'), + ]) + ; + + local scrapeFailures = + panel.timeSeries.new('Scrape failures') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded body size limit: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded sample limit: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('duplicate timestamp: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of bounds: {{cluster}} {{job}} {{instance}}'), + prometheus.new( + '$datasource', + 'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of order: {{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded body size limit: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('exceeded sample limit: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('duplicate timestamp: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of bounds: {{job}}'), + prometheus.new( + '$datasource', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('out of order: {{job}}'), + ]) + ; + + local appendedSamples = + panel.timeSeries.new('Appended Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}}'), + ]) + ; + + local headSeries = + panel.timeSeries.new('Head Series') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head series'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}} head series'), + ]) + ; + + local headChunks = + panel.timeSeries.new('Head Chunks') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head chunks'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}} head chunks'), + ]) + ; + + local queryRate = + panel.timeSeries.new('Query Rate') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('short') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{job}} {{instance}}'), + ]) + ; + + local stageDuration = + panel.timeSeries.new('Stage Duration') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withSort('desc') + + panel.timeSeries.standardOptions.withMin(0) + + panelTimeSeriesStacking + + panel.timeSeries.standardOptions.withUnit('ms') + + if showMultiCluster then + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{slice}}'), + ]) + else + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3' + ) + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{slice}}'), + ]) + ; + + dashboard.new('%(prefix)sOverview' % $._config.grafanaPrometheus) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.grafanaPrometheus.tags) + + dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh) + + dashboard.withVariables(std.prune([ + datasourceVariable, + if showMultiCluster then clusterVariable, + jobVariable, + instanceVariable, + ])) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('Prometheus Stats') + + row.withPanels([ + prometheusStats, + ]), + ], panelWidth=24, panelHeight=7) + + + grafana.util.grid.makeGrid([ + row.new('Discovery') + + row.withPanels([ + targetSync, + targets, + ]), + ], panelWidth=12, panelHeight=7, startY=8) + + + grafana.util.grid.makeGrid([ + row.new('Retrieval') + + row.withPanels([ + averageScrapeIntervalDuration, + scrapeFailures, + appendedSamples, + ]), + ], panelWidth=8, panelHeight=7, startY=16) + + + grafana.util.grid.makeGrid([ + row.new('Storage') + + row.withPanels([ + headSeries, + headChunks, + ]), + row.new('Query') + + row.withPanels([ + queryRate, + stageDuration, + ]), + ], panelWidth=12, panelHeight=7, startY=24) + ), // Remote write specific dashboard. 'prometheus-remote-write.json': + + local datasourceVariable = + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withCurrent('default') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local clusterVariable = + variable.query.new('cluster') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.generalOptions.withCurrent('$__all') + + variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info') + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue() + ; + + local instanceVariable = + variable.query.new('instance') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster"}') + ; + + local urlVariable = + variable.query.new('url') + + variable.query.withDatasourceFromVariable(datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(true) + + variable.query.queryTypes.withLabelValues('url', metric='prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}') + ; + local timestampComparison = - graphPanel.new( - 'Highest Timestamp In vs. Highest Timestamp Sent', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - ||| - ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - - - ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0) - ) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', - )); + panel.timeSeries.new('Highest Timestamp In vs. Highest Timestamp Sent') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} + - + ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0) + ) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local timestampComparisonRate = - graphPanel.new( - 'Rate[5m]', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - ||| - clamp_min( - rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - - - ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) - , 0) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', - )); + panel.timeSeries.new('Rate[5m]') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + clamp_min( + rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) + - + ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) + , 0) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local samplesRate = - graphPanel.new( - 'Rate, in vs. succeeded or dropped [5m]', - datasource='$datasource', - span=12, - ) - .addTarget(prometheus.target( - ||| - rate( - prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - - - ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) - - - (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) - |||, - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Rate, in vs. succeeded or dropped [5m]') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + rate( + prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) + - + ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) + - + (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])) + ||| + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local currentShards = - graphPanel.new( - 'Current Shards', - datasource='$datasource', - span=12, - min_span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Current Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local maxShards = - graphPanel.new( - 'Max Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Max Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local minShards = - graphPanel.new( - 'Min Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Min Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local desiredShards = - graphPanel.new( - 'Desired Shards', - datasource='$datasource', - span=4, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Desired Shards') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local shardsCapacity = - graphPanel.new( - 'Shard Capacity', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); - + panel.timeSeries.new('Shard Capacity') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local pendingSamples = - graphPanel.new( - 'Pending Samples', - datasource='$datasource', - span=6, - ) - .addTarget(prometheus.target( - 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Pending Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.standardOptions.withUnit('short') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local walSegment = - graphPanel.new( - 'TSDB Current Segment', - datasource='$datasource', - span=6, - formatY1='none', - ) - .addTarget(prometheus.target( - 'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}' - )); + panel.timeSeries.new('TSDB Current Segment') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.standardOptions.withUnit('none') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}}'), + ]); local queueSegment = - graphPanel.new( - 'Remote Write Current Segment', - datasource='$datasource', - span=6, - formatY1='none', - ) - .addTarget(prometheus.target( - 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}} {{consumer}}' - )); + panel.timeSeries.new('Remote Write Current Segment') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.standardOptions.withUnit('none') + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{consumer}}'), + ]); local droppedSamples = - graphPanel.new( - 'Dropped Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Dropped Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local failedSamples = - graphPanel.new( - 'Failed Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Failed Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local retriedSamples = - graphPanel.new( - 'Retried Samples', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Retried Samples') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); local enqueueRetries = - graphPanel.new( - 'Enqueue Retries', - datasource='$datasource', - span=3, - ) - .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])', - legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' - )); + panel.timeSeries.new('Enqueue Retries') + + panelTimeSeriesStdOptions + + panel.timeSeries.options.tooltip.withMode('single') + + panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0) + + panel.timeSeries.queryOptions.withTargets([ + prometheus.new( + '$datasource', + 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])' + ) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'), + ]); - dashboard.new( - title='%(prefix)sRemote Write' % $._config.grafanaPrometheus, - editable=true - ) - .addTemplate( - { - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addTemplate( - template.new( - 'cluster', - '$datasource', - 'label_values(prometheus_build_info, cluster)' % $._config, - refresh='time', - current={ - selected: true, - text: 'All', - value: '$__all', - }, - includeAll=true, - ) - ) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(prometheus_build_info{cluster=~"$cluster"}, instance)' % $._config, - refresh='time', - current={ - selected: true, - text: 'All', - value: '$__all', - }, - includeAll=true, - ) - ) - .addTemplate( - template.new( - 'url', - '$datasource', - 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config, - refresh='time', - includeAll=true, - ) - ) - .addRow( - row.new('Timestamps') - .addPanel(timestampComparison) - .addPanel(timestampComparisonRate) - ) - .addRow( - row.new('Samples') - .addPanel(samplesRate) - ) - .addRow( - row.new( - 'Shards' - ) - .addPanel(currentShards) - .addPanel(maxShards) - .addPanel(minShards) - .addPanel(desiredShards) - ) - .addRow( - row.new('Shard Details') - .addPanel(shardsCapacity) - .addPanel(pendingSamples) - ) - .addRow( - row.new('Segments') - .addPanel(walSegment) - .addPanel(queueSegment) - ) - .addRow( - row.new('Misc. Rates') - .addPanel(droppedSamples) - .addPanel(failedSamples) - .addPanel(retriedSamples) - .addPanel(enqueueRetries) - ) + { - tags: $._config.grafanaPrometheus.tags, - refresh: $._config.grafanaPrometheus.refresh, - }, + dashboard.new('%(prefix)sRemote Write' % $._config.grafanaPrometheus) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.grafanaPrometheus.tags) + + dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh) + + dashboard.withVariables([ + datasourceVariable, + clusterVariable, + instanceVariable, + urlVariable, + ]) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('Timestamps') + + row.withPanels([ + timestampComparison, + timestampComparisonRate, + ]), + ], panelWidth=12, panelHeight=7) + + + grafana.util.grid.makeGrid([ + row.new('Samples') + + row.withPanels([ + samplesRate + + panel.timeSeries.gridPos.withW(24), + ]), + row.new('Shards'), + ], panelWidth=24, panelHeight=7, startY=8) + + + grafana.util.grid.wrapPanels([ + currentShards + + panel.timeSeries.gridPos.withW(24), + maxShards, + minShards, + desiredShards, + ], panelWidth=8, panelHeight=7, startY=16) + + + grafana.util.grid.makeGrid([ + row.new('Shard Details') + + row.withPanels([ + shardsCapacity, + pendingSamples, + ]), + row.new('Segments') + + row.withPanels([ + walSegment, + queueSegment, + ]), + ], panelWidth=12, panelHeight=7, startY=24) + + + grafana.util.grid.makeGrid([ + row.new('Misc. Rates') + + row.withPanels([ + droppedSamples, + failedSamples, + retriedSamples, + enqueueRetries, + ]), + ], panelWidth=6, panelHeight=7, startY=40) + ), }, } diff --git a/documentation/prometheus-mixin/jsonnetfile.json b/documentation/prometheus-mixin/jsonnetfile.json index 1c64fd015..2d56d9124 100644 --- a/documentation/prometheus-mixin/jsonnetfile.json +++ b/documentation/prometheus-mixin/jsonnetfile.json @@ -4,20 +4,11 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" } }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "master" + "version": "main" } ], "legacyImports": false