prometheus/documentation/prometheus-mixin/dashboards.libsonnet
Jan Horstmann afde4707c5 Update mixin dashboard
Update and rewrite the mixin dashboards to use the grafonnet ([1])
library.
Grafana has deprecated angular plugins ([2]) as used by grafonnet-lib
([3]) with removal pending for grafana version 12.
Additionally grafonnet-lib is deprecated/unmaintained in favor of
grafonnet.
Therefore the mixin dashboards have been updated to use grafonnet.

Closes: https://github.com/prometheus/prometheus/issues/14404

[1]
https://github.com/grafana/grafonnet

[2]
https://grafana.com/docs/grafana/latest/developers/angular_deprecation/

[3]
https://github.com/grafana/grafonnet-lib

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
2024-12-13 16:32:45 +01:00

826 lines
37 KiB
Plaintext

local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local dashboard = grafana.dashboard;
local prometheus = grafana.query.prometheus;
local variable = dashboard.variable;
local panel = grafana.panel;
local row = panel.row;
{
grafanaDashboards+:: {
local panelTimeSeriesStdOptions =
{}
+ panel.timeSeries.queryOptions.withDatasource('prometheus', '$datasource')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
+ panel.timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
+ panel.timeSeries.options.tooltip.withMode('multi')
,
local panelTimeSeriesStacking =
{}
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(100)
+ panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(0)
+ panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal')
,
'prometheus.json':
local showMultiCluster = $._config.showMultiCluster;
local datasourceVariable =
variable.datasource.new('datasource', 'prometheus')
+ variable.datasource.generalOptions.withLabel('Data source')
+ variable.datasource.generalOptions.withCurrent('default')
+ variable.datasource.generalOptions.showOnDashboard.withLabelAndValue()
;
local clusterVariable =
variable.query.new('cluster')
+ variable.query.generalOptions.withLabel('cluster')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.withSort(type='alphabetical', asc=false)
+ variable.query.selectionOptions.withIncludeAll(true, '.+')
+ variable.query.selectionOptions.withMulti(true)
+ variable.query.generalOptions.withCurrent('$__all')
+ variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info{%(prometheusSelector)s}' % $._config)
+ variable.datasource.generalOptions.showOnDashboard.withLabelAndValue()
;
local jobVariable =
variable.query.new('job')
+ variable.query.generalOptions.withLabel('job')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.withSort(type='alphabetical', asc=false)
+ variable.query.selectionOptions.withIncludeAll(true, '.+')
+ variable.query.selectionOptions.withMulti(true)
+ if showMultiCluster then
variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{cluster=~"$cluster"}')
else
variable.query.queryTypes.withLabelValues('job', metric='prometheus_build_info{%(prometheusSelector)s}' % $._config)
;
local instanceVariable =
variable.query.new('instance')
+ variable.query.generalOptions.withLabel('instance')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.withSort(type='alphabetical', asc=false)
+ variable.query.selectionOptions.withIncludeAll(true, '.+')
+ variable.query.selectionOptions.withMulti(true)
+ if showMultiCluster then
variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster", job=~"$job"}')
else
variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{job=~"$job"}')
;
local prometheusStats =
panel.table.new('Prometheus Stats')
+ panel.table.queryOptions.withDatasource('prometheus', '$datasource')
+ panel.table.standardOptions.withUnit('short')
+ panel.table.standardOptions.withDecimals(2)
+ panel.table.standardOptions.withDisplayName('')
+ panel.table.standardOptions.withOverrides([
panel.table.standardOptions.override.byName.new('Time')
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Time')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'),
panel.table.standardOptions.override.byName.new('cluster')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 'short')
+ panel.table.standardOptions.override.byName.withProperty('decimals', 2)
+ if showMultiCluster then panel.table.standardOptions.override.byName.withProperty('displayName', 'Cluster') else {},
panel.table.standardOptions.override.byName.new('job')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 'short')
+ panel.table.standardOptions.override.byName.withProperty('decimals', 2)
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Job'),
panel.table.standardOptions.override.byName.new('instance')
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Instance')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 'short')
+ panel.table.standardOptions.override.byName.withProperty('decimals', 2),
panel.table.standardOptions.override.byName.new('version')
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Version')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 'short')
+ panel.table.standardOptions.override.byName.withProperty('decimals', 2),
panel.table.standardOptions.override.byName.new('Value #A')
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Count')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 'short')
+ panel.table.standardOptions.override.byName.withProperty('decimals', 2)
+ panel.table.standardOptions.override.byName.withProperty('custom.hidden', 'true'),
panel.table.standardOptions.override.byName.new('Value #B')
+ panel.table.standardOptions.override.byName.withProperty('displayName', 'Uptime')
+ panel.table.standardOptions.override.byName.withProperty('custom.align', null)
+ panel.table.standardOptions.override.byName.withProperty('unit', 's'),
])
+ if showMultiCluster then
panel.table.queryOptions.withTargets([
prometheus.new(
'$datasource',
'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})'
)
+ prometheus.withFormat('table')
+ prometheus.withInstant(true)
+ prometheus.withLegendFormat(''),
prometheus.new(
'$datasource',
'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})'
)
+ prometheus.withFormat('table')
+ prometheus.withInstant(true)
+ prometheus.withLegendFormat(''),
])
else
panel.table.queryOptions.withTargets([
prometheus.new(
'$datasource',
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})'
)
+ prometheus.withFormat('table')
+ prometheus.withInstant(true)
+ prometheus.withLegendFormat(''),
prometheus.new(
'$datasource',
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})'
)
+ prometheus.withFormat('table')
+ prometheus.withInstant(true)
+ prometheus.withLegendFormat(''),
])
;
local targetSync =
panel.timeSeries.new('Target Sync')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panel.timeSeries.standardOptions.withUnit('ms')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{scrape_job}}'),
])
;
local targets =
panel.timeSeries.new('Targets')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('short')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('Targets'),
])
;
local averageScrapeIntervalDuration =
panel.timeSeries.new('Average Scrape Interval Duration')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panel.timeSeries.standardOptions.withUnit('ms')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}}:{{job}}:{{instance}} {{interval}} configured'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{interval}} configured'),
])
;
local scrapeFailures =
panel.timeSeries.new('Scrape failures')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('ms')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('exceeded body size limit: {{cluster}} {{job}} {{instance}}'),
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('exceeded sample limit: {{cluster}} {{job}} {{instance}}'),
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('duplicate timestamp: {{cluster}} {{job}} {{instance}}'),
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('out of bounds: {{cluster}} {{job}} {{instance}}'),
prometheus.new(
'$datasource',
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('out of order: {{cluster}} {{job}} {{instance}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('exceeded body size limit: {{job}}'),
prometheus.new(
'$datasource',
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('exceeded sample limit: {{job}}'),
prometheus.new(
'$datasource',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('duplicate timestamp: {{job}}'),
prometheus.new(
'$datasource',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('out of bounds: {{job}}'),
prometheus.new(
'$datasource',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('out of order: {{job}}'),
])
;
local appendedSamples =
panel.timeSeries.new('Appended Samples')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('short')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{job}} {{instance}}'),
])
;
local headSeries =
panel.timeSeries.new('Head Series')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('short')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head series'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{job}} {{instance}} head series'),
])
;
local headChunks =
panel.timeSeries.new('Head Chunks')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('short')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}} head chunks'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{job}} {{instance}} head chunks'),
])
;
local queryRate =
panel.timeSeries.new('Query Rate')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('short')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{cluster}} {{job}} {{instance}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{job}} {{instance}}'),
])
;
local stageDuration =
panel.timeSeries.new('Stage Duration')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withSort('desc')
+ panel.timeSeries.standardOptions.withMin(0)
+ panelTimeSeriesStacking
+ panel.timeSeries.standardOptions.withUnit('ms')
+ if showMultiCluster then
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{slice}}'),
])
else
panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3'
)
+ prometheus.withFormat('time_series')
+ prometheus.withLegendFormat('{{slice}}'),
])
;
dashboard.new('%(prefix)sOverview' % $._config.grafanaPrometheus)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags($._config.grafanaPrometheus.tags)
+ dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh)
+ dashboard.withVariables(std.prune([
datasourceVariable,
if showMultiCluster then clusterVariable,
jobVariable,
instanceVariable,
]))
+ dashboard.withPanels(
grafana.util.grid.makeGrid([
row.new('Prometheus Stats')
+ row.withPanels([
prometheusStats,
]),
], panelWidth=24, panelHeight=7)
+
grafana.util.grid.makeGrid([
row.new('Discovery')
+ row.withPanels([
targetSync,
targets,
]),
], panelWidth=12, panelHeight=7, startY=8)
+
grafana.util.grid.makeGrid([
row.new('Retrieval')
+ row.withPanels([
averageScrapeIntervalDuration,
scrapeFailures,
appendedSamples,
]),
], panelWidth=8, panelHeight=7, startY=16)
+
grafana.util.grid.makeGrid([
row.new('Storage')
+ row.withPanels([
headSeries,
headChunks,
]),
row.new('Query')
+ row.withPanels([
queryRate,
stageDuration,
]),
], panelWidth=12, panelHeight=7, startY=24)
),
// Remote write specific dashboard.
'prometheus-remote-write.json':
local datasourceVariable =
variable.datasource.new('datasource', 'prometheus')
+ variable.datasource.generalOptions.withCurrent('default')
+ variable.datasource.generalOptions.showOnDashboard.withLabelAndValue()
;
local clusterVariable =
variable.query.new('cluster')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.selectionOptions.withIncludeAll(true)
+ variable.query.generalOptions.withCurrent('$__all')
+ variable.query.queryTypes.withLabelValues($._config.clusterLabel, metric='prometheus_build_info')
+ variable.datasource.generalOptions.showOnDashboard.withLabelAndValue()
;
local instanceVariable =
variable.query.new('instance')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.selectionOptions.withIncludeAll(true)
+ variable.query.queryTypes.withLabelValues('instance', metric='prometheus_build_info{cluster=~"$cluster"}')
;
local urlVariable =
variable.query.new('url')
+ variable.query.withDatasourceFromVariable(datasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.selectionOptions.withIncludeAll(true)
+ variable.query.queryTypes.withLabelValues('url', metric='prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}')
;
local timestampComparison =
panel.timeSeries.new('Highest Timestamp In vs. Highest Timestamp Sent')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}
-
ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0)
)
|||
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local timestampComparisonRate =
panel.timeSeries.new('Rate[5m]')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
clamp_min(
rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
-
ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])
, 0)
|||
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local samplesRate =
panel.timeSeries.new('Rate, in vs. succeeded or dropped [5m]')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
rate(
prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
-
ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
-
(rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
|||
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local currentShards =
panel.timeSeries.new('Current Shards')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local maxShards =
panel.timeSeries.new('Max Shards')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local minShards =
panel.timeSeries.new('Min Shards')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local desiredShards =
panel.timeSeries.new('Desired Shards')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local shardsCapacity =
panel.timeSeries.new('Shard Capacity')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local pendingSamples =
panel.timeSeries.new('Pending Samples')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.standardOptions.withUnit('short')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local walSegment =
panel.timeSeries.new('TSDB Current Segment')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.standardOptions.withUnit('none')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}}'),
]);
local queueSegment =
panel.timeSeries.new('Remote Write Current Segment')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.standardOptions.withUnit('none')
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{consumer}}'),
]);
local droppedSamples =
panel.timeSeries.new('Dropped Samples')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local failedSamples =
panel.timeSeries.new('Failed Samples')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local retriedSamples =
panel.timeSeries.new('Retried Samples')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
local enqueueRetries =
panel.timeSeries.new('Enqueue Retries')
+ panelTimeSeriesStdOptions
+ panel.timeSeries.options.tooltip.withMode('single')
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0)
+ panel.timeSeries.queryOptions.withTargets([
prometheus.new(
'$datasource',
'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])'
)
+ prometheus.withFormat('time_series')
+ prometheus.withIntervalFactor(2)
+ prometheus.withLegendFormat('{{cluster}}:{{instance}} {{remote_name}}:{{url}}'),
]);
dashboard.new('%(prefix)sRemote Write' % $._config.grafanaPrometheus)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags($._config.grafanaPrometheus.tags)
+ dashboard.timepicker.withRefreshIntervals($._config.grafanaPrometheus.refresh)
+ dashboard.withVariables([
datasourceVariable,
clusterVariable,
instanceVariable,
urlVariable,
])
+ dashboard.withPanels(
grafana.util.grid.makeGrid([
row.new('Timestamps')
+ row.withPanels([
timestampComparison,
timestampComparisonRate,
]),
], panelWidth=12, panelHeight=7)
+
grafana.util.grid.makeGrid([
row.new('Samples')
+ row.withPanels([
samplesRate
+ panel.timeSeries.gridPos.withW(24),
]),
row.new('Shards'),
], panelWidth=24, panelHeight=7, startY=8)
+
grafana.util.grid.wrapPanels([
currentShards
+ panel.timeSeries.gridPos.withW(24),
maxShards,
minShards,
desiredShards,
], panelWidth=8, panelHeight=7, startY=16)
+
grafana.util.grid.makeGrid([
row.new('Shard Details')
+ row.withPanels([
shardsCapacity,
pendingSamples,
]),
row.new('Segments')
+ row.withPanels([
walSegment,
queueSegment,
]),
], panelWidth=12, panelHeight=7, startY=24)
+
grafana.util.grid.makeGrid([
row.new('Misc. Rates')
+ row.withPanels([
droppedSamples,
failedSamples,
retriedSamples,
enqueueRetries,
]),
], panelWidth=6, panelHeight=7, startY=40)
),
},
}