2020-10-20 02:42:30 -07:00
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet';
2019-10-17 15:40:58 -07:00
local dashboard = grafana.dashboard;
local row = grafana.row;
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local template = grafana.template;
2018-08-07 05:14:00 -07:00
{
2019-07-11 06:30:57 -07:00
grafanaDashboards+:: {
2018-08-07 05:14:00 -07:00
'prometheus.json':
2020-12-16 09:49:06 -08:00
g.dashboard(
2020-12-30 08:47:04 -08:00
'%(prefix)sOverview' % $._config.grafanaPrometheus
2020-12-16 09:49:06 -08:00
)
2018-08-07 05:14:00 -07:00
.addMultiTemplate('job', 'prometheus_build_info', 'job')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
.addRow(
g.row('Prometheus Stats')
.addPanel(
g.panel('Prometheus Stats') +
g.tablePanel([
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
], {
job: { alias: 'Job' },
instance: { alias: 'Instance' },
2019-06-26 14:22:16 -07:00
version: { alias: 'Version' },
2018-08-07 05:14:00 -07:00
'Value #A': { alias: 'Count', type: 'hidden' },
'Value #B': { alias: 'Uptime' },
})
)
)
.addRow(
g.row('Discovery')
.addPanel(
g.panel('Target Sync') +
2019-06-26 14:22:16 -07:00
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
2018-08-07 05:14:00 -07:00
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Targets') +
2019-06-26 14:22:16 -07:00
g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
2018-08-07 05:14:00 -07:00
g.stack
)
)
.addRow(
g.row('Retrieval')
.addPanel(
2019-06-26 14:22:16 -07:00
g.panel('Average Scrape Interval Duration') +
g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
2018-08-07 05:14:00 -07:00
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
2021-05-15 19:19:22 -07:00
'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))',
2018-08-07 05:14:00 -07:00
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
], [
2021-05-15 19:19:22 -07:00
'exceeded body size limit: {{job}}',
2018-08-07 05:14:00 -07:00
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',
'out of order: {{job}}',
]) +
g.stack
)
.addPanel(
g.panel('Appended Samples') +
2019-06-26 14:22:16 -07:00
g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
2018-08-07 05:14:00 -07:00
g.stack
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Head Series') +
g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
g.stack
)
.addPanel(
g.panel('Head Chunks') +
g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
g.stack
)
)
.addRow(
g.row('Query')
.addPanel(
g.panel('Query Rate') +
2019-06-26 14:22:16 -07:00
g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
2018-08-07 05:14:00 -07:00
g.stack,
)
.addPanel(
g.panel('Stage Duration') +
g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
{ yaxes: g.yaxes('ms') } +
g.stack,
)
2020-12-16 09:49:06 -08:00
) + {
2020-12-30 08:47:04 -08:00
tags: $._config.grafanaPrometheus.tags,
refresh: $._config.grafanaPrometheus.refresh,
2020-12-16 09:49:06 -08:00
},
2019-06-26 07:23:09 -07:00
// Remote write specific dashboard.
2019-06-17 15:02:42 -07:00
'prometheus-remote-write.json':
2020-08-25 06:59:41 -07:00
local timestampComparison =
2019-10-17 15:40:58 -07:00
graphPanel.new(
'Highest Timestamp In vs. Highest Timestamp Sent',
datasource='$datasource',
span=6,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
|||
(
prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}
-
2020-10-15 00:15:59 -07:00
ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"} != 0)
2019-10-17 15:40:58 -07:00
)
|||,
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
2019-10-17 15:40:58 -07:00
));
2020-08-25 06:59:41 -07:00
local timestampComparisonRate =
2019-10-17 15:40:58 -07:00
graphPanel.new(
'Rate[5m]',
datasource='$datasource',
span=6,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
|||
2020-10-15 00:15:59 -07:00
clamp_min(
2019-10-17 15:40:58 -07:00
rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
-
2020-04-08 12:56:00 -07:00
ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
2020-10-15 00:15:59 -07:00
, 0)
2019-10-17 15:40:58 -07:00
|||,
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
2019-10-17 15:40:58 -07:00
));
local samplesRate =
graphPanel.new(
'Rate, in vs. succeeded or dropped [5m]',
datasource='$datasource',
span=12,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
|||
rate(
prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
-
2020-11-27 15:01:41 -08:00
ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
2019-10-17 15:40:58 -07:00
-
2020-11-27 15:01:41 -08:00
(rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
2019-10-17 15:40:58 -07:00
|||,
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
2019-11-18 19:58:07 -08:00
local currentShards =
2019-10-17 15:40:58 -07:00
graphPanel.new(
2019-11-18 19:58:07 -08:00
'Current Shards',
2019-10-17 15:40:58 -07:00
datasource='$datasource',
span=12,
min_span=6,
2019-11-18 19:58:07 -08:00
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-11-18 19:58:07 -08:00
));
local maxShards =
graphPanel.new(
'Max Shards',
datasource='$datasource',
span=4,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-11-18 19:58:07 -08:00
));
local minShards =
graphPanel.new(
'Min Shards',
datasource='$datasource',
span=4,
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-11-18 19:58:07 -08:00
));
local desiredShards =
graphPanel.new(
'Desired Shards',
datasource='$datasource',
span=4,
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-11-18 19:58:07 -08:00
));
2019-10-17 15:40:58 -07:00
local shardsCapacity =
graphPanel.new(
2019-11-18 19:58:07 -08:00
'Shard Capacity',
2019-10-17 15:40:58 -07:00
datasource='$datasource',
span=6,
2019-10-09 09:59:02 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
2020-08-25 06:59:41 -07:00
2019-10-17 15:40:58 -07:00
local pendingSamples =
graphPanel.new(
2019-11-18 19:58:07 -08:00
'Pending Samples',
2019-10-17 15:40:58 -07:00
datasource='$datasource',
span=6,
2019-10-09 09:59:02 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
2020-11-27 15:01:41 -08:00
'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
2020-08-25 06:59:41 -07:00
local walSegment =
2019-10-17 15:40:58 -07:00
graphPanel.new(
'TSDB Current Segment',
datasource='$datasource',
span=6,
formatY1='none',
2019-10-09 09:59:02 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}}'
));
2020-08-25 06:59:41 -07:00
local queueSegment =
2019-10-17 15:40:58 -07:00
graphPanel.new(
'Remote Write Current Segment',
datasource='$datasource',
span=6,
formatY1='none',
2019-10-09 09:59:02 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{consumer}}'
2019-10-17 15:40:58 -07:00
));
local droppedSamples =
graphPanel.new(
'Dropped Samples',
datasource='$datasource',
span=3,
2019-10-09 09:59:02 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
2020-11-27 15:01:41 -08:00
'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
local failedSamples =
graphPanel.new(
'Failed Samples',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
2020-11-27 15:01:41 -08:00
'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
local retriedSamples =
graphPanel.new(
'Retried Samples',
datasource='$datasource',
span=3,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTarget(prometheus.target(
2020-11-27 15:01:41 -08:00
'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
local enqueueRetries =
graphPanel.new(
'Enqueue Retries',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
2020-04-08 12:56:00 -07:00
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
2019-10-17 15:40:58 -07:00
));
2020-12-16 09:49:06 -08:00
dashboard.new(
2020-12-30 08:47:04 -08:00
title='%(prefix)sRemote Write' % $._config.grafanaPrometheus,
2020-12-16 09:49:06 -08:00
editable=true
)
2019-10-17 15:40:58 -07:00
.addTemplate(
{
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
.addTemplate(
template.new(
'instance',
'$datasource',
'label_values(prometheus_build_info, instance)' % $._config,
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
2020-08-25 06:59:41 -07:00
includeAll=true,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
)
.addTemplate(
template.new(
'cluster',
'$datasource',
'label_values(kube_pod_container_info{image=~".*prometheus.*"}, cluster)' % $._config,
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
2020-08-25 06:59:41 -07:00
includeAll=true,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
)
.addTemplate(
template.new(
2020-04-08 12:56:00 -07:00
'url',
2019-10-17 15:40:58 -07:00
'$datasource',
2020-04-08 12:56:00 -07:00
'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config,
2019-10-17 15:40:58 -07:00
refresh='time',
2020-08-25 06:59:41 -07:00
includeAll=true,
2019-06-26 07:23:09 -07:00
)
2019-10-17 15:40:58 -07:00
)
.addRow(
row.new('Timestamps')
.addPanel(timestampComparison)
.addPanel(timestampComparisonRate)
)
.addRow(
row.new('Samples')
.addPanel(samplesRate)
)
.addRow(
2020-08-25 06:59:41 -07:00
row.new(
'Shards'
2019-06-26 07:23:09 -07:00
)
2019-11-18 19:58:07 -08:00
.addPanel(currentShards)
.addPanel(maxShards)
.addPanel(minShards)
.addPanel(desiredShards)
2019-10-17 15:40:58 -07:00
)
.addRow(
row.new('Shard Details')
.addPanel(shardsCapacity)
.addPanel(pendingSamples)
)
.addRow(
row.new('Segments')
.addPanel(walSegment)
.addPanel(queueSegment)
)
.addRow(
row.new('Misc. Rates')
.addPanel(droppedSamples)
.addPanel(failedSamples)
.addPanel(retriedSamples)
.addPanel(enqueueRetries)
2020-12-16 09:49:06 -08:00
) + {
2020-12-30 08:47:04 -08:00
tags: $._config.grafanaPrometheus.tags,
refresh: $._config.grafanaPrometheus.refresh,
2020-12-16 09:49:06 -08:00
},
2019-06-26 07:23:09 -07:00
},
2018-08-07 05:14:00 -07:00
}