Commit 5627559d authored by Ben Kochie's avatar Ben Kochie Committed by John Skarbek

Update Prometheus metamon

* Remove obsolete Prometheus 1.x local storage alerts.
* Simplify queries.
* Add an alert for rule group evaluation taking longer than 70% of the
interval.
* Update the slow rule documentation.
parent 12f771ff
......@@ -2,7 +2,7 @@ groups:
- name: prometheus-metamon.rules
rules:
- alert: PrometheusUnreachable
expr: up{job=~"prometheus.*", fqdn !~ ".*(gprd|gstg|gce.gitlab-runners|ops).*"} == 0
expr: up{job=~"prometheus.*"} == 0
for: 10m
labels:
pager: pagerduty
......@@ -14,7 +14,7 @@ groups:
runbook: troubleshooting/prometheus-is-down.md
title: '{{$labels.job}} is unreachable'
- alert: PrometheusManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus.*", fqdn !~ ".*gprd.*"}[30m]) > 3
expr: changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
for: 30m
labels:
pager: pagerduty
......@@ -26,7 +26,9 @@ groups:
runbook: troubleshooting/prometheus-is-down.md
title: '{{$labels.job}} is restarting frequently'
- alert: PrometheusManyFileSDReadErrors
expr: rate(prometheus_sd_file_read_errors_total{job=~"prometheus.*"}[5m]) / rate(prometheus_sd_file_scan_duration_seconds_count{job=~"prometheus.*"}[5m])
expr: >
rate(prometheus_sd_file_read_errors_total[5m]) /
rate(prometheus_sd_file_scan_duration_seconds_count[5m])
* 100 > 5
for: 10m
labels:
......@@ -38,8 +40,7 @@ groups:
runbook: troubleshooting/prometheus-file-sd-errors.md
title: '{{$labels.job}} has many DNS-SD errors'
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_evaluator_duration_seconds{job=~"prometheus.*",quantile="0.9"}
> 60
expr: prometheus_rule_evaluation_duration_seconds{quantile="0.9"} > 60
for: 10m
labels:
service: prometheus
......@@ -49,56 +50,8 @@ groups:
latency of {{$value}}s completing rule evaluation cycles.'
runbook: troubleshooting/prometheus-slow-rule-eval.md
title: '{{$labels.job}} is evaluating rules too slowly'
- alert: PrometheusCheckpointingSlow
expr: avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
> prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000
for: 5m
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average
for each checkpoint.'
runbook: troubleshooting/prometheus-indexing-backlog.md
title: '{{$labels.job}} is checkpointing too slowly'
- alert: PrometheusIndexingBacklog
expr: prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"} / prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
* 100 > 10
for: 30m
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}%
full.'
runbook: troubleshooting/prometheus-indexing-backlog.md
title: '{{$labels.job}} is backlogging on the indexing queue'
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m])
== 0
for: 5m
labels:
service: prometheus
severity: critical
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples
in the last 10 minutes.'
runbook: troubleshooting/prometheus-not-ingesting.md
title: '{{$labels.job}} is not ingesting samples'
- alert: PrometheusPersistErrors
expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m])
> 0
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has encountered {{$value}}
persist errors per second in the last 10 minutes.'
runbook: troubleshooting/prometheus-persist-errors.md
title: '{{$labels.job}} has persist errors'
- alert: PrometheusNotificationsBacklog
expr: prometheus_notifications_queue_length{job=~"prometheus.*", fqdn !~ ".*gprd.*"} > 0
expr: prometheus_notifications_queue_length > 0
for: 10m
labels:
pager: pagerduty
......@@ -111,7 +64,7 @@ groups:
runbook: troubleshooting/prometheus-notifications-backlog.md
title: '{{$labels.job}} is backlogging on the notifications queue'
- alert: PrometheusScrapingSlowly
expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"}
expr: prometheus_target_interval_length_seconds{interval!~".*m.*",quantile="0.9"}
> 2 * 60
for: 10m
labels:
......@@ -123,58 +76,8 @@ groups:
pool.'
runbook: troubleshooting/prometheus-slow-scrapes.md
title: '{{$labels.job}} is scraping targets slowly'
- alert: PrometheusStorageInconsistent
expr: prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has detected a storage
inconsistency. A server restart is needed to initiate recovery.'
runbook: troubleshooting/prometheus-storage-inconsistent.md
title: '{{$labels.job}} has an inconsistent storage'
- alert: PrometheusPersistencePressureTooHigh
expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
> 0.8 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
3600 * 24) > 1
for: 30m
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
persistence pressure. Throttled ingestion expected within the next 24h.'
runbook: troubleshooting/prometheus-persistence-pressure-high.md
title: '{{$labels.job}} can not keep up persisting'
- alert: PrometheusPersistencePressureTooHigh
expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
> 0.85 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
3600 * 2) > 1
for: 30m
labels:
service: prometheus
severity: critical
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
persistence pressure. Throttled ingestion expected within the next 2h.'
runbook: troubleshooting/prometheus-persistence-pressure-high.md
title: '{{$labels.job}} can not keep up persisting'
- alert: PrometheusSeriesMaintenanceStalled
expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance)
rate(prometheus_local_storage_series_ops_total{job=~"prometheus.*",type="maintenance_in_memory"}[5m])
/ 3600 > 24 and ON(job, instance) prometheus_local_storage_rushed_mode == 1
for: 1h
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is maintaining memory
time series so slowly that it will take {{$value | printf `%.0f`}}h to complete
a full cycle. This will lead to persistence falling behind.'
runbook: troubleshooting/prometheus-slow-series-maintenance.md
title: '{{$labels.job}} is maintaining memory time series too slowly'
- alert: PrometheusInvalidConfigFile
expr: prometheus_config_last_reload_successful{job=~"prometheus.*", fqdn !~ ".*gprd.*"} == 0
expr: prometheus_config_last_reload_successful == 0
for: 30m
labels:
pager: pagerduty
......@@ -185,15 +88,16 @@ groups:
is invalid and was therefore not reloaded.
runbook: troubleshooting/prometheus-invalid-config.md
title: '{{$labels.job}} has an invalid config'
- alert: PrometheusOutOfOrderSamplesDiscarded
expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m])
> 0
for: 1h
- alert: PrometheusSlowRuleEvaluation
expr: >
(prometheus_rule_group_last_duration_seconds /
prometheus_rule_group_interval_seconds) * 100 > 70
for: 30m
labels:
service: prometheus
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has discarded {{$value}}
out-of-order samples over the last hour.'
runbook: troubleshooting/prometheus-out-of-order.md
title: '{{$labels.job}} is discarding out-of-order samples'
description: '{{$labels.job}} at {{$labels.instance}} rule group {{$labels.rule_group}}
is taking more than 70% of the evaluation over the last 30 minutes.'
runbook: troubleshooting/prometheus-slow-rule-eval.md
title: 'Prometheus has slow rule evaluations'
......@@ -2,8 +2,8 @@
## Symptoms
Rule-based metrics are appearing with a lag or not at all anymore because
Prometheus's rule evaluator takes a long time to complete a cycle.
Rule evaluations are executed in sequence on a per rule file/group basis.
Slow server performance or expensive rules can cause them to take too long to complete.
## Possible checks
......@@ -12,11 +12,13 @@ developed over time. Did it recently increase by a lot? Perhaps the rule
evaluation got slower due to more time series. Check for a recent increase
in time series: `prometheus_local_storage_memory_series`.
Perhaps the Prometheus server is overloaded by other things or in general,
there might be too many expensive rules configured.
Perhaps the Prometheus server is overloaded by other things or in general, possibly not enough memory or IO resources.
The rules are expensive, and look over a large number of metrics and/or samples.
## Resolution
Reduce the load on the Prometheus server by either reducing the number of
handled time series, the number of rules, rates of queries, or other causes
of load.
\ No newline at end of file
Reduce the load on the Prometheus server by:
* Reduce the number of executed rules in a rule group so that they can be executed in parallel.
* Reduce the number of series, or amount of smaples required to evaluate a rule.
* Increase the memory or other node resources to speed up evaluations.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment