Unverified Commit b031e37a authored by Ben Kochie's avatar Ben Kochie
Browse files

Split up Gitaly rules into separate groups

Split the Gitaly rule groups to reduce execution time per group.
parent 5627559d
groups:
- name: Gitaly
- name: Gitaly grpc handled
rules:
- record: gitaly:grpc_server_handled_total:rate1m
expr: sum(rate(grpc_server_handled_total[1m])) without (fqdn, instance, grpc_code)
......@@ -15,6 +15,51 @@ groups:
expr: >
sum without (grpc_code, grpc_method, grpc_service, grpc_type)
(rate(grpc_server_handled_total{grpc_code!="OK"}[1m]))
- alert: GitalyErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} > 2
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly {{$labels.grpc_code}} error rate for the last 5 minutes is over 2 for {{$labels.grpc_method}}.
Check Gitaly logs and consider disabling that method.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyInstanceErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:instance_error_rate1m{environment=~"g?prd"}
> 5
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly error rate for the last 5 minutes is over 5 on {{$labels.instance}}.
Check Gitaly logs and consider disabling it on that host.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyMethodErrorRateOutlier
expr: >
gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} >
(
gitaly:grpc_server_handled_total:error_avg_rate12h{environment=~"g?prd"}
+
(2 * gitaly:grpc_server_handled_total:error_rate1m_stddev_over_time12h{environment=~"g?prd"})
)
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: >
The {{$labels.grpc_code}} error rate on {{ $labels.grpc_method }} is outside normal
values over a 12 hour period (95% confidence).
dashboard: "https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?var-method={{ $labels.grpc_method }}&var-environment=prd"
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Error rate on {{ $labels.grpc_method }} is unusually high compared with a 12 hour average'
- name: Gitaly grpc buckets
rules:
- record: gitaly:grpc_server_handling_seconds_bucket:rate1m
expr: >
sum without (instance, grpc_service, grpc_type)
......@@ -51,32 +96,30 @@ groups:
expr: >
sum without (grpc_method, grpc_type, grpc_service, grpc_code)
(rate(grpc_server_handled_total{grpc_code!="OK"}[1m]))
- record: gitaly:gitaly_rate_limiting_queued:grpc_method
expr: sum(gitaly_rate_limiting_queued) by (environment, grpc_method, job)
# Alerts
- alert: GitalyErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} > 2
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly {{$labels.grpc_code}} error rate for the last 5 minutes is over 2 for {{$labels.grpc_method}}.
Check Gitaly logs and consider disabling that method.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyInstanceErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:instance_error_rate1m{environment=~"g?prd"}
> 5
- alert: GitalyLatencyOutlier
expr: avg(gitaly:grpc_server_handling_seconds:avg5m{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) > ON(grpc_method) GROUP_LEFT() (avg(gitaly:grpc_server_handling_seconds:avg24h{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) + 2 * avg(gitaly:grpc_server_handling_seconds:avg5m_stddev_over_time24h)
BY (grpc_method))
for: 5m
labels:
channel: gitaly
severity: critical
severity: warn
annotations:
description: Gitaly error rate for the last 5 minutes is over 5 on {{$labels.instance}}.
Check Gitaly logs and consider disabling it on that host.
description: The error rate on the {{ $labels.grpc_method }} endpoint is outside
normal values over a 12 hour period (95% confidence). Check https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?from=now-1h&to=now&orgId=1&var-method={{
$labels.grpc_method }}&var-tier=stor&var-type=gitaly&var-environment=prd&refresh=5m
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
title: 'Gitaly: Latency on the Gitaly {{ $labels.grpc_method }} is unusually
high compared with a 24 hour average'
- name: Gitaly rate limiting
rules:
- record: gitaly:gitaly_rate_limiting_queued:grpc_method
expr: sum(gitaly_rate_limiting_queued) by (environment, grpc_method, job)
- name: Gitaly misc alerts
rules:
- alert: GitalyFileServerDown
expr: up{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"} == 0
for: 1m
......@@ -132,38 +175,3 @@ groups:
for details of versions deployed across the fleet.
runbook: troubleshooting/gitaly-version-mismatch.md
title: 'Gitaly: multiple versions of Gitaly are currently running in production'
- alert: GitalyMethodErrorRateOutlier
expr: >
gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} >
(
gitaly:grpc_server_handled_total:error_avg_rate12h{environment=~"g?prd"}
+
(2 * gitaly:grpc_server_handled_total:error_rate1m_stddev_over_time12h{environment=~"g?prd"})
)
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: >
The {{$labels.grpc_code}} error rate on {{ $labels.grpc_method }} is outside normal
values over a 12 hour period (95% confidence).
dashboard: "https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?var-method={{ $labels.grpc_method }}&var-environment=prd"
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Error rate on {{ $labels.grpc_method }} is unusually high compared with a 12 hour average'
- alert: GitalyLatencyOutlier
expr: avg(gitaly:grpc_server_handling_seconds:avg5m{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) > ON(grpc_method) GROUP_LEFT() (avg(gitaly:grpc_server_handling_seconds:avg24h{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) + 2 * avg(gitaly:grpc_server_handling_seconds:avg5m_stddev_over_time24h)
BY (grpc_method))
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: The error rate on the {{ $labels.grpc_method }} endpoint is outside
normal values over a 12 hour period (95% confidence). Check https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?from=now-1h&to=now&orgId=1&var-method={{
$labels.grpc_method }}&var-tier=stor&var-type=gitaly&var-environment=prd&refresh=5m
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Latency on the Gitaly {{ $labels.grpc_method }} is unusually
high compared with a 24 hour average'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment