Split up Gitaly rules into separate groups

Split the Gitaly rule groups to reduce execution time per group.
parent 5627559d
groups:
- name: Gitaly
- name: Gitaly grpc handled
rules:
- record: gitaly:grpc_server_handled_total:rate1m
expr: sum(rate(grpc_server_handled_total[1m])) without (fqdn, instance, grpc_code)
......@@ -15,6 +15,51 @@ groups:
expr: >
sum without (grpc_code, grpc_method, grpc_service, grpc_type)
(rate(grpc_server_handled_total{grpc_code!="OK"}[1m]))
- alert: GitalyErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} > 2
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly {{$labels.grpc_code}} error rate for the last 5 minutes is over 2 for {{$labels.grpc_method}}.
Check Gitaly logs and consider disabling that method.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyInstanceErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:instance_error_rate1m{environment=~"g?prd"}
> 5
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly error rate for the last 5 minutes is over 5 on {{$labels.instance}}.
Check Gitaly logs and consider disabling it on that host.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyMethodErrorRateOutlier
expr: >
gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} >
(
gitaly:grpc_server_handled_total:error_avg_rate12h{environment=~"g?prd"}
+
(2 * gitaly:grpc_server_handled_total:error_rate1m_stddev_over_time12h{environment=~"g?prd"})
)
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: >
The {{$labels.grpc_code}} error rate on {{ $labels.grpc_method }} is outside normal
values over a 12 hour period (95% confidence).
dashboard: "https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?var-method={{ $labels.grpc_method }}&var-environment=prd"
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Error rate on {{ $labels.grpc_method }} is unusually high compared with a 12 hour average'
- name: Gitaly grpc buckets
rules:
- record: gitaly:grpc_server_handling_seconds_bucket:rate1m
expr: >
sum without (instance, grpc_service, grpc_type)
......@@ -51,32 +96,30 @@ groups:
expr: >
sum without (grpc_method, grpc_type, grpc_service, grpc_code)
(rate(grpc_server_handled_total{grpc_code!="OK"}[1m]))
- record: gitaly:gitaly_rate_limiting_queued:grpc_method
expr: sum(gitaly_rate_limiting_queued) by (environment, grpc_method, job)
# Alerts
- alert: GitalyErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} > 2
for: 5m
labels:
channel: gitaly
severity: critical
annotations:
description: Gitaly {{$labels.grpc_code}} error rate for the last 5 minutes is over 2 for {{$labels.grpc_method}}.
Check Gitaly logs and consider disabling that method.
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
- alert: GitalyInstanceErrorRateTooHigh
expr: gitaly:grpc_server_handled_total:instance_error_rate1m{environment=~"g?prd"}
> 5
- alert: GitalyLatencyOutlier
expr: avg(gitaly:grpc_server_handling_seconds:avg5m{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) > ON(grpc_method) GROUP_LEFT() (avg(gitaly:grpc_server_handling_seconds:avg24h{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) + 2 * avg(gitaly:grpc_server_handling_seconds:avg5m_stddev_over_time24h)
BY (grpc_method))
for: 5m
labels:
channel: gitaly
severity: critical
severity: warn
annotations:
description: Gitaly error rate for the last 5 minutes is over 5 on {{$labels.instance}}.
Check Gitaly logs and consider disabling it on that host.
description: The error rate on the {{ $labels.grpc_method }} endpoint is outside
normal values over a 12 hour period (95% confidence). Check https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?from=now-1h&to=now&orgId=1&var-method={{
$labels.grpc_method }}&var-tier=stor&var-type=gitaly&var-environment=prd&refresh=5m
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly error rate is too high: {{$value | printf "%.2f" }}'
title: 'Gitaly: Latency on the Gitaly {{ $labels.grpc_method }} is unusually
high compared with a 24 hour average'
- name: Gitaly rate limiting
rules:
- record: gitaly:gitaly_rate_limiting_queued:grpc_method
expr: sum(gitaly_rate_limiting_queued) by (environment, grpc_method, job)
- name: Gitaly misc alerts
rules:
- alert: GitalyFileServerDown
expr: up{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"} == 0
for: 1m
......@@ -132,38 +175,3 @@ groups:
for details of versions deployed across the fleet.
runbook: troubleshooting/gitaly-version-mismatch.md
title: 'Gitaly: multiple versions of Gitaly are currently running in production'
- alert: GitalyMethodErrorRateOutlier
expr: >
gitaly:grpc_server_handled_total:error_rate1m{environment=~"g?prd"} >
(
gitaly:grpc_server_handled_total:error_avg_rate12h{environment=~"g?prd"}
+
(2 * gitaly:grpc_server_handled_total:error_rate1m_stddev_over_time12h{environment=~"g?prd"})
)
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: >
The {{$labels.grpc_code}} error rate on {{ $labels.grpc_method }} is outside normal
values over a 12 hour period (95% confidence).
dashboard: "https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?var-method={{ $labels.grpc_method }}&var-environment=prd"
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Error rate on {{ $labels.grpc_method }} is unusually high compared with a 12 hour average'
- alert: GitalyLatencyOutlier
expr: avg(gitaly:grpc_server_handling_seconds:avg5m{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) > ON(grpc_method) GROUP_LEFT() (avg(gitaly:grpc_server_handling_seconds:avg24h{environment=~"g?prd",job="gitaly",tier="stor",type="gitaly"})
BY (grpc_method) + 2 * avg(gitaly:grpc_server_handling_seconds:avg5m_stddev_over_time24h)
BY (grpc_method))
for: 5m
labels:
channel: gitaly
severity: warn
annotations:
description: The error rate on the {{ $labels.grpc_method }} endpoint is outside
normal values over a 12 hour period (95% confidence). Check https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?from=now-1h&to=now&orgId=1&var-method={{
$labels.grpc_method }}&var-tier=stor&var-type=gitaly&var-environment=prd&refresh=5m
runbook: troubleshooting/gitaly-error-rate.md
title: 'Gitaly: Latency on the Gitaly {{ $labels.grpc_method }} is unusually
high compared with a 24 hour average'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment