Commit 93da2ebd authored by Andrew Newdigate's avatar Andrew Newdigate

Merge branch 'master' of gitlab.com:gitlab-com/runbooks into general_component_node_apdex_scores

parents cd92f52f 723aa629
groups:
- name: recording.rules
rules:
- record: backend_code:haproxy_server_http_responses_total:irate1m
expr: sum(irate(haproxy_server_http_responses_total[1m])) WITHOUT (fqdn, instance)
- record: job_frontend:haproxy_frontend_bytes_in_total:irate1m
expr: sum(irate(haproxy_frontend_bytes_in_total[1m])) WITHOUT (fqdn, instance)
- record: job_frontend:haproxy_frontend_request_errors_total:irate1m
expr: sum(irate(haproxy_frontend_request_errors_total[1m])) WITHOUT (fqdn, instance)
- record: job_backend:haproxy_backend_response_errors_total:irate1m
expr: sum(irate(haproxy_backend_response_errors_total[1m])) WITHOUT (fqdn, instance)
- record: process_cpu_seconds_total:rate1m
expr: rate(process_cpu_seconds_total[1m])
groups:
- name: Generic process
rules:
- record: instance:process_cpu_seconds_total:rate1m
expr: rate(process_cpu_seconds_total[1m])
This diff is collapsed.
groups:
- name: haproxy.rules
rules:
- record: backend_code:haproxy_server_http_responses_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_server_http_responses_total[1m])
)
- record: frontend_code:haproxy_frontend_http_responses_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_frontend_http_responses_total[1m])
)
- record: job_frontend:haproxy_frontend_bytes_in_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_frontend_bytes_in_total[1m])
)
- record: job_frontend:haproxy_frontend_bytes_out_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_frontend_bytes_out_total[1m])
)
- record: job_backend:haproxy_backend_response_errors_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_backend_response_errors_total[1m])
)
- record: job_frontend:haproxy_frontend_request_errors_total:irate1m
expr: >
sum without (fqdn, instance) (
irate(haproxy_frontend_request_errors_total[1m])
)
- alert: HighWebErrorRate
expr: sum(backend_code:haproxy_server_http_responses_total:irate1m{backend="web",code="5xx",tier="lb"})
- sum(backend_code:haproxy_server_http_responses_total:irate1m{backend="web",code!="5xx",tier="lb"})
......
groups:
- name: CPU rules
interval: 1m
rules:
# The count of CPUs per node, useful for getting CPU time as a percent of total.
- record: instance:node_cpus:count
expr: count(node_cpu{mode="idle"}) WITHOUT (cpu, mode)
- record: instance:node_cpus:count
expr: count(node_cpu_seconds_total{mode="idle"}) WITHOUT (cpu, mode)
- record: instance_cpu:node_cpu_not_idle:rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (mode)
- record: instance_cpu:node_cpu_not_idle:rate5m
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) WITHOUT (mode)
- record: instance_mode:node_cpu:rate5m
expr: sum(rate(node_cpu[5m])) WITHOUT (cpu)
- record: instance_mode:node_cpu:rate5m
expr: sum(rate(node_cpu_seconds_total[5m])) WITHOUT (cpu)
- record: instance:node_cpu_in_use:ratio
expr: sum(instance_mode:node_cpu:rate5m{mode!="idle"}) WITHOUT (mode) / instance:node_cpus:count
- alert: ExtremelyHighCPU
expr: instance:node_cpu_in_use:ratio{environment=~"prd|cny"} > 0.95
expr: >
count without (cpu, mode) (
node_cpu_seconds_total{mode="idle"}
)
# CPU in use by CPU.
- record: instance_cpu:node_cpu_seconds_not_idle:rate1m
expr: >
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle"}[1m])
)
# CPU in use by mode.
- record: instance_mode:node_cpu_seconds:rate5m
expr: >
sum without (cpu) (
rate(node_cpu_seconds_total[5m])
)
# CPU in use ratio.
- record: instance:node_cpu_utilization:ratio
expr: >
sum without (mode) (
instance_mode:node_cpu_seconds:rate5m{mode!="idle"}
) / instance:node_cpus:count
# CPU Alerts
- alert: HighCPU
expr: instance:node_cpu_utilization:ratio > 0.95
for: 2h
labels:
pager: pagerduty
......@@ -27,30 +39,17 @@ groups:
runbook: troubleshooting
title: CPU use percent is extremely high on {{ if $labels.fqdn }}{{ $labels.fqdn
}}{{ else }}{{ $labels.instance }}{{ end }} for the past 2 hours.
- alert: HighCPU
expr: instance:node_cpu_in_use:ratio{environment=~"prd|cny"} > 0.8
for: 2h
labels:
severity: critical
annotations:
description: CPU use percent is extremely high on {{ if $labels.fqdn }}{{ $labels.fqdn
}}{{ else }}{{ $labels.instance }}{{ end }} for the past 2 hours.
runbook: troubleshooting
title: CPU use percent is high on {{ if $labels.fqdn }}{{ $labels.fqdn }}{{
else }}{{ $labels.instance }}{{ end }} for the past 2 hours.
- alert: CPUOutlierDetectionOnPrd
expr: instance:node_cpu_in_use:percent5m{environment=~"prd|cny"} >= ON(job, fqdn,
environment) GROUP_LEFT() (clamp_max(instance:node_cpu_in_use:percent1h + 2
* instance:node_cpu_in_use:percent_stddev_over_time1h, 1))
for: 10m
labels:
severity: warn
annotations:
description: The CPU usage on {{ if $labels.fqdn }}{{ $labels.fqdn }}{{ else
}}{{ $labels.instance }}{{ end }} is outside normal values over a 1h period
runbook: troubleshooting
title: CPU use percent is unusually high compared with the rate of the last
hour
- name: Old CPU rules
rules:
- record: instance:node_cpus:count
expr: count(node_cpu_seconds_total{mode="idle"}) WITHOUT (cpu, mode)
- record: instance_cpu:node_cpu_not_idle:rate5m
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) WITHOUT (mode)
- record: instance_mode:node_cpu:rate5m
expr: sum(rate(node_cpu_seconds_total[5m])) WITHOUT (cpu)
- record: instance:node_cpu_in_use:ratio
expr: sum(instance_mode:node_cpu:rate5m{mode!="idle"}) WITHOUT (mode) / instance:node_cpus:count
- name: Node filesystem rules
rules:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment