Add some alerts to discover invalid behavior of shared runners

parent e70c63df
## Pending builds for projects with shared runners enabled
ALERT number_of_pending_builds_for_project_with_shared_runners_is_too_high
IF max(rate(ci_builds_total{shared_runners_enabled_projects="1",status="pending",instance="db1"}[2h])) > 5
FOR 30m
LABELS {severity="critical", channel="infrastructure"}
ANNOTATIONS {
title="Rate of pending builds for projects with shared runners is too high: {{$value}}",
description="Rate of pending builds for projects with shared runners is increasing and is very high for more than 30 minutes. This may suggest problems with auto-scaling provider or Runner stability. You should check Runner's logs. Check http://performance.gitlab.net/dashboard/db/ci.",
}
......@@ -7,3 +7,23 @@ ALERT machines_creations_rate_is_too_high
title="Machines creation rate for runners is too high: {{$value}}",
description="Machines creation rate for the last 20 minutes is over 5. This may by a symptom of problems with auto-scaling provider. Check http://performance.gitlab.net/dashboard/db/ci.",
}
## Shared runners machines creation rate
ALERT machines_creation_rate_for_shared_runners_is_too_small
IF max(sum(rate(ci_docker_machines{type="created",job="shared-runners"}[1m]))) < 0.1
FOR 5m
LABELS {severity="critical", channel="infrastructure"}
ANNOTATIONS {
title="Machines creation rate for shared runners is too low: {{$value}}",
description="Machines creation rate for shared runners for the last 5 minutes is less than 0.1. This may suggest problems with auto-scaling provider. Check http://performance.gitlab.net/dashboard/db/ci.",
}
## Shared runners machines usage rate
ALERT machines_usage_rate_for_shared_runners_is_too_small
IF max(sum(rate(ci_docker_machines{type="used",job="shared-runners"}[1m]))) < 0.1
FOR 5m
LABELS {severity="critical", channel="infrastructure"}
ANNOTATIONS {
title="Machines usage rate for shared runners is too low: {{$value}}",
description="Machines usage rate for shared runners for the last 5 minutes is less than 0.1. This may suggest problems with auto-scaling provider. Check http://performance.gitlab.net/dashboard/db/ci.",
}
## Runners manager builds
ALERT no_builds_on_shared_runners
IF max(sum(rate(ci_runner_builds{job="shared-runners"}[1m]))) == 0
FOR 5m
LABELS {severity="critical", channel="infrastructure"}
ANNOTATIONS {
title="Number of builds running on shared runners is too low: {{$value}}",
description="Number of builds running on shared runners for the last 5 minutes is 0. This may suggest problems with auto-scaling provider or Runner stability. You should check Runner's logs. Check http://performance.gitlab.net/dashboard/db/ci.",
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment