Commit a98f3bcb authored by Andreas Brandl's avatar Andreas Brandl

Adapt alerts to DR replica "type" labels.

`type = "postgres"`: normal postgres box, part of HA
`type = "postgres-archive"`: archive replica
`type = "postgres-delayed"`: delayed replica

TODO: Once postgres-01-db-gprd is back to its normal role, clean up
alert filters
parent e650c58f
......@@ -14,7 +14,7 @@ groups:
runbook: troubleshooting/postgresql.md#availability
title: Postgres seems to be consuming transaction IDs very slowly
- alert: PostgreSQL_XLOGConsumptionTooLow
expr: rate(pg_xlog_position_bytes{fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m]) < 200000
expr: rate(pg_xlog_position_bytes{type = "postgres", fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m]) < 200000
for: 2m
labels:
pager: pagerduty
......@@ -26,7 +26,7 @@ groups:
runbook: troubleshooting/postgresql.md#availability
title: Postgres seems to be consuming XLOG very slowly
- alert: PostgreSQL_CommitRateTooLow
expr: rate(pg_stat_database_xact_commit{datname="gitlabhq_production",fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m])
expr: rate(pg_stat_database_xact_commit{datname="gitlabhq_production", type = "postgres", fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m])
< 1000
for: 5m
labels:
......@@ -97,7 +97,7 @@ groups:
runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
title: PostgreSQL replication has stopped
- alert: PostgreSQL_ReplicationLagTooLarge
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica{fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"} == 1)
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica{type = "postgres", fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"} == 1)
for: 5m
labels:
pager: pagerduty
......@@ -108,7 +108,8 @@ groups:
$value | humanizeDuration }}
runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
title: Postgres Replication lag is over 2 minutes
- alert: PostgreSQL_ReplicationLagTooLarge_ArchiveReplica
- alert: PostgreSQL_ReplicationLagTooLarge_ArchiveReplica_deprecated
# (TODO: deprecated, remove once postgres-01-db-gprd has changed its role)
expr: (pg_replication_lag > 43200) and ON(instance) (pg_replication_is_replica{fqdn="postgres-01-db-gprd.c.gitlab-production.internal"} == 1)
for: 5m
labels:
......@@ -120,9 +121,21 @@ groups:
$value | humanizeDuration }}
runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
title: Postgres Replication lag is over 12 hours on archive recovery replica
- alert: PostgreSQL_ReplicationLagTooLarge_ArchiveReplica
expr: (pg_replication_lag > 43200) and ON(instance) (pg_replication_is_replica{type = "postgres-archive"} == 1)
for: 5m
labels:
pager: pagerduty
severity: warn
channel: database
annotations:
description: Replication lag on server {{$labels.instance}} is currently {{
$value | humanizeDuration }}
runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
title: Postgres Replication lag is over 12 hours on archive recovery replica
- alert: PostgreSQL_ReplicationLagBytesTooLarge
expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - ON(environment)
GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica{fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"}
GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica{type = "postgres", fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"}
== 1) > 1e+09
for: 5m
labels:
......@@ -241,7 +254,7 @@ groups:
title: "Split Brain: too many postgres databases in environment {{$labels.environment}}
in read-write (primary) mode"
- alert: PostgreSQL_SplitBrain_Replicas
expr: count(count(pg_stat_wal_receiver_status >= 0) BY (environment, upstream_host))
expr: count(count(pg_stat_wal_receiver_status{type="postgres"} >= 0) BY (environment, upstream_host))
BY (environment) > 1
for: 1m
labels:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment