postgresqls.yml 16.9 KB
Newer Older
Pablo Carranza's avatar
Pablo Carranza committed
1 2 3 4
groups:
- name: postgresql.rules
  rules:
  - alert: PostgresSQL_XIDConsumptionTooLow
5
    expr: rate(pg_txid_current[1m]) < 5
Pablo Carranza's avatar
Pablo Carranza committed
6 7
    for: 1m
    labels:
8 9
      pager: pagerduty
      severity: critical
10
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
11 12 13 14 15 16
    annotations:
      description: TXID/s is {{ $value | printf "%.1f" }} on {{$labels.instance}}
        which is unusually low. Perhaps the application is unable to connect
      runbook: troubleshooting/postgresql.md#availability
      title: Postgres seems to be consuming transaction IDs very slowly
  - alert: PostgreSQL_XLOGConsumptionTooLow
17
    expr: rate(pg_xlog_position_bytes{type = "postgres", fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m]) < 200000
Pablo Carranza's avatar
Pablo Carranza committed
18 19
    for: 2m
    labels:
20 21
      pager: pagerduty
      severity: critical
22
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
23 24 25 26 27 28
    annotations:
      description: XLOG throughput is {{ $value | humanize1024 }}B/s on {{$labels.instance}}
        which is unusually low. Perhaps the application is unable to connect
      runbook: troubleshooting/postgresql.md#availability
      title: Postgres seems to be consuming XLOG very slowly
  - alert: PostgreSQL_CommitRateTooLow
29
    expr: rate(pg_stat_database_xact_commit{datname="gitlabhq_production", type = "postgres", fqdn !="postgres-01-db-gprd.c.gitlab-production.internal"}[1m])
Pablo Carranza's avatar
Pablo Carranza committed
30
      < 1000
31
    for: 5m
Pablo Carranza's avatar
Pablo Carranza committed
32
    labels:
33 34
      pager: pagerduty
      severity: critical
35
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
36 37 38 39 40 41 42 43
    annotations:
      description: Commits/s on {{$labels.instance}} database {{$labels.datname}}
        is {{$value | printf "%.0f" }} which is implausibly low. Perhaps the application
        is unable to connect
      runbook: troubleshooting/postgresql.md#availability
      title: Postgres seems to be processing very few transactions
  - alert: PostgreSQL_ConnectionsTooHigh
    expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections
44 45
      * 0.95
    for: 5m
Pablo Carranza's avatar
Pablo Carranza committed
46 47
    labels:
      severity: warn
48
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
49 50 51 52 53 54
    annotations:
      runbook: troubleshooting/postgresql.md#connections
      title: Postgres has {{$value}} connections on {{$labels.fqdn}} which is close
        to the maximum
  - alert: PostgreSQL_RollbackRateTooHigh
    expr: rate(pg_stat_database_xact_rollback{datname="gitlabhq_production"}[5m])
55
      / ON(instance, datname) rate(pg_stat_database_xact_commit{datname="gitlabhq_production"}[5m])
56
      > 0.1
Pablo Carranza's avatar
Pablo Carranza committed
57 58
    for: 5m
    labels:
59
      severity: warn
60
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
61 62 63 64 65 66
    annotations:
      description: Ratio of transactions being aborted compared to committed is {{
        $value | printf "%.2f" }} on {{$labels.instance}}
      runbook: troubleshooting/postgresql.md#errors
      title: Postgres transaction rollback rate is high
  - alert: PostgreSQL_StatementTimeout_Errors
67
    expr: rate(postgresql_errors_total{type="statement_timeout"}[5m]) > 3
Pablo Carranza's avatar
Pablo Carranza committed
68 69
    for: 5m
    labels:
Yorick Peterse's avatar
Yorick Peterse committed
70
      severity: warn
71
      pager: pagerduty
72
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
73 74 75 76 77 78
    annotations:
      descrition: Database {{$labels.fqdn}} has {{ $value | printf "%.1f" }} statement
        timeouts per second
      runbook: troubleshooting/postgresql.md#errors
      title: Postgres transactions showing high rate of statement timeouts
  - alert: PostgreSQL_Deadlock_Errors
79
    expr: rate(postgresql_errors_total{type="deadlock_detected"}[1m]) * 60 > 5
Pablo Carranza's avatar
Pablo Carranza committed
80 81
    labels:
      severity: warn
82
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
83 84
    annotations:
      descrition: Database {{$labels.fqdn}} has {{ $value | printf "%.1f" }} deadlock
85
        errors per minute
Pablo Carranza's avatar
Pablo Carranza committed
86 87 88
      runbook: troubleshooting/postgresql.md#errors
      title: Postgres detected deadlocks
  - alert: PostgreSQL_ReplicationStopped
89
    expr: pg_stat_replication_pg_xlog_location_diff{job="gitlab-cluster-db"} != pg_stat_replication_pg_xlog_location_diff{job="gitlab-cluster-db"}
Pablo Carranza's avatar
Pablo Carranza committed
90 91 92 93
    for: 5m
    labels:
      pager: pagerduty
      severity: critical
94
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
95 96 97 98 99
    annotations:
      description: PostgreSQL replication has stopped on {{$labels.instance}}.
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: PostgreSQL replication has stopped
  - alert: PostgreSQL_ReplicationLagTooLarge
100
    expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica{type = "postgres", fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"} == 1)
Pablo Carranza's avatar
Pablo Carranza committed
101 102 103
    for: 5m
    labels:
      pager: pagerduty
Yorick Peterse's avatar
Yorick Peterse committed
104
      severity: warn
105
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
106 107 108 109 110
    annotations:
      description: Replication lag on server {{$labels.instance}} is currently {{
        $value | humanizeDuration }}
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres Replication lag is over 2 minutes
111 112
  - alert: PostgreSQL_ReplicationLagTooLarge_ArchiveReplica_deprecated
    # (TODO: deprecated, remove once postgres-01-db-gprd has changed its role)
113 114 115 116 117 118 119 120 121 122 123
    expr: (pg_replication_lag > 43200) and ON(instance) (pg_replication_is_replica{fqdn="postgres-01-db-gprd.c.gitlab-production.internal"} == 1)
    for: 5m
    labels:
      pager: pagerduty
      severity: warn
      channel: database
    annotations:
      description: Replication lag on server {{$labels.instance}} is currently {{
        $value | humanizeDuration }}
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres Replication lag is over 12 hours on archive recovery replica
124 125 126 127 128 129 130 131 132 133 134 135
  - alert: PostgreSQL_ReplicationLagTooLarge_ArchiveReplica
    expr: (pg_replication_lag > 43200) and ON(instance) (pg_replication_is_replica{type = "postgres-archive"} == 1)
    for: 5m
    labels:
      pager: pagerduty
      severity: warn
      channel: database
    annotations:
      description: Replication lag on server {{$labels.instance}} is currently {{
        $value | humanizeDuration }}
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres Replication lag is over 12 hours on archive recovery replica
136 137 138 139 140 141 142 143 144 145 146 147
  - alert: PostgreSQL_ReplicationLagTooLarge_DelayedReplica
    expr: (pg_replication_lag > 43200) and ON(instance) (pg_replication_is_replica{type = "postgres-delayed"} == 1)
    for: 5m
    labels:
      pager: pagerduty
      severity: warn
      channel: database
    annotations:
      description: Replication lag on server {{$labels.instance}} is currently {{
        $value | humanizeDuration }}
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres Replication lag is over 12 hours on delayed replica
Pablo Carranza's avatar
Pablo Carranza committed
148 149
  - alert: PostgreSQL_ReplicationLagBytesTooLarge
    expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - ON(environment)
150
      GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica{type = "postgres", fqdn != "postgres-01-db-gprd.c.gitlab-production.internal"}
Pablo Carranza's avatar
Pablo Carranza committed
151 152 153 154
      == 1) > 1e+09
    for: 5m
    labels:
      pager: pagerduty
Yorick Peterse's avatar
Yorick Peterse committed
155
      severity: warn
156
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
157 158 159 160 161 162
    annotations:
      description: Replication lag on server {{$labels.instance}} is currently {{
        $value | humanize1024}}B
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres Replication lag (in bytes) is high
  - alert: PostgreSQL_XLOGConsumptionTooHigh
163
    expr: rate(pg_xlog_position_bytes[2m]) > 36700160 and ON(instance) (pg_replication_is_replica == 0)
164
    for: 10m
Pablo Carranza's avatar
Pablo Carranza committed
165 166
    labels:
      pager: pagerduty
Yorick Peterse's avatar
Yorick Peterse committed
167
      severity: warn
168
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
169 170 171 172 173 174 175 176 177 178
    annotations:
      description: XLOG is being generated at a rate of {{ $value | humanize1024 }}B/s
        on {{$labels.instance}}
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
      title: Postgres is generating XLOG too fast, expect this to cause replication
        lag
  - alert: PostgreSQL_UnusedReplicationSlot
    expr: pg_replication_slots_active == 0
    for: 30m
    labels:
179 180
      pager: pagerduty
      severity: critical
181
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
182 183
    annotations:
      description: Unused {{$labels.slot_type}} slot "{{$labels.slot_name}}" on {{$labels.fqdn}}
184
      runbook: "troubleshooting/postgres.md#replication_slots"
Pablo Carranza's avatar
Pablo Carranza committed
185 186 187
      title: Unused Replication Slots for {{$labels.fqdn}}
  - alert: PostgreSQL_ReplicaStaleXmin
    expr: pg_replication_slots_xmin_age > 20000
188
    for: 30m
Pablo Carranza's avatar
Pablo Carranza committed
189
    labels:
Yorick Peterse's avatar
Yorick Peterse committed
190
      severity: critical
191
      channel: database
Yorick Peterse's avatar
Yorick Peterse committed
192
      pager: pagerduty
Pablo Carranza's avatar
Pablo Carranza committed
193
    annotations:
Yorick Peterse's avatar
Yorick Peterse committed
194 195
      title: |
        PostgreSQL replication slot {{$labels.slot_name}} on {{$labels.fqdn}} is
196
        falling behind.
197
      description: |
198 199 200 201 202
        The replication slot {{$labels.slot_name}} on {{$labels.fqdn}} is using
        a minimum transaction ID that is {{$value | humanize}} transactions old.
        This can cause an increase in dead tuples on the primary. This can be
        caused by long-running queries or transactions, or unused replication
        slots on a secondary.
Pablo Carranza's avatar
Pablo Carranza committed
203 204
      runbook: troubleshooting/postgres.md#tables-with-a-large-amount-of-dead-tuples
  - alert: PostgreSQL_LongLivedTransaction
205
    expr: pg_stat_activity_max_tx_duration > 3600
Pablo Carranza's avatar
Pablo Carranza committed
206 207
    labels:
      severity: warn
208
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
209
    annotations:
210 211 212 213
      description: |
        "Postgres server {{$labels.fqdn}} has a transaction in currently in
        state "{{$labels.state}}" that started {{ $value | humanizeDuration }}
        ago"
Pablo Carranza's avatar
Pablo Carranza committed
214
      runbook: troubleshooting/postgres.md#tables-with-a-large-amount-of-dead-tuples
215
      title: "There's a long-lived Postgres transaction"
Pablo Carranza's avatar
Pablo Carranza committed
216 217 218 219
  - alert: PostgreSQL_DBHeavyLoad
    expr: node_load1{type="postgres"} > 200
    for: 5m
    labels:
220
      pager: pagerduty
Pablo Carranza's avatar
Pablo Carranza committed
221
      severity: critical
222
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
223
    annotations:
224 225 226 227 228
      description: |
        "Really high load in the batabase for the last minute, there are {{
          query 'sum(pg_slow_queries_total)' }} slow queries, {{ query
          'sum(pg_blocked_queries_total)' }} and {{ query
          'sum(pg_locks_count{datname=\'gitlabhq_production\'})' }} locks.
229 230
          Check http://dashboards.gitlab.net/dashboard/db/postgres-stats and
          http://dashboards.gitlab.net/dashboard/db/postgres-queries to get
231 232
          more data."
      runbook: "troubleshooting/postgres.md#load"
Pablo Carranza's avatar
Pablo Carranza committed
233 234
      title: 'High load in database {{ $labels.fqdn }}: {{$value}}'
  - alert: PostgreSQL_TooManyDeadTuples
235 236
    expr: >
      (
237
        (pg_stat_user_tables_n_dead_tup > 10000)
238 239 240 241 242
        /
        (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup > 1000)
      ) >= 0.05
      unless ON(instance) (pg_replication_is_replica == 1)
    for: 30m
Pablo Carranza's avatar
Pablo Carranza committed
243
    labels:
244
      pager: pagerduty
245
      severity: warn
246
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
247
    annotations:
248 249
      description: "The dead tuple ratio of {{$labels.relname}} is greater than 5%"
      table: "{{$labels.relname}}"
250 251
      dead_tuples: "{{$value}}"
      runbook: "troubleshooting/postgres.md#tables-with-a-large-amount-of-dead-tuples"
Pablo Carranza's avatar
Pablo Carranza committed
252 253 254 255 256
      title: PostgreSQL dead tuples is too large
  - alert: PostgreSQL_SplitBrain
    expr: count(pg_replication_is_replica == 0) BY (environment) != 1
    for: 1m
    labels:
257
      pager: pagerduty
Pablo Carranza's avatar
Pablo Carranza committed
258
      severity: critical
259
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
260
    annotations:
261 262 263 264 265 266 267
      description: 'The following {{$value}} databases are in read-write
      (primary) mode: {{range query
        "pg_replication_is_replica{environment=''$labels.environment''} ==
        0.0"}} {{.labels.fqdn}} {{end}}'
      runbook: "troubleshooting/postgres.md#repmgr"
      title: "Split Brain: too many postgres databases in environment {{$labels.environment}}
        in read-write (primary) mode"
Pablo Carranza's avatar
Pablo Carranza committed
268
  - alert: PostgreSQL_SplitBrain_Replicas
269
    expr: count(count(pg_stat_wal_receiver_status{type="postgres"} >= 0) BY (environment, upstream_host))
Pablo Carranza's avatar
Pablo Carranza committed
270 271 272
      BY (environment) > 1
    for: 1m
    labels:
273
      pager: pagerduty
Pablo Carranza's avatar
Pablo Carranza committed
274
      severity: critical
275
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
276 277 278 279 280 281 282
    annotations:
      description: 'In environment {{$labels.environment}} there are {{$value}} different
        upstream primaries: {{range query "pg_stat_wal_receiver_status{environment=''$labels.environment''}
        >= 0"}}"{{.labels.fqdn}}" thinks "{{.labels.upstream_host}}" is the primary;
        {{end}} '
      runbook: troubleshooting/postgres.md#repmgr
      title: Replicas have different upstream primary databases
283 284 285

  - alert: PostgreSQL_ServiceDown
    expr: pg_up == 0
Pablo Carranza's avatar
Pablo Carranza committed
286 287
    for: 1m
    labels:
288
      channel: database
289 290
      pager: pagerduty
      severity: critical
Pablo Carranza's avatar
Pablo Carranza committed
291
    annotations:
292 293 294 295 296 297
      description: |
        {{$labels.fqdn}} postgres service appears down
      runbook: "troubleshooting/postgres.md"
      title: |
        {{$labels.fqdn}} postgres service appears down

Pablo Carranza's avatar
Pablo Carranza committed
298 299 300 301
  - alert: PostgreSQL_RoleChange
    expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
    labels:
      severity: info
302
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
303 304 305 306 307 308 309 310 311
    annotations:
      description: Database on {{$labels.fqdn}} changed role to {{if eq $value 1.0}}
        *replica* {{else}} *primary* {{end}}
      title: Postgres Database replica promotion occurred in environment "{{$labels.environment}}"
  - alert: PostgreSQL_ConfigurationChange
    expr: '{__name__=~"pg_settings_.*"} != ON(__name__, fqdn) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"}
      OFFSET 10m'
    labels:
      severity: info
312
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
313 314 315 316 317 318 319
    annotations:
      description: Database on {{$labels.fqdn}} setting now {{$labels.__name__}}={{$value}}
      title: Postgres Database configuration change has occured on "{{$labels.fqdn}}"
  - alert: PostgreSQL_TooFewPrometheusScrapes
    expr: rate(pg_exporter_scrapes_total[1m]) < 1 / 60
    for: 5m
    labels:
320 321
      pager: pagerduty
      severity: critical
322
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
323 324 325 326 327
    annotations:
      description: '{{$labels.fqdn}} is showing only {{$value}} scrapes per second
        which should be > 0.2'
      title: PostgreSQL Exporter not being scraped
  - alert: PostgreSQL_ExporterErrors
328
    expr: pg_exporter_last_scrape_error{environment=~"gprd|gstg"} == 1
Pablo Carranza's avatar
Pablo Carranza committed
329 330
    for: 1h
    labels:
331 332
      pager: pagerduty
      severity: critical
333
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
334 335 336 337 338 339
    annotations:
      description: This may indicate postgres_exporter is not running or a buggy query
        in query.yaml on {{$labels.fqdn}}
      runbook: troubleshooting/prometheus-exporter-scrape-errors.md
      title: Postgres exporter is showing errors for the last hour
  - alert: PostgreSQL_DiskUtilizationMaxedOut
340
    expr: rate(node_disk_io_time_seconds_total{device=~"dm-(0|1)",tier="db",type="postgres"}[1m])
341 342
      / 10 > 97
    for: 5m
Pablo Carranza's avatar
Pablo Carranza committed
343 344
    labels:
      severity: warn
345
      channel: database
Pablo Carranza's avatar
Pablo Carranza committed
346 347 348 349
    annotations:
      description: Server {{$labels.fqdn}} disk {{$labels.device}} utilization is
        {{$value | printf "%.1f"}}%
      runbook: troubleshooting/postgres.md#replication-is-lagging-or-has-stopped
350
      title: Postgres server disk utilization over 95% for over a minute. Expect
Ben Kochie's avatar
Ben Kochie committed
351
        this to cause replication lag or user visible latency
352
  - alert: PostgreSQL_PGBouncer_Errors
353
    expr: rate(pgbouncer_errors_count{environment=~"gprd|gstg", errmsg!="server conn crashed?"}[10m]) > 5
354
    for: 5m
355 356 357 358
    labels:
      severity: warn
      channel: database
    annotations:
359
      message: "{{$labels.errmsg}}"
360 361 362 363 364 365 366 367 368 369 370 371 372
      description: PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console
      runbook: troubleshooting/postgres.md#pgbouncer-errors
      title: PGBouncer is logging errors
  - alert: PostgreSQL_PGBouncer_maxclient_conn
    expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
    labels:
      severity: critical
      channel: database
    annotations:
      description: The number of client connections has reached max_client_conn.
        Consider raising this parameter or investigating the source of the unusually large number of connections
      runbook: troubleshooting/postgres.md#pgbouncer-errors
      title: PGBouncer is logging "no more connections allowed" errors
373
  - alert: PostgreSQL_too_many_slow_queries
374
    expr: rate(pg_slow_queries{environment=~"gprd"}[1m]) * 60 > 100
375 376 377 378 379 380 381 382 383 384 385 386
    for: 5m
    labels:
      pager: pagerduty
      severity: warn
      channel: database
    annotations:
      description: >
        Database {{ $labels.fqdn }} has {{ $value | printf "%.1f" }} slow
        queries. This may be the result of badly written SQL queries, an
        overloaded database, or a performance regression introduced in a new
        release.
      title: There are too many slow SQL queries
387 388 389 390 391 392 393 394 395 396 397 398 399
  - alert: PostgreSQL_SSLCompressionActive
    expr: sum(pg_stat_ssl_compression{environment=~".*prd"}) BY (fqdn) > 0
    for: 10m
    labels:
      severity: warn
      channel: database
    annotations:
      description: >
        Database {{ $labels.fqdn }} has {{ $value | printf "%.1f" }}
        connections with SSL compression enabled. This may add significant
        jitter in replication delay. Replicas should turn off SSL compression
        via `sslcompression=0` in `recovery.conf`.
      title: Clients are using SSL compression