...
 
Commits (5)
......@@ -20,6 +20,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "4"
grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Ops Rate
- alert: gitlab_component_opsrate_missing_series
......@@ -40,6 +41,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "42"
grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Apdex
- alert: gitlab_component_apdex_missing_series
......@@ -60,6 +62,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "19"
grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Error Rate
- alert: gitlab_component_error_missing_series
......@@ -80,3 +83,4 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "27"
grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
groups:
- name: service_availability.rules
rules:
# Availability below 2 sigma
- alert: service_availability_out_of_bounds_lower_2sigma_5m
# Availability below 3 sigma
- alert: service_availability_out_of_bounds_lower_5m
expr: |
gitlab_service_availability:ratio
<
gitlab_service_availability:ratio:avg_over_time_1w - 2.5 * gitlab_service_availability:ratio:stddev_over_time_1w
gitlab_service_availability:ratio:avg_over_time_1w - 3 * gitlab_service_availability:ratio:stddev_over_time_1w
for: 5m
labels:
rules_domain: general
......@@ -14,7 +14,7 @@ groups:
severity: warn
period: 5m
bound: lower
threshold_sigma: "2.5"
threshold_sigma: "3"
annotations:
description: |
The ratio of services that are available to serve the `{{ $labels.type }}` service
......@@ -27,15 +27,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "2"
grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-availability.md"
# Operation rate above 2 sigma
- alert: service_ops_out_of_bounds_upper_2sigma_5m
# Operation rate above 3 sigma
- alert: service_availability_out_of_bounds_lower_5m
expr: |
gitlab_service_ops:rate
>
gitlab_service_ops:rate:avg_over_time_1w + 2.5 * gitlab_service_ops:rate:stddev_over_time_1w
gitlab_service_ops:rate:avg_over_time_1w + 3 * gitlab_service_ops:rate:stddev_over_time_1w
for: 5m
labels:
rules_domain: general
......@@ -43,7 +44,7 @@ groups:
severity: warn
period: 5m
bound: upper
threshold_sigma: "2.5"
threshold_sigma: "3"
annotations:
description: |
The `{{ $labels.type }}` service is receiving more requests than normal.
......@@ -57,15 +58,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "12"
grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md"
# Operation rate below 2 sigma
- alert: service_ops_out_of_bounds_lower_2sigma_5m
# Operation rate below 3 sigma
- alert: service_ops_out_of_bounds_lower_5m
expr: |
gitlab_service_ops:rate
<
gitlab_service_ops:rate:avg_over_time_1w - 2.5 * gitlab_service_ops:rate:stddev_over_time_1w
gitlab_service_ops:rate:avg_over_time_1w - 3 * gitlab_service_ops:rate:stddev_over_time_1w
for: 5m
labels:
rules_domain: general
......@@ -73,7 +75,7 @@ groups:
severity: warn
period: 5m
bound: lower
threshold_sigma: "2.5"
threshold_sigma: "3"
annotations:
description: |
The `{{ $labels.type }}` service is receiving fewer requests than normal.
......@@ -86,15 +88,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "12"
grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md"
# Apdex lower than 2 sigma
- alert: service_apdex_out_of_bounds_lower_2sigma_5m
# Apdex lower than 3 sigma
- alert: service_ops_out_of_bounds_lower_5m
expr: |
gitlab_service_apdex:ratio
<
gitlab_service_apdex:ratio:avg_over_time_1w - 2.5 * gitlab_service_apdex:ratio:stddev_over_time_1w
gitlab_service_apdex:ratio:avg_over_time_1w - 3 * gitlab_service_apdex:ratio:stddev_over_time_1w
for: 5m
labels:
rules_domain: general
......@@ -102,7 +105,7 @@ groups:
severity: warn
period: 5m
bound: lower
threshold_sigma: "2.5"
threshold_sigma: "3"
annotations:
description: |
The `{{ $labels.type }}` service is operating at a slower rate than normal.
......@@ -116,15 +119,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "16"
grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-apdex.md"
# Error rate exceeds 2 sigma
- alert: service_errors_out_of_bounds_upper_2sigma_5m
# Error rate exceeds 3 sigma
- alert: service_errors_out_of_bounds_upper_5m
expr: |
gitlab_service_errors:rate
>
gitlab_service_errors:rate:avg_over_time_1w + 2.5 * gitlab_service_errors:rate:stddev_over_time_1w
gitlab_service_errors:rate:avg_over_time_1w + 3 * gitlab_service_errors:rate:stddev_over_time_1w
for: 5m
labels:
rules_domain: general
......@@ -132,7 +136,7 @@ groups:
severity: warn
period: 5m
bound: upper
threshold_sigma: "2.5"
threshold_sigma: "3"
annotations:
description: |
The `{{ $labels.type }}` service is generating more errors than normal.
......@@ -145,5 +149,6 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "24"
grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-errors.md"
groups:
- name: postgresql.rules
rules:
- alert: NoPostgresMasterDetectedByConsul
expr: |
sum(consul_health_service_status{check="service:postgresql", tier="inf", status="passing"}) == 0
labels:
pager: pagerduty
severity: critical
channel: database
annotations:
description: |
No postgresql master is passing the consul check. If there were a
failover, no server is available to populate the pgbouncer
configuration. Check: https://dashboards.gitlab.net/d/a988f2tmz/consul?panelId=23&fullscreen&orgId=1
runbook: troubleshooting/postgres.md
title: No Postgresql Master detected by Consul
- alert: PostgresSQL_XIDConsumptionTooLow
expr: rate(pg_txid_current[1m]) < 5
for: 1m
......@@ -285,7 +300,7 @@ groups:
- alert: PostgreSQL_RoleChange
expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
labels:
severity: info
severity: warn
channel: database
annotations:
description: Database on {{$labels.fqdn}} changed role to {{if eq $value 1.0}}
......@@ -295,7 +310,7 @@ groups:
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, fqdn) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"}
OFFSET 10m'
labels:
severity: info
severity: warn
channel: database
annotations:
description: Database on {{$labels.fqdn}} setting now {{$labels.__name__}}={{$value}}
......
......@@ -15,6 +15,8 @@ ykpersonalize -m86
This setting lets us use the Yubikey as both a SmartCard and an OTP device
at the same time.
**Note:** The above command is not necessary on a YubiKey 5 (and won't work)
## Changing the Default PIN Entries on the Yubikey PIV Card
By default the user PIN is `123456` and the ADMIN PIN is `12345678`, keep this
in mind when changing the PINS when it asks for the current PIN
......@@ -175,6 +177,8 @@ uid John Rando <rando@gitlab.com>
Now that we have a master key, a good practice is to generate a revocation
certificate in the event that we lose the password or the key is compromised.
**Note:** In some versions you do not see the key id in the gpg output. You can use your email here.
```
> gpg --gen-revoke FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E-revocation-certificate.asc
......@@ -310,6 +314,10 @@ sub 4096R/DE86E396 created: 2017-08-25 expires: 2018-08-25 usage: A
## Backup and Publish your Public Key
```
> gpg --armor --export FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E.asc
```
If your gpg version does not output the key id you should use the full fingerprint instead.
```
> gpg --keyserver hkps://hkps.pool.sks-keyservers.net --send-key FAEFD83E
```
......@@ -327,12 +335,12 @@ you've imported.
Or in a fresh terminal we can:
```
> gpg2 --import-key /Volumes/GitLab/gpg_config/FAEFD83E.asc
> gpg --import /Volumes/GitLab/gpg_config/FAEFD83E.asc
gpg: key FAEFD83E: public key imported
gpg: Total number processed: 1
gpg: imported: 1
> gpg2 --edit-key FAEFD83E
> gpg --edit-key FAEFD83E
Secret subkeys are available.
pub 4096R/FAEFD83E created: 2017-08-25 expires: 2018-08-25 usage: C
......
......@@ -23,12 +23,5 @@ groups:
== 1)
- record: postgres:databases
expr: count(pg_exporter_scrapes_total) BY (environment)
- record: postgres:up
expr: (count(pg_replication_is_replica == 1 unless ON(fqdn) ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",severity!="info"})
BY (environment) >= 2 and ON(environment) count(pg_replication_is_replica ==
0 unless ON(fqdn) ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",severity!="info"})
BY (environment) == 1 unless ON(environment) count(ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",fqdn="",severity!="info"})
BY (environment) > 0) > BOOL 0 or ON(environment) postgres:databases == BOOL
0
- record: postgres:pg_stat_seq_scans:rate2m
expr: rate(pg_stat_user_tables_seq_tup_read[2m])
......@@ -20,7 +20,7 @@ def validate_rule(alert_file_path, rule)
LOGGER.warn "#{alert_file_path}: #{alert}: Rules should contain a `description` annotation" unless annotations["description"]
raise StandardError, " #{alert}: rules must contain a `severity` label" unless labels["severity"]
raise StandardError, " #{alert}: rules contains an invalid `severity` label: #{labels["severity"]}" unless ["info", "warn", "error", "critical"].include?(labels["severity"])
raise StandardError, " #{alert}: rules contains an invalid `severity` label: #{labels["severity"]}" unless ["warn", "error", "critical"].include?(labels["severity"])
if labels["pager"]
raise StandardError, " #{alert}: rules contains an invalid `pager` label: #{labels["pager"]}" unless labels["pager"] == "pagerduty"
......