...
 
Commits (5)
...@@ -20,6 +20,7 @@ groups: ...@@ -20,6 +20,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data" grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "4" grafana_panel_id: "4"
grafana_variables: "environment,type,component" grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Ops Rate # Ops Rate
- alert: gitlab_component_opsrate_missing_series - alert: gitlab_component_opsrate_missing_series
...@@ -40,6 +41,7 @@ groups: ...@@ -40,6 +41,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data" grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "42" grafana_panel_id: "42"
grafana_variables: "environment,type,component" grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Apdex # Apdex
- alert: gitlab_component_apdex_missing_series - alert: gitlab_component_apdex_missing_series
...@@ -60,6 +62,7 @@ groups: ...@@ -60,6 +62,7 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data" grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "19" grafana_panel_id: "19"
grafana_variables: "environment,type,component" grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
# Error Rate # Error Rate
- alert: gitlab_component_error_missing_series - alert: gitlab_component_error_missing_series
...@@ -80,3 +83,4 @@ groups: ...@@ -80,3 +83,4 @@ groups:
grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data" grafana_dashboard_id: "Z4W91Zbmk/general-triage-missing-component-series-data"
grafana_panel_id: "27" grafana_panel_id: "27"
grafana_variables: "environment,type,component" grafana_variables: "environment,type,component"
grafana_min_zoom_hours: 24
groups: groups:
- name: service_availability.rules - name: service_availability.rules
rules: rules:
# Availability below 2 sigma # Availability below 3 sigma
- alert: service_availability_out_of_bounds_lower_2sigma_5m - alert: service_availability_out_of_bounds_lower_5m
expr: | expr: |
gitlab_service_availability:ratio gitlab_service_availability:ratio
< <
gitlab_service_availability:ratio:avg_over_time_1w - 2.5 * gitlab_service_availability:ratio:stddev_over_time_1w gitlab_service_availability:ratio:avg_over_time_1w - 3 * gitlab_service_availability:ratio:stddev_over_time_1w
for: 5m for: 5m
labels: labels:
rules_domain: general rules_domain: general
...@@ -14,7 +14,7 @@ groups: ...@@ -14,7 +14,7 @@ groups:
severity: warn severity: warn
period: 5m period: 5m
bound: lower bound: lower
threshold_sigma: "2.5" threshold_sigma: "3"
annotations: annotations:
description: | description: |
The ratio of services that are available to serve the `{{ $labels.type }}` service The ratio of services that are available to serve the `{{ $labels.type }}` service
...@@ -27,15 +27,16 @@ groups: ...@@ -27,15 +27,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service" grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "2" grafana_panel_id: "2"
grafana_variables: "environment,type" grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition" link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-availability.md" link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-availability.md"
# Operation rate above 2 sigma # Operation rate above 3 sigma
- alert: service_ops_out_of_bounds_upper_2sigma_5m - alert: service_availability_out_of_bounds_lower_5m
expr: | expr: |
gitlab_service_ops:rate gitlab_service_ops:rate
> >
gitlab_service_ops:rate:avg_over_time_1w + 2.5 * gitlab_service_ops:rate:stddev_over_time_1w gitlab_service_ops:rate:avg_over_time_1w + 3 * gitlab_service_ops:rate:stddev_over_time_1w
for: 5m for: 5m
labels: labels:
rules_domain: general rules_domain: general
...@@ -43,7 +44,7 @@ groups: ...@@ -43,7 +44,7 @@ groups:
severity: warn severity: warn
period: 5m period: 5m
bound: upper bound: upper
threshold_sigma: "2.5" threshold_sigma: "3"
annotations: annotations:
description: | description: |
The `{{ $labels.type }}` service is receiving more requests than normal. The `{{ $labels.type }}` service is receiving more requests than normal.
...@@ -57,15 +58,16 @@ groups: ...@@ -57,15 +58,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service" grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "12" grafana_panel_id: "12"
grafana_variables: "environment,type" grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition" link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md" link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md"
# Operation rate below 2 sigma # Operation rate below 3 sigma
- alert: service_ops_out_of_bounds_lower_2sigma_5m - alert: service_ops_out_of_bounds_lower_5m
expr: | expr: |
gitlab_service_ops:rate gitlab_service_ops:rate
< <
gitlab_service_ops:rate:avg_over_time_1w - 2.5 * gitlab_service_ops:rate:stddev_over_time_1w gitlab_service_ops:rate:avg_over_time_1w - 3 * gitlab_service_ops:rate:stddev_over_time_1w
for: 5m for: 5m
labels: labels:
rules_domain: general rules_domain: general
...@@ -73,7 +75,7 @@ groups: ...@@ -73,7 +75,7 @@ groups:
severity: warn severity: warn
period: 5m period: 5m
bound: lower bound: lower
threshold_sigma: "2.5" threshold_sigma: "3"
annotations: annotations:
description: | description: |
The `{{ $labels.type }}` service is receiving fewer requests than normal. The `{{ $labels.type }}` service is receiving fewer requests than normal.
...@@ -86,15 +88,16 @@ groups: ...@@ -86,15 +88,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service" grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "12" grafana_panel_id: "12"
grafana_variables: "environment,type" grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition" link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md" link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-ops-rate.md"
# Apdex lower than 2 sigma # Apdex lower than 3 sigma
- alert: service_apdex_out_of_bounds_lower_2sigma_5m - alert: service_ops_out_of_bounds_lower_5m
expr: | expr: |
gitlab_service_apdex:ratio gitlab_service_apdex:ratio
< <
gitlab_service_apdex:ratio:avg_over_time_1w - 2.5 * gitlab_service_apdex:ratio:stddev_over_time_1w gitlab_service_apdex:ratio:avg_over_time_1w - 3 * gitlab_service_apdex:ratio:stddev_over_time_1w
for: 5m for: 5m
labels: labels:
rules_domain: general rules_domain: general
...@@ -102,7 +105,7 @@ groups: ...@@ -102,7 +105,7 @@ groups:
severity: warn severity: warn
period: 5m period: 5m
bound: lower bound: lower
threshold_sigma: "2.5" threshold_sigma: "3"
annotations: annotations:
description: | description: |
The `{{ $labels.type }}` service is operating at a slower rate than normal. The `{{ $labels.type }}` service is operating at a slower rate than normal.
...@@ -116,15 +119,16 @@ groups: ...@@ -116,15 +119,16 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service" grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "16" grafana_panel_id: "16"
grafana_variables: "environment,type" grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition" link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-apdex.md" link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-apdex.md"
# Error rate exceeds 2 sigma # Error rate exceeds 3 sigma
- alert: service_errors_out_of_bounds_upper_2sigma_5m - alert: service_errors_out_of_bounds_upper_5m
expr: | expr: |
gitlab_service_errors:rate gitlab_service_errors:rate
> >
gitlab_service_errors:rate:avg_over_time_1w + 2.5 * gitlab_service_errors:rate:stddev_over_time_1w gitlab_service_errors:rate:avg_over_time_1w + 3 * gitlab_service_errors:rate:stddev_over_time_1w
for: 5m for: 5m
labels: labels:
rules_domain: general rules_domain: general
...@@ -132,7 +136,7 @@ groups: ...@@ -132,7 +136,7 @@ groups:
severity: warn severity: warn
period: 5m period: 5m
bound: upper bound: upper
threshold_sigma: "2.5" threshold_sigma: "3"
annotations: annotations:
description: | description: |
The `{{ $labels.type }}` service is generating more errors than normal. The `{{ $labels.type }}` service is generating more errors than normal.
...@@ -145,5 +149,6 @@ groups: ...@@ -145,5 +149,6 @@ groups:
grafana_dashboard_id: "WOtyonOiz/general-triage-service" grafana_dashboard_id: "WOtyonOiz/general-triage-service"
grafana_panel_id: "24" grafana_panel_id: "24"
grafana_variables: "environment,type" grafana_variables: "environment,type"
grafana_min_zoom_hours: 12
link1_title: "Definition" link1_title: "Definition"
link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-errors.md" link1_url: "https://gitlab.com/gitlab-com/runbooks/blob/master/troubleshooting/definition-service-errors.md"
groups: groups:
- name: postgresql.rules - name: postgresql.rules
rules: rules:
- alert: NoPostgresMasterDetectedByConsul
expr: |
sum(consul_health_service_status{check="service:postgresql", tier="inf", status="passing"}) == 0
labels:
pager: pagerduty
severity: critical
channel: database
annotations:
description: |
No postgresql master is passing the consul check. If there were a
failover, no server is available to populate the pgbouncer
configuration. Check: https://dashboards.gitlab.net/d/a988f2tmz/consul?panelId=23&fullscreen&orgId=1
runbook: troubleshooting/postgres.md
title: No Postgresql Master detected by Consul
- alert: PostgresSQL_XIDConsumptionTooLow - alert: PostgresSQL_XIDConsumptionTooLow
expr: rate(pg_txid_current[1m]) < 5 expr: rate(pg_txid_current[1m]) < 5
for: 1m for: 1m
...@@ -285,7 +300,7 @@ groups: ...@@ -285,7 +300,7 @@ groups:
- alert: PostgreSQL_RoleChange - alert: PostgreSQL_RoleChange
expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0 expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
labels: labels:
severity: info severity: warn
channel: database channel: database
annotations: annotations:
description: Database on {{$labels.fqdn}} changed role to {{if eq $value 1.0}} description: Database on {{$labels.fqdn}} changed role to {{if eq $value 1.0}}
...@@ -295,7 +310,7 @@ groups: ...@@ -295,7 +310,7 @@ groups:
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, fqdn) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} expr: '{__name__=~"pg_settings_.*"} != ON(__name__, fqdn) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"}
OFFSET 10m' OFFSET 10m'
labels: labels:
severity: info severity: warn
channel: database channel: database
annotations: annotations:
description: Database on {{$labels.fqdn}} setting now {{$labels.__name__}}={{$value}} description: Database on {{$labels.fqdn}} setting now {{$labels.__name__}}={{$value}}
......
...@@ -15,6 +15,8 @@ ykpersonalize -m86 ...@@ -15,6 +15,8 @@ ykpersonalize -m86
This setting lets us use the Yubikey as both a SmartCard and an OTP device This setting lets us use the Yubikey as both a SmartCard and an OTP device
at the same time. at the same time.
**Note:** The above command is not necessary on a YubiKey 5 (and won't work)
## Changing the Default PIN Entries on the Yubikey PIV Card ## Changing the Default PIN Entries on the Yubikey PIV Card
By default the user PIN is `123456` and the ADMIN PIN is `12345678`, keep this By default the user PIN is `123456` and the ADMIN PIN is `12345678`, keep this
in mind when changing the PINS when it asks for the current PIN in mind when changing the PINS when it asks for the current PIN
...@@ -175,6 +177,8 @@ uid John Rando <rando@gitlab.com> ...@@ -175,6 +177,8 @@ uid John Rando <rando@gitlab.com>
Now that we have a master key, a good practice is to generate a revocation Now that we have a master key, a good practice is to generate a revocation
certificate in the event that we lose the password or the key is compromised. certificate in the event that we lose the password or the key is compromised.
**Note:** In some versions you do not see the key id in the gpg output. You can use your email here.
``` ```
> gpg --gen-revoke FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E-revocation-certificate.asc > gpg --gen-revoke FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E-revocation-certificate.asc
...@@ -310,6 +314,10 @@ sub 4096R/DE86E396 created: 2017-08-25 expires: 2018-08-25 usage: A ...@@ -310,6 +314,10 @@ sub 4096R/DE86E396 created: 2017-08-25 expires: 2018-08-25 usage: A
## Backup and Publish your Public Key ## Backup and Publish your Public Key
``` ```
> gpg --armor --export FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E.asc > gpg --armor --export FAEFD83E > /Volumes/GitLab/gpg_config/FAEFD83E.asc
```
If your gpg version does not output the key id you should use the full fingerprint instead.
```
> gpg --keyserver hkps://hkps.pool.sks-keyservers.net --send-key FAEFD83E > gpg --keyserver hkps://hkps.pool.sks-keyservers.net --send-key FAEFD83E
``` ```
...@@ -327,12 +335,12 @@ you've imported. ...@@ -327,12 +335,12 @@ you've imported.
Or in a fresh terminal we can: Or in a fresh terminal we can:
``` ```
> gpg2 --import-key /Volumes/GitLab/gpg_config/FAEFD83E.asc > gpg --import /Volumes/GitLab/gpg_config/FAEFD83E.asc
gpg: key FAEFD83E: public key imported gpg: key FAEFD83E: public key imported
gpg: Total number processed: 1 gpg: Total number processed: 1
gpg: imported: 1 gpg: imported: 1
> gpg2 --edit-key FAEFD83E > gpg --edit-key FAEFD83E
Secret subkeys are available. Secret subkeys are available.
pub 4096R/FAEFD83E created: 2017-08-25 expires: 2018-08-25 usage: C pub 4096R/FAEFD83E created: 2017-08-25 expires: 2018-08-25 usage: C
......
...@@ -23,12 +23,5 @@ groups: ...@@ -23,12 +23,5 @@ groups:
== 1) == 1)
- record: postgres:databases - record: postgres:databases
expr: count(pg_exporter_scrapes_total) BY (environment) expr: count(pg_exporter_scrapes_total) BY (environment)
- record: postgres:up - record: postgres:pg_stat_seq_scans:rate2m
expr: (count(pg_replication_is_replica == 1 unless ON(fqdn) ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",severity!="info"})
BY (environment) >= 2 and ON(environment) count(pg_replication_is_replica ==
0 unless ON(fqdn) ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",severity!="info"})
BY (environment) == 1 unless ON(environment) count(ALERTS{alertname=~"PostgreSQL_.*",alertstate="firing",fqdn="",severity!="info"})
BY (environment) > 0) > BOOL 0 or ON(environment) postgres:databases == BOOL
0
- record: postgres:pg_stat_seq_scans:rate2m
expr: rate(pg_stat_user_tables_seq_tup_read[2m]) expr: rate(pg_stat_user_tables_seq_tup_read[2m])
...@@ -20,7 +20,7 @@ def validate_rule(alert_file_path, rule) ...@@ -20,7 +20,7 @@ def validate_rule(alert_file_path, rule)
LOGGER.warn "#{alert_file_path}: #{alert}: Rules should contain a `description` annotation" unless annotations["description"] LOGGER.warn "#{alert_file_path}: #{alert}: Rules should contain a `description` annotation" unless annotations["description"]
raise StandardError, " #{alert}: rules must contain a `severity` label" unless labels["severity"] raise StandardError, " #{alert}: rules must contain a `severity` label" unless labels["severity"]
raise StandardError, " #{alert}: rules contains an invalid `severity` label: #{labels["severity"]}" unless ["info", "warn", "error", "critical"].include?(labels["severity"]) raise StandardError, " #{alert}: rules contains an invalid `severity` label: #{labels["severity"]}" unless ["warn", "error", "critical"].include?(labels["severity"])
if labels["pager"] if labels["pager"]
raise StandardError, " #{alert}: rules contains an invalid `pager` label: #{labels["pager"]}" unless labels["pager"] == "pagerduty" raise StandardError, " #{alert}: rules contains an invalid `pager` label: #{labels["pager"]}" unless labels["pager"] == "pagerduty"
......