Commit 62273024 authored by Matteo Melli's avatar Matteo Melli
Browse files

Update failover.md

parent 9a2eafab
Pipeline #88496 passed with stage
in 21 seconds
......@@ -339,87 +339,36 @@ state of the secondary to converge.
* Staging: `postgres-01.db.gstg.gitlab.com`
* Production: `postgres-01-db-gprd.c.gitlab-production.internal`
```shell
shopt -s expand_aliases
alias ssh_remote="ssh " # TODO(NikolayS) change to "knife ..."
# TODO(NikolayS) move globals to the top / init stage
export GITLAB_ENV="staging"
if [[ "$GITLAB_ENV" == "production" ]]; then
export N_OF_HOSTS=4
export AZURE_HOST_PREFIX="postgres-0"
export AZURE_HOST_SUFFIX=".db.prd.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gprd.c.gitlab-production.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}1${GCP_HOST_SUFFIX}"
elif [[ "$GITLAB_ENV" == "staging" ]]; then
export N_OF_HOSTS=2
export AZURE_HOST_PREFIX="postgres0"
export AZURE_HOST_SUFFIX=".db.stg.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gstg.c.gitlab-staging-1.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}2${GCP_HOST_SUFFIX}"
else
>&2 echo "\$GITLAB_ENV is not correctly defined ($GITLAB_ENV). Stop all work and exit (enter anything to proceed)."
read smth
exit 1
fi
#######################################
# Check that GCP "main" replica is not lagging too much
# Globals:
# AZURE_MASTER
# GCP_MASTER_CANDIDATE
# Arguments:
# None
# Returns:
# None
#######################################
handle_gcp_replication_delay() { # put definition to the top
max_rep_delay=10 # TODO(NikolayS) is 10s ok? Double-check after turning SR on
# "create database if not exists" is not supported in Postgres,
# so to make the following action idempotent and not depending on the pre-actions,
# we better re-create tombstone DB and table from scratch
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone";
sudo -u gitlab-psql gitlab-psql postgres \
-c "create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
* Create tombstone database and table
```shell
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
EOF
)
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${GITLAB_ENV}"
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('${tombstone_msg}') returning *\""
# wait until the change is propagated
while [[ true ]]; do
find_new_msg=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd tombstone -c \"select created_at from tombstone where note = '$tombstone_msg'\""
)
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]; then
gcp_cur_rep_delay=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'"
)
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE. The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE, continue."
break
fi
done
}
```
handle_gcp_replication_delay
```
* Insert tombstone record and check lag is under 10s. If SR enable 10s can be lowered. Double-check after turning SR on.
```shell
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z "${find_new_msg+x}" ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
```
1. [ ] 🐺 {+ Coordinator +}: Now disable all sidekiq-cron jobs on the secondary
* In a dedicated rails console on the **secondary**:
* `loop { Sidekiq::Cron::Job.all.map(&:disable!); sleep 1 }`
......@@ -469,119 +418,207 @@ of errors while it is being promoted.
- [ ] `altssh.gitlab.com A 35.190.168.187`
- [ ] `*.gitlab.io A 35.185.44.232`
- **DO NOT** change `gitlab.io`.
1. [ ] 🐘 {+ Database-Wrangler +}: Disable chef on all nodes and shut down repmgr
1. [ ] 🐘 {+ Database-Wrangler +}: Disable chef on all nodes and shut down consul agents and repmgrd
```shell
#######################################
# Stop or start chef, consul, and repmgr
# Globals:
# GITLAB_ENV, AZURE_MASTER, AZURE_HOST_PREFIX,
# GCP_HOST_PREFIX, GCP_HOST_SUFFIX, N_OF_HOSTS
# Arguments:
# None
# Returns:
# None
#######################################
db_switch_chef_consul_repmgr_state() {
if [[ -z "${GITLAB_ENV+x}" ]]; then
>&2 echo "\$GITLAB_ENV is empty, cannot proceed."
return
fi
echo "We're about to stop disable chef and stop repmgr on $GITLAB_ENV environment. To proceed, type '$GITLAB_ENV':"
read proceed_cmd
if [[ "$proceed_cmd" != "$GITLAB_ENV" ]]; then
>&2 echo "Stop."
return
fi
# chef
for i in $(seq 1 $N_OF_HOSTS); do
for host in "${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}" "${GCP_HOST_PREFIX}$i${GCP_HOST_SUFFIX}"; do
echo "${1}ing chef on $host"
# WARNING: the following line modifies the state
if [[ "$1" == "stop" ]]; then
ssh_remote "$host" "sudo service chef-client stop"
ssh_remote "$host" "sudo mv /etc/chef /etc/chef.migration"
elif [[ "$1" == "start" ]]; then
ssh_remote "$host" "sudo mv /etc/chef.migration /etc/chef"
ssh_remote "$host" "sudo service chef-client start"
else
>&2 echo "1st argument must be either \"stop\" or \"start\" (provided: $1)."
return
fi
done
#Chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
#Consul
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
#Repmgr
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
# Reset repmgr failover state
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr -c \
"TRUNCATE repmgr_gitlab_cluster.repl_nodes"
```
1. [ ] 🐘 {+ Database-Wrangler +}: Convert the currect master (Azure) to a standby.
# consul
# TODO(NikolayS) commands to stop consul see https://gitlab.com/gitlab-com/migration/issues/718#note_90342057
# repmgr
# WARNING: the following line modifies the state
ssh_remote "$AZURE_MASTER" "sudo sv $1 /opt/gitlab/sv/repmgrd"
}
* Convert the currect master (Azure) to a headless standby (empty `primary_conninfo` and `restore_command`).
db_switch_chef_consul_repmgr_state stop
```shell
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
# Runit (sv) send SIGTERM (smart shutwodn that wait connections to be closed)
# instead of SIGNINT (fast shutdown that close connections). This workaround
# allow to stop Postgresql faster and cleaner. We issue a sv stop with small
# timeout to prevent runit from try to restart it after SIGINT is sent
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
```
1. [ ] 🐘 {+ Database-Wrangler +}: Forbid writes to the currect master (Azure) – do not allow any connections except replication and administrative (local via socket) ones.
* Wait for the GCP master candidate and previous Azure master (now standby) to have same LSN
```shell
for i in $(seq 1 $N_OF_HOSTS); do
host="${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}"
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'local all all peer map=gitlab' > /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'host replication gitlab_repmgr 0.0.0.0/0 md5' >> /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/pg_hba.conf /var/opt/gitlab/postgresql/data/pg_hba.conf.backup"
ssh_remote "$host" "sudo -u gitlab-psql mv /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp /var/opt/gitlab/postgresql/data/pg_hba.conf"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c 'select pg_reload_conf()'"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c \
\"select pg_terminate_backend(pid) from pg_stat_activity where datname = 'gitlabhq_production' and pid <> pg_backend_pid()\""
done
# Then we wait for the GCP master candidate and previous Azure master (now standby) to have same LSN
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "Wait GCP and Azure have same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: Perform regular switchover to the main replica on GCP
```shell
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" \
"/opt/gitlab/embedded/bin/pg_ctl -D /var/opt/gitlab/postgresql/data promote"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
```
TODO --- more checks --- (below this line DB steps are not reworked // NikolayS)
- [ ] Confirm `gitlab-ctl repmgr cluster show` reflects the desired state
- [ ] Confirm pgbouncer node in GCP (Password is in 1password)
1. [ ] 🐘 {+ Database-Wrangler +}: Check the database is now read-write
```shell
$ gitlab-ctl pgb-console
...
pgbouncer# SHOW DATABASES;
# You want to see lines like
gitlabhq_production | PRIMARY_IP_HERE | 5432 | gitlabhq_production | | 100 | 5 | | 0 | 0
gitlabhq_production_sidekiq | PRIMARY_IP_HERE | 5432 | gitlabhq_production | | 150 | 5 | | 0 | 0
...
pgbouncer# SHOW SERVERS;
# You want to see lines like
S | gitlab | gitlabhq_production | idle | PRIMARY_IP | 5432 | PGBOUNCER_IP | 54714 | 2018-05-11 20:59:11 | 2018-05-11 20:59:12 | 0x718ff0 | | 19430 |
```
1. [ ] 🐘 {+ Database-Wrangler +}: In case automated failover does not occur, perform a manual failover
- [ ] Promote the desired primary
* Connect to the newly promoted primary in GCP (The result should be `f`)
```shell
sudo gitlab-psql -d gitlabhq_production -c "select * from pg_is_in_recovery();"
```
```shell
$ knife ssh "fqdn:DESIRED_PRIMARY" "gitlab-ctl repmgr standby promote"
```
- [ ] Instruct the remaining standby nodes to follow the new primary
1. [ ] 🐘 {+ Database-Wrangler +}: Start repmgrd on GCP
```shell
#Repmgrd
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv start /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: Check repmgr master on GCP
```shell
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Check pgbouncer on $host"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW DATABASES"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW SERVERS"
read -s -N 1 -p "Press [y] to continue, any other key to abort." key
if [ "$key" != "y" ]
then
return 1
fi
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: Start consul on GCP
```shell
#Consul
for host in "${GCP_HOSTS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: Check pgbouncer is connecting on GCP
```shell
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Check pgbouncer on $host"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW DATABASES"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW SERVERS"
read -s -N 1 -p "Press [y] to continue, any other key to abort." key
if [ "$key" != "y" ]
then
return 1
fi
done
```
```shell
$ knife ssh "role:gstg-base-db-postgres AND NOT fqdn:DESIRED_PRIMARY" "gitlab-ctl repmgr standby follow DESIRED_PRIMARY"
```
*Note*: This will fail on the WAL-E node
1. [ ] 🐘 {+ Database-Wrangler +}: Check the database is now read-write
* Connect to the newly promoted primary in GCP
* `sudo gitlab-psql -d gitlabhq_production -c "select * from pg_is_in_recovery();"`
* The result should be `F`
1. [ ] 🔪 {+ Chef-Runner +}: Update the chef configuration according to
* Staging: https://dev.gitlab.org/cookbooks/chef-repo/merge_requests/1989
* Production: https://dev.gitlab.org/cookbooks/chef-repo/merge_requests/2218
1. [ ] 🔪 {+ Chef-Runner +}: Run `chef-client` on every node to ensure Chef changes are applied and all Geo secondary services are stopped
* **STAGING** `knife ssh roles:gstg-base 'sudo chef-client > /tmp/chef-client-log-$(date +%s).txt 2>&1 || echo FAILED'`
* **PRODUCTION** **UNTESTED** `knife ssh roles:gprd-base 'sudo chef-client > /tmp/chef-client-log-$(date +%s).txt 2>&1 || echo FAILED'`
1. [ ] 🔪 {+ Chef-Runner +}: Ensure that `gitlab.rb` has the correct `external_url` on all hosts
* Staging: `knife ssh roles:gstg-base 'sudo cat /etc/gitlab/gitlab.rb 2>/dev/null | grep external_url' | sort -k 2`
* Production: `knife ssh roles:gprd-base 'sudo cat /etc/gitlab/gitlab.rb 2>/dev/null | grep external_url' | sort -k 2`
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment