Commit e38302db authored by Matteo Melli's avatar Matteo Melli

Fixes, failback procedure and notes in README.md

parent 4207bb78
Pipeline #88922 failed with stage
in 13 seconds
......@@ -188,3 +188,13 @@ Then carry out the following steps:
1. **Verify `/opt/gitlab-migration/bin/verify-failover-config`**: You should receive a message indicating success
1. **Setup the workflow issues**": Run `/opt/gitlab-migration/bin/start-failover-procedure.sh`. This will setup several issues in the issue tracker for performing the checks, failover, tests, etc.
* Any variables in the template in the format `__VARIABLE__` will be substituted with their values from the `bin/source_vars` file, saving manual effort.
### Migration scripts
1. Prepare file `env_<environment>` pointing environment variables to correct hosts.
1. Steps scripts are mapped in file `steps_<role>` in order of execution (To define failback steps just add `_failback` suffix to the role)
1. To run the runbook script menu use the `migration` script:
```shell
bash bin/migration <environment> <role>
```
......@@ -14,7 +14,7 @@ export GITLAB_ENV=$ENVIRONMENT
# shellcheck source=/dev/null
source "${BASE}/source_vars"
source "${BASE}/env_${ENVIRONMENT}" # That is, .env_staging or .env_production (test also supported)
source "${BASE}/utilities"
source "${BASE}/migration_utilities"
source "${BASE}/steps_${ROLE}"
#Check all steps have a script
......
......@@ -8,5 +8,5 @@ source "${BASE}/env_${ENVIRONMENT}"
echo "Checking repmgr state for host $AZURE_MASTER"
echo
echo "select * from repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr
echo
......@@ -14,7 +14,7 @@ do
if [ "$host" != "$AZURE_MASTER" ]
then
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
......@@ -22,30 +22,30 @@ do
fi
else
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 'f'
then
>&2 echo "Host $host is not master"
all_ok=false
fi
if ! echo "select count(1) from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q '4'
then
>&2 echo "Host $host is not replicated by 4 nodes:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
for slave_host in "${AZURE_SLAVES[@]}" "$GCP_MASTER_CANDIDATE"
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
done
......@@ -60,7 +60,7 @@ do
if [ "$host" != "$GCP_MASTER_CANDIDATE" ]
then
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
......@@ -68,30 +68,30 @@ do
fi
else
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
all_ok=false
fi
if ! echo "select count(1) from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q '3'
then
>&2 echo "Host $host is not replicated by 3 nodes:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
for slave_host in "${GCP_SLAVES[@]}"
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
done
......
......@@ -7,6 +7,6 @@ source "${BASE}/env_${ENVIRONMENT}"
echo "Create tombstone database and table if not already existing"
echo "drop database if exists tombstone; create database tombstone" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
echo "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d tombstone
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d tombstone
......@@ -6,4 +6,4 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
echo "Convert postgres on $AZURE_MASTER as standby of $GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE"
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
\ No newline at end of file
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "TRUNCATE repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Coping recovery.done to recovery.conf on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/recovery.done /var/opt/gitlab/postgresql/data/recovery.conf
echo "Restarting postgres on $GCP_MASTER_CANDIDATE"
(ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \
|| (ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv int /opt/gitlab/sv/postgresql \
&& ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 stop /opt/gitlab/sv/postgresql)) \
&& ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 start /opt/gitlab/sv/postgresql
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
if echo "select pg_is_in_recovery()" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \
| grep -q 't'
then
echo "$GCP_MASTER_CANDIDATE is standby"
exit 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is NOT standby"
exit 1
fi
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
azure_master_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
exit 0
fi
echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
exit 1
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
# WARNING WARNING WARNING here switchback happens!
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/opt/gitlab/postgresql/data
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
if echo "select pg_is_in_recovery()" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \
| grep -q 'f'
then
echo "$AZURE_MASTER is master"
exit 0
else
>&2 echo "$AZURE_MASTER is NOT master"
exit 1
fi
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Register $AZURE_MASTER as master with repmgr"
ssh_remote "$AZURE_MASTER" sudo gitlab-ctl repmgr master register
for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr standby register
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" sudo sv start /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Checking state of $AZURE_MASTER"
if ssh_remote "$AZURE_MASTER" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$AZURE_MASTER is repmgr master"
exit 0
else
>&2 echo "$AZURE_MASTER is not repmgr master"
exit 1
fi
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
# chef
for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}"; do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${GCP_PGBOUNCERS[@]}" "${AZURE_PGBOUNCERS[@]}"
do
echo "Check pgbouncer on $host"
echo "SHOW DATABASES" | ssh_remote "$host" gitlab-ctl pgb-console
echo "SHOW SERVERS" | ssh_remote "$host" gitlab-ctl pgb-console
read -r -s -N 1 -p "Press [y] to continue, any other key to abort." key
if [ "$key" != "y" ]
then
exit 1
fi
done
......@@ -13,11 +13,11 @@ file:scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh
file:scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh
file:scripts/02_failover/060_go/p04/060-perform-gcp-candidate-master-promote.sh
file:scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh
file:scripts/02_failover/060_go/p04/062-convert-azure-master-to-gcp-standby.sh
file:scripts/02_failover/060_go/p04/070-enable-automatic-failover-on-gcp-only.sh
file:scripts/02_failover/060_go/p04/071-check-repmgr-master.sh
file:scripts/02_failover/060_go/p04/072-enable-consul-on-gcp-only.sh
file:scripts/02_failover/060_go/p04/073-check-pgbouncer-node-in-gcp.sh
file:scripts/02_failover/060_go/p04/074-convert-azure-master-to-standby-of-gcp.sh
function:restore-could-not-change-directory-to-message
)
......
export steps=(
function:get-rid-of-could-not-change-directory-to-message
file:scripts/04_failback/040-disable-chef.sh
file:scripts/04_failback/041-disable-consul.sh
file:scripts/04_failback/042-disable-automatic-failover.sh
file:scripts/04_failback/043-reset-automatic-failover-state.sh
file:scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh
file:scripts/04_failback/051-check-gcp-master-is-standby.sh
file:scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh
file:scripts/04_failback/060-perform-azure-master-promote.sh
file:scripts/04_failback/061-check-azure-master-is-master.sh
file:scripts/04_failback/070-enable-automatic-failover.sh
file:scripts/04_failback/071-check-repmgr-master.sh
file:scripts/04_failback/072-enable-consul.sh
file:scripts/04_failback/073-check-pgbouncer-node-in-azure.sh
function:restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"
do
echo "$host: chmod o+x \$HOME"
ssh_remote "$host" bash -c '"chmod o+x \"$HOME\""'
done
}
function restore-could-not-change-directory-to-message(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"
do
echo "$host: chmod o-x \$HOME"
ssh_remote "$host" bash -c '"chmod o-x \"$HOME\""'
done
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment