Commit 1c722d75 authored by Matteo Melli's avatar Matteo Melli
Browse files

Fixes, failback procedure and notes in README.md

parent 4207bb78
Pipeline #88923 failed with stage
in 19 seconds
......@@ -188,3 +188,13 @@ Then carry out the following steps:
1. **Verify `/opt/gitlab-migration/bin/verify-failover-config`**: You should receive a message indicating success
1. **Setup the workflow issues**": Run `/opt/gitlab-migration/bin/start-failover-procedure.sh`. This will setup several issues in the issue tracker for performing the checks, failover, tests, etc.
* Any variables in the template in the format `__VARIABLE__` will be substituted with their values from the `bin/source_vars` file, saving manual effort.
### Migration scripts
1. Prepare file `env_<environment>` pointing environment variables to correct hosts.
1. Steps scripts are mapped in file `steps_<role>` in order of execution (To define failback steps just add `_failback` suffix to the role)
1. To run the runbook script menu use the `migration` script:
```shell
bash bin/migration <environment> <role>
```
......@@ -14,7 +14,7 @@ export GITLAB_ENV=$ENVIRONMENT
# shellcheck source=/dev/null
source "${BASE}/source_vars"
source "${BASE}/env_${ENVIRONMENT}" # That is, .env_staging or .env_production (test also supported)
source "${BASE}/utilities"
source "${BASE}/migration_utilities"
source "${BASE}/steps_${ROLE}"
#Check all steps have a script
......
......@@ -8,5 +8,5 @@ source "${BASE}/env_${ENVIRONMENT}"
echo "Checking repmgr state for host $AZURE_MASTER"
echo
echo "select * from repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr
echo
......@@ -14,7 +14,7 @@ do
if [ "$host" != "$AZURE_MASTER" ]
then
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
......@@ -22,30 +22,30 @@ do
fi
else
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 'f'
then
>&2 echo "Host $host is not master"
all_ok=false
fi
if ! echo "select count(1) from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q '4'
then
>&2 echo "Host $host is not replicated by 4 nodes:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
for slave_host in "${AZURE_SLAVES[@]}" "$GCP_MASTER_CANDIDATE"
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
done
......@@ -60,7 +60,7 @@ do
if [ "$host" != "$GCP_MASTER_CANDIDATE" ]
then
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
......@@ -68,30 +68,30 @@ do
fi
else
if ! echo "select pg_is_in_recovery()" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q 't'
then
>&2 echo "Host $host is not standby"
all_ok=false
fi
if ! echo "select count(1) from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q '3'
then
>&2 echo "Host $host is not replicated by 3 nodes:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
for slave_host in "${GCP_SLAVES[@]}"
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres
| >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
all_ok=false
fi
done
......
......@@ -7,6 +7,6 @@ source "${BASE}/env_${ENVIRONMENT}"
echo "Create tombstone database and table if not already existing"
echo "drop database if exists tombstone; create database tombstone" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres
echo "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d tombstone
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d tombstone
......@@ -8,16 +8,16 @@ source "${BASE}/env_${ENVIRONMENT}"
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
echo "insert into tombstone(note) values('${tombstone_msg}') returning *" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d tombstone
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d tombstone
# wait until the change is propagated
while true
do
find_new_msg="$(echo "select created_at from tombstone where note = '$tombstone_msg'" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -A -t -d tombstone)"
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -v ON_ERROR_STOP=1 -A -t -d tombstone)"
if [[ -z "${find_new_msg+x}" ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(echo "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -A -t -d postgres)"
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -v ON_ERROR_STOP=1 -A -t -d postgres)"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
......
......@@ -6,4 +6,4 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
echo "TRUNCATE repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr
......@@ -9,7 +9,7 @@ echo "Updating recovery.conf on $AZURE_MASTER"
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/opt/gitlab/postgresql/data/recovery.conf > /dev/null
ssh_remote "$AZURE_MASTER" sudo chown gitlab-psql:gitlab-psql /var/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown gitlab-psql:gitlab-psql -v ON_ERROR_STOP=1 /var/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/opt/gitlab/postgresql/data/recovery.conf
echo "Restarting postgres on $AZURE_MASTER"
(ssh_remote "$AZURE_MASTER" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \
......
......@@ -6,7 +6,7 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
if echo "select pg_is_in_recovery()" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \
| grep -q 't'
then
echo "$AZURE_MASTER is standby"
......
......@@ -8,11 +8,11 @@ source "${BASE}/env_${ENVIRONMENT}"
azure_master_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)"
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)"
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
......
......@@ -6,7 +6,7 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
if echo "select pg_is_in_recovery()" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -d postgres -t -A \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \
| grep -q 'f'
then
echo "$GCP_MASTER_CANDIDATE is master"
......
......@@ -6,4 +6,4 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
echo "Convert postgres on $AZURE_MASTER as standby of $GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE"
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
\ No newline at end of file
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "TRUNCATE repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Coping recovery.done to recovery.conf on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/recovery.done /var/opt/gitlab/postgresql/data/recovery.conf
echo "Restarting postgres on $GCP_MASTER_CANDIDATE"
(ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \
|| (ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv int /opt/gitlab/sv/postgresql \
&& ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 stop /opt/gitlab/sv/postgresql)) \
&& ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 start /opt/gitlab/sv/postgresql
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
if echo "select pg_is_in_recovery()" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \
| grep -q 't'
then
echo "$GCP_MASTER_CANDIDATE is standby"
exit 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is NOT standby"
exit 1
fi
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
azure_master_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
exit 0
fi
echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
exit 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment