Commit 3eb7c748 authored by Matteo Melli's avatar Matteo Melli

Debug and fixes of pre-flight and failover scripts

parent 7fa82c21
Pipeline #88800 failed with stage
in 15 seconds
......@@ -389,12 +389,13 @@ of errors while it is being promoted.
* Disable automatic failover on all nodes. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/042-disable-automatic-failover.sh`
* Reset automatic failover state. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh`
1. [ ] 🐘 {+ Database-Wrangler +}: Convert the currect master (Azure) to a standby.
* Convert the currect master (Azure) to a standby pointing to candidate master on GCP. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-standby.sh`
* Check the database is now read-only `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh`
* Wait for the GCP master candidate and previous Azure master (now standby) to have same LSN `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh`
* Convert the currect master (Azure) to a headless standby (a standby that does not replicate from anything). `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh`
* Check the database is now read-only. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh`
* Wait for the GCP master candidate and previous Azure master (now standby) to have same LSN. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh`
1. [ ] 🐘 {+ Database-Wrangler +}: Perform regular switchover to the main replica on GCP
* Perform GCP candidate promote. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/060-perform-gcp-candidate-master-promote.sh`
* Check the database is now read-write. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh`
* Convert the old master (Azure) to a standby pointing to candidate master on GCP. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/062-convert-azure-master-to-gcp-standby.sh`
1. [ ] 🐘 {+ Database-Wrangler +}: Start repmgrd and consul agents on GCP
* Enable automatic failover on GCP. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/070-enable-automatic-failover-on-gcp-only.sh`
* Check repmgr master on GCP. `/opt/gitlab-migration/bin/scripts/02_failover/060_go/p04/071-check-repmgr-master.sh`
......
......@@ -5,10 +5,8 @@ set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"
do
echo "Checking repmgr state for host $host"
echo
ssh_remote "$host" sudo -u gitlab-psql gitlab-ctl repmgr cluster show
echo
done
echo "Checking repmgr state for host $AZURE_MASTER"
echo
echo "select * from repmgr_gitlab_cluster.repl_nodes" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr
echo
......@@ -41,7 +41,7 @@ do
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| grep -q "$(host "$slave_host"|cut -d ' ' -f 4)-streaming"
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
......@@ -86,8 +86,8 @@ do
for slave_host in "${GCP_SLAVES[@]}"
do
if ! echo "select client_addr||'-'||state from pg_stat_replication" \
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -A -t \
| grep -q "$(host "$slave_host"|cut -d ' ' -f 4)-streaming"
| ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \
| grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming"
then
>&2 echo "Host $host is not correclty replicated by host $slave_host:"
echo "select * from pg_stat_replication" \
......
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Updating recovery.conf on $AZURE_MASTER"
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/opt/gitlab/postgresql/data/recovery.conf > /dev/null
ssh_remote "$AZURE_MASTER" sudo chown gitlab-psql:gitlab-psql /var/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/opt/gitlab/postgresql/data/recovery.conf
echo "Restarting postgres on $AZURE_MASTER"
(ssh_remote "$AZURE_MASTER" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgresql \
&& ssh_remote "$AZURE_MASTER" sudo sv -w 60 stop /opt/gitlab/sv/postgresql)) \
&& ssh_remote "$AZURE_MASTER" sudo sv -w 60 start /opt/gitlab/sv/postgresql
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "standby_mode = 'on'
primary_conninfo = 'user=gitlab_repmgr host=''$GCP_MASTER_CANDIDATE'' password=$GITLAB_REPMGR_PASSWORD port=5432 fallback_application_name=repmgr sslmode=prefer sslcompression=1 application_name=''$AZURE_MASTER'''
primary_slot_name = secondary_azureprd
restore_command = '/usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e wal-fetch -p 32 \"%f\" \"%p\"'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf > /dev/null
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
......@@ -5,23 +5,18 @@ set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
# shellcheck disable=SC2030,SC2031,SC2036,SC2116
azure_master_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A
# shellcheck disable=SC2030,SC2031,SC2036,SC2116
else pg_current_xlog_location() end;" \
| ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)"
gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;" \
| ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)"
# shellcheck disable=SC2030,SC2031
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
# shellcheck disable=SC2030,SC2031
echo "GCP and Azure have same LSN: $azure_master_lsn"
exit 0
fi
# shellcheck disable=SC2030,SC2031
echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
exit 1
......@@ -7,4 +7,4 @@ source "${BASE}/env_${ENVIRONMENT}"
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
promote -D /var/opt/gitlab/postgresql/data
#!/bin/bash
set -eu
# shellcheck source=/dev/null
source "${BASE}/env_${ENVIRONMENT}"
echo "Convert postgres on $AZURE_MASTER as standby of "$GCP_MASTER_CANDIDATE"
ssh_remote "$AZURE_MASTER" gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE"
......@@ -6,7 +6,7 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr master register
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
......@@ -14,7 +14,7 @@ do
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
ssh_remote "$host" sudo gitlab-ctl repmgr standby register
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
......
......@@ -6,8 +6,7 @@ set -eu
source "${BASE}/env_${ENVIRONMENT}"
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Starting chef-client on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
for host in "${GCP_HOSTS[@]}"; do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
......@@ -8,11 +8,12 @@ file:scripts/02_failover/060_go/p04/040-disable-chef.sh
file:scripts/02_failover/060_go/p04/041-disable-consul.sh
file:scripts/02_failover/060_go/p04/042-disable-automatic-failover.sh
file:scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh
file:scripts/02_failover/060_go/p04/050-convert-azure-master-to-standby.sh
file:scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh
file:scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh
file:scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh
file:scripts/02_failover/060_go/p04/060-perform-gcp-candidate-master-promote.sh
file:scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh
file:scripts/02_failover/060_go/p04/062-convert-azure-master-to-gcp-standby.sh
file:scripts/02_failover/060_go/p04/070-enable-automatic-failover-on-gcp-only.sh
file:scripts/02_failover/060_go/p04/071-check-repmgr-master.sh
file:scripts/02_failover/060_go/p04/072-enable-consul-on-gcp-only.sh
......@@ -21,238 +22,17 @@ function:restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"
do
ssh_remote "$host" bash -c 'chmod o+x "$HOME"'
done
}
function create-tombstone-table(){
echo "Create tombstone database and table if not already existing"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z "${find_new_msg+x}" ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function disable-chef(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping repmgrd on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function reset-automatic-failover-state(){
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr -c \
"TRUNCATE repmgr_gitlab_cluster.repl_nodes"
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
primary_conninfo = 'user=gitlab_repmgr host=''$GCP_MASTER_CANDIDATE'' password=$GITLAB_REPMGR_PASSWORD port=5432 fallback_application_name=repmgr sslmode=prefer sslcompression=1 application_name=''$AZURE_MASTER'''
primary_slot_name = secondary_azureprd
restore_command = '/usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e wal-fetch -p 32 "%f" "%p"'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf > /dev/null
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-azure-master-is-standby(){
if ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 't'
then
echo "$AZURE_MASTER is standby"
return 0
else
>&2 echo "$AZURE_MASTER is standby"
return 1
fi
}
function check-gcp-nodes-has-same-azure-lsn(){
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
reutrn 1
}
function perform-gcp-candidate-master-promote(){
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
function check-gcp-candidate-master-is-master(){
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 'f'
then
echo "$GCP_MASTER_CANDIDATE is master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is master"
return 1
fi
}
function enable-automatic-failover-on-gcp-only(){
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv start /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function check-pgbouncer-node-in-gcp() {
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Check pgbouncer on $host"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW DATABASES"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW SERVERS"
read -s -N 1 -p "Press [y] to continue, any other key to abort." key
if [ "$key" != "y" ]
then
return 1
fi
done
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Starting chef-client on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
echo "$host: chmod o+x \$HOME"
ssh_remote "$host" bash -c '"chmod o+x \"$HOME\""'
done
}
function restore-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"
do
ssh_remote "$host" bash -c 'chmod o-x "$HOME"'
echo "$host: chmod o-x \$HOME"
ssh_remote "$host" bash -c '"chmod o-x \"$HOME\""'
done
}
......@@ -10,6 +10,7 @@ function do_step(){
else
echo
>&2 echo "!!!!!!!!!!!!!!!!!!!!! Step $@ Failed !!!!!!!!!!!!!!!!!!!!!"
return 1
fi
}
......@@ -139,6 +140,7 @@ function do_menu(){
if [ ! -z "$step" ]
then
script_type="$(step_type "$step")"
script="$(step_script "$step")"
echo "Step $step will be executed ($script_type $script):"
echo
if [ "$script_type" == "function" ]
......@@ -153,8 +155,12 @@ function do_menu(){
echo
if [ "$key" == "y" ]
then
next_step="$(next_step "$step")"
do_step "$script" > >(prepend_timestamp | tee -a migration.log) 2> >(prepend_timestamp | tee -a migration.log >&2)
if do_step "$script" > >(prepend_timestamp | tee -a migration.log) 2> >(prepend_timestamp | tee -a migration.log >&2)
then
next_step="$(next_step "$step")"
else
next_step="$(step_3digit_number "$step")"
fi
# wait for the redirect subshells to complete
sleep 1
fi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment