Commit e51141c1 authored by Matteo Melli's avatar Matteo Melli
Browse files

Standby to point to GCP candidate, fixes and more checks

parent 9eefd5dd
Pipeline #88512 passed with stage
in 25 seconds
......@@ -5,15 +5,18 @@ export steps=(
003_disable-chef
004_disable-consul
005_disable-automatic-failover
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_enable-automatic-failover-on-gcp-only
011_check-repmgr-master
012_enable-consul-on-gcp-only
013_enable-chef-on-gcp-only
014_restore-could-not-change-directory-to-message
006_reset-automatic-failover-state
007_convert-azure-master-to-standby
008_check-azure-master-is-standby
009_check-gcp-nodes-has-same-azure-lsn
010_perform-gcp-candidate-master-promote
011_check-gcp-candidate-master-is-master
012_enable-automatic-failover-on-gcp-only
013_check-repmgr-master
014_check-pgbouncer-node-in-gcp
015_enable-chef-on-gcp-only
016_enable-consul-on-gcp-only
017_restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
......@@ -39,7 +42,7 @@ function check-gcp-replication-delay(){
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]
if [[ -z "${find_new_msg+x}" ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
......@@ -100,12 +103,20 @@ function disable-automatic-failover(){
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
echo "Stopping repmgrd on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function reset-automatic-failover-state(){
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr -c \
"TRUNCATE repmgr_gitlab_cluster.repl_nodes"
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
primary_conninfo = 'user=gitlab_repmgr host=''$GCP_MASTER_CANDIDATE'' password=$GITLAB_REPMGR_PASSWORD port=5432 fallback_application_name=repmgr sslmode=prefer sslcompression=1 application_name=''$AZURE_MASTER'''
primary_slot_name = secondary_azureprd
restore_command = '/usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e wal-fetch -p 32 "%f" "%p"'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
......@@ -115,28 +126,38 @@ recovery_target_timeline = 'latest'" | \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-azure-master-is-standby(){
if ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 't'
then
echo "$AZURE_MASTER is standby"
return 0
else
>&2 echo "$AZURE_MASTER is standby"
return 1
fi
}
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
reutrn 1
}
function perform-gcp-candidate-master-promote(){
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
......@@ -167,7 +188,7 @@ function enable-automatic-failover-on-gcp-only(){
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv start /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
......@@ -179,6 +200,32 @@ function enable-automatic-failover-on-gcp-only(){
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function check-pgbouncer-node-in-gcp() {
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Check pgbouncer on $host"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW DATABASES"
ssh_remote "$host" gitlab-ctl pgb-console -c "SHOW SERVERS"
read -s -N 1 -p "Press [y] to continue, any other key to abort." key
if [ "$key" != "y" ]
then
return 1
fi
done
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
......@@ -202,18 +249,6 @@ function enable-chef-on-gcp-only(){
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function restore-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment