Commit a8dce2c5 authored by Matteo Melli's avatar Matteo Melli
Browse files

Full alpha implementation

parent bbc82600
......@@ -21,11 +21,11 @@ postgres-03-db-gprd.c.gitlab-production.internal
postgres-01-db-gprd.c.gitlab-production.internal
postgres-04-db-gprd.c.gitlab-production.internal
)
AZURE_PGBOUNCER=(
AZURE_PGBOUNCERS=(
pgbouncer-01.db.prd.gitlab.com
pgbouncer-02.db.prd.gitlab.com
)
GCP_PGBOUNCER=(
GCP_PGBOUNCERS=(
pgbouncer-01-db-gprd.c.gitlab-production.internal
pgbouncer-02-db-gprd.c.gitlab-production.internal
)
......
......@@ -30,10 +30,10 @@ postgres-02-db-gstg.c.gitlab-staging-1.internal
postgres-03-db-gstg.c.gitlab-staging-1.internal
)
AZURE_PGBOUNCER=(
AZURE_PGBOUNCERS=(
pgbouncer-01.db.stg.gitlab.com
)
GCP_PGBOUNCER=(
GCP_PGBOUNCERS=(
pgbouncer-01-db-gstg.c.gitlab-staging-1.internal
)
......
......@@ -7,12 +7,16 @@ export ENVIRONMENT=$1
source .env_${ENVIRONMENT} # That is, .env_staging or .env_production
source utilities
source steps_${ENVIRONMENT}
# Failover steps go here
#
function 00_test(){
echo "Running things inside"
return $1
}
#Check all steps have a script
for step in "${steps[@]}"
do
if ! type "$(step_script "$step")" > /dev/null 2>&1
then
>&2 echo "Function $(step_script "$step") do not exists for step $(step_3digit_number "$step")"
exit 1
fi
done
echo "menu"
do_menu
......
......@@ -4,45 +4,101 @@ export steps=(
002_disable-chef
003_disable-consul
004_disable-automatic-failover
005_forbid-writes-to-current-master
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_reduce-statement-timeout
011_configure-pgbouncer-for-gcp
012_ensure-priority-is-updated-in-repmgr
013_update-chef-cookbook
014_enable-automatic-failover-on-gcp-only
015_enable-consul-on-gcp-only
016_enable-chef-on-gcp-only
005_convert-azure-master-to-standby
006_check-gcp-nodes-has-same-azure-lsn
007_perform-gcp-candidate-master-promote
008_check-gcp-candidate-master-is-master
009_enable-automatic-failover-on-gcp-only
010_enable-consul-on-gcp-only
011_enable-chef-on-gcp-only
)
function 000_create-tombstone-table(){
return 0
function create-tombstone-table(){
# "create database if not exists" is not supported in Postgres,
# so to make the following action idempotent and not depending on the pre-actions,
# we better re-create tombstone DB and table from scratch
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
EOF
)
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('${tombstone_msg}') returning *\""
# wait until the change is propagated
while [[ true ]]; do
find_new_msg=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd tombstone -c \"select created_at from tombstone where note = '$tombstone_msg'\""
)
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]; then
gcp_cur_rep_delay=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'"
)
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function 001_check-gcp-replication-delay(){
return 0
function disable-chef(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function 002_disable-chef(){
return 0
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
function 003_disable-consul(){
return 0
}
for host in "${AZURE_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
function 004_disable-automatic-failover(){
return 0
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function 005_forbid-writes-to-current-master(){
return 0
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function 006_convert-azure-master-to-standby(){
function convert-azure-master-to-standby(){
ssh_remote "$AZURE_MASTER" $(cat << EOF
echo "
standby_mode = 'on'
......@@ -56,7 +112,7 @@ EOF
)
}
function 007_check-gcp-nodes-has-same-azure-lsn(){
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" $(cat << EOF
......@@ -85,48 +141,53 @@ EOF
done
}
function 008_perform-gcp-candidate-master-promote(){
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data;
EOF
)
)
}
function 009_check-gcp-candidate-master-is-master(){
function check-gcp-candidate-master-is-master(){
ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()";
EOF
) | grep -q 'f'
}
function 010_reduce-statement-timeout(){
return 0
}
function 011_configure-pgbouncer-for-gcp(){
return 0
}
function 012_ensure-priority-is-updated-in-repmgr(){
return 0
) | grep -q 'f'
}
function 013_update-chef-cookbook(){
return 0
function enable-automatic-failover-on-gcp-only(){
ssh "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
}
function 014_enable-automatic-failover-on-gcp-only(){
return 0
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv start /opt/gitlab/sv/consul
done
function 015_enable-consul-on-gcp-only(){
return 0
for host in "${GCP_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function 016_enable-chef-on-gcp-only(){
return 0
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "starting chef on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
done
}
shopt -s expand_aliases
alias ssh_remote="ssh "
export steps=(
000_create-tombstone-table
001_check-gcp-replication-delay
002_disable-chef
003_disable-consul
004_disable-automatic-failover
005_forbid-writes-to-current-master
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_reduce-statement-timeout
011_configure-pgbouncer-for-gcp
012_ensure-priority-is-updated-in-repmgr
013_update-chef-cookbook
014_enable-automatic-failover-on-gcp-only
015_enable-consul-on-gcp-only
016_enable-chef-on-gcp-only
005_convert-azure-master-to-standby
006_check-gcp-nodes-has-same-azure-lsn
007_perform-gcp-candidate-master-promote
008_check-gcp-candidate-master-is-master
009_enable-automatic-failover-on-gcp-only
010_enable-consul-on-gcp-only
011_enable-chef-on-gcp-only
)
#######################################
# Check that GCP "main" replica is not lagging too much
# Globals:
# AZURE_MASTER
# GCP_MASTER_CANDIDATE
# Arguments:
# None
# Returns:
# None
#######################################
function 000_create-tombstone-table(){
# "create database if not exists" is not supported in Postgres,
# so to make the following action idempotent and not depending on the pre-actions,
# we better re-create tombstone DB and table from scratch
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
function create-tombstone-table(){
# "create database if not exists" is not supported in Postgres,
# so to make the following action idempotent and not depending on the pre-actions,
# we better re-create tombstone DB and table from scratch
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
EOF
)
return 0
}
function 001_check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('${tombstone_msg}') returning *\""
# wait until the change is propagated
while [[ true ]]; do
find_new_msg=$(
)
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('${tombstone_msg}') returning *\""
# wait until the change is propagated
while [[ true ]]; do
find_new_msg=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd tombstone -c \"select created_at from tombstone where note = '$tombstone_msg'\""
)
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]; then
gcp_cur_rep_delay=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd tombstone -c \"select created_at from tombstone where note = '$tombstone_msg'\""
)
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]; then
gcp_cur_rep_delay=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'"
)
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
return 0
"cd /tmp; sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'"
)
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function 002_disable-chef(){
echo "We're about to stop disable chef and stop repmgr on $GITLAB_ENV environment. To proceed, type '$GITLAB_ENV':"
read proceed_cmd
if [[ "$proceed_cmd" != "$GITLAB_ENV" ]]; then
>&2 echo "Stop."
return
fi
function disable-chef(){
# chef
for i in $(seq 1 $N_OF_HOSTS); do
for host in "${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}" "${GCP_HOST_PREFIX}$i${GCP_HOST_SUFFIX}"; do
echo "${1}ing chef on $host"
# WARNING: the following line modifies the state
if [[ "$1" == "stop" ]]; then
ssh_remote "$host" "sudo service chef-client stop"
ssh_remote "$host" "sudo mv /etc/chef /etc/chef.migration"
elif [[ "$1" == "start" ]]; then
ssh_remote "$host" "sudo mv /etc/chef.migration /etc/chef"
ssh_remote "$host" "sudo service chef-client start"
else
>&2 echo "1st argument must be either \"stop\" or \"start\" (provided: $1)."
return
fi
done
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
return 0
}
function 003_disable-consul(){
for host in $AZURE_PGBOUNCER
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in $GCP_PGBOUNCER
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in $AZURE_HOSTS
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in $GCP_HOSTS
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
return 0
}
#
#
#
function 004_disable-automatic-failover(){
ssh_remote "$AZURE_MASTER" "sudo sv $1 /opt/gitlab/sv/repmgrd"
return 0
}
function 005_forbid-writes-to-current-master(){
return 0
}
function 006_convert-azure-master-to-standby(){
return 0
}
function 007_check-gcp-nodes-has-same-azure-lsn(){
return 0
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
function 008_perform-gcp-candidate-master-promote(){
return 0
}
for host in "${AZURE_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
function 009_check-gcp-candidate-master-is-master(){
return 0
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function 010_reduce-statement-timeout(){
return 0
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function convert-azure-master-to-standby(){
ssh_remote "$AZURE_MASTER" $(cat << EOF
echo "
standby_mode = 'on'
recovery_target_timeline = 'latest'" > /var/lib/opt/gitlab/postgresql/data/recovery.conf
sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (sudo sv int /opt/gitlab/sv/postgres \
&& sudo sv -W 60 stop /opt/gitlab/sv/postgres)
EOF
)
}
function 011_configure-pgbouncer-for-gcp(){
return 0
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;";
EOF
))"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;";
EOF
))"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
}
function 012_ensure-priority-is-updated-in-repmgr(){
return 0
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data;
EOF
)
}
function 013_update-chef-cookbook(){
return 0
function check-gcp-candidate-master-is-master(){
ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()";
EOF
) | grep -q 'f'
}
function enable-automatic-failover-on-gcp-only(){
ssh "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
ssh "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
}
function 014_enable-automatic-failover-on-gcp-only(){
return 0
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
ssh "$host" sudo sv start /opt/gitlab/sv/consul
done
function 015_enable-consul-on-gcp-only(){
return 0
for host in "${GCP_PGBOUNCERS[@]}"
do
ssh "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function 016_enable-chef-on-gcp-only(){
return 0
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "starting chef on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
done
}
......@@ -15,7 +15,7 @@ function step_number(){
printf "%d" "$(step_3digit_number "$1"|sed 's/0\+//')"
}
function step_name(){
function step_script(){
echo "$1"|cut -d _ -f 2-
}
......@@ -40,7 +40,7 @@ function do_menu(){
echo
for step in "${steps[@]}"
do
echo "$(step_3digit_number "$step")) $(step_name "$step")"
echo "$(step_3digit_number "$step")) $(step_script "$step")"
done
echo
if has_step "$next_step"
......@@ -97,7 +97,7 @@ function do_menu(){
if [ "$key" == "y" ]
then
next_step="$(printf "%03d" "$(($(step_number "$step")+1))")"
do_step "$step"
do_step "$(step_script "$step")"
fi
echo
step=
......
Markdown is supported
0%