Commit 802a421b authored by Matteo Melli's avatar Matteo Melli

Merge branch 'database-wrangler-runbook' into database_wrangler_automation

parents 7c83f328 95921246
Pipeline #88408 passed with stage
in 14 seconds
source_vars
.project
\ No newline at end of file
# Production
N_OF_HOSTS=4
AZURE_HOST_PREFIX="postgres-0"
AZURE_HOST_SUFFIX=".db.prd.gitlab.com"
GCP_HOST_PREFIX="postgres-0"
GCP_HOST_SUFFIX="-db-gprd.c.gitlab-production.internal"
AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}1${GCP_HOST_SUFFIX}"
AZURE_SLAVES=(
postgres-02.db.prd.gitlab.com
postgres-03.db.prd.gitlab.com
postgres-04.db.prd.gitlab.com
postgres-01.db.prd.gitlab.com
)
GCP_SLAVES=(
postgres-02-db-gprd.c.gitlab-production.internal
postgres-03-db-gprd.c.gitlab-production.internal
postgres-01-db-gprd.c.gitlab-production.internal
postgres-04-db-gprd.c.gitlab-production.internal
)
AZURE_PGBOUNCERS=(
pgbouncer-01.db.prd.gitlab.com
pgbouncer-02.db.prd.gitlab.com
)
GCP_PGBOUNCERS=(
pgbouncer-01-db-gprd.c.gitlab-production.internal
pgbouncer-02-db-gprd.c.gitlab-production.internal
)
# Generic
max_rep_delay=10
# Staging
N_OF_HOSTS=2
AZURE_HOST_PREFIX="postgres0"
AZURE_HOST_SUFFIX=".db.stg.gitlab.com"
GCP_HOST_PREFIX="postgres-0"
GCP_HOST_SUFFIX="-db-gstg.c.gitlab-staging-1.internal"
AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}2${GCP_HOST_SUFFIX}"
AZURE_HOSTS=(
postgres01.db.stg.gitlab.com
postgres02.db.stg.gitlab.com
)
GCP_HOSTS=(
postgres-02-db-gstg.c.gitlab-staging-1.internal
postgres-01-db-gstg.c.gitlab-staging-1.internal
postgres-03-db-gstg.c.gitlab-staging-1.internal
)
AZURE_SLAVES=(
postgres01.db.stg.gitlab.com
)
GCP_SLAVES=(
postgres-02-db-gstg.c.gitlab-staging-1.internal
postgres-03-db-gstg.c.gitlab-staging-1.internal
)
AZURE_PGBOUNCERS=(
pgbouncer-01.db.stg.gitlab.com
)
GCP_PGBOUNCERS=(
pgbouncer-01-db-gstg.c.gitlab-staging-1.internal
)
# Generic
max_rep_delay=10
\ No newline at end of file
#!/bin/bash
set -eu
[[ $# -lt 1 ]] && { echo "Specify the environment"; exit 1 ; }
# Because some of the script use one or another, we clone both,
# although we need to homogenize
export ENVIRONMENT=$1
export GITLAB_ENV=$ENVIRONMENT
source env_${ENVIRONMENT} # That is, .env_staging or .env_production (test also supported)
source utilities
source steps_${ENVIRONMENT}
#Check all steps have a script
for step in "${steps[@]}"
do
if ! type "$(step_script "$step")" > /dev/null 2>&1
then
>&2 echo "Function $(step_script "$step") do not exists for step $(step_3digit_number "$step")"
exit 1
fi
done
echo "menu"
do_menu
#!/bin/bash
shopt -s expand_aliases
alias ssh_remote="ssh "
source env_${1}
ssh_remote "${AZURE_MASTER}" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-ctl repmgr cluster show
EOF
)
\ No newline at end of file
#!/bin/bash
shopt -s expand_aliases
alias ssh_remote="ssh "
source env_${1}
ssh_remote "${AZURE_MASTER}" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-ctl repmgr cluster show
EOF
)
export steps=(
000_get-rid-of-could-not-change-directory-to-message
001_create-tombstone-table
002_check-gcp-replication-delay
003_disable-chef
004_disable-consul
005_disable-automatic-failover
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_enable-automatic-failover-on-gcp-only
011_check-repmgr-master
012_enable-consul-on-gcp-only
013_enable-chef-on-gcp-only
014_restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o+x "$HOME"'
done
}
function create-tombstone-table(){
echo "Create tombstone database and table if not already existing"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function disable-chef(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
}
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
function check-gcp-candidate-master-is-master(){
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 'f'
then
echo "$GCP_MASTER_CANDIDATE is master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is master"
return 1
fi
}
function enable-automatic-failover-on-gcp-only(){
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Starting chef-client on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function restore-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o-x "$HOME"'
done
}
export steps=(
000_get-rid-of-could-not-change-directory-to-message
001_create-tombstone-table
002_check-gcp-replication-delay
003_disable-chef
004_disable-consul
005_disable-automatic-failover
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_enable-automatic-failover-on-gcp-only
011_check-repmgr-master
012_enable-consul-on-gcp-only
013_enable-chef-on-gcp-only
014_restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o+x "$HOME"'
done
}
function create-tombstone-table(){
echo "Create tombstone database and table if not already existing"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function disable-chef(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
}
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
function check-gcp-candidate-master-is-master(){
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 'f'
then
echo "$GCP_MASTER_CANDIDATE is master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is master"
return 1
fi
}
function enable-automatic-failover-on-gcp-only(){
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Starting chef-client on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function restore-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o-x "$HOME"'
done
}
#!/bin/bash
export steps=(000_step 001_step)
function 000_step(){
echo "Running things inside"
return $1
}
function 001_step(){
echo "Running things inside"
return $1
}
#!/bin/bash
function ssh_remote(){
ssh "$@"
}
function do_step(){
echo "Executing: $@"
"$@" || { echo "Step `$@` Failed." ; exit 2 ; } \
&& echo "Step `$@` ran OK"
}
function step_3digit_number(){
echo "$1"|cut -d _ -f 1
}
function step_number(){
printf "%d" "$(step_3digit_number "$1"|sed 's/0\+//')"
}
function step_script(){
echo "$1"|cut -d _ -f 2-
}
function has_step(){
found=false
for step in "${steps[@]}"
do
if [ "$(step_3digit_number "$step")" == "$1" ]
then
found=true
break;
fi
done
$found
}
function do_menu(){
next_step="$(step_3digit_number "${steps[0]}")"
while true
do
echo "Available steps:"
echo
for step in "${steps[@]}"
do
echo "$(step_3digit_number "$step")) $(step_script "$step")"
done
echo
if has_step "$next_step"
then
export read_prompt="Enter the step number to execute, next to execute step $next_step or quit to exit? "
else
export read_prompt="Enter the step number to execute or quit to exit? "
fi
command="$(rlwrap -C migration bash -c 'read -p "$read_prompt" command; echo "$command"')"
step=
case "$command" in
quit)
exit 0
;;
next)
found=false
for step in "${steps[@]}"