Commit 8702e166 authored by Nikolay's avatar Nikolay

Merge branch 'database_wrangler_automation' into 'master'

Database wrangler automation

See merge request gitlab-com/migration!170
parents 02759828 802a421b
Pipeline #88477 passed with stage
in 25 seconds
source_vars
.project
\ No newline at end of file
......@@ -333,9 +333,88 @@ state of the secondary to converge.
1. [ ] 🐘 {+ Database-Wrangler +}: Ensure the prospective failover target in GCP is up to date
* Staging: `postgres-01.db.gstg.gitlab.com`
* Production: `postgres-01-db-gprd.c.gitlab-production.internal`
* `sudo gitlab-psql -d gitlabhq_production -c "SELECT now() - pg_last_xact_replay_timestamp();"`
* Assuming the clocks are in sync, this value should be close to 0
* If this is a large number, GCP may not have some data that is in Azure
```shell
shopt -s expand_aliases
alias ssh_remote="ssh " # TODO(NikolayS) change to "knife ..."
# TODO(NikolayS) move globals to the top / init stage
export GITLAB_ENV="staging"
if [[ "$GITLAB_ENV" == "production" ]]; then
export N_OF_HOSTS=4
export AZURE_HOST_PREFIX="postgres-0"
export AZURE_HOST_SUFFIX=".db.prd.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gprd.c.gitlab-production.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}1${GCP_HOST_SUFFIX}"
elif [[ "$GITLAB_ENV" == "staging" ]]; then
export N_OF_HOSTS=2
export AZURE_HOST_PREFIX="postgres0"
export AZURE_HOST_SUFFIX=".db.stg.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gstg.c.gitlab-staging-1.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}2${GCP_HOST_SUFFIX}"
else
>&2 echo "\$GITLAB_ENV is not correctly defined ($GITLAB_ENV). Stop all work and exit (enter anything to proceed)."
read smth
exit 1
fi
#######################################
# Check that GCP "main" replica is not lagging too much
# Globals:
# AZURE_MASTER
# GCP_MASTER_CANDIDATE
# Arguments:
# None
# Returns:
# None
#######################################
handle_gcp_replication_delay() { # put definition to the top
max_rep_delay=10 # TODO(NikolayS) is 10s ok? Double-check after turning SR on
# "create database if not exists" is not supported in Postgres,
# so to make the following action idempotent and not depending on the pre-actions,
# we better re-create tombstone DB and table from scratch
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone";
sudo -u gitlab-psql gitlab-psql postgres \
-c "create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
EOF
)
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${GITLAB_ENV}"
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('${tombstone_msg}') returning *\""
# wait until the change is propagated
while [[ true ]]; do
find_new_msg=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd tombstone -c \"select created_at from tombstone where note = '$tombstone_msg'\""
)
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]; then
gcp_cur_rep_delay=$(
ssh_remote "$GCP_MASTER_CANDIDATE" \
"cd /tmp; sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'"
)
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE. The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE, continue."
break
fi
done
}
handle_gcp_replication_delay
```
1. [ ] 🐺 {+ Coordinator +}: Now disable all sidekiq-cron jobs on the secondary
* In a dedicated rails console on the **secondary**:
* `loop { Sidekiq::Cron::Job.all.map(&:disable!); sleep 1 }`
......@@ -385,24 +464,82 @@ of errors while it is being promoted.
- [ ] `altssh.gitlab.com A 35.190.168.187`
- [ ] `*.gitlab.io A 35.185.44.232`
- **DO NOT** change `gitlab.io`.
1. [ ] 🐘 {+ Database-Wrangler +}: Update the priority of GCP nodes in the repmgr database. Run the following on the current primary:
1. [ ] 🐘 {+ Database-Wrangler +}: Disable chef on all nodes and shut down repmgr
```shell
# gitlab-psql -d gitlab_repmgr -c "update repmgr_gitlab_cluster.repl_nodes set priority=100 where name like '%gstg%'"
#######################################
# Stop or start chef, consul, and repmgr
# Globals:
# GITLAB_ENV, AZURE_MASTER, AZURE_HOST_PREFIX,
# GCP_HOST_PREFIX, GCP_HOST_SUFFIX, N_OF_HOSTS
# Arguments:
# None
# Returns:
# None
#######################################
db_switch_chef_consul_repmgr_state() {
if [[ -z "${GITLAB_ENV+x}" ]]; then
>&2 echo "\$GITLAB_ENV is empty, cannot proceed."
return
fi
echo "We're about to stop disable chef and stop repmgr on $GITLAB_ENV environment. To proceed, type '$GITLAB_ENV':"
read proceed_cmd
if [[ "$proceed_cmd" != "$GITLAB_ENV" ]]; then
>&2 echo "Stop."
return
fi
# chef
for i in $(seq 1 $N_OF_HOSTS); do
for host in "${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}" "${GCP_HOST_PREFIX}$i${GCP_HOST_SUFFIX}"; do
echo "${1}ing chef on $host"
# WARNING: the following line modifies the state
if [[ "$1" == "stop" ]]; then
ssh_remote "$host" "sudo service chef-client stop"
ssh_remote "$host" "sudo mv /etc/chef /etc/chef.migration"
elif [[ "$1" == "start" ]]; then
ssh_remote "$host" "sudo mv /etc/chef.migration /etc/chef"
ssh_remote "$host" "sudo service chef-client start"
else
>&2 echo "1st argument must be either \"stop\" or \"start\" (provided: $1)."
return
fi
done
done
# consul
# TODO(NikolayS) commands to stop consul see https://gitlab.com/gitlab-com/migration/issues/718#note_90342057
# repmgr
# WARNING: the following line modifies the state
ssh_remote "$AZURE_MASTER" "sudo sv $1 /opt/gitlab/sv/repmgrd"
}
db_switch_chef_consul_repmgr_state stop
```
1. [ ] 🐘 {+ Database-Wrangler +}: **Gracefully** turn off the **Azure** postgresql standby instances.
* Keep everything, just ensure it’s turned off
1. [ ] 🐘 {+ Database-Wrangler +}: Forbid writes to the currect master (Azure) – do not allow any connections except replication and administrative (local via socket) ones.
```shell
$ knife ssh "role:staging-base-db-postgres AND NOT fqdn:CURRENT_PRIMARY" "gitlab-ctl stop postgresql"
for i in $(seq 1 $N_OF_HOSTS); do
host="${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}"
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'local all all peer map=gitlab' > /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'host replication gitlab_repmgr 0.0.0.0/0 md5' >> /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/pg_hba.conf /var/opt/gitlab/postgresql/data/pg_hba.conf.backup"
ssh_remote "$host" "sudo -u gitlab-psql mv /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp /var/opt/gitlab/postgresql/data/pg_hba.conf"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c 'select pg_reload_conf()'"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c \
\"select pg_terminate_backend(pid) from pg_stat_activity where datname = 'gitlabhq_production' and pid <> pg_backend_pid()\""
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: **Gracefully** turn off the **Azure** postgresql primary instance.
* Keep everything, just ensure it’s turned off
1. [ ] 🐘 {+ Database-Wrangler +}: Perform regular switchover to the main replica on GCP
```shell
$ knife ssh "fqdn:CURRENT_PRIMARY" "gitlab-ctl stop postgresql"
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" \
"/opt/gitlab/embedded/bin/pg_ctl -D /var/opt/gitlab/postgresql/data promote"
```
1. [ ] 🐘 {+ Database-Wrangler +}: After timeout of 30 seconds, repmgr should failover primary to the chosen node in GCP, and other nodes should automatically follow.
TODO --- more checks --- (below this line DB steps are not reworked // NikolayS)
- [ ] Confirm `gitlab-ctl repmgr cluster show` reflects the desired state
- [ ] Confirm pgbouncer node in GCP (Password is in 1password)
......
# Production
N_OF_HOSTS=4
AZURE_HOST_PREFIX="postgres-0"
AZURE_HOST_SUFFIX=".db.prd.gitlab.com"
GCP_HOST_PREFIX="postgres-0"
GCP_HOST_SUFFIX="-db-gprd.c.gitlab-production.internal"
AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}1${GCP_HOST_SUFFIX}"
AZURE_SLAVES=(
postgres-02.db.prd.gitlab.com
postgres-03.db.prd.gitlab.com
postgres-04.db.prd.gitlab.com
postgres-01.db.prd.gitlab.com
)
GCP_SLAVES=(
postgres-02-db-gprd.c.gitlab-production.internal
postgres-03-db-gprd.c.gitlab-production.internal
postgres-01-db-gprd.c.gitlab-production.internal
postgres-04-db-gprd.c.gitlab-production.internal
)
AZURE_PGBOUNCERS=(
pgbouncer-01.db.prd.gitlab.com
pgbouncer-02.db.prd.gitlab.com
)
GCP_PGBOUNCERS=(
pgbouncer-01-db-gprd.c.gitlab-production.internal
pgbouncer-02-db-gprd.c.gitlab-production.internal
)
# Generic
max_rep_delay=10
# Staging
N_OF_HOSTS=2
AZURE_HOST_PREFIX="postgres0"
AZURE_HOST_SUFFIX=".db.stg.gitlab.com"
GCP_HOST_PREFIX="postgres-0"
GCP_HOST_SUFFIX="-db-gstg.c.gitlab-staging-1.internal"
AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}2${GCP_HOST_SUFFIX}"
AZURE_HOSTS=(
postgres01.db.stg.gitlab.com
postgres02.db.stg.gitlab.com
)
GCP_HOSTS=(
postgres-02-db-gstg.c.gitlab-staging-1.internal
postgres-01-db-gstg.c.gitlab-staging-1.internal
postgres-03-db-gstg.c.gitlab-staging-1.internal
)
AZURE_SLAVES=(
postgres01.db.stg.gitlab.com
)
GCP_SLAVES=(
postgres-02-db-gstg.c.gitlab-staging-1.internal
postgres-03-db-gstg.c.gitlab-staging-1.internal
)
AZURE_PGBOUNCERS=(
pgbouncer-01.db.stg.gitlab.com
)
GCP_PGBOUNCERS=(
pgbouncer-01-db-gstg.c.gitlab-staging-1.internal
)
# Generic
max_rep_delay=10
\ No newline at end of file
#!/bin/bash
set -eu
[[ $# -lt 1 ]] && { echo "Specify the environment"; exit 1 ; }
# Because some of the script use one or another, we clone both,
# although we need to homogenize
export ENVIRONMENT=$1
export GITLAB_ENV=$ENVIRONMENT
source env_${ENVIRONMENT} # That is, .env_staging or .env_production (test also supported)
source utilities
source steps_${ENVIRONMENT}
#Check all steps have a script
for step in "${steps[@]}"
do
if ! type "$(step_script "$step")" > /dev/null 2>&1
then
>&2 echo "Function $(step_script "$step") do not exists for step $(step_3digit_number "$step")"
exit 1
fi
done
echo "menu"
do_menu
#!/bin/bash
shopt -s expand_aliases
alias ssh_remote="ssh "
source env_${1}
ssh_remote "${AZURE_MASTER}" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-ctl repmgr cluster show
EOF
)
\ No newline at end of file
#!/bin/bash
shopt -s expand_aliases
alias ssh_remote="ssh "
source env_${1}
ssh_remote "${AZURE_MASTER}" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-ctl repmgr cluster show
EOF
)
export steps=(
000_get-rid-of-could-not-change-directory-to-message
001_create-tombstone-table
002_check-gcp-replication-delay
003_disable-chef
004_disable-consul
005_disable-automatic-failover
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_enable-automatic-failover-on-gcp-only
011_check-repmgr-master
012_enable-consul-on-gcp-only
013_enable-chef-on-gcp-only
014_restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o+x "$HOME"'
done
}
function create-tombstone-table(){
echo "Create tombstone database and table if not already existing"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function disable-chef(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
}
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
function check-gcp-candidate-master-is-master(){
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select pg_is_in_recovery()" | grep -q 'f'
then
echo "$GCP_MASTER_CANDIDATE is master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is master"
return 1
fi
}
function enable-automatic-failover-on-gcp-only(){
echo "Register $GCP_MASTER_CANDIDATE as master with repmgr"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-ctl repmgr register master
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Register $host as standby with repmgr"
ssh_remote "$host" sudo gitlab-ctl repmgr register standby
done
echo "Starting repmgrd on $GCP_MASTER_CANDIDATE"
ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd
for host in "${GCP_HOSTS[@]}"
do
if [ "$GCP_MASTER_CANDIDATE" == "$host" ]
then
continue;
fi
echo "Starting repmgrd on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd
done
}
function enable-consul-on-gcp-only(){
for host in "${GCP_HOSTS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Starting consul agent on $host"
ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul
done
}
function enable-chef-on-gcp-only(){
# chef
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Starting chef-client on $host"
ssh_remote "$host" sudo mv /etc/chef.migration /etc/chef
ssh_remote "$host" sudo service chef-client start
done
}
function check-repmgr-master(){
echo "Checking state of $GCP_MASTER_CANDIDATE"
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null
then
echo "$GCP_MASTER_CANDIDATE is repmgr master"
return 0
else
>&2 echo "$GCP_MASTER_CANDIDATE is not repmgr master"
return 1
fi
}
function restore-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o-x "$HOME"'
done
}
export steps=(
000_get-rid-of-could-not-change-directory-to-message
001_create-tombstone-table
002_check-gcp-replication-delay
003_disable-chef
004_disable-consul
005_disable-automatic-failover
006_convert-azure-master-to-standby
007_check-gcp-nodes-has-same-azure-lsn
008_perform-gcp-candidate-master-promote
009_check-gcp-candidate-master-is-master
010_enable-automatic-failover-on-gcp-only
011_check-repmgr-master
012_enable-consul-on-gcp-only
013_enable-chef-on-gcp-only
014_restore-could-not-change-directory-to-message
)
function get-rid-of-could-not-change-directory-to-message(){
for host in "$AZURE_HOSTS"
do
ssh_remote "$host" bash -c 'chmod o+x "$HOME"'
done
}
function create-tombstone-table(){
echo "Create tombstone database and table if not already existing"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone; create database tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)"
}
function check-gcp-replication-delay(){
tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}"
echo "Insert '$tombstone_msg' into tombstone"
ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql tombstone -c "insert into tombstone(note) values('${tombstone_msg}') returning *"
# wait until the change is propagated
while true
do
find_new_msg="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -Atd tombstone -c "select created_at from tombstone where note = '$tombstone_msg'")"
if [[ -z ${find_new_msg+x} ]] || [[ "$find_new_msg" == "" ]]
then
gcp_cur_rep_delay="$(ssh_remote "$GCP_MASTER_CANDIDATE"
sudo gitlab-psql -Atd postgres -c "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))")"
echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..."
sleep 3
else
echo "New tombstone message arrived to $GCP_MASTER_CANDIDATE."
break
fi
done
}
function disable-chef(){
for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do
echo "Stopping chef on $host"
ssh_remote "$host" sudo service chef-client stop
ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration
done
}
function disable-consul(){
for host in "${AZURE_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_PGBOUNCERS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${AZURE_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping consul on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul
done
}
function disable-automatic-failover(){
for host in "${GCP_HOSTS[@]}"
do
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
for host in "${AZURE_HOSTS[@]}"
do
if [ "$AZURE_MASTER" == "$host" ]
then
continue
fi
echo "Stopping repmgrd on $host"
ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd
done
echo "Stopping consul on $AZURE_MASTER"
ssh_remote "$AZURE_MASTER" sudo sv stop /opt/gitlab/sv/repmgrd
}
function convert-azure-master-to-standby(){
echo "standby_mode = 'on'
recovery_target_timeline = 'latest'" | \
ssh_remote "$AZURE_MASTER" sudo tee /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chown postgres:postgres /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/lib/opt/gitlab/postgresql/data/recovery.conf
ssh_remote "$AZURE_MASTER" sudo sv -W 1 stop /opt/gitlab/sv/postgres \
|| (ssh_remote "$AZURE_MASTER" sudo sv int /opt/gitlab/sv/postgres \
&& ssh_remote "$AZURE_MASTER" sudo sv -W 60 stop /opt/gitlab/sv/postgres)
}
function check-gcp-nodes-has-same-azure-lsn(){
while true
do
azure_master_lsn="$(ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
gcp_master_candidate_lsn="$(ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \
-t -A -c "select case when pg_is_in_recovery()
then pg_last_xlog_replay_location()
else pg_current_xlog_location() end;")"
if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ]
then
echo "GCP and Azure have same LSN: $azure_master_lsn"
return 0
fi
echo "GCP and Azure have different LSN: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn"
sleep 3
done
}
function perform-gcp-candidate-master-promote(){
ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \
promote -D /var/lib/opt/gitlab/postgresql/data
}
function check-gcp-candidate-master-is-master(){
if ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql postgres \