From 1c722d7550956d14d4d772cdec08f6c84be3f900 Mon Sep 17 00:00:00 2001 From: Matteo Melli Date: Wed, 8 Aug 2018 17:44:07 +0200 Subject: [PATCH] Fixes, failback procedure and notes in README.md --- README.md | 10 ++++++ bin/migration | 2 +- bin/{utilities => migration_utilities} | 0 .../110-check-repmgr-state.sh | 2 +- .../111-check-replication-topology.sh | 24 +++++++------- .../060_go/p03/050-create-tombstone-table.sh | 4 +-- .../p03/051-check-gcp-replication-delay.sh | 6 ++-- .../p04/043-reset-automatic-failover-state.sh | 2 +- ...onvert-azure-master-to-headless-standby.sh | 2 +- .../p04/051-check-azure-master-is-standby.sh | 2 +- .../052-check-gcp-nodes-has-same-azure-lsn.sh | 4 +-- ...61-check-gcp-candidate-master-is-master.sh | 2 +- ...convert-azure-master-to-standby-of-gcp.sh} | 2 +- bin/scripts/04_failback/040-disable-chef.sh | 12 +++++++ bin/scripts/04_failback/041-disable-consul.sh | 29 ++++++++++++++++ .../042-disable-automatic-failover.sh | 23 +++++++++++++ .../043-reset-automatic-failover-state.sh | 9 +++++ ...-convert-cgp-master-to-standby-of-azure.sh | 14 ++++++++ .../051-check-gcp-master-is-standby.sh | 17 ++++++++++ .../052-check-azure-nodes-has-same-gcp-lsn.sh | 22 +++++++++++++ .../060-perform-azure-master-promote.sh | 10 ++++++ .../061-check-azure-master-is-master.sh | 17 ++++++++++ .../070-enable-automatic-failover.sh | 30 +++++++++++++++++ .../04_failback/071-check-repmgr-master.sh | 16 +++++++++ bin/scripts/04_failback/072-enable-consul.sh | 12 +++++++ .../073-check-pgbouncer-node-in-azure.sh | 18 ++++++++++ bin/steps_database_wrangler | 2 +- bin/steps_database_wrangler_failback | 33 +++++++++++++++++++ 28 files changed, 299 insertions(+), 27 deletions(-) rename bin/{utilities => migration_utilities} (100%) rename bin/scripts/02_failover/060_go/p04/{074-convert-azure-master-to-gcp-standby.sh => 074-convert-azure-master-to-standby-of-gcp.sh} (61%) create mode 100755 bin/scripts/04_failback/040-disable-chef.sh create mode 100755 bin/scripts/04_failback/041-disable-consul.sh create mode 100755 bin/scripts/04_failback/042-disable-automatic-failover.sh create mode 100755 bin/scripts/04_failback/043-reset-automatic-failover-state.sh create mode 100755 bin/scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh create mode 100755 bin/scripts/04_failback/051-check-gcp-master-is-standby.sh create mode 100755 bin/scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh create mode 100755 bin/scripts/04_failback/060-perform-azure-master-promote.sh create mode 100755 bin/scripts/04_failback/061-check-azure-master-is-master.sh create mode 100755 bin/scripts/04_failback/070-enable-automatic-failover.sh create mode 100755 bin/scripts/04_failback/071-check-repmgr-master.sh create mode 100755 bin/scripts/04_failback/072-enable-consul.sh create mode 100755 bin/scripts/04_failback/073-check-pgbouncer-node-in-azure.sh create mode 100644 bin/steps_database_wrangler_failback diff --git a/README.md b/README.md index 5eaa578..a9a00b3 100644 --- a/README.md +++ b/README.md @@ -188,3 +188,13 @@ Then carry out the following steps: 1. **Verify `/opt/gitlab-migration/bin/verify-failover-config`**: You should receive a message indicating success 1. **Setup the workflow issues**": Run `/opt/gitlab-migration/bin/start-failover-procedure.sh`. This will setup several issues in the issue tracker for performing the checks, failover, tests, etc. * Any variables in the template in the format `__VARIABLE__` will be substituted with their values from the `bin/source_vars` file, saving manual effort. + +### Migration scripts + +1. Prepare file `env_` pointing environment variables to correct hosts. +1. Steps scripts are mapped in file `steps_` in order of execution (To define failback steps just add `_failback` suffix to the role) +1. To run the runbook script menu use the `migration` script: + +```shell +bash bin/migration +``` diff --git a/bin/migration b/bin/migration index d605e90..42a0a18 100755 --- a/bin/migration +++ b/bin/migration @@ -14,7 +14,7 @@ export GITLAB_ENV=$ENVIRONMENT # shellcheck source=/dev/null source "${BASE}/source_vars" source "${BASE}/env_${ENVIRONMENT}" # That is, .env_staging or .env_production (test also supported) -source "${BASE}/utilities" +source "${BASE}/migration_utilities" source "${BASE}/steps_${ROLE}" #Check all steps have a script diff --git a/bin/utilities b/bin/migration_utilities similarity index 100% rename from bin/utilities rename to bin/migration_utilities diff --git a/bin/scripts/01_preflight/050_configuration_checks/110-check-repmgr-state.sh b/bin/scripts/01_preflight/050_configuration_checks/110-check-repmgr-state.sh index 6928a3d..9341350 100755 --- a/bin/scripts/01_preflight/050_configuration_checks/110-check-repmgr-state.sh +++ b/bin/scripts/01_preflight/050_configuration_checks/110-check-repmgr-state.sh @@ -8,5 +8,5 @@ source "${BASE}/env_${ENVIRONMENT}" echo "Checking repmgr state for host $AZURE_MASTER" echo echo "select * from repmgr_gitlab_cluster.repl_nodes" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr echo diff --git a/bin/scripts/01_preflight/050_configuration_checks/111-check-replication-topology.sh b/bin/scripts/01_preflight/050_configuration_checks/111-check-replication-topology.sh index 8305ba4..61d5a6b 100755 --- a/bin/scripts/01_preflight/050_configuration_checks/111-check-replication-topology.sh +++ b/bin/scripts/01_preflight/050_configuration_checks/111-check-replication-topology.sh @@ -14,7 +14,7 @@ do if [ "$host" != "$AZURE_MASTER" ] then if ! echo "select pg_is_in_recovery()" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q 't' then >&2 echo "Host $host is not standby" @@ -22,30 +22,30 @@ do fi else if ! echo "select pg_is_in_recovery()" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q 'f' then >&2 echo "Host $host is not master" all_ok=false fi if ! echo "select count(1) from pg_stat_replication" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q '4' then >&2 echo "Host $host is not replicated by 4 nodes:" echo "select * from pg_stat_replication" \ - | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres + | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres all_ok=false fi for slave_host in "${AZURE_SLAVES[@]}" "$GCP_MASTER_CANDIDATE" do if ! echo "select client_addr||'-'||state from pg_stat_replication" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming" then >&2 echo "Host $host is not correclty replicated by host $slave_host:" echo "select * from pg_stat_replication" \ - | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres + | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres all_ok=false fi done @@ -60,7 +60,7 @@ do if [ "$host" != "$GCP_MASTER_CANDIDATE" ] then if ! echo "select pg_is_in_recovery()" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q 't' then >&2 echo "Host $host is not standby" @@ -68,30 +68,30 @@ do fi else if ! echo "select pg_is_in_recovery()" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q 't' then >&2 echo "Host $host is not standby" all_ok=false fi if ! echo "select count(1) from pg_stat_replication" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q '3' then >&2 echo "Host $host is not replicated by 3 nodes:" echo "select * from pg_stat_replication" \ - | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres + | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres all_ok=false fi for slave_host in "${GCP_SLAVES[@]}" do if ! echo "select client_addr||'-'||state from pg_stat_replication" \ - | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres -A -t \ + | ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -A -t \ | grep -q "$(host -t A "$slave_host"|cut -d ' ' -f 4)/32-streaming" then >&2 echo "Host $host is not correclty replicated by host $slave_host:" echo "select * from pg_stat_replication" \ - | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -d postgres + | >&2 ssh_remote "$host" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres all_ok=false fi done diff --git a/bin/scripts/02_failover/060_go/p03/050-create-tombstone-table.sh b/bin/scripts/02_failover/060_go/p03/050-create-tombstone-table.sh index afb0060..5a2f0ca 100755 --- a/bin/scripts/02_failover/060_go/p03/050-create-tombstone-table.sh +++ b/bin/scripts/02_failover/060_go/p03/050-create-tombstone-table.sh @@ -7,6 +7,6 @@ source "${BASE}/env_${ENVIRONMENT}" echo "Create tombstone database and table if not already existing" echo "drop database if exists tombstone; create database tombstone" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres echo "create table if not exists tombstone (created_at timestamptz default now() primary key, note text)" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d tombstone + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d tombstone diff --git a/bin/scripts/02_failover/060_go/p03/051-check-gcp-replication-delay.sh b/bin/scripts/02_failover/060_go/p03/051-check-gcp-replication-delay.sh index d06ed91..67053ca 100755 --- a/bin/scripts/02_failover/060_go/p03/051-check-gcp-replication-delay.sh +++ b/bin/scripts/02_failover/060_go/p03/051-check-gcp-replication-delay.sh @@ -8,16 +8,16 @@ source "${BASE}/env_${ENVIRONMENT}" tombstone_msg=$(date +'%Y%m%d_%H%M%S')"_${ENVIRONMENT}" echo "Insert '$tombstone_msg' into tombstone" echo "insert into tombstone(note) values('${tombstone_msg}') returning *" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d tombstone + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d tombstone # wait until the change is propagated while true do find_new_msg="$(echo "select created_at from tombstone where note = '$tombstone_msg'" \ - | ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -A -t -d tombstone)" + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -v ON_ERROR_STOP=1 -A -t -d tombstone)" if [[ -z "${find_new_msg+x}" ]] || [[ "$find_new_msg" == "" ]] then gcp_cur_rep_delay="$(echo "select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))" \ - | ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -A -t -d postgres)" + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo gitlab-psql -v ON_ERROR_STOP=1 -A -t -d postgres)" echo "New tombstone message is not seen on $GCP_MASTER_CANDIDATE (GCP MASTER CANDIDATE). The replication delay: ${gcp_cur_rep_delay}s. Wait 3 seconds..." sleep 3 else diff --git a/bin/scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh b/bin/scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh index 9e19bc9..db21467 100755 --- a/bin/scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh +++ b/bin/scripts/02_failover/060_go/p04/043-reset-automatic-failover-state.sh @@ -6,4 +6,4 @@ set -eu source "${BASE}/env_${ENVIRONMENT}" echo "TRUNCATE repmgr_gitlab_cluster.repl_nodes" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d gitlab_repmgr + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr diff --git a/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh b/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh index f583279..3074a88 100755 --- a/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh +++ b/bin/scripts/02_failover/060_go/p04/050-convert-azure-master-to-headless-standby.sh @@ -9,7 +9,7 @@ echo "Updating recovery.conf on $AZURE_MASTER" echo "standby_mode = 'on' recovery_target_timeline = 'latest'" | \ ssh_remote "$AZURE_MASTER" sudo tee /var/opt/gitlab/postgresql/data/recovery.conf > /dev/null -ssh_remote "$AZURE_MASTER" sudo chown gitlab-psql:gitlab-psql /var/opt/gitlab/postgresql/data/recovery.conf +ssh_remote "$AZURE_MASTER" sudo chown gitlab-psql:gitlab-psql -v ON_ERROR_STOP=1 /var/opt/gitlab/postgresql/data/recovery.conf ssh_remote "$AZURE_MASTER" sudo chmod 600 /var/opt/gitlab/postgresql/data/recovery.conf echo "Restarting postgres on $AZURE_MASTER" (ssh_remote "$AZURE_MASTER" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \ diff --git a/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh b/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh index bb7fd99..2389cb0 100755 --- a/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh +++ b/bin/scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh @@ -6,7 +6,7 @@ set -eu source "${BASE}/env_${ENVIRONMENT}" if echo "select pg_is_in_recovery()" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A \ + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \ | grep -q 't' then echo "$AZURE_MASTER is standby" diff --git a/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh b/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh index f4fb759..59c0628 100755 --- a/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh +++ b/bin/scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh @@ -8,11 +8,11 @@ source "${BASE}/env_${ENVIRONMENT}" azure_master_lsn="$(echo "select case when pg_is_in_recovery() then pg_last_xlog_replay_location() else pg_current_xlog_location() end;" \ - | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)" + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)" gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery() then pg_last_xlog_replay_location() else pg_current_xlog_location() end;" \ - | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -d postgres -t -A)" + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)" if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ] then echo "GCP and Azure have same LSN: $azure_master_lsn" diff --git a/bin/scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh b/bin/scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh index 7c79464..d2bacc4 100755 --- a/bin/scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh +++ b/bin/scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh @@ -6,7 +6,7 @@ set -eu source "${BASE}/env_${ENVIRONMENT}" if echo "select pg_is_in_recovery()" \ - | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -d postgres -t -A \ + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \ | grep -q 'f' then echo "$GCP_MASTER_CANDIDATE is master" diff --git a/bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-gcp-standby.sh b/bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-standby-of-gcp.sh similarity index 61% rename from bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-gcp-standby.sh rename to bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-standby-of-gcp.sh index 4e0013e..3e0b640 100755 --- a/bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-gcp-standby.sh +++ b/bin/scripts/02_failover/060_go/p04/074-convert-azure-master-to-standby-of-gcp.sh @@ -6,4 +6,4 @@ set -eu source "${BASE}/env_${ENVIRONMENT}" echo "Convert postgres on $AZURE_MASTER as standby of $GCP_MASTER_CANDIDATE" -ssh_remote "$AZURE_MASTER" gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE" +ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-ctl repmgr standby follow "$GCP_MASTER_CANDIDATE" diff --git a/bin/scripts/04_failback/040-disable-chef.sh b/bin/scripts/04_failback/040-disable-chef.sh new file mode 100755 index 0000000..c9347e2 --- /dev/null +++ b/bin/scripts/04_failback/040-disable-chef.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}"; do + echo "Stopping chef on $host" + ssh_remote "$host" sudo service chef-client stop + ssh_remote "$host" sudo mv /etc/chef /etc/chef.migration +done \ No newline at end of file diff --git a/bin/scripts/04_failback/041-disable-consul.sh b/bin/scripts/04_failback/041-disable-consul.sh new file mode 100755 index 0000000..3c8ec47 --- /dev/null +++ b/bin/scripts/04_failback/041-disable-consul.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +for host in "${AZURE_PGBOUNCERS[@]}" +do + echo "Stopping consul on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul +done +for host in "${GCP_PGBOUNCERS[@]}" +do + echo "Stopping consul on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul +done + +for host in "${AZURE_HOSTS[@]}" +do + echo "Stopping consul on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul +done + +for host in "${GCP_HOSTS[@]}" +do + echo "Stopping consul on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/consul +done diff --git a/bin/scripts/04_failback/042-disable-automatic-failover.sh b/bin/scripts/04_failback/042-disable-automatic-failover.sh new file mode 100755 index 0000000..0b518a5 --- /dev/null +++ b/bin/scripts/04_failback/042-disable-automatic-failover.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +for host in "${AZURE_HOSTS[@]}" +do + echo "Stopping repmgrd on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd +done +for host in "${GCP_HOSTS[@]}" +do + if [ "$GCP_MASTER_CANDIDATE" == "$host" ] + then + continue + fi + echo "Stopping repmgrd on $host" + ssh_remote "$host" sudo sv stop /opt/gitlab/sv/repmgrd +done +echo "Stopping repmgrd on $GCP_MASTER_CANDIDATE" +ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv stop /opt/gitlab/sv/repmgrd diff --git a/bin/scripts/04_failback/043-reset-automatic-failover-state.sh b/bin/scripts/04_failback/043-reset-automatic-failover-state.sh new file mode 100755 index 0000000..d40f7a2 --- /dev/null +++ b/bin/scripts/04_failback/043-reset-automatic-failover-state.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +echo "TRUNCATE repmgr_gitlab_cluster.repl_nodes" \ + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d gitlab_repmgr diff --git a/bin/scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh b/bin/scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh new file mode 100755 index 0000000..67e5852 --- /dev/null +++ b/bin/scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +echo "Coping recovery.done to recovery.conf on $GCP_MASTER_CANDIDATE" +ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/recovery.done /var/opt/gitlab/postgresql/data/recovery.conf +echo "Restarting postgres on $GCP_MASTER_CANDIDATE" +(ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 1 stop /opt/gitlab/sv/postgresql \ + || (ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv int /opt/gitlab/sv/postgresql \ + && ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 stop /opt/gitlab/sv/postgresql)) \ + && ssh_remote "$GCP_MASTER_CANDIDATE" sudo sv -w 60 start /opt/gitlab/sv/postgresql diff --git a/bin/scripts/04_failback/051-check-gcp-master-is-standby.sh b/bin/scripts/04_failback/051-check-gcp-master-is-standby.sh new file mode 100755 index 0000000..0105936 --- /dev/null +++ b/bin/scripts/04_failback/051-check-gcp-master-is-standby.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +if echo "select pg_is_in_recovery()" \ + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \ + | grep -q 't' +then + echo "$GCP_MASTER_CANDIDATE is standby" + exit 0 +else + >&2 echo "$GCP_MASTER_CANDIDATE is NOT standby" + exit 1 +fi diff --git a/bin/scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh b/bin/scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh new file mode 100755 index 0000000..59c0628 --- /dev/null +++ b/bin/scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +azure_master_lsn="$(echo "select case when pg_is_in_recovery() + then pg_last_xlog_replay_location() + else pg_current_xlog_location() end;" \ + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)" +gcp_master_candidate_lsn="$(echo "select case when pg_is_in_recovery() + then pg_last_xlog_replay_location() + else pg_current_xlog_location() end;" \ + | ssh_remote "$GCP_MASTER_CANDIDATE" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A)" +if [ "$azure_master_lsn" == "$gcp_master_candidate_lsn" ] +then + echo "GCP and Azure have same LSN: $azure_master_lsn" + exit 0 +fi +echo "GCP and Azure have NOT same LSN. Current LSNs are: Azure/$azure_master_lsn GCP/$gcp_master_candidate_lsn" +exit 1 diff --git a/bin/scripts/04_failback/060-perform-azure-master-promote.sh b/bin/scripts/04_failback/060-perform-azure-master-promote.sh new file mode 100755 index 0000000..7dc2bba --- /dev/null +++ b/bin/scripts/04_failback/060-perform-azure-master-promote.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +# WARNING WARNING WARNING here switchback happens! +ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql /opt/gitlab/embedded/bin/pg_ctl \ + promote -D /var/opt/gitlab/postgresql/data diff --git a/bin/scripts/04_failback/061-check-azure-master-is-master.sh b/bin/scripts/04_failback/061-check-azure-master-is-master.sh new file mode 100755 index 0000000..b4db778 --- /dev/null +++ b/bin/scripts/04_failback/061-check-azure-master-is-master.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +if echo "select pg_is_in_recovery()" \ + | ssh_remote "$AZURE_MASTER" sudo -u gitlab-psql gitlab-psql -v ON_ERROR_STOP=1 -d postgres -t -A \ + | grep -q 'f' +then + echo "$AZURE_MASTER is master" + exit 0 +else + >&2 echo "$AZURE_MASTER is NOT master" + exit 1 +fi diff --git a/bin/scripts/04_failback/070-enable-automatic-failover.sh b/bin/scripts/04_failback/070-enable-automatic-failover.sh new file mode 100755 index 0000000..3588124 --- /dev/null +++ b/bin/scripts/04_failback/070-enable-automatic-failover.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +echo "Register $AZURE_MASTER as master with repmgr" +ssh_remote "$AZURE_MASTER" sudo gitlab-ctl repmgr master register +for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}" +do + if [ "$AZURE_MASTER" == "$host" ] + then + continue; + fi + echo "Register $host as standby with repmgr" + ssh_remote "$host" sudo gitlab-ctl repmgr standby register +done + +echo "Starting repmgrd on $GCP_MASTER_CANDIDATE" +ssh_remote "$AZURE_MASTER" sudo sv start /opt/gitlab/sv/repmgrd +for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}" +do + if [ "$AZURE_MASTER" == "$host" ] + then + continue; + fi + echo "Starting repmgrd on $host" + ssh_remote "$host" sudo sv start /opt/gitlab/sv/repmgrd +done diff --git a/bin/scripts/04_failback/071-check-repmgr-master.sh b/bin/scripts/04_failback/071-check-repmgr-master.sh new file mode 100755 index 0000000..5c5e842 --- /dev/null +++ b/bin/scripts/04_failback/071-check-repmgr-master.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +echo "Checking state of $AZURE_MASTER" +if ssh_remote "$AZURE_MASTER" sudo -u gitlab-consul gitlab-ctl repmgr-check-master 2> /dev/null +then + echo "$AZURE_MASTER is repmgr master" + exit 0 +else + >&2 echo "$AZURE_MASTER is not repmgr master" + exit 1 +fi diff --git a/bin/scripts/04_failback/072-enable-consul.sh b/bin/scripts/04_failback/072-enable-consul.sh new file mode 100755 index 0000000..25da748 --- /dev/null +++ b/bin/scripts/04_failback/072-enable-consul.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +# chef +for host in "${GCP_HOSTS[@]}" "${AZURE_HOSTS[@]}"; do + echo "Starting consul agent on $host" + ssh_remote "$host" sudo sv start /opt/gitlab/sv/consul +done diff --git a/bin/scripts/04_failback/073-check-pgbouncer-node-in-azure.sh b/bin/scripts/04_failback/073-check-pgbouncer-node-in-azure.sh new file mode 100755 index 0000000..3793b58 --- /dev/null +++ b/bin/scripts/04_failback/073-check-pgbouncer-node-in-azure.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -eu + +# shellcheck source=/dev/null +source "${BASE}/env_${ENVIRONMENT}" + +for host in "${GCP_PGBOUNCERS[@]}" "${AZURE_PGBOUNCERS[@]}" +do + echo "Check pgbouncer on $host" + echo "SHOW DATABASES" | ssh_remote "$host" gitlab-ctl pgb-console + echo "SHOW SERVERS" | ssh_remote "$host" gitlab-ctl pgb-console + read -r -s -N 1 -p "Press [y] to continue, any other key to abort." key + if [ "$key" != "y" ] + then + exit 1 + fi +done diff --git a/bin/steps_database_wrangler b/bin/steps_database_wrangler index 5aa6d21..0931110 100644 --- a/bin/steps_database_wrangler +++ b/bin/steps_database_wrangler @@ -13,11 +13,11 @@ file:scripts/02_failover/060_go/p04/051-check-azure-master-is-standby.sh file:scripts/02_failover/060_go/p04/052-check-gcp-nodes-has-same-azure-lsn.sh file:scripts/02_failover/060_go/p04/060-perform-gcp-candidate-master-promote.sh file:scripts/02_failover/060_go/p04/061-check-gcp-candidate-master-is-master.sh -file:scripts/02_failover/060_go/p04/062-convert-azure-master-to-gcp-standby.sh file:scripts/02_failover/060_go/p04/070-enable-automatic-failover-on-gcp-only.sh file:scripts/02_failover/060_go/p04/071-check-repmgr-master.sh file:scripts/02_failover/060_go/p04/072-enable-consul-on-gcp-only.sh file:scripts/02_failover/060_go/p04/073-check-pgbouncer-node-in-gcp.sh +file:scripts/02_failover/060_go/p04/074-convert-azure-master-to-standby-of-gcp.sh function:restore-could-not-change-directory-to-message ) diff --git a/bin/steps_database_wrangler_failback b/bin/steps_database_wrangler_failback new file mode 100644 index 0000000..28a6953 --- /dev/null +++ b/bin/steps_database_wrangler_failback @@ -0,0 +1,33 @@ +export steps=( +function:get-rid-of-could-not-change-directory-to-message +file:scripts/04_failback/040-disable-chef.sh +file:scripts/04_failback/041-disable-consul.sh +file:scripts/04_failback/042-disable-automatic-failover.sh +file:scripts/04_failback/043-reset-automatic-failover-state.sh +file:scripts/04_failback/050-convert-cgp-master-to-standby-of-azure.sh +file:scripts/04_failback/051-check-gcp-master-is-standby.sh +file:scripts/04_failback/052-check-azure-nodes-has-same-gcp-lsn.sh +file:scripts/04_failback/060-perform-azure-master-promote.sh +file:scripts/04_failback/061-check-azure-master-is-master.sh +file:scripts/04_failback/070-enable-automatic-failover.sh +file:scripts/04_failback/071-check-repmgr-master.sh +file:scripts/04_failback/072-enable-consul.sh +file:scripts/04_failback/073-check-pgbouncer-node-in-azure.sh +function:restore-could-not-change-directory-to-message +) + +function get-rid-of-could-not-change-directory-to-message(){ + for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}" + do + echo "$host: chmod o+x \$HOME" + ssh_remote "$host" bash -c '"chmod o+x \"$HOME\""' + done +} + +function restore-could-not-change-directory-to-message(){ + for host in "${AZURE_HOSTS[@]}" "${GCP_HOSTS[@]}" + do + echo "$host: chmod o-x \$HOME" + ssh_remote "$host" bash -c '"chmod o-x \"$HOME\""' + done +} -- GitLab