Commit b415be33 authored by Nikolay's avatar Nikolay
Browse files

separate tombstone table setup and its usage

scripted host prefixes and suffixes

Reworked DB steps till switchover itself
parent 95fb3e36
......@@ -349,13 +349,24 @@ state of the secondary to converge.
export GITLAB_ENV="staging"
if [[ "$GITLAB_ENV" == "production" ]]; then
export AZURE_MASTER="postgres-02.db.prd.gitlab.com"
export GCP_MASTER_CANDIDATE="postgres-01-db-gprd.c.gitlab-production.internal"
export N_OF_HOSTS=4
export AZURE_HOST_PREFIX="postgres-0"
export AZURE_HOST_SUFFIX=".db.prd.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gprd.c.gitlab-production.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}1${GCP_HOST_SUFFIX}"
elif [[ "$GITLAB_ENV" == "staging" ]]; then
export AZURE_MASTER="postgres01.db.stg.gitlab.com"
export GCP_MASTER_CANDIDATE="postgres-01-db-gprd.c.gitlab-staging-1.internal"
export N_OF_HOSTS=2
export AZURE_HOST_PREFIX="postgres0"
export AZURE_HOST_SUFFIX=".db.stg.gitlab.com"
export GCP_HOST_PREFIX="postgres-0"
export GCP_HOST_SUFFIX="-db-gstg.c.gitlab-staging-1.internal"
export AZURE_MASTER="${AZURE_HOST_PREFIX}2${AZURE_HOST_SUFFIX}"
export GCP_MASTER_CANDIDATE="${GCP_HOST_PREFIX}2${GCP_HOST_SUFFIX}"
else
>&2 echo "\$GITLAB_ENV is not correctly defined ($GITLAB_ENV)"
>&2 echo "\$GITLAB_ENV is not correctly defined ($GITLAB_ENV). Stop all work and exit (enter anything to proceed)."
read smth
exit 1
fi
......@@ -372,17 +383,8 @@ state of the secondary to converge.
check_gcp_replication_delay() { # put definition to the top
max_rep_delay=10 # TODO(NikolayS) is 10s ok? Double-check after turning SR on
# TODO(NikolayS) write to the tombstone table right before checking
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone" \
-c "create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table tombstone (created_at timestamptz default now() primary key, note text)" \
-c "insert into tombstone(note) values('pre-switchover check') returning *"
EOF
)
ssh_remote "$AZURE_MASTER" \
"cd /tmp; sudo -u gitlab-psql gitlab-psql tombstone -c \"insert into tombstone(note) values('pre-switchover check') returning *\""
gcp_cur_rep_delay=$(ssh_remote "$GCP_MASTER_CANDIDATE" \
"sudo gitlab-psql -Atd postgres -c 'select round(extract(epoch from (now() - pg_last_xact_replay_timestamp())))'")
......@@ -394,6 +396,16 @@ EOF
fi
}
ssh_remote "$AZURE_MASTER" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql gitlab-psql postgres \
-c "drop database if exists tombstone" \
-c "create database tombstone";
sudo -u gitlab-psql gitlab-psql tombstone \
-c "create table tombstone (created_at timestamptz default now() primary key, note text)"
EOF
)
check_gcp_replication_delay
```
1. [ ] 🐺 {+ Coordinator +}: Now disable all sidekiq-cron jobs on the secondary
......@@ -445,24 +457,87 @@ of errors while it is being promoted.
- [ ] `altssh.gitlab.com A 35.190.168.187`
- [ ] `*.gitlab.io A 35.185.44.232`
- **DO NOT** change `gitlab.io`.
1. [ ] 🐘 {+ Database-Wrangler +}: Shut down repmgr
1. [ ] 🐘 {+ Database-Wrangler +}: Disable chef on all nodes and shut down repmgr
```shell
#######################################
# Stop or start chef, consul, and repmgr
# Globals:
# GITLAB_ENV, AZURE_MASTER, AZURE_HOST_PREFIX,
# GCP_HOST_PREFIX, GCP_HOST_SUFFIX, N_OF_HOSTS
# Arguments:
# None
# Returns:
# None
#######################################
db_switch_chef_consul_repmgr_state() {
if [[ -z "${GITLAB_ENV+x}" ]]; then
>&2 echo "\$GITLAB_ENV is empty, cannot proceed."
return
fi
echo "We're about to stop disable chef and stop repmgr on $GITLAB_ENV environment. To proceed, type '$GITLAB_ENV':"
read proceed_cmd
if [[ "$proceed_cmd" != "$GITLAB_ENV" ]]; then
>&2 echo "Stop."
return
fi
# chef
for i in $(seq 1 $N_OF_HOSTS); do
for host in "${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}" "${GCP_HOST_PREFIX}$i${GCP_HOST_SUFFIX}"; do
echo "${1}ing chef on $host"
# WARNING: the following line modifies the state
if [[ "$1" == "stop" ]]; then
ssh_remote "$host" "sudo service chef-client stop"
ssh_remote "$host" "sudo mv /etc/chef /etc/chef.migration"
elif [[ "$1" == "start" ]]; then
ssh_remote "$host" "sudo mv /etc/chef.migration /etc/chef"
ssh_remote "$host" "sudo service chef-client start"
else
>&2 echo "1st argument must be either \"stop\" or \"start\" (provided: $1)."
return
fi
done
done
# consul
# TODO(NikolayS) commands to stop consul see https://gitlab.com/gitlab-com/migration/issues/718#note_90342057
# repmgr
# WARNING: the following line modifies the state
ssh_remote "$AZURE_MASTER" "sudo sv $1 /opt/gitlab/sv/repmgrd"
}
db_switch_chef_consul_repmgr_state stop
```
1. [ ] 🐘 {+ Database-Wrangler +}: **Gracefully** turn off the **Azure** postgresql standby instances.
* Keep everything, just ensure it’s turned off
1. [ ] 🐘 {+ Database-Wrangler +}: Forbid writes to the currect master (Azure) – do not allow any connections except replication and administrative (local via socket) ones.
```shell
$ knife ssh "role:staging-base-db-postgres AND NOT fqdn:CURRENT_PRIMARY" "gitlab-ctl stop postgresql"
for i in $(seq 1 $N_OF_HOSTS); do
host="${AZURE_HOST_PREFIX}$i${AZURE_HOST_SUFFIX}"
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'local all all peer map=gitlab' > /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql sh -c \"echo 'host replication gitlab_repmgr 0.0.0.0/0 md5' >> /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp\""
ssh_remote "$host" "sudo -u gitlab-psql cp /var/opt/gitlab/postgresql/data/pg_hba.conf /var/opt/gitlab/postgresql/data/pg_hba.conf.backup"
ssh_remote "$host" "sudo -u gitlab-psql mv /var/opt/gitlab/postgresql/data/pg_hba.conf.tmp /var/opt/gitlab/postgresql/data/pg_hba.conf"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c 'select pg_reload_conf()'"
ssh_remote "$host" "cd /tmp; sudo -u gitlab-psql gitlab-psql -c 'select pg_terminate_backend(pid) from pg_stat_activity where datname = 'gitlabhq_production''"
done
```
1. [ ] 🐘 {+ Database-Wrangler +}: **Gracefully** turn off the **Azure** postgresql primary instance.
* Keep everything, just ensure it’s turned off
1. [ ] 🐘 {+ Database-Wrangler +}: Perform regular switchover to the main replica on GCP
```shell
$ knife ssh "fqdn:CURRENT_PRIMARY" "gitlab-ctl stop postgresql"
# TODO (NikolayS) Prepare trigger_file much earlier or use `pg_ctl promote`
ssh_remote "$GCP_MASTER_CANDIDATE" $(cat << EOF
cd /tmp;
sudo -u gitlab-psql "echo \"trigger_file = '/var/opt/gitlab/postgresql/data/trigger_file'\" > /var/opt/gitlab/postgresql/data/recovery.conf";
sudo -u gitlab-psql gitlab-psql -c 'select pg_reload_conf()'
EOF
)
# WARNING WARNING WARNING here switchover happens!
ssh_remote "$GCP_MASTER_CANDIDATE" "sudo -u gitlab-psql touch /var/opt/gitlab/postgresql/data/trigger_file"
```
1. [ ] 🐘 {+ Database-Wrangler +}: After timeout of 30 seconds, repmgr should failover primary to the chosen node in GCP, and other nodes should automatically follow.
TODO --- more checks --- (below this line DB steps are not reworked // NikolayS)
- [ ] Confirm `gitlab-ctl repmgr cluster show` reflects the desired state
- [ ] Confirm pgbouncer node in GCP (Password is in 1password)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment