Commit 00683b0b authored by Ilya Frolov's avatar Ilya Frolov Committed by Alex Hanselka

add some automation and instructions to testing backups

parent cd4e0c44
#!/bin/bash
# vim: ai:ts=8:sw=8:noet
# This script automates the creation of VM on Azure to test production DB backup
set -eufo pipefail
IFS=$'\t\n'
# Command requirements
command -v az >/dev/null 2>/dev/null || { echo 'Please install az utility'; exit 1; }
# Variables to change always
RG_NAME='BADJune2017-proddb' # Name of the resource group
# Variables to change only if you know what you are doing
RG_LOC='eastus2' # Location to create restoration resource group in
VM_NAME='restoreproddb' # How the VM should be named
VM_USERNAME='restore' # How the first user should be named
# Main flow
# Generate rsa keypair in current dir if not existent
test -f "./${RG_NAME}_rsa4096" || \
ssh-keygen -f "./${RG_NAME}_rsa4096" \
-t rsa \
-C "ephemeral ${USER}'s key for ${RG_NAME}" \
-N '' \
-b 4096
echo "Creating separate resource group for restoration:"
az group create --verbose \
--location "${RG_LOC}" \
--name "${RG_NAME}"
echo "Creating VM ${VM_NAME}"
CUSTOM_DATA=$(cat <<EOF
#!/bin/bash
export DEBIAN_FRONTEND=noninteractive
# Format and mount gitlab storage (assuming one extra disk attached, hence sdc)
mkfs.ext4 -q /dev/sdc
mkdir -p /var/opt/gitlab && mount /dev/sdc /var/opt/gitlab
# Set apt config, update repos and disable postfix prompt
curl https://packages.gitlab.com/install/repositories/gitlab/gitlab-ee/script.deb.sh | sudo bash
debconf-set-selections <<< "postfix postfix/main_mailer_type string 'No configuration'"
# install everything in one go
apt-get -y install daemontools lzop gcc make python3 virtualenv python3-dev libssl-dev gitlab-ee ca-certificates postfix
gitlab-ctl reconfigure
# stop postgres just after reconfig
gitlab-ctl stop postgresql
# to save some wtf figuring out
sed -i 's/^max_replication_slots = 0/max_replication_slots = 100/' /var/opt/gitlab/postgresql/data/postgresql.conf
# Configure wal-e
mkdir -p /opt/wal-e /etc/wal-e.d/env
virtualenv --python=python3 /opt/wal-e
/opt/wal-e/bin/pip3 install boto azure wal-e
# prepare for vault
touch /etc/wal-e.d/env/AWS_ACCESS_KEY_ID
touch /etc/wal-e.d/env/AWS_SECRET_ACCESS_KEY
touch /etc/wal-e.d/env/WALE_S3_PREFIX
# this is not secret
echo 'us-east-1' > /etc/wal-e.d/env/AWS_REGION
# precreate recovery.conf
cat > /var/opt/gitlab/postgresql/data/recovery.conf <<RECOVERY
restore_command = '/usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e wal-fetch "%f" "%p"'
recovery_target_time = '2017-XX-YY 06:00:00'
recovery_target_action = 'promote'
RECOVERY
chown gitlab-psql:gitlab-psql /var/opt/gitlab/postgresql/data/recovery.conf
EOF
)
VM_IP=$(az vm create --verbose \
--resource-group "${RG_NAME}" \
--location "${RG_LOC}" \
--name "${VM_NAME}" \
--image "UbuntuLTS" \
--admin-username "${VM_USERNAME}" \
--authentication-type "ssh" \
--ssh-key-value "./${RG_NAME}_rsa4096.pub" \
--size "Standard_DS3_v2" \
--data-disk-sizes-gb 1024 \
--custom-data "${CUSTOM_DATA}" | jq ".publicIpAddress")
echo "All done, please proceed (see tail -f /var/log/cloud-init-output.log):"
echo ssh "${VM_USERNAME}@${VM_IP}" -i "./${RG_NAME}_rsa4096" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null
echo "After you are done, don't forget to remove resource group ${RG_NAME}"
#!/bin/bash
# vim: ai:ts=8:sw=8:noet
# This script automates the creation of VM on DO to test secondary DB backup
set -eufo pipefail
IFS=$'\t\n'
# Command requirements
command -v doctl >/dev/null 2>/dev/null || { echo 'Please install doctl utility'; exit 1; }
# Variables to change always
RESTORE='customers'
# Variables to change only if you know what you are doing
DO_REGION='nyc3' # Location to create restoration resource group in
VM_NAME="bkp${RESTORE}" # How the VM should be named
# Main flow
# Check what we're restoring
if [[ ! "${RESTORE}" =~ ^(license|version|customers)$ ]]; then
echo "Box to test restore should be one of: license, version, customers"
exit 1
else
RESTORE_IMAGE='ubuntu-14-05-x64'
RESTORE_PG_VER='9.3'
if [[ "${RESTORE}" == 'customers' ]]; then
RESTORE_IMAGE='ubuntu-16-04-x64'
RESTORE_PG_VER='9.5'
fi
fi
# Generate rsa keypair in current dir if not existent
test -f "./${RESTORE}_rsa4096" || \
ssh-keygen -f "./${RESTORE}_rsa4096" \
-t rsa \
-C "ephemeral ${USER}'s key for ${RESTORE}" \
-N '' \
-b 4096
echo "Creating VM ${VM_NAME}"
USER_DATA=$(cat <<EOF
#!/bin/bash
export DEBIAN_FRONTEND=noninteractive
mkdir -p /root/.ssh
echo "$(cat "./${RESTORE}_rsa4096.pub")" >> /root/.ssh/authorized_keys
chmod 0700 /root/.ssh && chmod 0400 /root/.ssh/authorized_keys
sed -i 's/^PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
echo -e '\nPermitRootLogin without-password\n' >> /etc/ssh/sshd_config
service ssh reload
chpasswd <<< "root:plzdontbugmedigitalocean$(pwgen -s -1 32)"
# install postgres
apt-get update && apt-get -y install daemontools lzop gcc make python3 virtualenvwrapper python3-dev libssl-dev postgresql gnupg-agent pinentry-curses
service postgresql stop
# Configure wal-e
mkdir -p /opt/wal-e /etc/wal-e.d/env
virtualenv --python=python3 /opt/wal-e
/opt/wal-e/bin/pip3 install --upgrade pip
/opt/wal-e/bin/pip3 install boto azure wal-e
# prepare for vault
touch /etc/wal-e.d/env/AWS_ACCESS_KEY_ID
touch /etc/wal-e.d/env/AWS_SECRET_ACCESS_KEY
touch /etc/wal-e.d/env/WALE_S3_PREFIX
touch /etc/wal-e.d/env/WALE_GPG_KEY_ID
touch /etc/wal-e.d/env/GPG_AGENT_INFO
# this is not secret
echo 'us-east-1' > /etc/wal-e.d/env/AWS_REGION
# precreate recovery.conf
cat > /var/lib/postgresql/${RESTORE_PG_VER}/main/recovery.conf <<RECOVERY
restore_command = '/usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e wal-fetch "%f" "%p"'
recovery_target_time = '2017-XX-YY 06:00:00'
# disabled on secondary
# recovery_target_action = 'promote'
RECOVERY
chown postgres:postgres /var/lib/postgresql/${RESTORE_PG_VER}/main/recovery.conf
# Manual steps cheat-sheet (if encrypted):
# Restore latest backup cmd
# /usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e backup-list 2>/dev/null | tail -1 | cut -d ' ' -f1 | xargs -n1 /usr/bin/envdir /etc/wal-e.d/env /opt/wal-e/bin/wal-e backup-fetch /var/lib/postgresql/${RESTORE_PG_VER}/main
# Then, to restore wal-e chunks, under user postgres:
# gpg --allow-secret-key-import --import /etc/wal-e.d/ops-contact+dbcrypt.key
# gpg --import-ownertrust /etc/wal-e.d/gpg_owner_trust
# (force gpg agent remember passphrase -- see official docs)
# put GPG_AGENT_INFO value to /etc/wal-e.d/env/GPG_AGENT_INFO
# Note: in 16.04 you should manually construct this value as sockpath:pid:1.
# put key id to /etc/wal-e.d/env/WALE_GPG_KEY_ID
# echo 'use-agent' > ~/.gnupg/gpg.conf
# start postgres
EOF
)
VM_IP="$(doctl compute droplet create \
"${VM_NAME}" \
--no-header \
--format PublicIPv4 \
--image "${RESTORE_IMAGE}" \
--region "${DO_REGION}" \
--size '512mb' \
--user-data "${USER_DATA}" \
--verbose \
--wait)"
echo "All done, please proceed (see tail -f /var/log/cloud-init-output.log):"
echo ssh "root@${VM_IP}" -i "./${RESTORE}_rsa4096" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null
echo "After you are done, don't forget to remove droplet ${RESTORE}"
#!/bin/bash
# vim: ai:ts=8:sw=8:noet
# This script automates the creation of VM on Azure to test shapshot backups
set -eufo pipefail
IFS=$'\t\n'
# Command requirements
command -v az >/dev/null 2>/dev/null || { echo 'Please install az utility'; exit 1; }
# Variables to change always
RDATE='2017-07-05' # Date of snapshot to restore
RG_NAME='BADJune2017-f08' # Name of the resource group
# Variables to change if you want test different box
RESTORE='file-08' # Which machine to restore (leave file-08 if you are unsure)
# Variables to change only if you know what you are doing
RG_LOC='eastus2' # Location to create restoration resource group in
VM_NAME='restorevm' # How the VM should be named
VM_USERNAME='restore' # How the first user should be named
SUB="c802e1f4-573f-4049-8645-4f735e6411b3" # our subscription
# Main flow
# Generate rsa keypair in current dir if not existent
test -f "./${RG_NAME}_rsa4096" || \
ssh-keygen -f "./${RG_NAME}_rsa4096" \
-t rsa \
-C "ephemeral ${USER}'s key for ${RG_NAME}" \
-N '' \
-b 4096
# Check if the snapshot to restore exists (and check if we can login)
echo "Will try restoring this snapshots resource group:"
az group list --verbose \
--query "[?name=='snapshots-${RDATE}']"
echo "Creating separate resource group for restoration:"
az group create --verbose \
--location "${RG_LOC}" \
--name "${RG_NAME}"
echo "Creating VM ${VM_NAME}"
VM_IP=$(az vm create --verbose \
--resource-group "${RG_NAME}" \
--location "${RG_LOC}" \
--name "${VM_NAME}" \
--image "UbuntuLTS" \
--admin-username "${VM_USERNAME}" \
--authentication-type "ssh" \
--ssh-key-value "./${RG_NAME}_rsa4096.pub" \
--size "Standard_DS13_v2" | jq ".publicIpAddress")
# NOTE: the following could be done multithreaded way with -P16,
# if only az client didn't break _horribly_ in multithreaded mode
# The only option/hack I see now is duplicate $HOME/.azure/ per
# thread, cause az actively writing there, with races during it
# Try this:
# az group list # check its working
# chmod 0400 ~/.azure/clouds.config
# az group list # see error completely unrelated to readonly file
echo "Creating VM disks"
seq 0 15 | xargs -n1 -P1 -I{} sh -c \
"az disk create --verbose \
--resource-group '${RG_NAME}' \
--name '${RESTORE}-restore-{}' \
--source '/subscriptions/${SUB}/resourceGroups/snapshots-${RDATE}/providers/Microsoft.Compute/snapshots/${RESTORE}-datadisk-{}-snap-${RDATE}'"
echo "Attaching VM disks"
seq 0 15 | xargs -n1 -P1 -I{} sh -c \
"az vm disk attach --verbose \
--resource-group '${RG_NAME}' \
--disk '${RESTORE}-restore-{}' \
--vm-name '${VM_NAME}' \
--lun '{}'"
echo "All done, please proceed:"
echo ssh "${VM_USERNAME}@${VM_IP}" -i "./${RG_NAME}_rsa4096" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null
echo "After you are done, don't forget to remove resource group ${RG_NAME}"
#!/bin/bash
# vim: ai:ts=8:sw=8:noet
# This automates creation of packagecloud box on AWS to test backup restore procedures
set -eufo pipefail
IFS=$'\t\n'
# Command requirements
command -v aws >/dev/null 2>/dev/null || { echo 'Please install aws utility'; exit 1; }
# Variables to change only if you know what you are doing
AWS_REGION='us-east-1' # Location to create restoration resource group in
VM_NAME='restorepkgc' # How the VM should be named
VM_USERNAME='restore' # How the first user should be named
# Main flow
# Generate rsa keypair in current dir if not existent
test -f "./${VM_NAME}_rsa4096" || \
ssh-keygen -f "./${VM_NAME}_rsa4096" \
-t rsa \
-C "ephemeral ${USER}'s key for ${VM_NAME}" \
-N '' \
-b 4096
# Get latest Trusty AMI
echo -n "Latest Trusty AMI: "
AWS_AMI="$(aws ec2 describe-images \
--region "${AWS_REGION}" \
--filters "Name=name,Values=*ubuntu-trusty-14.04-amd64-server*" \
"Name=root-device-type,Values=ebs" \
"Name=virtualization-type,Values=hvm" \
--query 'sort_by(Images, &Name)[-1].ImageId' \
--output text)"
echo ${AWS_AMI}
# Use playground subnet for that
echo -n "Will use 'playground' subnet, id: "
AWS_SUBNET="$(aws ec2 describe-subnets \
--region "${AWS_REGION}" \
--filters "Name=tag:Name,Values=playground" \
--query "Subnets[0].SubnetId" \
--output text)"
echo ${AWS_SUBNET}
# Use some old security group with ssh in and tcp out for now
AWS_SG="sg-9bf638fc"
echo "Creating VM ${VM_NAME}"
CUSTOM_DATA=$(cat <<EOF
#!/bin/bash
export DEBIAN_FRONTEND=noninteractive
echo "$(cat "./${VM_NAME}_rsa4096.pub")" > /home/ubuntu/.ssh/authorized_keys
chown -R ubuntu:ubuntu /home/ubuntu/.ssh/authorized_keys
chmod 0400 /home/ubuntu/.ssh/authorized_keys
mkfs.ext4 -q /dev/xvdf
mkdir -p /var/opt/packagecloud
mount /dev/xvdf /var/opt/packagecloud
apt-get update && apt-get -y install s3cmd
EOF
)
AWS_IID="$(aws ec2 run-instances \
--count 1 \
--enable-api-termination \
--ebs-optimized \
--instance-type c4.2xlarge \
--associate-public-ip-address \
--subnet-id "${AWS_SUBNET}" \
--security-group-ids "${AWS_SG}" \
--image-id "${AWS_AMI}" \
--user-data "${CUSTOM_DATA}" \
--block-device-mappings "DeviceName=/dev/sdf,Ebs={VolumeSize=2048,VolumeType=gp2,DeleteOnTermination=true}" \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=${VM_NAME}}]" \
--query "Instances[0].InstanceId" \
--output text)"
echo "Created instance: ${AWS_IID}, trying to get IP"
VM_IP="$(aws ec2 describe-instances \
--instance-ids "${AWS_IID}" \
--query "Reservations[].Instances[].NetworkInterfaces[].Association.PublicIp" \
--output text)"
echo "rerun this to query for ip again"
echo aws ec2 describe-instances \
--instance-ids "${AWS_IID}" \
--query 'Reservations[].Instances[].NetworkInterfaces[].Association.PublicIp' \
--output text
echo ssh "ubuntu@${VM_IP}" -i "./${VM_NAME}_rsa4096" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null
echo "After you are done, don't forget to remove ec2 machine ${VM_NAME}"
# sed '/^#/d;/^$/d;/^backups/d;/ssl/d' /etc/packagecloud/packagecloud.rb
# disable redirects to https too
......@@ -16,8 +16,16 @@ At 3AM UTC, the snapshot cleanup script runs. This script simply checks for reso
## How do I restore?
Currently the restore is an extremely manual process. We hope to create a more
automated way of restoring soon. The below guide assumes that you must create a new server and that you are not attempting to reattach to an already existing server.
Currently the restore is an ~~extremely~~ manual process. We hope to create a more
automated way of restoring soon. The below guide assumes that you must create a
new server and that you are not attempting to reattach to an already existing
server. Currently there is _some_ automation available, for the steps up to the
ssh commands. To take advantage of it:
* create new dir
* put backup_scripts/03-azure-snapshot.sh there
* make sure `az` client is working (run `az group list` for a test)
* run `time bash 03-azure-snapshot.sh`. In 25 minutes the server will be created
with all the disks and you could proceed directly to mounting the filesystem.
1. Create an Ubuntu server in Azure (DS13-v2).
1. Create disks from the snapshots, placing them in the same resource group as the server
......
......@@ -63,6 +63,16 @@ the problems with uploading to S3 automatically and the giant database. The foll
guide assumes that there has been a catastrophic event that will require a complete
rebuild and will thus begin with building and configuring the server.
Semi-automated way:
1. Make sure your `aws` cli is working (`aws ec2 describe-vpcs` as test cmd)
1. `mkdir ./bad && cd ./bad`
1. grab backup_scripts/04-packagecloud.sh
1. `time bash 04-packagecloud.sh`
1. As soon as cloud-init is done (`tail -f /var/log/cloud-init-output.log`),
you can proceed with configuring secrets for packagecloud.
Manual way:
1. Build a new server! The current specs are listed below.
* Instance Size: c4.2xlarge
* Root Disk Size: 8GB (gp2)
......
......@@ -121,6 +121,21 @@ Before we start, take a deep breath and don't panic.
[PSQL_WAL]: https://www.postgresql.org/docs/current/static/wal-intro.html
## Creating servers for testing backups
The semi-automated procedure (production db):
1. `mkdir ./bad && cd ./bad`
1. grab backup_scripts/01-prod-db.sh
1. `time bash 01-prod-db.sh`
1. continue from customizing wal-e access keys and selecting time to restore.
(Make sure the cloud-init finished: `tail -f /var/log/cloud-init-output.log`
The semi-automated procedure (secondary db):
1. `mkdir ./bad && cd ./bad`
1. grab backup_scripts/02-secondary-db.sh, edit variables
1. `time bash 02-secondary-db.sh`
1. continue from customizing wal-e access keys and selecting time to restore.
(Make sure the cloud-init finished: `tail -f /var/log/cloud-init-output.log`
The Manual procedure:
1. For testing of primary database restore, create on Azure:
1. Resource group, say, `backup-may-2017`
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment