bundles/backup-client: rework backup generation
All checks were successful
kunsi/bundlewrap/pipeline/head This commit looks good
All checks were successful
kunsi/bundlewrap/pipeline/head This commit looks good
This commit is contained in:
parent
4e5cb69d1c
commit
14e4415e5f
6 changed files with 159 additions and 107 deletions
|
@ -1,42 +1,28 @@
|
||||||
#!/usr/bin/env python3
|
#!/bin/bash
|
||||||
|
|
||||||
from os.path import getmtime, isfile
|
statusfile="/var/tmp/backup.monitoring"
|
||||||
from sys import exit
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
statusfile = '/var/tmp/backup.monitoring'
|
if [[ ! -r "$statusfile" ]]
|
||||||
if not isfile(statusfile):
|
then
|
||||||
print('Status file not found')
|
echo "cannot read $statusfile"
|
||||||
exit(3)
|
exit 3
|
||||||
|
fi
|
||||||
|
|
||||||
mtime = getmtime(statusfile)
|
. "$statusfile"
|
||||||
now = time()
|
|
||||||
|
|
||||||
if now-mtime > 60*60*24*2:
|
if [[ -z "$msg" ]] || [[ -z "$status" ]] || [[ -z "$timestamp" ]]
|
||||||
print('Status file is older than 2 days!')
|
then
|
||||||
exit(3)
|
echo "status file is corrupt, cannot read status"
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
|
||||||
with open(statusfile, 'r') as f:
|
two_days_ago=$(($(date +%s) - 86400*2))
|
||||||
status = f.read().splitlines()
|
|
||||||
|
|
||||||
exitcode = status[0].strip()
|
if [[ $timestamp -lt $two_days_ago ]]
|
||||||
|
then
|
||||||
|
echo "last saved status is older than two days"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
if exitcode == 'ok':
|
echo "$msg"
|
||||||
print('OK')
|
exit "$status"
|
||||||
exit(0)
|
|
||||||
elif exitcode == 'rsync_error':
|
|
||||||
print('rsync produced some errors, exit codes were:')
|
|
||||||
for line in status[1:]:
|
|
||||||
print(line)
|
|
||||||
exit(2)
|
|
||||||
elif exitcode == 'hook':
|
|
||||||
print('run-parts /etc/backup-pre-hook.d failed with exit code {}'.format(status[1]))
|
|
||||||
exit(2)
|
|
||||||
elif exitcode == 'abort_no_key':
|
|
||||||
print('no ssh key found in /etc/backup.priv!')
|
|
||||||
exit(1)
|
|
||||||
else:
|
|
||||||
# garbage in file
|
|
||||||
for line in status:
|
|
||||||
print(line)
|
|
||||||
exit(3)
|
|
||||||
|
|
|
@ -1,77 +1,121 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
statusfile=/var/tmp/backup.monitoring
|
statusfile="/var/tmp/backup.monitoring"
|
||||||
|
logdir="/var/log/backup-client"
|
||||||
|
lock="/tmp/backup-client-is-running"
|
||||||
ssh_login="${username}@${server}"
|
ssh_login="${username}@${server}"
|
||||||
ssh_cmnd="ssh -o IdentityFile=/etc/backup.priv -o StrictHostKeyChecking=accept-new -p ${port}"
|
ssh_opts="-o IdentityFile=/etc/backup.priv -o StrictHostKeyChecking=accept-new -p ${port}"
|
||||||
nodename="${node.name}"
|
nodename="${node.name}"
|
||||||
|
|
||||||
<%text>
|
<%text>
|
||||||
|
try="${1:-<unknown>}"
|
||||||
[[ -n "$DEBUG" ]] && set -x
|
[[ -n "$DEBUG" ]] && set -x
|
||||||
NL=$'\n'
|
|
||||||
|
|
||||||
if ! [[ -f /etc/backup.priv ]]
|
|
||||||
then
|
|
||||||
echo "/etc/backup.priv does not exist" | logger -t backup-client -p user.error
|
|
||||||
echo "abort_no_key" > "$statusfile"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
run-parts --exit-on-error -- /etc/backup-pre-hooks.d
|
|
||||||
exitcode=$?
|
|
||||||
if [[ $exitcode != 0 ]]
|
|
||||||
then
|
|
||||||
echo "run-parts /etc/backup-pre-hooks.d exited $exitcode" | logger -t backup-client -p user.error
|
|
||||||
echo "hook $exitcode" > "$statusfile"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
do_backup() {
|
do_backup() {
|
||||||
rsync_errorcodes_for_this_path=""
|
echo "==> starting backup for '$1'"
|
||||||
backup_has_successfully_run="no"
|
|
||||||
|
|
||||||
for try in {1..5}
|
# Compress level 1 is a good compromise between speed and cpu usage.
|
||||||
do
|
rsync --compress-level=1 -aAP --numeric-ids --delete --relative \
|
||||||
echo "Backup for '$1', try $try ..." | logger -t backup-client -p user.info
|
--rsync-path="/usr/bin/rsync --fake-super" \
|
||||||
|
-e "ssh $ssh_opts" \
|
||||||
|
"$1" "$ssh_login":backups/
|
||||||
|
|
||||||
# Compress level 1 is a good compromise between speed and cpu usage.
|
# Exit code 24 means some files have vanished during rsync.
|
||||||
rsync --compress-level=1 -aAP --numeric-ids --delete --relative \
|
# I don't know why, but this is very common, apparently?
|
||||||
--rsync-path="/usr/bin/rsync --fake-super" \
|
exitcode=$?
|
||||||
-e "$ssh_cmnd" \
|
echo "==> backup for '$1' exited $exitcode"
|
||||||
"$1" "$ssh_login":backups/
|
if [[ $exitcode != 0 ]] && [[ $exitcode != 24 ]]
|
||||||
|
|
||||||
# Exit code 24 means some files have vanished during rsync.
|
|
||||||
# I don't know why, but this is very common, apparently?
|
|
||||||
exitcode=$?
|
|
||||||
echo "Backup for '$1' try $try exited $exitcode" | logger -t backup-client -p user.info
|
|
||||||
if [[ $exitcode != 0 ]] && [[ $exitcode != 24 ]]
|
|
||||||
then
|
|
||||||
rsync_errorcodes_for_this_path+=" $exitcode"
|
|
||||||
sleep 30
|
|
||||||
else
|
|
||||||
backup_has_successfully_run="yes"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$backup_has_successfully_run" != "yes" ]]
|
|
||||||
then
|
then
|
||||||
echo "Backup for '$1' did not succeed!" | logger -t backup-client -p user.error
|
rsync_errors+=" $1 ($exitcode)"
|
||||||
rsync_errors+="${NL}${1}${rsync_errorcodes_for_this_path}"
|
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
rsync_errors=""
|
on_exit() {
|
||||||
|
rmdir "$lock"
|
||||||
|
echo "*** END BACKUP RUN $(date '+%F %T %z') ***"
|
||||||
|
}
|
||||||
|
|
||||||
$ssh_cmnd $ssh_login "sudo /usr/local/bin/rotate-single-backup-client $nodename"
|
prepare_and_cleanup_logdir() {
|
||||||
|
# rsync logs tend to get very large. That's why we pipe them through
|
||||||
|
# gzip when writing. Because we're running multiple tries, we cannot
|
||||||
|
# rely on logrotate to rotate the logs, we have to do it ourselves.
|
||||||
|
# Of course that means we have to clean up after ourselves, too.
|
||||||
|
mkdir -p "$logdir"
|
||||||
|
find "$logdir" -type f -mtime +14 -name "*.log" -delete
|
||||||
|
find "$logdir" -type f -mtime +14 -name "*.gz" -delete
|
||||||
|
}
|
||||||
|
|
||||||
|
save_result_for_monitoring() {
|
||||||
|
code=$1
|
||||||
|
msg=$2
|
||||||
|
printf "status=%q\n" "$code" > "$statusfile"
|
||||||
|
printf "msg=%q\n" "$msg" >> "$statusfile"
|
||||||
|
printf "timestamp=%q\n" "$(date +%s)" >> "$statusfile"
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! mkdir "$lock" >/dev/null 2>&1
|
||||||
|
then
|
||||||
|
save_result_for_monitoring 2 "could not get lock"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
trap "on_exit" EXIT
|
||||||
|
|
||||||
|
# redirect stdout and stderr to logfile
|
||||||
|
prepare_and_cleanup_logdir
|
||||||
|
logfile="$logdir/backup--$(date '+%F--%H-%M-%S')--$$.log.gz"
|
||||||
|
echo "All log output will go to $logfile" | logger -it backup-client
|
||||||
|
exec > >(gzip >"$logfile")
|
||||||
|
exec 2>&1
|
||||||
|
|
||||||
|
# this is where the real work starts
|
||||||
|
ts_begin=$(date +%s)
|
||||||
|
|
||||||
|
echo "*** BEGIN BACKUP RUN $(date '+%F %T %z') ***"
|
||||||
|
echo "This is attempt $try"
|
||||||
|
echo "using ssh options [$ssh_opts]"
|
||||||
|
echo "using ssh login [$ssh_login]"
|
||||||
|
|
||||||
|
if ! [[ -f /etc/backup.priv ]]
|
||||||
|
then
|
||||||
|
save_result_for_monitoring 2 "/etc/backup.priv does not exist"
|
||||||
|
exit 100
|
||||||
|
fi
|
||||||
|
|
||||||
|
for i in /etc/backup-pre-hooks.d/*
|
||||||
|
do
|
||||||
|
[[ -x "$i" ]] || continue
|
||||||
|
|
||||||
|
echo "Running pre-hook '$i'"
|
||||||
|
if ! $i
|
||||||
|
then
|
||||||
|
save_result_for_monitoring 2 "pre-hook '$i' failed to run"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
rsync_errors=""
|
||||||
</%text>
|
</%text>
|
||||||
|
|
||||||
% for path in sorted(paths):
|
% for path in sorted(paths):
|
||||||
do_backup "${path}"
|
do_backup "${path}"
|
||||||
% endfor
|
% endfor
|
||||||
|
|
||||||
|
<%text>
|
||||||
if [[ -n "$rsync_errors" ]]
|
if [[ -n "$rsync_errors" ]]
|
||||||
then
|
then
|
||||||
echo "rsync_error$rsync_errors" > "$statusfile"
|
save_result_for_monitoring 2 "rsync failed: $rsync_errors"
|
||||||
else
|
exit 1
|
||||||
echo "ok" > "$statusfile"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
ssh $ssh_opts $ssh_login "sudo /usr/local/bin/rotate-single-backup-client $nodename" </dev/null
|
||||||
|
ssh_error=$?
|
||||||
|
if [[ $ssh_error -ne 0 ]]
|
||||||
|
then
|
||||||
|
save_result_for_monitoring 2 "rotating backups failed with status code $ssh_error"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ts_end=$(date +%s)
|
||||||
|
echo "Success"
|
||||||
|
save_result_for_monitoring 0 "Backup finished at $(date '+%F %T %z') (took $((ts_end - ts_begin)) seconds)"
|
||||||
|
</%text>
|
||||||
|
|
22
bundles/backup-client/files/generate-backup-with-retries
Normal file
22
bundles/backup-client/files/generate-backup-with-retries
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Try generating a backup multiple times. If one attempt succeeds, we're
|
||||||
|
# done. If not, there will be logs for every attempt, plus monitoring
|
||||||
|
# will read the result of the last backup.
|
||||||
|
for try in {1..3}
|
||||||
|
do
|
||||||
|
generate-backup "$try"
|
||||||
|
exitcode=$?
|
||||||
|
|
||||||
|
if [[ $exitcode -eq 100 ]]
|
||||||
|
then
|
||||||
|
# fatal error, cannot recover
|
||||||
|
exit 1
|
||||||
|
elif [[ $exitcode -eq 0 ]]
|
||||||
|
then
|
||||||
|
# successful backup
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
sleep 60
|
||||||
|
fi
|
||||||
|
done
|
|
@ -1,10 +0,0 @@
|
||||||
/var/log/backup-client/*.log {
|
|
||||||
compress
|
|
||||||
copytruncate
|
|
||||||
daily
|
|
||||||
dateext
|
|
||||||
missingok
|
|
||||||
notifempty
|
|
||||||
rotate 14
|
|
||||||
sharedscripts
|
|
||||||
}
|
|
|
@ -18,12 +18,25 @@ else:
|
||||||
backup_paths = node.metadata.get('backups/paths', set())
|
backup_paths = node.metadata.get('backups/paths', set())
|
||||||
|
|
||||||
if node.metadata.get('backups/exclude_from_backups', False):
|
if node.metadata.get('backups/exclude_from_backups', False):
|
||||||
files['/etc/backup.priv'] = {
|
# make sure nobody tries to do something funny
|
||||||
'delete': True,
|
for file in [
|
||||||
}
|
'/etc/backup.priv',
|
||||||
|
'/usr/local/bin/generate-backup',
|
||||||
|
'/usr/local/bin/generate-backup-with-retries',
|
||||||
|
'/var/tmp/backup.monitoring', # status file
|
||||||
|
]:
|
||||||
|
files[file] = {
|
||||||
|
'delete': True,
|
||||||
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
backup_target = repo.get_node(node.metadata.get('backup-client/target'))
|
backup_target = repo.get_node(node.metadata.get('backup-client/target'))
|
||||||
|
|
||||||
|
files['/etc/backup.priv'] = {
|
||||||
|
'content': repo.vault.decrypt_file(join('backup', 'keys', f'{node.name}.key.vault')),
|
||||||
|
'mode': '0400',
|
||||||
|
}
|
||||||
|
|
||||||
files['/usr/local/bin/generate-backup'] = {
|
files['/usr/local/bin/generate-backup'] = {
|
||||||
'content_type': 'mako',
|
'content_type': 'mako',
|
||||||
'context': {
|
'context': {
|
||||||
|
@ -35,9 +48,8 @@ else:
|
||||||
'mode': '0700',
|
'mode': '0700',
|
||||||
}
|
}
|
||||||
|
|
||||||
files['/etc/backup.priv'] = {
|
files['/usr/local/bin/generate-backup-with-retries'] = {
|
||||||
'content': repo.vault.decrypt_file(join('backup', 'keys', f'{node.name}.key.vault')),
|
'mode': '0700',
|
||||||
'mode': '0400',
|
|
||||||
}
|
}
|
||||||
|
|
||||||
files['/usr/local/share/icinga/plugins/check_backup_last_run'] = {
|
files['/usr/local/share/icinga/plugins/check_backup_last_run'] = {
|
||||||
|
@ -45,15 +57,13 @@ files['/usr/local/share/icinga/plugins/check_backup_last_run'] = {
|
||||||
}
|
}
|
||||||
|
|
||||||
files['/etc/logrotate.d/backup-client'] = {
|
files['/etc/logrotate.d/backup-client'] = {
|
||||||
'source': 'logrotate.conf',
|
'delete': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
directories['/etc/backup-pre-hooks.d'] = {
|
directories['/etc/backup-pre-hooks.d'] = {
|
||||||
'purge': True,
|
'purge': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
directories['/var/log/backup-client'] = {}
|
|
||||||
|
|
||||||
for hname, hcontent in node.metadata.get('backup-client/pre-hooks', {}).items():
|
for hname, hcontent in node.metadata.get('backup-client/pre-hooks', {}).items():
|
||||||
files[f'/etc/backup-pre-hooks.d/50-{hname}'] = {
|
files[f'/etc/backup-pre-hooks.d/50-{hname}'] = {
|
||||||
'content': '#!/bin/sh\n\n' + hcontent,
|
'content': '#!/bin/sh\n\n' + hcontent,
|
||||||
|
|
|
@ -20,7 +20,7 @@ def cron(metadata):
|
||||||
return {
|
return {
|
||||||
'cron': {
|
'cron': {
|
||||||
# spread backups between 00:00 and 04:59 UTC
|
# spread backups between 00:00 and 04:59 UTC
|
||||||
'backup': '{} {} * * * root /usr/local/bin/generate-backup > /var/log/backup-client/backup.log 2>&1'.format(
|
'backup': '{} {} * * * root /usr/local/bin/generate-backup-with-retries'.format(
|
||||||
(node.magic_number % 60),
|
(node.magic_number % 60),
|
||||||
(node.magic_number % 4),
|
(node.magic_number % 4),
|
||||||
),
|
),
|
||||||
|
|
Loading…
Reference in a new issue