diff --git a/bundles/backup-client/files/check_backup_last_run b/bundles/backup-client/files/check_backup_last_run index a5e0313..1523869 100644 --- a/bundles/backup-client/files/check_backup_last_run +++ b/bundles/backup-client/files/check_backup_last_run @@ -1,42 +1,28 @@ -#!/usr/bin/env python3 +#!/bin/bash -from os.path import getmtime, isfile -from sys import exit -from time import time +statusfile="/var/tmp/backup.monitoring" -statusfile = '/var/tmp/backup.monitoring' -if not isfile(statusfile): - print('Status file not found') - exit(3) +if [[ ! -r "$statusfile" ]] +then + echo "cannot read $statusfile" + exit 3 +fi -mtime = getmtime(statusfile) -now = time() +. "$statusfile" -if now-mtime > 60*60*24*2: - print('Status file is older than 2 days!') - exit(3) +if [[ -z "$msg" ]] || [[ -z "$status" ]] || [[ -z "$timestamp" ]] +then + echo "status file is corrupt, cannot read status" + exit 3 +fi -with open(statusfile, 'r') as f: - status = f.read().splitlines() +two_days_ago=$(($(date +%s) - 86400*2)) -exitcode = status[0].strip() +if [[ $timestamp -lt $two_days_ago ]] +then + echo "last saved status is older than two days" + exit 2 +fi -if exitcode == 'ok': - print('OK') - exit(0) -elif exitcode == 'rsync_error': - print('rsync produced some errors, exit codes were:') - for line in status[1:]: - print(line) - exit(2) -elif exitcode == 'hook': - print('run-parts /etc/backup-pre-hook.d failed with exit code {}'.format(status[1])) - exit(2) -elif exitcode == 'abort_no_key': - print('no ssh key found in /etc/backup.priv!') - exit(1) -else: - # garbage in file - for line in status: - print(line) - exit(3) +echo "$msg" +exit "$status" diff --git a/bundles/backup-client/files/generate-backup b/bundles/backup-client/files/generate-backup index ef648f4..bfe11bd 100644 --- a/bundles/backup-client/files/generate-backup +++ b/bundles/backup-client/files/generate-backup @@ -1,77 +1,121 @@ #!/bin/bash -statusfile=/var/tmp/backup.monitoring +statusfile="/var/tmp/backup.monitoring" +logdir="/var/log/backup-client" +lock="/tmp/backup-client-is-running" ssh_login="${username}@${server}" -ssh_cmnd="ssh -o IdentityFile=/etc/backup.priv -o StrictHostKeyChecking=accept-new -p ${port}" +ssh_opts="-o IdentityFile=/etc/backup.priv -o StrictHostKeyChecking=accept-new -p ${port}" nodename="${node.name}" <%text> +try="${1:-}" [[ -n "$DEBUG" ]] && set -x -NL=$'\n' - -if ! [[ -f /etc/backup.priv ]] -then - echo "/etc/backup.priv does not exist" | logger -t backup-client -p user.error - echo "abort_no_key" > "$statusfile" - exit 1 -fi - -run-parts --exit-on-error -- /etc/backup-pre-hooks.d -exitcode=$? -if [[ $exitcode != 0 ]] -then - echo "run-parts /etc/backup-pre-hooks.d exited $exitcode" | logger -t backup-client -p user.error - echo "hook $exitcode" > "$statusfile" - exit 1 -fi do_backup() { - rsync_errorcodes_for_this_path="" - backup_has_successfully_run="no" + echo "==> starting backup for '$1'" - for try in {1..5} - do - echo "Backup for '$1', try $try ..." | logger -t backup-client -p user.info + # Compress level 1 is a good compromise between speed and cpu usage. + rsync --compress-level=1 -aAP --numeric-ids --delete --relative \ + --rsync-path="/usr/bin/rsync --fake-super" \ + -e "ssh $ssh_opts" \ + "$1" "$ssh_login":backups/ - # Compress level 1 is a good compromise between speed and cpu usage. - rsync --compress-level=1 -aAP --numeric-ids --delete --relative \ - --rsync-path="/usr/bin/rsync --fake-super" \ - -e "$ssh_cmnd" \ - "$1" "$ssh_login":backups/ - - # Exit code 24 means some files have vanished during rsync. - # I don't know why, but this is very common, apparently? - exitcode=$? - echo "Backup for '$1' try $try exited $exitcode" | logger -t backup-client -p user.info - if [[ $exitcode != 0 ]] && [[ $exitcode != 24 ]] - then - rsync_errorcodes_for_this_path+=" $exitcode" - sleep 30 - else - backup_has_successfully_run="yes" - break - fi - done - - if [[ "$backup_has_successfully_run" != "yes" ]] + # Exit code 24 means some files have vanished during rsync. + # I don't know why, but this is very common, apparently? + exitcode=$? + echo "==> backup for '$1' exited $exitcode" + if [[ $exitcode != 0 ]] && [[ $exitcode != 24 ]] then - echo "Backup for '$1' did not succeed!" | logger -t backup-client -p user.error - rsync_errors+="${NL}${1}${rsync_errorcodes_for_this_path}" + rsync_errors+=" $1 ($exitcode)" fi } -rsync_errors="" +on_exit() { + rmdir "$lock" + echo "*** END BACKUP RUN $(date '+%F %T %z') ***" +} -$ssh_cmnd $ssh_login "sudo /usr/local/bin/rotate-single-backup-client $nodename" +prepare_and_cleanup_logdir() { + # rsync logs tend to get very large. That's why we pipe them through + # gzip when writing. Because we're running multiple tries, we cannot + # rely on logrotate to rotate the logs, we have to do it ourselves. + # Of course that means we have to clean up after ourselves, too. + mkdir -p "$logdir" + find "$logdir" -type f -mtime +14 -name "*.log" -delete + find "$logdir" -type f -mtime +14 -name "*.gz" -delete +} + +save_result_for_monitoring() { + code=$1 + msg=$2 + printf "status=%q\n" "$code" > "$statusfile" + printf "msg=%q\n" "$msg" >> "$statusfile" + printf "timestamp=%q\n" "$(date +%s)" >> "$statusfile" +} + +if ! mkdir "$lock" >/dev/null 2>&1 +then + save_result_for_monitoring 2 "could not get lock" + exit 1 +fi +trap "on_exit" EXIT + +# redirect stdout and stderr to logfile +prepare_and_cleanup_logdir +logfile="$logdir/backup--$(date '+%F--%H-%M-%S')--$$.log.gz" +echo "All log output will go to $logfile" | logger -it backup-client +exec > >(gzip >"$logfile") +exec 2>&1 + +# this is where the real work starts +ts_begin=$(date +%s) + +echo "*** BEGIN BACKUP RUN $(date '+%F %T %z') ***" +echo "This is attempt $try" +echo "using ssh options [$ssh_opts]" +echo "using ssh login [$ssh_login]" + +if ! [[ -f /etc/backup.priv ]] +then + save_result_for_monitoring 2 "/etc/backup.priv does not exist" + exit 100 +fi + +for i in /etc/backup-pre-hooks.d/* +do + [[ -x "$i" ]] || continue + + echo "Running pre-hook '$i'" + if ! $i + then + save_result_for_monitoring 2 "pre-hook '$i' failed to run" + exit 1 + fi +done + +rsync_errors="" % for path in sorted(paths): do_backup "${path}" % endfor +<%text> if [[ -n "$rsync_errors" ]] then - echo "rsync_error$rsync_errors" > "$statusfile" -else - echo "ok" > "$statusfile" + save_result_for_monitoring 2 "rsync failed: $rsync_errors" + exit 1 fi + +ssh $ssh_opts $ssh_login "sudo /usr/local/bin/rotate-single-backup-client $nodename" diff --git a/bundles/backup-client/files/generate-backup-with-retries b/bundles/backup-client/files/generate-backup-with-retries new file mode 100644 index 0000000..6dc89e2 --- /dev/null +++ b/bundles/backup-client/files/generate-backup-with-retries @@ -0,0 +1,22 @@ +#!/bin/bash + +# Try generating a backup multiple times. If one attempt succeeds, we're +# done. If not, there will be logs for every attempt, plus monitoring +# will read the result of the last backup. +for try in {1..3} +do + generate-backup "$try" + exitcode=$? + + if [[ $exitcode -eq 100 ]] + then + # fatal error, cannot recover + exit 1 + elif [[ $exitcode -eq 0 ]] + then + # successful backup + exit 0 + else + sleep 60 + fi +done diff --git a/bundles/backup-client/files/logrotate.conf b/bundles/backup-client/files/logrotate.conf deleted file mode 100644 index cd23372..0000000 --- a/bundles/backup-client/files/logrotate.conf +++ /dev/null @@ -1,10 +0,0 @@ -/var/log/backup-client/*.log { - compress - copytruncate - daily - dateext - missingok - notifempty - rotate 14 - sharedscripts -} diff --git a/bundles/backup-client/items.py b/bundles/backup-client/items.py index 3d91c8e..6538803 100644 --- a/bundles/backup-client/items.py +++ b/bundles/backup-client/items.py @@ -18,12 +18,25 @@ else: backup_paths = node.metadata.get('backups/paths', set()) if node.metadata.get('backups/exclude_from_backups', False): - files['/etc/backup.priv'] = { - 'delete': True, - } + # make sure nobody tries to do something funny + for file in [ + '/etc/backup.priv', + '/usr/local/bin/generate-backup', + '/usr/local/bin/generate-backup-with-retries', + '/var/tmp/backup.monitoring', # status file + ]: + files[file] = { + 'delete': True, + } + else: backup_target = repo.get_node(node.metadata.get('backup-client/target')) + files['/etc/backup.priv'] = { + 'content': repo.vault.decrypt_file(join('backup', 'keys', f'{node.name}.key.vault')), + 'mode': '0400', + } + files['/usr/local/bin/generate-backup'] = { 'content_type': 'mako', 'context': { @@ -35,9 +48,8 @@ else: 'mode': '0700', } - files['/etc/backup.priv'] = { - 'content': repo.vault.decrypt_file(join('backup', 'keys', f'{node.name}.key.vault')), - 'mode': '0400', + files['/usr/local/bin/generate-backup-with-retries'] = { + 'mode': '0700', } files['/usr/local/share/icinga/plugins/check_backup_last_run'] = { @@ -45,15 +57,13 @@ files['/usr/local/share/icinga/plugins/check_backup_last_run'] = { } files['/etc/logrotate.d/backup-client'] = { - 'source': 'logrotate.conf', + 'delete': True, } directories['/etc/backup-pre-hooks.d'] = { 'purge': True, } -directories['/var/log/backup-client'] = {} - for hname, hcontent in node.metadata.get('backup-client/pre-hooks', {}).items(): files[f'/etc/backup-pre-hooks.d/50-{hname}'] = { 'content': '#!/bin/sh\n\n' + hcontent, diff --git a/bundles/backup-client/metadata.py b/bundles/backup-client/metadata.py index 0410fda..3192399 100644 --- a/bundles/backup-client/metadata.py +++ b/bundles/backup-client/metadata.py @@ -20,7 +20,7 @@ def cron(metadata): return { 'cron': { # spread backups between 00:00 and 04:59 UTC - 'backup': '{} {} * * * root /usr/local/bin/generate-backup > /var/log/backup-client/backup.log 2>&1'.format( + 'backup': '{} {} * * * root /usr/local/bin/generate-backup-with-retries'.format( (node.magic_number % 60), (node.magic_number % 4), ),