2016-05-30 22:33:13 +00:00
|
|
|
import errno
|
2016-05-30 23:18:03 +00:00
|
|
|
import os
|
|
|
|
import socket
|
|
|
|
import stat
|
|
|
|
import sys
|
|
|
|
import time
|
2016-06-27 18:56:41 +00:00
|
|
|
from contextlib import contextmanager
|
2016-02-08 19:17:35 +00:00
|
|
|
from datetime import datetime, timezone
|
2016-11-19 18:09:47 +00:00
|
|
|
from functools import partial
|
2010-10-24 20:07:54 +00:00
|
|
|
from getpass import getuser
|
2016-05-30 23:18:03 +00:00
|
|
|
from io import BytesIO
|
2014-02-16 21:21:18 +00:00
|
|
|
from itertools import groupby
|
2016-05-30 23:18:03 +00:00
|
|
|
from shutil import get_terminal_size
|
|
|
|
|
|
|
|
import msgpack
|
convert most print() calls to logging
the logging level varies: most is logging.info(), in some place
logging.warning() or logging.error() are used when the condition is
clearly an error or warning. in other cases, we keep using print, but
force writing to sys.stderr, unless we interact with the user.
there were 77 calls to print before this commit, now there are 7, most
of which in the archiver module, which interacts directly with the
user. in one case there, we still use print() only because logging is
not setup properly yet during argument parsing.
it could be argued that commands like info or list should use print
directly, but we have converted them anyways, without ill effects on
the unit tests
unit tests still use print() in some places
this switches all informational output to stderr, which should help
with, if not fix jborg/attic#312 directly
2015-10-01 17:41:42 +00:00
|
|
|
|
2015-10-06 16:33:55 +00:00
|
|
|
from .logger import create_logger
|
|
|
|
logger = create_logger()
|
|
|
|
|
2015-05-22 17:21:41 +00:00
|
|
|
from . import xattr
|
2016-05-30 23:18:03 +00:00
|
|
|
from .cache import ChunkListEntry
|
|
|
|
from .chunker import Chunker
|
2016-07-31 21:09:57 +00:00
|
|
|
from .compress import Compressor
|
2016-04-17 01:15:19 +00:00
|
|
|
from .constants import * # NOQA
|
2016-05-30 23:18:03 +00:00
|
|
|
from .hashindex import ChunkIndex, ChunkIndexEntry
|
2016-05-30 22:33:13 +00:00
|
|
|
from .helpers import Manifest
|
|
|
|
from .helpers import Chunk, ChunkIteratorFileWrapper, open_item
|
|
|
|
from .helpers import Error, IntegrityError
|
|
|
|
from .helpers import uid2user, user2uid, gid2group, group2gid
|
|
|
|
from .helpers import parse_timestamp, to_localtime
|
|
|
|
from .helpers import format_time, format_timedelta, format_file_size, file_status
|
2016-09-27 09:35:45 +00:00
|
|
|
from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates, swidth_slice
|
2016-05-30 22:33:13 +00:00
|
|
|
from .helpers import decode_dict, StableDict
|
|
|
|
from .helpers import int_to_bigint, bigint_to_int, bin_to_hex
|
2016-11-13 21:34:15 +00:00
|
|
|
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
|
2016-05-30 22:33:13 +00:00
|
|
|
from .helpers import PathPrefixPattern, FnmatchPattern
|
2016-09-16 00:49:54 +00:00
|
|
|
from .helpers import consume, chunkit
|
2016-05-30 22:33:13 +00:00
|
|
|
from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec
|
2016-08-14 23:11:33 +00:00
|
|
|
from .item import Item, ArchiveItem
|
2016-05-30 23:18:03 +00:00
|
|
|
from .key import key_factory
|
2016-05-18 21:59:47 +00:00
|
|
|
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
|
2016-05-30 23:18:03 +00:00
|
|
|
from .remote import cache_if_remote
|
|
|
|
from .repository import Repository
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2013-06-03 11:45:48 +00:00
|
|
|
has_lchmod = hasattr(os, 'lchmod')
|
2010-10-27 17:30:21 +00:00
|
|
|
|
2016-02-18 22:01:55 +00:00
|
|
|
flags_normal = os.O_RDONLY | getattr(os, 'O_BINARY', 0)
|
|
|
|
flags_noatime = flags_normal | getattr(os, 'O_NOATIME', 0)
|
|
|
|
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-05-18 21:59:47 +00:00
|
|
|
class Statistics:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.osize = self.csize = self.usize = self.nfiles = 0
|
|
|
|
self.last_progress = 0 # timestamp when last progress was shown
|
|
|
|
|
|
|
|
def update(self, size, csize, unique):
|
|
|
|
self.osize += size
|
|
|
|
self.csize += csize
|
|
|
|
if unique:
|
|
|
|
self.usize += csize
|
|
|
|
|
2016-07-30 21:16:19 +00:00
|
|
|
summary = "{label:15} {stats.osize_fmt:>20s} {stats.csize_fmt:>20s} {stats.usize_fmt:>20s}"
|
2016-05-18 21:59:47 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.summary.format(stats=self, label='This archive:')
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<{cls} object at {hash:#x} ({self.osize}, {self.csize}, {self.usize})>".format(
|
|
|
|
cls=type(self).__name__, hash=id(self), self=self)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def osize_fmt(self):
|
|
|
|
return format_file_size(self.osize)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def usize_fmt(self):
|
|
|
|
return format_file_size(self.usize)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def csize_fmt(self):
|
|
|
|
return format_file_size(self.csize)
|
|
|
|
|
|
|
|
def show_progress(self, item=None, final=False, stream=None, dt=None):
|
|
|
|
now = time.time()
|
|
|
|
if dt is None or now - self.last_progress > dt:
|
|
|
|
self.last_progress = now
|
|
|
|
columns, lines = get_terminal_size()
|
|
|
|
if not final:
|
|
|
|
msg = '{0.osize_fmt} O {0.csize_fmt} C {0.usize_fmt} D {0.nfiles} N '.format(self)
|
2016-05-31 23:45:45 +00:00
|
|
|
path = remove_surrogates(item.path) if item else ''
|
2016-05-18 21:59:47 +00:00
|
|
|
space = columns - swidth(msg)
|
2016-09-25 15:30:55 +00:00
|
|
|
if space < 12:
|
|
|
|
msg = ''
|
|
|
|
space = columns - swidth(msg)
|
|
|
|
if space >= 8:
|
2016-11-13 21:34:15 +00:00
|
|
|
msg += ellipsis_truncate(path, space)
|
2016-05-18 21:59:47 +00:00
|
|
|
else:
|
|
|
|
msg = ' ' * columns
|
|
|
|
print(msg, file=stream or sys.stderr, end="\r", flush=True)
|
|
|
|
|
|
|
|
|
2016-07-02 19:04:51 +00:00
|
|
|
def is_special(mode):
|
|
|
|
# file types that get special treatment in --read-special mode
|
|
|
|
return stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)
|
|
|
|
|
|
|
|
|
2016-06-30 22:13:53 +00:00
|
|
|
class BackupOSError(Exception):
|
2016-07-03 21:58:12 +00:00
|
|
|
"""
|
|
|
|
Wrapper for OSError raised while accessing backup files.
|
|
|
|
|
|
|
|
Borg does different kinds of IO, and IO failures have different consequences.
|
|
|
|
This wrapper represents failures of input file or extraction IO.
|
|
|
|
These are non-critical and are only reported (exit code = 1, warning).
|
|
|
|
|
|
|
|
Any unwrapped IO error is critical and aborts execution (for example repository IO failure).
|
|
|
|
"""
|
2016-06-27 18:56:41 +00:00
|
|
|
def __init__(self, os_error):
|
|
|
|
self.os_error = os_error
|
|
|
|
self.errno = os_error.errno
|
|
|
|
self.strerror = os_error.strerror
|
|
|
|
self.filename = os_error.filename
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return str(self.os_error)
|
|
|
|
|
|
|
|
|
2016-12-02 23:12:48 +00:00
|
|
|
class BackupIO:
|
|
|
|
def __enter__(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
|
if exc_type and issubclass(exc_type, OSError):
|
|
|
|
raise BackupOSError(exc_val) from exc_val
|
|
|
|
|
|
|
|
|
|
|
|
backup_io = BackupIO()
|
2016-06-27 18:56:41 +00:00
|
|
|
|
|
|
|
|
2016-07-03 21:57:55 +00:00
|
|
|
def backup_io_iter(iterator):
|
2016-06-27 18:56:41 +00:00
|
|
|
while True:
|
|
|
|
try:
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-06-27 18:56:41 +00:00
|
|
|
item = next(iterator)
|
|
|
|
except StopIteration:
|
|
|
|
return
|
|
|
|
yield item
|
|
|
|
|
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
class DownloadPipeline:
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def __init__(self, repository, key):
|
|
|
|
self.repository = repository
|
|
|
|
self.key = key
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-23 21:13:08 +00:00
|
|
|
def unpack_many(self, ids, filter=None, preload=False):
|
2016-08-19 22:04:55 +00:00
|
|
|
"""
|
|
|
|
Return iterator of items.
|
|
|
|
|
|
|
|
*ids* is a chunk ID list of an item stream. *filter* is a callable
|
|
|
|
to decide whether an item will be yielded. *preload* preloads the data chunks of every yielded item.
|
|
|
|
|
|
|
|
Warning: if *preload* is True then all data chunks of every yielded item have to be retrieved,
|
|
|
|
otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak.
|
|
|
|
"""
|
2014-01-22 19:58:48 +00:00
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
2016-03-18 02:16:12 +00:00
|
|
|
for _, data in self.fetch_many(ids):
|
2014-01-22 19:58:48 +00:00
|
|
|
unpacker.feed(data)
|
2016-05-31 23:45:45 +00:00
|
|
|
items = [Item(internal_dict=item) for item in unpacker]
|
2016-04-16 15:48:47 +00:00
|
|
|
for item in items:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
item.chunks = [ChunkListEntry(*e) for e in item.chunks]
|
2016-08-22 20:58:54 +00:00
|
|
|
if filter:
|
|
|
|
items = [item for item in items if filter(item)]
|
2014-01-23 21:13:08 +00:00
|
|
|
if preload:
|
|
|
|
for item in items:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
self.repository.preload([c.id for c in item.chunks])
|
2014-01-22 19:58:48 +00:00
|
|
|
for item in items:
|
|
|
|
yield item
|
|
|
|
|
|
|
|
def fetch_many(self, ids, is_preloaded=False):
|
2014-02-16 21:21:18 +00:00
|
|
|
for id_, data in zip(ids, self.repository.get_many(ids, is_preloaded=is_preloaded)):
|
2014-01-22 19:58:48 +00:00
|
|
|
yield self.key.decrypt(id_, data)
|
|
|
|
|
|
|
|
|
|
|
|
class ChunkBuffer:
|
2016-08-14 13:07:18 +00:00
|
|
|
BUFFER_SIZE = 8 * 1024 * 1024
|
2014-01-22 19:58:48 +00:00
|
|
|
|
2016-01-15 19:56:21 +00:00
|
|
|
def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
|
2014-07-10 13:44:29 +00:00
|
|
|
self.buffer = BytesIO()
|
2014-01-22 19:58:48 +00:00
|
|
|
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
|
|
|
|
self.chunks = []
|
|
|
|
self.key = key
|
2015-06-20 23:46:41 +00:00
|
|
|
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def add(self, item):
|
2016-05-31 23:45:45 +00:00
|
|
|
self.buffer.write(self.packer.pack(item.as_dict()))
|
2014-02-16 21:21:18 +00:00
|
|
|
if self.is_full():
|
|
|
|
self.flush()
|
|
|
|
|
|
|
|
def write_chunk(self, chunk):
|
|
|
|
raise NotImplementedError
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def flush(self, flush=False):
|
|
|
|
if self.buffer.tell() == 0:
|
|
|
|
return
|
|
|
|
self.buffer.seek(0)
|
2016-03-18 02:16:12 +00:00
|
|
|
chunks = list(Chunk(bytes(s)) for s in self.chunker.chunkify(self.buffer))
|
2014-01-22 19:58:48 +00:00
|
|
|
self.buffer.seek(0)
|
|
|
|
self.buffer.truncate(0)
|
2014-08-30 13:10:41 +00:00
|
|
|
# Leave the last partial chunk in the buffer unless flush is True
|
2014-01-22 19:58:48 +00:00
|
|
|
end = None if flush or len(chunks) == 1 else -1
|
|
|
|
for chunk in chunks[:end]:
|
2014-02-16 21:21:18 +00:00
|
|
|
self.chunks.append(self.write_chunk(chunk))
|
2014-01-22 19:58:48 +00:00
|
|
|
if end == -1:
|
2016-04-19 07:18:46 +00:00
|
|
|
self.buffer.write(chunks[-1].data)
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def is_full(self):
|
|
|
|
return self.buffer.tell() > self.BUFFER_SIZE
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
class CacheChunkBuffer(ChunkBuffer):
|
|
|
|
|
2016-01-15 19:56:21 +00:00
|
|
|
def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS):
|
2015-07-11 16:31:49 +00:00
|
|
|
super().__init__(key, chunker_params)
|
2014-02-16 21:21:18 +00:00
|
|
|
self.cache = cache
|
|
|
|
self.stats = stats
|
|
|
|
|
|
|
|
def write_chunk(self, chunk):
|
2016-03-18 02:16:12 +00:00
|
|
|
id_, _, _ = self.cache.add_chunk(self.key.id_hash(chunk.data), chunk, self.stats)
|
2014-02-16 21:21:18 +00:00
|
|
|
return id_
|
|
|
|
|
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
class Archive:
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2013-12-15 19:35:29 +00:00
|
|
|
class DoesNotExist(Error):
|
|
|
|
"""Archive {} does not exist"""
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2013-12-15 19:35:29 +00:00
|
|
|
class AlreadyExists(Error):
|
|
|
|
"""Archive {} already exists"""
|
2011-09-10 15:19:02 +00:00
|
|
|
|
2015-04-21 20:29:10 +00:00
|
|
|
class IncompatibleFilesystemEncodingError(Error):
|
|
|
|
"""Failed to encode filename "{}" into file system encoding "{}". Consider configuring the LANG environment variable."""
|
|
|
|
|
2013-06-20 10:44:58 +00:00
|
|
|
def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
2016-11-28 01:23:32 +00:00
|
|
|
checkpoint_interval=300, numeric_owner=False, noatime=False, noctime=False, progress=False,
|
2016-07-21 20:24:48 +00:00
|
|
|
chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None,
|
2016-07-21 22:19:56 +00:00
|
|
|
consider_part_files=False):
|
2013-06-03 11:45:48 +00:00
|
|
|
self.cwd = os.getcwd()
|
2011-07-30 19:13:48 +00:00
|
|
|
self.key = key
|
2013-06-20 10:44:58 +00:00
|
|
|
self.repository = repository
|
2011-07-30 19:13:48 +00:00
|
|
|
self.cache = cache
|
2011-09-04 21:02:47 +00:00
|
|
|
self.manifest = manifest
|
2010-10-20 20:53:58 +00:00
|
|
|
self.hard_links = {}
|
2011-08-07 15:10:21 +00:00
|
|
|
self.stats = Statistics()
|
2015-03-24 03:24:54 +00:00
|
|
|
self.show_progress = progress
|
2011-09-10 15:19:02 +00:00
|
|
|
self.name = name
|
|
|
|
self.checkpoint_interval = checkpoint_interval
|
2012-02-29 22:59:17 +00:00
|
|
|
self.numeric_owner = numeric_owner
|
2016-11-28 01:23:32 +00:00
|
|
|
self.noatime = noatime
|
|
|
|
self.noctime = noctime
|
2016-03-11 22:37:37 +00:00
|
|
|
if start is None:
|
|
|
|
start = datetime.utcnow()
|
2016-03-12 11:40:39 +00:00
|
|
|
self.chunker_params = chunker_params
|
2015-10-02 19:56:21 +00:00
|
|
|
self.start = start
|
2016-03-11 22:37:37 +00:00
|
|
|
if end is None:
|
|
|
|
end = datetime.utcnow()
|
2015-10-02 19:56:21 +00:00
|
|
|
self.end = end
|
2016-07-21 22:19:56 +00:00
|
|
|
self.consider_part_files = consider_part_files
|
2014-01-22 19:58:48 +00:00
|
|
|
self.pipeline = DownloadPipeline(self.repository, self.key)
|
2011-09-10 15:19:02 +00:00
|
|
|
if create:
|
2016-08-06 20:37:44 +00:00
|
|
|
self.file_compression_logger = create_logger('borg.debug.file-compression')
|
2016-01-15 19:56:21 +00:00
|
|
|
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
|
2015-06-20 23:46:41 +00:00
|
|
|
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
2016-04-18 23:13:10 +00:00
|
|
|
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
|
|
|
compression_files or [])
|
|
|
|
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
2011-09-10 15:19:02 +00:00
|
|
|
if name in manifest.archives:
|
2012-12-09 22:06:33 +00:00
|
|
|
raise self.AlreadyExists(name)
|
2011-09-10 15:19:02 +00:00
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
i = 0
|
|
|
|
while True:
|
|
|
|
self.checkpoint_name = '%s.checkpoint%s' % (name, i and ('.%d' % i) or '')
|
2015-03-17 22:47:21 +00:00
|
|
|
if self.checkpoint_name not in manifest.archives:
|
2011-09-10 15:19:02 +00:00
|
|
|
break
|
|
|
|
i += 1
|
|
|
|
else:
|
2016-08-15 02:17:41 +00:00
|
|
|
info = self.manifest.archives.get(name)
|
|
|
|
if info is None:
|
2012-12-09 22:06:33 +00:00
|
|
|
raise self.DoesNotExist(name)
|
2016-08-15 02:17:41 +00:00
|
|
|
self.load(info.id)
|
2015-06-20 23:46:41 +00:00
|
|
|
self.zeros = b'\0' * (1 << chunker_params[1])
|
2011-08-15 20:32:26 +00:00
|
|
|
|
2015-03-24 06:11:00 +00:00
|
|
|
def _load_meta(self, id):
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(id, self.repository.get(id))
|
2016-08-14 23:11:33 +00:00
|
|
|
metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
|
|
|
if metadata.version != 1:
|
2015-03-24 06:11:00 +00:00
|
|
|
raise Exception('Unknown archive metadata version')
|
|
|
|
return metadata
|
|
|
|
|
2010-10-21 19:21:43 +00:00
|
|
|
def load(self, id):
|
|
|
|
self.id = id
|
2015-03-24 06:11:00 +00:00
|
|
|
self.metadata = self._load_meta(self.id)
|
2016-08-14 23:11:33 +00:00
|
|
|
self.metadata.cmdline = [safe_decode(arg) for arg in self.metadata.cmdline]
|
|
|
|
self.name = self.metadata.name
|
2010-10-25 17:51:47 +00:00
|
|
|
|
2011-06-16 19:55:54 +00:00
|
|
|
@property
|
|
|
|
def ts(self):
|
2016-02-05 01:02:04 +00:00
|
|
|
"""Timestamp of archive creation (start) in UTC"""
|
2016-08-14 23:11:33 +00:00
|
|
|
ts = self.metadata.time
|
2016-02-07 01:35:31 +00:00
|
|
|
return parse_timestamp(ts)
|
2011-06-16 19:55:54 +00:00
|
|
|
|
2016-02-05 01:02:04 +00:00
|
|
|
@property
|
|
|
|
def ts_end(self):
|
|
|
|
"""Timestamp of archive creation (end) in UTC"""
|
2016-02-07 01:35:31 +00:00
|
|
|
# fall back to time if there is no time_end present in metadata
|
2016-08-14 23:11:33 +00:00
|
|
|
ts = self.metadata.get('time_end') or self.metadata.time
|
2016-02-07 01:35:31 +00:00
|
|
|
return parse_timestamp(ts)
|
2016-02-05 01:02:04 +00:00
|
|
|
|
2015-10-02 19:56:21 +00:00
|
|
|
@property
|
|
|
|
def fpr(self):
|
2016-04-23 20:42:56 +00:00
|
|
|
return bin_to_hex(self.id)
|
2015-10-02 19:56:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def duration(self):
|
2016-01-30 20:32:45 +00:00
|
|
|
return format_timedelta(self.end - self.start)
|
2015-10-02 19:56:21 +00:00
|
|
|
|
2016-04-23 23:29:17 +00:00
|
|
|
@property
|
|
|
|
def duration_from_meta(self):
|
|
|
|
return format_timedelta(self.ts_end - self.ts)
|
|
|
|
|
2015-10-02 19:56:21 +00:00
|
|
|
def __str__(self):
|
2016-02-08 19:17:35 +00:00
|
|
|
return '''\
|
|
|
|
Archive name: {0.name}
|
2015-10-02 19:56:21 +00:00
|
|
|
Archive fingerprint: {0.fpr}
|
2016-02-08 19:17:35 +00:00
|
|
|
Time (start): {start}
|
|
|
|
Time (end): {end}
|
2015-10-02 19:56:21 +00:00
|
|
|
Duration: {0.duration}
|
2016-02-08 19:17:35 +00:00
|
|
|
Number of files: {0.stats.nfiles}'''.format(
|
|
|
|
self,
|
|
|
|
start=format_time(to_localtime(self.start.replace(tzinfo=timezone.utc))),
|
|
|
|
end=format_time(to_localtime(self.end.replace(tzinfo=timezone.utc))))
|
2015-10-02 19:56:21 +00:00
|
|
|
|
2011-08-11 19:18:13 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return 'Archive(%r)' % self.name
|
|
|
|
|
2016-06-26 22:25:05 +00:00
|
|
|
def item_filter(self, item, filter=None):
|
2016-07-21 22:19:56 +00:00
|
|
|
if not self.consider_part_files and 'part' in item:
|
|
|
|
# this is a part(ial) file, we usually don't want to consider it.
|
2016-07-21 20:24:48 +00:00
|
|
|
return False
|
|
|
|
return filter(item) if filter else True
|
2016-06-26 22:25:05 +00:00
|
|
|
|
2014-01-23 21:13:08 +00:00
|
|
|
def iter_items(self, filter=None, preload=False):
|
2016-08-14 23:11:33 +00:00
|
|
|
for item in self.pipeline.unpack_many(self.metadata.items, preload=preload,
|
2016-06-26 22:25:05 +00:00
|
|
|
filter=lambda item: self.item_filter(item, filter)):
|
2014-01-23 21:13:08 +00:00
|
|
|
yield item
|
2010-11-29 20:08:37 +00:00
|
|
|
|
2016-06-26 16:07:01 +00:00
|
|
|
def add_item(self, item, show_progress=True):
|
|
|
|
if show_progress and self.show_progress:
|
2015-12-27 10:06:03 +00:00
|
|
|
self.stats.show_progress(item=item, dt=0.2)
|
2014-01-22 19:58:48 +00:00
|
|
|
self.items_buffer.add(item)
|
2010-12-04 20:03:02 +00:00
|
|
|
|
2011-09-10 15:19:02 +00:00
|
|
|
def write_checkpoint(self):
|
|
|
|
self.save(self.checkpoint_name)
|
|
|
|
del self.manifest.archives[self.checkpoint_name]
|
2014-03-19 21:32:07 +00:00
|
|
|
self.cache.chunk_decref(self.id, self.stats)
|
2011-09-10 15:19:02 +00:00
|
|
|
|
2016-04-07 09:29:52 +00:00
|
|
|
def save(self, name=None, comment=None, timestamp=None, additional_metadata=None):
|
2011-09-10 15:19:02 +00:00
|
|
|
name = name or self.name
|
2011-09-04 21:02:47 +00:00
|
|
|
if name in self.manifest.archives:
|
2011-09-10 15:19:02 +00:00
|
|
|
raise self.AlreadyExists(name)
|
2014-01-22 19:58:48 +00:00
|
|
|
self.items_buffer.flush(flush=True)
|
2015-04-18 19:36:10 +00:00
|
|
|
if timestamp is None:
|
2016-02-07 01:35:31 +00:00
|
|
|
self.end = datetime.utcnow()
|
2016-02-05 01:02:04 +00:00
|
|
|
start = self.start
|
|
|
|
end = self.end
|
|
|
|
else:
|
2016-02-07 01:35:31 +00:00
|
|
|
self.end = timestamp
|
2016-02-05 01:02:04 +00:00
|
|
|
start = timestamp
|
|
|
|
end = timestamp # we only have 1 value
|
2016-04-07 09:29:52 +00:00
|
|
|
metadata = {
|
2010-10-20 19:08:46 +00:00
|
|
|
'version': 1,
|
2010-10-20 17:59:15 +00:00
|
|
|
'name': name,
|
2016-08-14 23:11:33 +00:00
|
|
|
'comment': comment or '',
|
2014-01-22 19:58:48 +00:00
|
|
|
'items': self.items_buffer.chunks,
|
2010-10-21 19:21:43 +00:00
|
|
|
'cmdline': sys.argv,
|
2010-10-24 18:18:18 +00:00
|
|
|
'hostname': socket.gethostname(),
|
2010-10-24 20:07:54 +00:00
|
|
|
'username': getuser(),
|
2016-02-05 01:02:04 +00:00
|
|
|
'time': start.isoformat(),
|
|
|
|
'time_end': end.isoformat(),
|
2016-03-12 11:40:39 +00:00
|
|
|
'chunker_params': self.chunker_params,
|
2016-04-07 09:29:52 +00:00
|
|
|
}
|
|
|
|
metadata.update(additional_metadata or {})
|
2016-08-14 23:11:33 +00:00
|
|
|
metadata = ArchiveItem(metadata)
|
|
|
|
data = msgpack.packb(metadata.as_dict(), unicode_errors='surrogateescape')
|
2011-08-15 20:32:26 +00:00
|
|
|
self.id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
self.cache.add_chunk(self.id, Chunk(data), self.stats)
|
2016-08-15 02:17:41 +00:00
|
|
|
self.manifest.archives[name] = (self.id, metadata.time)
|
2011-09-04 21:02:47 +00:00
|
|
|
self.manifest.write()
|
2013-06-20 10:44:58 +00:00
|
|
|
self.repository.commit()
|
2011-09-10 15:19:02 +00:00
|
|
|
self.cache.commit()
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2015-04-21 18:50:19 +00:00
|
|
|
def calc_stats(self, cache):
|
2012-11-28 20:17:00 +00:00
|
|
|
def add(id):
|
2015-04-19 21:45:05 +00:00
|
|
|
count, size, csize = cache.chunks[id]
|
2012-11-28 20:17:00 +00:00
|
|
|
stats.update(size, csize, count == 1)
|
2015-04-19 22:07:26 +00:00
|
|
|
cache.chunks[id] = count - 1, size, csize
|
2015-07-11 16:31:49 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def add_file_chunks(chunks):
|
|
|
|
for id, _, _ in chunks:
|
|
|
|
add(id)
|
2015-07-11 16:31:49 +00:00
|
|
|
|
2011-07-30 19:13:48 +00:00
|
|
|
# This function is a bit evil since it abuses the cache to calculate
|
2015-04-21 18:50:19 +00:00
|
|
|
# the stats. The cache transaction must be rolled back afterwards
|
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
2012-10-17 09:40:23 +00:00
|
|
|
cache.begin_txn()
|
|
|
|
stats = Statistics()
|
2012-11-28 20:17:00 +00:00
|
|
|
add(self.id)
|
2016-08-14 23:11:33 +00:00
|
|
|
for id, chunk in zip(self.metadata.items, self.repository.get_many(self.metadata.items)):
|
2012-11-29 19:38:03 +00:00
|
|
|
add(id)
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(id, chunk)
|
|
|
|
unpacker.feed(data)
|
2011-07-30 19:13:48 +00:00
|
|
|
for item in unpacker:
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(internal_dict=item)
|
|
|
|
if 'chunks' in item:
|
2012-11-28 20:17:00 +00:00
|
|
|
stats.nfiles += 1
|
2016-05-31 23:45:45 +00:00
|
|
|
add_file_chunks(item.chunks)
|
2011-07-30 19:13:48 +00:00
|
|
|
cache.rollback()
|
2011-07-30 20:50:59 +00:00
|
|
|
return stats
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-03-17 21:39:57 +00:00
|
|
|
def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
|
2016-08-25 19:16:20 +00:00
|
|
|
hardlink_masters=None, stripped_components=0, original_path=None, pi=None):
|
2016-03-17 21:39:57 +00:00
|
|
|
"""
|
|
|
|
Extract archive item.
|
|
|
|
|
|
|
|
:param item: the item to extract
|
|
|
|
:param restore_attrs: restore file attributes
|
|
|
|
:param dry_run: do not write any data
|
|
|
|
:param stdout: write extracted data to stdout
|
|
|
|
:param sparse: write sparse files (chunk-granularity, independent of the original being sparse)
|
|
|
|
:param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly
|
2016-08-25 19:16:20 +00:00
|
|
|
:param stripped_components: stripped leading path components to correct hard link extraction
|
2016-05-31 23:45:45 +00:00
|
|
|
:param original_path: 'path' key as stored in archive
|
2016-08-07 12:17:56 +00:00
|
|
|
:param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
|
2016-03-17 21:39:57 +00:00
|
|
|
"""
|
2016-08-25 19:16:20 +00:00
|
|
|
hardlink_masters = hardlink_masters or {}
|
2016-07-10 23:23:27 +00:00
|
|
|
has_damaged_chunks = 'chunks_healthy' in item
|
2015-03-01 04:07:29 +00:00
|
|
|
if dry_run or stdout:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
|
2016-08-07 12:17:56 +00:00
|
|
|
if pi:
|
2016-11-13 21:34:15 +00:00
|
|
|
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
|
2015-03-01 04:07:29 +00:00
|
|
|
if stdout:
|
|
|
|
sys.stdout.buffer.write(data)
|
|
|
|
if stdout:
|
|
|
|
sys.stdout.buffer.flush()
|
2016-07-09 16:19:25 +00:00
|
|
|
if has_damaged_chunks:
|
|
|
|
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
|
2016-07-17 17:13:09 +00:00
|
|
|
remove_surrogates(item.path))
|
2014-02-18 20:33:06 +00:00
|
|
|
return
|
|
|
|
|
2016-05-31 23:45:45 +00:00
|
|
|
original_path = original_path or item.path
|
2013-06-30 20:32:27 +00:00
|
|
|
dest = self.cwd
|
2016-05-31 23:45:45 +00:00
|
|
|
if item.path.startswith(('/', '..')):
|
2013-08-03 11:34:14 +00:00
|
|
|
raise Exception('Path should be relative and local')
|
2016-05-31 23:45:45 +00:00
|
|
|
path = os.path.join(dest, item.path)
|
2012-12-06 22:04:01 +00:00
|
|
|
# Attempt to remove existing files, ignore errors on failure
|
|
|
|
try:
|
|
|
|
st = os.lstat(path)
|
|
|
|
if stat.S_ISDIR(st.st_mode):
|
|
|
|
os.rmdir(path)
|
|
|
|
else:
|
|
|
|
os.unlink(path)
|
2015-04-21 20:29:10 +00:00
|
|
|
except UnicodeEncodeError:
|
2015-12-14 23:17:03 +00:00
|
|
|
raise self.IncompatibleFilesystemEncodingError(path, sys.getfilesystemencoding()) from None
|
2012-12-06 22:04:01 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2016-05-31 23:45:45 +00:00
|
|
|
mode = item.mode
|
2015-05-31 19:53:37 +00:00
|
|
|
if stat.S_ISREG(mode):
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-07-04 17:07:37 +00:00
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
2016-06-30 22:14:10 +00:00
|
|
|
os.makedirs(os.path.dirname(path))
|
2010-10-30 11:44:25 +00:00
|
|
|
# Hard link?
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'source' in item:
|
2016-08-25 19:16:20 +00:00
|
|
|
source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:])
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-06-30 22:14:10 +00:00
|
|
|
if os.path.exists(path):
|
|
|
|
os.unlink(path)
|
2016-08-25 19:16:20 +00:00
|
|
|
if item.source not in hardlink_masters:
|
2016-07-04 17:07:37 +00:00
|
|
|
os.link(source, path)
|
|
|
|
return
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks, link_target = hardlink_masters[item.source]
|
2016-03-17 21:39:57 +00:00
|
|
|
if link_target:
|
|
|
|
# Hard link was extracted previously, just link
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-07-04 17:07:37 +00:00
|
|
|
os.link(link_target, path)
|
2016-03-17 21:39:57 +00:00
|
|
|
return
|
|
|
|
# Extract chunks, since the item which had the chunks was not extracted
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-07-04 17:07:37 +00:00
|
|
|
fd = open(path, 'wb')
|
|
|
|
with fd:
|
2016-05-31 23:45:45 +00:00
|
|
|
ids = [c.id for c in item.chunks]
|
2016-03-18 02:16:12 +00:00
|
|
|
for _, data in self.pipeline.fetch_many(ids, is_preloaded=True):
|
2016-08-07 12:17:56 +00:00
|
|
|
if pi:
|
2016-11-13 21:34:15 +00:00
|
|
|
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-07-04 17:07:37 +00:00
|
|
|
if sparse and self.zeros.startswith(data):
|
|
|
|
# all-zero chunk: create a hole in a sparse file
|
|
|
|
fd.seek(len(data), 1)
|
|
|
|
else:
|
|
|
|
fd.write(data)
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-07-04 17:07:37 +00:00
|
|
|
pos = fd.tell()
|
|
|
|
fd.truncate(pos)
|
|
|
|
fd.flush()
|
|
|
|
self.restore_attrs(path, item, fd=fd.fileno())
|
2016-07-10 23:23:27 +00:00
|
|
|
if has_damaged_chunks:
|
|
|
|
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
|
|
|
|
remove_surrogates(item.path))
|
2016-03-17 21:39:57 +00:00
|
|
|
if hardlink_masters:
|
|
|
|
# Update master entry with extracted file path, so that following hardlinks don't extract twice.
|
2016-05-31 23:45:45 +00:00
|
|
|
hardlink_masters[item.get('source') or original_path] = (None, path)
|
2016-06-30 22:14:10 +00:00
|
|
|
return
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-06-30 22:14:10 +00:00
|
|
|
# No repository access beyond this point.
|
|
|
|
if stat.S_ISDIR(mode):
|
|
|
|
if not os.path.exists(path):
|
|
|
|
os.makedirs(path)
|
|
|
|
if restore_attrs:
|
|
|
|
self.restore_attrs(path, item)
|
|
|
|
elif stat.S_ISLNK(mode):
|
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
|
|
|
os.makedirs(os.path.dirname(path))
|
2016-07-04 17:07:37 +00:00
|
|
|
source = item.source
|
2016-06-30 22:14:10 +00:00
|
|
|
if os.path.exists(path):
|
|
|
|
os.unlink(path)
|
|
|
|
try:
|
|
|
|
os.symlink(source, path)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None
|
|
|
|
self.restore_attrs(path, item, symlink=True)
|
|
|
|
elif stat.S_ISFIFO(mode):
|
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
|
|
|
os.makedirs(os.path.dirname(path))
|
|
|
|
os.mkfifo(path)
|
2015-05-31 19:53:37 +00:00
|
|
|
self.restore_attrs(path, item)
|
2016-06-30 22:14:10 +00:00
|
|
|
elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
|
2016-07-04 17:07:37 +00:00
|
|
|
os.mknod(path, item.mode, item.rdev)
|
2016-06-30 22:14:10 +00:00
|
|
|
self.restore_attrs(path, item)
|
|
|
|
else:
|
2016-07-04 17:07:37 +00:00
|
|
|
raise Exception('Unknown archive item type %r' % item.mode)
|
2010-10-20 20:53:58 +00:00
|
|
|
|
2013-06-03 11:45:48 +00:00
|
|
|
def restore_attrs(self, path, item, symlink=False, fd=None):
|
2016-06-30 22:14:10 +00:00
|
|
|
"""
|
2016-07-03 21:58:12 +00:00
|
|
|
Restore filesystem attributes on *path* (*fd*) from *item*.
|
2016-06-30 22:14:10 +00:00
|
|
|
|
|
|
|
Does not access the repository.
|
|
|
|
"""
|
2012-02-29 22:59:17 +00:00
|
|
|
uid = gid = None
|
|
|
|
if not self.numeric_owner:
|
2016-05-31 23:45:45 +00:00
|
|
|
uid = user2uid(item.user)
|
|
|
|
gid = group2gid(item.group)
|
|
|
|
uid = item.uid if uid is None else uid
|
|
|
|
gid = item.gid if gid is None else gid
|
2013-06-03 11:45:48 +00:00
|
|
|
# This code is a bit of a mess due to os specific differences
|
2010-10-20 20:53:58 +00:00
|
|
|
try:
|
2013-06-03 11:45:48 +00:00
|
|
|
if fd:
|
|
|
|
os.fchown(fd, uid, gid)
|
|
|
|
else:
|
|
|
|
os.lchown(path, uid, gid)
|
2010-10-20 20:53:58 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2013-06-03 11:45:48 +00:00
|
|
|
if fd:
|
2016-05-31 23:45:45 +00:00
|
|
|
os.fchmod(fd, item.mode)
|
2013-06-03 11:45:48 +00:00
|
|
|
elif not symlink:
|
2016-05-31 23:45:45 +00:00
|
|
|
os.chmod(path, item.mode)
|
2013-06-03 11:45:48 +00:00
|
|
|
elif has_lchmod: # Not available on Linux
|
2016-05-31 23:45:45 +00:00
|
|
|
os.lchmod(path, item.mode)
|
|
|
|
mtime = item.mtime
|
|
|
|
if 'atime' in item:
|
|
|
|
atime = item.atime
|
2015-10-26 01:07:55 +00:00
|
|
|
else:
|
|
|
|
# old archives only had mtime in item metadata
|
|
|
|
atime = mtime
|
2016-07-25 03:38:28 +00:00
|
|
|
try:
|
|
|
|
if fd:
|
|
|
|
os.utime(fd, None, ns=(atime, mtime))
|
|
|
|
else:
|
|
|
|
os.utime(path, None, ns=(atime, mtime), follow_symlinks=False)
|
|
|
|
except OSError:
|
|
|
|
# some systems don't support calling utime on a symlink
|
|
|
|
pass
|
2014-04-13 18:26:46 +00:00
|
|
|
acl_set(path, item, self.numeric_owner)
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'bsdflags' in item:
|
2014-04-08 19:52:26 +00:00
|
|
|
try:
|
2016-05-31 23:45:45 +00:00
|
|
|
set_flags(path, item.bsdflags, fd=fd)
|
2014-04-08 19:52:26 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2016-04-16 21:52:16 +00:00
|
|
|
# chown removes Linux capabilities, so set the extended attributes at the end, after chown, since they include
|
|
|
|
# the Linux capabilities in the "security.capability" attribute.
|
2016-05-31 23:45:45 +00:00
|
|
|
xattrs = item.get('xattrs', {})
|
2016-04-16 21:52:16 +00:00
|
|
|
for k, v in xattrs.items():
|
|
|
|
try:
|
|
|
|
xattr.setxattr(fd or path, k, v, follow_symlinks=False)
|
|
|
|
except OSError as e:
|
|
|
|
if e.errno not in (errno.ENOTSUP, errno.EACCES):
|
|
|
|
# only raise if the errno is not on our ignore list:
|
|
|
|
# ENOTSUP == xattrs not supported here
|
|
|
|
# EACCES == permission denied to set this specific xattr
|
|
|
|
# (this may happen related to security.* keys)
|
|
|
|
raise
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-04-08 05:07:14 +00:00
|
|
|
def set_meta(self, key, value):
|
2016-08-14 23:11:33 +00:00
|
|
|
metadata = self._load_meta(self.id)
|
|
|
|
setattr(metadata, key, value)
|
|
|
|
data = msgpack.packb(metadata.as_dict(), unicode_errors='surrogateescape')
|
2015-03-24 06:11:00 +00:00
|
|
|
new_id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
self.cache.add_chunk(new_id, Chunk(data), self.stats)
|
2016-08-15 02:17:41 +00:00
|
|
|
self.manifest.archives[self.name] = (new_id, metadata.time)
|
2015-03-24 06:11:00 +00:00
|
|
|
self.cache.chunk_decref(self.id, self.stats)
|
2016-04-07 09:29:52 +00:00
|
|
|
self.id = new_id
|
2016-04-08 05:07:14 +00:00
|
|
|
|
|
|
|
def rename(self, name):
|
|
|
|
if name in self.manifest.archives:
|
|
|
|
raise self.AlreadyExists(name)
|
|
|
|
oldname = self.name
|
|
|
|
self.name = name
|
2016-08-14 23:11:33 +00:00
|
|
|
self.set_meta('name', name)
|
2016-04-08 05:07:14 +00:00
|
|
|
del self.manifest.archives[oldname]
|
2015-03-24 06:11:00 +00:00
|
|
|
|
2016-07-01 02:27:06 +00:00
|
|
|
def delete(self, stats, progress=False, forced=False):
|
|
|
|
class ChunksIndexError(Error):
|
|
|
|
"""Chunk ID {} missing from chunks index, corrupted chunks index - aborting transaction."""
|
|
|
|
|
|
|
|
def chunk_decref(id, stats):
|
|
|
|
nonlocal error
|
|
|
|
try:
|
|
|
|
self.cache.chunk_decref(id, stats)
|
|
|
|
except KeyError:
|
2016-07-08 11:29:48 +00:00
|
|
|
cid = bin_to_hex(id)
|
2016-07-01 02:27:06 +00:00
|
|
|
raise ChunksIndexError(cid)
|
|
|
|
except Repository.ObjectNotFound as e:
|
|
|
|
# object not in repo - strange, but we wanted to delete it anyway.
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
|
|
|
|
|
|
|
error = False
|
|
|
|
try:
|
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
2016-08-14 23:11:33 +00:00
|
|
|
items_ids = self.metadata.items
|
2016-08-07 12:24:30 +00:00
|
|
|
pi = ProgressIndicatorPercent(total=len(items_ids), msg="Decrementing references %3.0f%%")
|
2016-07-01 02:27:06 +00:00
|
|
|
for (i, (items_id, data)) in enumerate(zip(items_ids, self.repository.get_many(items_ids))):
|
|
|
|
if progress:
|
|
|
|
pi.show(i)
|
2016-07-05 23:33:53 +00:00
|
|
|
_, data = self.key.decrypt(items_id, data)
|
|
|
|
unpacker.feed(data)
|
2016-07-01 02:27:06 +00:00
|
|
|
chunk_decref(items_id, stats)
|
|
|
|
try:
|
|
|
|
for item in unpacker:
|
2016-07-05 23:33:53 +00:00
|
|
|
item = Item(internal_dict=item)
|
|
|
|
if 'chunks' in item:
|
|
|
|
for chunk_id, size, csize in item.chunks:
|
2016-07-01 02:27:06 +00:00
|
|
|
chunk_decref(chunk_id, stats)
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
# if items metadata spans multiple chunks and one chunk got dropped somehow,
|
|
|
|
# it could be that unpacker yields bad types
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
2016-01-16 19:46:49 +00:00
|
|
|
if progress:
|
2016-07-01 02:27:06 +00:00
|
|
|
pi.finish()
|
|
|
|
except (msgpack.UnpackException, Repository.ObjectNotFound):
|
|
|
|
# items metadata corrupted
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
|
|
|
# in forced delete mode, we try hard to delete at least the manifest entry,
|
|
|
|
# if possible also the archive superblock, even if processing the items raises
|
|
|
|
# some harmless exception.
|
|
|
|
chunk_decref(self.id, stats)
|
2011-09-04 21:02:47 +00:00
|
|
|
del self.manifest.archives[self.name]
|
2016-07-01 02:27:06 +00:00
|
|
|
if error:
|
|
|
|
logger.warning('forced deletion succeeded, but the deleted archive was corrupted.')
|
|
|
|
logger.warning('borg check --repair is required to free all space.')
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-06-26 14:59:38 +00:00
|
|
|
def stat_simple_attrs(self, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs = dict(
|
|
|
|
mode=st.st_mode,
|
2016-06-26 14:59:38 +00:00
|
|
|
uid=st.st_uid,
|
|
|
|
gid=st.st_gid,
|
2016-05-31 23:45:45 +00:00
|
|
|
mtime=st.st_mtime_ns,
|
|
|
|
)
|
2016-11-28 01:23:32 +00:00
|
|
|
# borg can work with archives only having mtime (older attic archives do not have
|
|
|
|
# atime/ctime). it can be useful to omit atime/ctime, if they change without the
|
|
|
|
# file content changing - e.g. to get better metadata deduplication.
|
|
|
|
if not self.noatime:
|
|
|
|
attrs['atime'] = st.st_atime_ns
|
|
|
|
if not self.noctime:
|
|
|
|
attrs['ctime'] = st.st_ctime_ns
|
2012-02-29 22:59:17 +00:00
|
|
|
if self.numeric_owner:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['user'] = attrs['group'] = None
|
2016-06-26 14:59:38 +00:00
|
|
|
else:
|
|
|
|
attrs['user'] = uid2user(st.st_uid)
|
|
|
|
attrs['group'] = gid2group(st.st_gid)
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
def stat_ext_attrs(self, st, path):
|
|
|
|
attrs = {}
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-06-27 18:56:41 +00:00
|
|
|
xattrs = xattr.get_all(path, follow_symlinks=False)
|
2016-07-08 11:26:06 +00:00
|
|
|
bsdflags = get_flags(path, st)
|
|
|
|
acl_get(path, attrs, st, self.numeric_owner)
|
2013-07-29 19:09:31 +00:00
|
|
|
if xattrs:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['xattrs'] = StableDict(xattrs)
|
2016-05-17 22:22:49 +00:00
|
|
|
if bsdflags:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['bsdflags'] = bsdflags
|
|
|
|
return attrs
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2016-06-26 14:59:38 +00:00
|
|
|
def stat_attrs(self, st, path):
|
|
|
|
attrs = self.stat_simple_attrs(st)
|
|
|
|
attrs.update(self.stat_ext_attrs(st, path))
|
|
|
|
return attrs
|
|
|
|
|
2015-03-22 14:52:43 +00:00
|
|
|
def process_dir(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path))
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2010-11-29 20:08:37 +00:00
|
|
|
self.add_item(item)
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'd' # directory
|
|
|
|
|
|
|
|
def process_fifo(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path))
|
2015-03-22 14:52:43 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
|
|
|
self.add_item(item)
|
|
|
|
return 'f' # fifo
|
2010-10-31 19:12:32 +00:00
|
|
|
|
2012-03-03 13:02:22 +00:00
|
|
|
def process_dev(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path), rdev=st.st_rdev)
|
2012-03-03 13:02:22 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
if stat.S_ISCHR(st.st_mode):
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'c' # char device
|
2015-03-08 18:18:21 +00:00
|
|
|
elif stat.S_ISBLK(st.st_mode):
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'b' # block device
|
2012-03-03 13:02:22 +00:00
|
|
|
|
2010-10-20 20:53:58 +00:00
|
|
|
def process_symlink(self, path, st):
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-09-14 00:53:41 +00:00
|
|
|
source = os.readlink(path)
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path), source=source)
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2010-11-29 20:08:37 +00:00
|
|
|
self.add_item(item)
|
2015-03-22 14:52:43 +00:00
|
|
|
return 's' # symlink
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2016-11-19 18:09:47 +00:00
|
|
|
def write_part_file(self, item, from_chunk, number):
|
|
|
|
item = Item(internal_dict=item.as_dict())
|
|
|
|
length = len(item.chunks)
|
|
|
|
# the item should only have the *additional* chunks we processed after the last partial item:
|
|
|
|
item.chunks = item.chunks[from_chunk:]
|
|
|
|
item.path += '.borg_part_%d' % number
|
|
|
|
item.part = number
|
|
|
|
number += 1
|
|
|
|
self.add_item(item, show_progress=False)
|
|
|
|
self.write_checkpoint()
|
|
|
|
return length, number
|
|
|
|
|
|
|
|
def chunk_file(self, item, cache, stats, chunk_iter, chunk_processor=None, **chunk_kw):
|
|
|
|
if not chunk_processor:
|
|
|
|
def chunk_processor(data):
|
|
|
|
return cache.add_chunk(self.key.id_hash(data), Chunk(data, **chunk_kw), stats)
|
2016-07-21 21:56:58 +00:00
|
|
|
|
2016-06-26 16:07:01 +00:00
|
|
|
item.chunks = []
|
2016-07-21 21:56:58 +00:00
|
|
|
from_chunk = 0
|
|
|
|
part_number = 1
|
2016-11-19 18:09:47 +00:00
|
|
|
for data in chunk_iter:
|
|
|
|
item.chunks.append(chunk_processor(data))
|
2016-06-26 16:07:01 +00:00
|
|
|
if self.show_progress:
|
|
|
|
self.stats.show_progress(item=item, dt=0.2)
|
|
|
|
if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
|
2016-11-19 18:09:47 +00:00
|
|
|
from_chunk, part_number = self.write_part_file(item, from_chunk, part_number)
|
2016-07-21 21:56:58 +00:00
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
else:
|
2016-07-28 15:55:40 +00:00
|
|
|
if part_number > 1:
|
|
|
|
if item.chunks[from_chunk:]:
|
|
|
|
# if we already have created a part item inside this file, we want to put the final
|
|
|
|
# chunks (if any) into a part item also (so all parts can be concatenated to get
|
|
|
|
# the complete file):
|
2016-11-19 18:09:47 +00:00
|
|
|
from_chunk, part_number = self.write_part_file(item, from_chunk, part_number)
|
2016-07-28 15:55:40 +00:00
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
|
|
|
|
# if we created part files, we have referenced all chunks from the part files,
|
|
|
|
# but we also will reference the same chunks also from the final, complete file:
|
|
|
|
for chunk in item.chunks:
|
|
|
|
cache.chunk_incref(chunk.id, stats)
|
2016-06-26 16:07:01 +00:00
|
|
|
|
2015-03-01 03:29:44 +00:00
|
|
|
def process_stdin(self, path, cache):
|
|
|
|
uid, gid = 0, 0
|
2016-05-31 23:45:45 +00:00
|
|
|
t = int(time.time()) * 1000000000
|
|
|
|
item = Item(
|
|
|
|
path=path,
|
|
|
|
mode=0o100660, # regular file, ug=rw
|
|
|
|
uid=uid, user=uid2user(uid),
|
|
|
|
gid=gid, group=gid2group(gid),
|
|
|
|
mtime=t, atime=t, ctime=t,
|
|
|
|
)
|
2016-06-26 15:14:13 +00:00
|
|
|
fd = sys.stdin.buffer # binary
|
2016-11-19 18:09:47 +00:00
|
|
|
self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd)))
|
2016-06-26 15:14:13 +00:00
|
|
|
self.stats.nfiles += 1
|
2015-03-01 03:29:44 +00:00
|
|
|
self.add_item(item)
|
2015-09-08 01:12:45 +00:00
|
|
|
return 'i' # stdin
|
2015-03-01 03:29:44 +00:00
|
|
|
|
2016-03-15 14:38:55 +00:00
|
|
|
def process_file(self, path, st, cache, ignore_inode=False):
|
2015-03-08 18:18:21 +00:00
|
|
|
status = None
|
2013-08-03 11:34:14 +00:00
|
|
|
safe_path = make_path_safe(path)
|
2010-10-25 20:31:18 +00:00
|
|
|
# Is it a hard link?
|
2010-10-20 20:53:58 +00:00
|
|
|
if st.st_nlink > 1:
|
|
|
|
source = self.hard_links.get((st.st_ino, st.st_dev))
|
2016-09-25 15:30:55 +00:00
|
|
|
if source is not None:
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=safe_path, source=source)
|
|
|
|
item.update(self.stat_attrs(st, path))
|
2011-06-19 18:34:46 +00:00
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
status = 'h' # regular file, hardlink (to already seen inodes)
|
|
|
|
return status
|
2010-10-20 20:53:58 +00:00
|
|
|
else:
|
|
|
|
self.hard_links[st.st_ino, st.st_dev] = safe_path
|
2016-07-02 19:04:51 +00:00
|
|
|
is_special_file = is_special(st.st_mode)
|
|
|
|
if not is_special_file:
|
2016-07-04 17:07:37 +00:00
|
|
|
path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
|
2016-07-02 17:44:26 +00:00
|
|
|
ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
|
|
|
|
else:
|
|
|
|
# in --read-special mode, we may be called for special files.
|
|
|
|
# there should be no information in the cache about special files processed in
|
|
|
|
# read-special mode, but we better play safe as this was wrong in the past:
|
|
|
|
path_hash = ids = None
|
2016-08-06 20:38:53 +00:00
|
|
|
first_run = not cache.files and cache.do_files
|
2015-10-16 01:16:31 +00:00
|
|
|
if first_run:
|
2016-03-26 13:31:54 +00:00
|
|
|
logger.debug('Processing files ...')
|
2011-07-30 19:13:48 +00:00
|
|
|
chunks = None
|
2010-10-25 20:31:18 +00:00
|
|
|
if ids is not None:
|
|
|
|
# Make sure all ids are available
|
2014-01-22 19:58:48 +00:00
|
|
|
for id_ in ids:
|
|
|
|
if not cache.seen_chunk(id_):
|
2010-10-25 20:31:18 +00:00
|
|
|
break
|
|
|
|
else:
|
2014-01-22 19:58:48 +00:00
|
|
|
chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
|
2015-03-08 18:18:21 +00:00
|
|
|
status = 'U' # regular file, unchanged
|
|
|
|
else:
|
|
|
|
status = 'A' # regular file, added
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(
|
|
|
|
path=safe_path,
|
|
|
|
hardlink_master=st.st_nlink > 1, # item is a hard link and has the chunks
|
|
|
|
)
|
2016-06-26 14:59:38 +00:00
|
|
|
item.update(self.stat_simple_attrs(st))
|
2010-10-25 20:31:18 +00:00
|
|
|
# Only chunkify the file if needed
|
2016-06-26 15:20:29 +00:00
|
|
|
if chunks is not None:
|
|
|
|
item.chunks = chunks
|
|
|
|
else:
|
2016-04-18 23:13:10 +00:00
|
|
|
compress = self.compression_decider1.decide(path)
|
2016-08-06 20:37:44 +00:00
|
|
|
self.file_compression_logger.debug('%s -> compression %s', path, compress['name'])
|
2016-12-02 23:12:48 +00:00
|
|
|
with backup_io:
|
2016-06-27 18:56:41 +00:00
|
|
|
fh = Archive._open_rb(path)
|
2015-04-08 16:43:53 +00:00
|
|
|
with os.fdopen(fh, 'rb') as fd:
|
2016-11-19 18:09:47 +00:00
|
|
|
self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)), compress=compress)
|
2016-07-02 19:04:51 +00:00
|
|
|
if not is_special_file:
|
2016-07-02 17:44:26 +00:00
|
|
|
# we must not memorize special files, because the contents of e.g. a
|
|
|
|
# block or char device will change without its mtime/size/inode changing.
|
2016-06-26 15:20:29 +00:00
|
|
|
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
|
2015-03-08 18:18:21 +00:00
|
|
|
status = status or 'M' # regular file, modified (if not 'A' already)
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2016-07-02 19:04:51 +00:00
|
|
|
if is_special_file:
|
2016-07-02 18:17:07 +00:00
|
|
|
# we processed a special file like a regular file. reflect that in mode,
|
2016-07-02 19:04:51 +00:00
|
|
|
# so it can be extracted / accessed in FUSE mount like a regular file:
|
2016-07-04 17:07:37 +00:00
|
|
|
item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
|
2011-08-07 18:00:18 +00:00
|
|
|
self.stats.nfiles += 1
|
2011-08-02 21:20:46 +00:00
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
return status
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2010-10-21 19:21:43 +00:00
|
|
|
@staticmethod
|
2013-06-20 10:44:58 +00:00
|
|
|
def list_archives(repository, key, manifest, cache=None):
|
2015-05-26 00:04:41 +00:00
|
|
|
# expensive! see also Manifest.list_archive_infos.
|
2016-08-15 02:17:41 +00:00
|
|
|
for name in manifest.archives:
|
2013-06-20 10:44:58 +00:00
|
|
|
yield Archive(repository, key, manifest, name, cache=cache)
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2014-08-30 13:10:41 +00:00
|
|
|
@staticmethod
|
2016-02-18 22:01:55 +00:00
|
|
|
def _open_rb(path):
|
|
|
|
try:
|
|
|
|
# if we have O_NOATIME, this likely will succeed if we are root or owner of file:
|
|
|
|
return os.open(path, flags_noatime)
|
|
|
|
except PermissionError:
|
|
|
|
if flags_noatime == flags_normal:
|
|
|
|
# we do not have O_NOATIME, no need to try again:
|
|
|
|
raise
|
|
|
|
# Was this EPERM due to the O_NOATIME flag? Try again without it:
|
|
|
|
return os.open(path, flags_normal)
|
2014-08-30 13:10:41 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def valid_msgpacked_dict(d, keys_serialized):
|
|
|
|
"""check if the data <d> looks like a msgpacked dict"""
|
|
|
|
d_len = len(d)
|
|
|
|
if d_len == 0:
|
|
|
|
return False
|
|
|
|
if d[0] & 0xf0 == 0x80: # object is a fixmap (up to 15 elements)
|
|
|
|
offs = 1
|
|
|
|
elif d[0] == 0xde: # object is a map16 (up to 2^16-1 elements)
|
|
|
|
offs = 3
|
|
|
|
else:
|
|
|
|
# object is not a map (dict)
|
|
|
|
# note: we must not have dicts with > 2^16-1 elements
|
|
|
|
return False
|
|
|
|
if d_len <= offs:
|
|
|
|
return False
|
|
|
|
# is the first dict key a bytestring?
|
|
|
|
if d[offs] & 0xe0 == 0xa0: # key is a small bytestring (up to 31 chars)
|
|
|
|
pass
|
|
|
|
elif d[offs] in (0xd9, 0xda, 0xdb): # key is a str8, str16 or str32
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# key is not a bytestring
|
|
|
|
return False
|
|
|
|
# is the bytestring any of the expected key names?
|
|
|
|
key_serialized = d[offs:]
|
|
|
|
return any(key_serialized.startswith(pattern) for pattern in keys_serialized)
|
|
|
|
|
|
|
|
|
2015-07-15 09:30:25 +00:00
|
|
|
class RobustUnpacker:
|
2014-02-24 21:43:17 +00:00
|
|
|
"""A restartable/robust version of the streaming msgpack unpacker
|
|
|
|
"""
|
2016-07-28 20:10:29 +00:00
|
|
|
class UnpackerCrashed(Exception):
|
|
|
|
"""raise if unpacker crashed"""
|
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def __init__(self, validator, item_keys):
|
2015-07-11 16:31:49 +00:00
|
|
|
super().__init__()
|
2016-06-12 21:36:56 +00:00
|
|
|
self.item_keys = [msgpack.packb(name.encode()) for name in item_keys]
|
2014-02-24 21:43:17 +00:00
|
|
|
self.validator = validator
|
|
|
|
self._buffered_data = []
|
|
|
|
self._resync = False
|
|
|
|
self._unpacker = msgpack.Unpacker(object_hook=StableDict)
|
|
|
|
|
|
|
|
def resync(self):
|
|
|
|
self._buffered_data = []
|
|
|
|
self._resync = True
|
|
|
|
|
|
|
|
def feed(self, data):
|
|
|
|
if self._resync:
|
|
|
|
self._buffered_data.append(data)
|
|
|
|
else:
|
|
|
|
self._unpacker.feed(data)
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
2016-07-28 20:23:38 +00:00
|
|
|
def unpack_next():
|
|
|
|
try:
|
|
|
|
return next(self._unpacker)
|
|
|
|
except (TypeError, ValueError) as err:
|
|
|
|
# transform exceptions that might be raised when feeding
|
|
|
|
# msgpack with invalid data to a more specific exception
|
|
|
|
raise self.UnpackerCrashed(str(err))
|
|
|
|
|
2014-02-24 21:43:17 +00:00
|
|
|
if self._resync:
|
2014-02-24 22:37:21 +00:00
|
|
|
data = b''.join(self._buffered_data)
|
2014-02-24 21:43:17 +00:00
|
|
|
while self._resync:
|
|
|
|
if not data:
|
|
|
|
raise StopIteration
|
2016-06-12 21:36:56 +00:00
|
|
|
# Abort early if the data does not look like a serialized item dict
|
|
|
|
if not valid_msgpacked_dict(data, self.item_keys):
|
2014-03-04 20:15:52 +00:00
|
|
|
data = data[1:]
|
|
|
|
continue
|
2014-02-24 22:37:21 +00:00
|
|
|
self._unpacker = msgpack.Unpacker(object_hook=StableDict)
|
|
|
|
self._unpacker.feed(data)
|
|
|
|
try:
|
2016-07-28 20:23:38 +00:00
|
|
|
item = unpack_next()
|
|
|
|
except (self.UnpackerCrashed, StopIteration):
|
|
|
|
# as long as we are resyncing, we also ignore StopIteration
|
2016-07-28 20:12:34 +00:00
|
|
|
pass
|
|
|
|
else:
|
2014-02-24 21:43:17 +00:00
|
|
|
if self.validator(item):
|
|
|
|
self._resync = False
|
2014-02-24 22:37:21 +00:00
|
|
|
return item
|
|
|
|
data = data[1:]
|
2014-02-24 21:43:17 +00:00
|
|
|
else:
|
2016-07-28 20:23:38 +00:00
|
|
|
return unpack_next()
|
2014-02-24 21:43:17 +00:00
|
|
|
|
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
class ArchiveChecker:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.error_found = False
|
|
|
|
self.possibly_superseded = set()
|
|
|
|
|
2016-10-18 01:57:42 +00:00
|
|
|
def check(self, repository, repair=False, archive=None, first=0, last=0, sort_by='', prefix='',
|
|
|
|
verify_data=False, save_space=False):
|
2016-05-13 20:50:34 +00:00
|
|
|
"""Perform a set of checks on 'repository'
|
|
|
|
|
|
|
|
:param repair: enable repair mode, write updated or corrected data into repository
|
|
|
|
:param archive: only check this archive
|
2016-10-18 01:57:42 +00:00
|
|
|
:param first/last/sort_by: only check this number of first/last archives ordered by sort_by
|
2016-05-13 20:50:34 +00:00
|
|
|
:param prefix: only check archives with this prefix
|
|
|
|
:param verify_data: integrity verification of data referenced by archives
|
|
|
|
:param save_space: Repository.commit(save_space)
|
|
|
|
"""
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Starting archive consistency check...')
|
2016-10-18 01:57:42 +00:00
|
|
|
self.check_all = archive is None and not any((first, last, prefix))
|
2014-02-24 22:37:21 +00:00
|
|
|
self.repair = repair
|
|
|
|
self.repository = repository
|
|
|
|
self.init_chunks()
|
2016-11-13 14:58:42 +00:00
|
|
|
if not self.chunks:
|
|
|
|
logger.error('Repository contains no apparent data at all, cannot continue check/repair.')
|
|
|
|
return False
|
2014-02-24 22:37:21 +00:00
|
|
|
self.key = self.identify_key(repository)
|
2016-07-28 16:41:08 +00:00
|
|
|
if verify_data:
|
|
|
|
self.verify_data()
|
2015-03-17 22:47:21 +00:00
|
|
|
if Manifest.MANIFEST_ID not in self.chunks:
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.error("Repository manifest not found!")
|
|
|
|
self.error_found = True
|
2014-02-24 22:37:21 +00:00
|
|
|
self.manifest = self.rebuild_manifest()
|
|
|
|
else:
|
2016-11-30 04:38:04 +00:00
|
|
|
try:
|
|
|
|
self.manifest, _ = Manifest.load(repository, key=self.key)
|
|
|
|
except IntegrityError as exc:
|
|
|
|
logger.error('Repository manifest is corrupted: %s', exc)
|
|
|
|
self.error_found = True
|
|
|
|
del self.chunks[Manifest.MANIFEST_ID]
|
|
|
|
self.manifest = self.rebuild_manifest()
|
2016-10-18 01:57:42 +00:00
|
|
|
self.rebuild_refcounts(archive=archive, first=first, last=last, sort_by=sort_by, prefix=prefix)
|
2015-08-09 10:43:57 +00:00
|
|
|
self.orphan_chunks_check()
|
2015-11-18 01:27:25 +00:00
|
|
|
self.finish(save_space=save_space)
|
2015-12-07 23:21:46 +00:00
|
|
|
if self.error_found:
|
|
|
|
logger.error('Archive consistency check complete, problems found.')
|
|
|
|
else:
|
2015-10-02 14:58:08 +00:00
|
|
|
logger.info('Archive consistency check complete, no problems found.')
|
2014-02-24 22:37:21 +00:00
|
|
|
return self.repair or not self.error_found
|
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def init_chunks(self):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Fetch a list of all object keys from repository
|
|
|
|
"""
|
2015-08-08 22:36:17 +00:00
|
|
|
# Explicitly set the initial hash table capacity to avoid performance issues
|
2016-09-14 00:53:41 +00:00
|
|
|
# due to hash table "resonance".
|
|
|
|
# Since reconstruction of archive items can add some new chunks, add 10 % headroom
|
|
|
|
capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR * 1.1)
|
2014-07-10 13:32:12 +00:00
|
|
|
self.chunks = ChunkIndex(capacity)
|
2014-02-16 21:21:18 +00:00
|
|
|
marker = None
|
|
|
|
while True:
|
|
|
|
result = self.repository.list(limit=10000, marker=marker)
|
|
|
|
if not result:
|
|
|
|
break
|
|
|
|
marker = result[-1]
|
2016-04-16 15:48:47 +00:00
|
|
|
init_entry = ChunkIndexEntry(refcount=0, size=0, csize=0)
|
2014-02-16 21:21:18 +00:00
|
|
|
for id_ in result:
|
2016-04-16 15:48:47 +00:00
|
|
|
self.chunks[id_] = init_entry
|
2014-02-16 21:21:18 +00:00
|
|
|
|
|
|
|
def identify_key(self, repository):
|
2016-06-12 21:36:56 +00:00
|
|
|
try:
|
|
|
|
some_chunkid, _ = next(self.chunks.iteritems())
|
|
|
|
except StopIteration:
|
|
|
|
# repo is completely empty, no chunks
|
|
|
|
return None
|
|
|
|
cdata = repository.get(some_chunkid)
|
2014-02-16 21:21:18 +00:00
|
|
|
return key_factory(repository, cdata)
|
|
|
|
|
2016-05-13 20:50:34 +00:00
|
|
|
def verify_data(self):
|
|
|
|
logger.info('Starting cryptographic data integrity verification...')
|
2016-10-04 20:05:26 +00:00
|
|
|
chunks_count_index = len(self.chunks)
|
|
|
|
chunks_count_segments = 0
|
2016-07-28 16:40:20 +00:00
|
|
|
errors = 0
|
2016-09-08 23:27:27 +00:00
|
|
|
defect_chunks = []
|
2016-10-04 20:05:26 +00:00
|
|
|
pi = ProgressIndicatorPercent(total=chunks_count_index, msg="Verifying data %6.2f%%", step=0.01)
|
2016-10-04 02:55:10 +00:00
|
|
|
marker = None
|
|
|
|
while True:
|
|
|
|
chunk_ids = self.repository.scan(limit=100, marker=marker)
|
|
|
|
if not chunk_ids:
|
|
|
|
break
|
2016-10-04 20:05:26 +00:00
|
|
|
chunks_count_segments += len(chunk_ids)
|
2016-10-04 02:55:10 +00:00
|
|
|
marker = chunk_ids[-1]
|
2016-09-16 00:49:54 +00:00
|
|
|
chunk_data_iter = self.repository.get_many(chunk_ids)
|
|
|
|
chunk_ids_revd = list(reversed(chunk_ids))
|
|
|
|
while chunk_ids_revd:
|
|
|
|
pi.show()
|
|
|
|
chunk_id = chunk_ids_revd.pop(-1) # better efficiency
|
|
|
|
try:
|
|
|
|
encrypted_data = next(chunk_data_iter)
|
|
|
|
except (Repository.ObjectNotFound, IntegrityError) as err:
|
|
|
|
self.error_found = True
|
|
|
|
errors += 1
|
|
|
|
logger.error('chunk %s: %s', bin_to_hex(chunk_id), err)
|
|
|
|
if isinstance(err, IntegrityError):
|
|
|
|
defect_chunks.append(chunk_id)
|
|
|
|
# as the exception killed our generator, make a new one for remaining chunks:
|
|
|
|
if chunk_ids_revd:
|
|
|
|
chunk_ids = list(reversed(chunk_ids_revd))
|
|
|
|
chunk_data_iter = self.repository.get_many(chunk_ids)
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
_chunk_id = None if chunk_id == Manifest.MANIFEST_ID else chunk_id
|
|
|
|
_, data = self.key.decrypt(_chunk_id, encrypted_data)
|
|
|
|
except IntegrityError as integrity_error:
|
|
|
|
self.error_found = True
|
|
|
|
errors += 1
|
|
|
|
logger.error('chunk %s, integrity error: %s', bin_to_hex(chunk_id), integrity_error)
|
|
|
|
defect_chunks.append(chunk_id)
|
2016-05-13 20:50:34 +00:00
|
|
|
pi.finish()
|
2016-10-04 20:05:26 +00:00
|
|
|
if chunks_count_index != chunks_count_segments:
|
|
|
|
logger.error('Repo/Chunks index object count vs. segment files object count mismatch.')
|
|
|
|
logger.error('Repo/Chunks index: %d objects != segment files: %d objects',
|
|
|
|
chunks_count_index, chunks_count_segments)
|
2016-09-08 23:27:27 +00:00
|
|
|
if defect_chunks:
|
|
|
|
if self.repair:
|
|
|
|
# if we kill the defect chunk here, subsequent actions within this "borg check"
|
|
|
|
# run will find missing chunks and replace them with all-zero replacement
|
|
|
|
# chunks and flag the files as "repaired".
|
|
|
|
# if another backup is done later and the missing chunks get backupped again,
|
|
|
|
# a "borg check" afterwards can heal all files where this chunk was missing.
|
|
|
|
logger.warning('Found defect chunks. They will be deleted now, so affected files can '
|
|
|
|
'get repaired now and maybe healed later.')
|
|
|
|
for defect_chunk in defect_chunks:
|
|
|
|
# remote repo (ssh): retry might help for strange network / NIC / RAM errors
|
|
|
|
# as the chunk will be retransmitted from remote server.
|
|
|
|
# local repo (fs): as chunks.iteritems loop usually pumps a lot of data through,
|
|
|
|
# a defect chunk is likely not in the fs cache any more and really gets re-read
|
|
|
|
# from the underlying media.
|
|
|
|
encrypted_data = self.repository.get(defect_chunk)
|
|
|
|
try:
|
|
|
|
_chunk_id = None if defect_chunk == Manifest.MANIFEST_ID else defect_chunk
|
|
|
|
self.key.decrypt(_chunk_id, encrypted_data)
|
|
|
|
except IntegrityError:
|
|
|
|
# failed twice -> get rid of this chunk
|
|
|
|
del self.chunks[defect_chunk]
|
|
|
|
self.repository.delete(defect_chunk)
|
|
|
|
logger.debug('chunk %s deleted.', bin_to_hex(defect_chunk))
|
|
|
|
else:
|
|
|
|
logger.warning('chunk %s not deleted, did not consistently fail.')
|
|
|
|
else:
|
|
|
|
logger.warning('Found defect chunks. With --repair, they would get deleted, so affected '
|
|
|
|
'files could get repaired then and maybe healed later.')
|
|
|
|
for defect_chunk in defect_chunks:
|
|
|
|
logger.debug('chunk %s is defect.', bin_to_hex(defect_chunk))
|
2016-05-13 20:50:34 +00:00
|
|
|
log = logger.error if errors else logger.info
|
2016-10-04 20:05:26 +00:00
|
|
|
log('Finished cryptographic data integrity verification, verified %d chunks with %d integrity errors.',
|
|
|
|
chunks_count_segments, errors)
|
2016-05-13 20:50:34 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def rebuild_manifest(self):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Rebuild the manifest object if it is missing
|
|
|
|
|
|
|
|
Iterates through all objects in the repository looking for archive metadata blocks.
|
|
|
|
"""
|
2016-06-12 21:36:56 +00:00
|
|
|
required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS)
|
|
|
|
|
|
|
|
def valid_archive(obj):
|
|
|
|
if not isinstance(obj, dict):
|
|
|
|
return False
|
|
|
|
keys = set(obj)
|
|
|
|
return required_archive_keys.issubset(keys)
|
|
|
|
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Rebuilding missing manifest, this might take some time...')
|
2016-06-12 21:36:56 +00:00
|
|
|
# as we have lost the manifest, we do not know any more what valid item keys we had.
|
|
|
|
# collecting any key we encounter in a damaged repo seems unwise, thus we just use
|
|
|
|
# the hardcoded list from the source code. thus, it is not recommended to rebuild a
|
|
|
|
# lost manifest on a older borg version than the most recent one that was ever used
|
|
|
|
# within this repository (assuming that newer borg versions support more item keys).
|
2014-02-16 21:21:18 +00:00
|
|
|
manifest = Manifest(self.key, self.repository)
|
2016-06-12 21:36:56 +00:00
|
|
|
archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS]
|
2014-02-16 21:21:18 +00:00
|
|
|
for chunk_id, _ in self.chunks.iteritems():
|
|
|
|
cdata = self.repository.get(chunk_id)
|
2016-11-30 04:38:04 +00:00
|
|
|
try:
|
|
|
|
_, data = self.key.decrypt(chunk_id, cdata)
|
|
|
|
except IntegrityError as exc:
|
|
|
|
logger.error('Skipping corrupted chunk: %s', exc)
|
|
|
|
self.error_found = True
|
|
|
|
continue
|
2016-06-12 21:36:56 +00:00
|
|
|
if not valid_msgpacked_dict(data, archive_keys_serialized):
|
2014-03-01 14:00:21 +00:00
|
|
|
continue
|
2015-03-17 22:47:21 +00:00
|
|
|
if b'cmdline' not in data or b'\xa7version\x01' not in data:
|
2014-03-01 14:00:21 +00:00
|
|
|
continue
|
2014-02-16 21:21:18 +00:00
|
|
|
try:
|
|
|
|
archive = msgpack.unpackb(data)
|
2015-03-20 01:31:39 +00:00
|
|
|
# Ignore exceptions that might be raised when feeding
|
|
|
|
# msgpack with invalid data
|
|
|
|
except (TypeError, ValueError, StopIteration):
|
2014-02-16 21:21:18 +00:00
|
|
|
continue
|
2016-06-12 21:36:56 +00:00
|
|
|
if valid_archive(archive):
|
2016-08-14 23:11:33 +00:00
|
|
|
archive = ArchiveItem(internal_dict=archive)
|
|
|
|
logger.info('Found archive %s', archive.name)
|
2016-08-15 02:17:41 +00:00
|
|
|
manifest.archives[archive.name] = (chunk_id, archive.time)
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Manifest rebuild complete.')
|
2014-02-16 21:21:18 +00:00
|
|
|
return manifest
|
|
|
|
|
2016-10-18 01:57:42 +00:00
|
|
|
def rebuild_refcounts(self, archive=None, first=0, last=0, sort_by='', prefix=''):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Rebuild object reference counts by walking the metadata
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2014-02-24 22:37:21 +00:00
|
|
|
Missing and/or incorrect data is repaired when detected
|
|
|
|
"""
|
2014-02-16 21:21:18 +00:00
|
|
|
# Exclude the manifest from chunks
|
|
|
|
del self.chunks[Manifest.MANIFEST_ID]
|
2014-02-18 20:16:36 +00:00
|
|
|
|
2014-02-24 22:37:21 +00:00
|
|
|
def mark_as_possibly_superseded(id_):
|
2016-04-16 15:48:47 +00:00
|
|
|
if self.chunks.get(id_, ChunkIndexEntry(0, 0, 0)).refcount == 0:
|
2014-02-16 21:21:18 +00:00
|
|
|
self.possibly_superseded.add(id_)
|
|
|
|
|
|
|
|
def add_callback(chunk):
|
2016-03-18 02:16:12 +00:00
|
|
|
id_ = self.key.id_hash(chunk.data)
|
2014-02-16 21:21:18 +00:00
|
|
|
cdata = self.key.encrypt(chunk)
|
2016-03-18 02:16:12 +00:00
|
|
|
add_reference(id_, len(chunk.data), len(cdata), cdata)
|
2014-02-16 21:21:18 +00:00
|
|
|
return id_
|
|
|
|
|
|
|
|
def add_reference(id_, size, csize, cdata=None):
|
|
|
|
try:
|
2016-04-11 22:10:44 +00:00
|
|
|
self.chunks.incref(id_)
|
2014-02-16 21:21:18 +00:00
|
|
|
except KeyError:
|
|
|
|
assert cdata is not None
|
2016-04-16 15:48:47 +00:00
|
|
|
self.chunks[id_] = ChunkIndexEntry(refcount=1, size=size, csize=csize)
|
2014-02-16 21:21:18 +00:00
|
|
|
if self.repair:
|
|
|
|
self.repository.put(id_, cdata)
|
|
|
|
|
|
|
|
def verify_file_chunks(item):
|
2016-07-09 14:38:07 +00:00
|
|
|
"""Verifies that all file chunks are present.
|
2014-02-24 22:37:21 +00:00
|
|
|
|
2016-07-09 14:38:07 +00:00
|
|
|
Missing file chunks will be replaced with new chunks of the same length containing all zeros.
|
|
|
|
If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
|
2014-02-24 22:37:21 +00:00
|
|
|
"""
|
2014-02-16 21:21:18 +00:00
|
|
|
offset = 0
|
|
|
|
chunk_list = []
|
2016-07-06 21:10:04 +00:00
|
|
|
chunks_replaced = False
|
2016-07-10 23:23:27 +00:00
|
|
|
has_chunks_healthy = 'chunks_healthy' in item
|
|
|
|
chunks_current = item.chunks
|
|
|
|
chunks_healthy = item.chunks_healthy if has_chunks_healthy else chunks_current
|
2016-07-09 14:38:07 +00:00
|
|
|
assert len(chunks_current) == len(chunks_healthy)
|
|
|
|
for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy):
|
|
|
|
chunk_id, size, csize = chunk_healthy
|
2015-03-17 22:47:21 +00:00
|
|
|
if chunk_id not in self.chunks:
|
2016-07-09 14:38:07 +00:00
|
|
|
# a chunk of the healthy list is missing
|
|
|
|
if chunk_current == chunk_healthy:
|
|
|
|
logger.error('{}: New missing file chunk detected (Byte {}-{}). '
|
2016-07-10 23:23:27 +00:00
|
|
|
'Replacing with all-zero chunk.'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
self.error_found = chunks_replaced = True
|
|
|
|
data = bytes(size)
|
|
|
|
chunk_id = self.key.id_hash(data)
|
2016-07-10 23:23:27 +00:00
|
|
|
cdata = self.key.encrypt(Chunk(data))
|
2016-07-09 14:38:07 +00:00
|
|
|
csize = len(cdata)
|
|
|
|
add_reference(chunk_id, size, csize, cdata)
|
|
|
|
else:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Previously missing file chunk is still missing (Byte {}-{}). It has a '
|
|
|
|
'all-zero replacement chunk already.'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
chunk_id, size, csize = chunk_current
|
|
|
|
add_reference(chunk_id, size, csize)
|
2014-02-16 21:21:18 +00:00
|
|
|
else:
|
2016-07-09 14:38:07 +00:00
|
|
|
if chunk_current == chunk_healthy:
|
|
|
|
# normal case, all fine.
|
|
|
|
add_reference(chunk_id, size, csize)
|
|
|
|
else:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Healed previously missing file chunk! '
|
|
|
|
'(Byte {}-{}).'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
add_reference(chunk_id, size, csize)
|
|
|
|
mark_as_possibly_superseded(chunk_current[0]) # maybe orphaned the all-zero replacement chunk
|
|
|
|
chunk_list.append([chunk_id, size, csize]) # list-typed element as chunks_healthy is list-of-lists
|
2014-02-16 21:21:18 +00:00
|
|
|
offset += size
|
2016-07-09 14:38:07 +00:00
|
|
|
if chunks_replaced and not has_chunks_healthy:
|
2016-07-06 20:42:18 +00:00
|
|
|
# if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later
|
2016-07-08 10:13:52 +00:00
|
|
|
item.chunks_healthy = item.chunks
|
2016-07-09 14:38:07 +00:00
|
|
|
if has_chunks_healthy and chunk_list == chunks_healthy:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Completely healed previously damaged file!'.format(item.path))
|
|
|
|
del item.chunks_healthy
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks = chunk_list
|
2014-02-16 21:21:18 +00:00
|
|
|
|
|
|
|
def robust_iterator(archive):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Iterates through all archive items
|
|
|
|
|
|
|
|
Missing item chunks will be skipped and the msgpack stream will be restarted
|
|
|
|
"""
|
2016-06-12 21:36:56 +00:00
|
|
|
item_keys = frozenset(key.encode() for key in self.manifest.item_keys)
|
|
|
|
required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS)
|
|
|
|
unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and 'path' in item,
|
|
|
|
self.manifest.item_keys)
|
2014-02-24 21:43:17 +00:00
|
|
|
_state = 0
|
2015-03-17 22:47:21 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def missing_chunk_detector(chunk_id):
|
2014-02-24 21:43:17 +00:00
|
|
|
nonlocal _state
|
2015-03-17 22:47:21 +00:00
|
|
|
if _state % 2 != int(chunk_id not in self.chunks):
|
2014-02-24 21:43:17 +00:00
|
|
|
_state += 1
|
|
|
|
return _state
|
2015-03-17 22:47:21 +00:00
|
|
|
|
2015-11-03 22:45:49 +00:00
|
|
|
def report(msg, chunk_id, chunk_no):
|
2016-04-23 20:42:56 +00:00
|
|
|
cid = bin_to_hex(chunk_id)
|
2016-10-09 22:22:01 +00:00
|
|
|
msg += ' [chunk: %06d_%s]' % (chunk_no, cid) # see "debug dump-archive-items"
|
2015-12-07 23:21:46 +00:00
|
|
|
self.error_found = True
|
|
|
|
logger.error(msg)
|
2015-11-03 22:45:49 +00:00
|
|
|
|
2016-11-13 14:58:42 +00:00
|
|
|
def list_keys_safe(keys):
|
|
|
|
return ', '.join((k.decode() if isinstance(k, bytes) else str(k) for k in keys))
|
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def valid_item(obj):
|
|
|
|
if not isinstance(obj, StableDict):
|
2016-11-13 14:58:42 +00:00
|
|
|
return False, 'not a dictionary'
|
|
|
|
# A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item.
|
|
|
|
# We ignore it here, should it exist. See test_attic013_acl_bug for details.
|
|
|
|
obj.pop(b'acl', None)
|
2016-06-12 21:36:56 +00:00
|
|
|
keys = set(obj)
|
2016-11-13 14:58:42 +00:00
|
|
|
if not required_item_keys.issubset(keys):
|
|
|
|
return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
|
|
|
|
if not keys.issubset(item_keys):
|
|
|
|
return False, 'invalid keys: ' + list_keys_safe(keys - item_keys)
|
|
|
|
return True, ''
|
2016-06-12 21:36:56 +00:00
|
|
|
|
2015-11-03 22:45:49 +00:00
|
|
|
i = 0
|
2016-08-14 23:11:33 +00:00
|
|
|
for state, items in groupby(archive.items, missing_chunk_detector):
|
2014-02-24 21:43:17 +00:00
|
|
|
items = list(items)
|
2014-02-16 21:21:18 +00:00
|
|
|
if state % 2:
|
2015-11-03 22:45:49 +00:00
|
|
|
for chunk_id in items:
|
|
|
|
report('item metadata chunk missing', chunk_id, i)
|
|
|
|
i += 1
|
2014-02-24 21:43:17 +00:00
|
|
|
continue
|
|
|
|
if state > 0:
|
|
|
|
unpacker.resync()
|
2014-03-13 21:29:47 +00:00
|
|
|
for chunk_id, cdata in zip(items, repository.get_many(items)):
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(chunk_id, cdata)
|
|
|
|
unpacker.feed(data)
|
2015-11-03 22:45:49 +00:00
|
|
|
try:
|
|
|
|
for item in unpacker:
|
2016-11-13 14:58:42 +00:00
|
|
|
valid, reason = valid_item(item)
|
|
|
|
if valid:
|
2016-05-31 23:45:45 +00:00
|
|
|
yield Item(internal_dict=item)
|
2015-11-03 22:45:49 +00:00
|
|
|
else:
|
2016-11-13 14:58:42 +00:00
|
|
|
report('Did not get expected metadata dict when unpacking item metadata (%s)' % reason, chunk_id, i)
|
2016-07-28 20:10:29 +00:00
|
|
|
except RobustUnpacker.UnpackerCrashed as err:
|
|
|
|
report('Unpacker crashed while unpacking item metadata, trying to resync...', chunk_id, i)
|
|
|
|
unpacker.resync()
|
2015-11-03 22:45:49 +00:00
|
|
|
except Exception:
|
|
|
|
report('Exception while unpacking item metadata', chunk_id, i)
|
|
|
|
raise
|
|
|
|
i += 1
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2015-08-08 20:11:40 +00:00
|
|
|
if archive is None:
|
2016-10-18 01:57:42 +00:00
|
|
|
sort_by = sort_by.split(',')
|
|
|
|
if any((first, last, prefix)):
|
|
|
|
archive_infos = self.manifest.archives.list(sort_by=sort_by, prefix=prefix, first=first, last=last)
|
|
|
|
else:
|
|
|
|
archive_infos = self.manifest.archives.list(sort_by=sort_by)
|
2015-08-08 20:11:40 +00:00
|
|
|
else:
|
|
|
|
# we only want one specific archive
|
2016-08-15 02:17:41 +00:00
|
|
|
info = self.manifest.archives.get(archive)
|
|
|
|
if info is None:
|
2016-05-13 20:50:34 +00:00
|
|
|
logger.error("Archive '%s' not found.", archive)
|
2016-08-15 02:17:41 +00:00
|
|
|
archive_infos = []
|
|
|
|
else:
|
|
|
|
archive_infos = [info]
|
2016-10-18 01:57:42 +00:00
|
|
|
num_archives = len(archive_infos)
|
2016-01-16 22:42:54 +00:00
|
|
|
|
|
|
|
with cache_if_remote(self.repository) as repository:
|
2016-10-18 01:57:42 +00:00
|
|
|
for i, info in enumerate(archive_infos):
|
|
|
|
logger.info('Analyzing archive {} ({}/{})'.format(info.name, i + 1, num_archives))
|
2016-08-15 02:17:41 +00:00
|
|
|
archive_id = info.id
|
2016-01-16 22:42:54 +00:00
|
|
|
if archive_id not in self.chunks:
|
|
|
|
logger.error('Archive metadata block is missing!')
|
|
|
|
self.error_found = True
|
2016-08-15 02:17:41 +00:00
|
|
|
del self.manifest.archives[info.name]
|
2016-01-16 22:42:54 +00:00
|
|
|
continue
|
|
|
|
mark_as_possibly_superseded(archive_id)
|
|
|
|
cdata = self.repository.get(archive_id)
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(archive_id, cdata)
|
2016-08-14 23:11:33 +00:00
|
|
|
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
|
|
|
if archive.version != 1:
|
2016-01-16 22:42:54 +00:00
|
|
|
raise Exception('Unknown archive metadata version')
|
2016-08-14 23:11:33 +00:00
|
|
|
archive.cmdline = [safe_decode(arg) for arg in archive.cmdline]
|
2016-01-16 22:42:54 +00:00
|
|
|
items_buffer = ChunkBuffer(self.key)
|
|
|
|
items_buffer.write_chunk = add_callback
|
|
|
|
for item in robust_iterator(archive):
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
2016-01-16 22:42:54 +00:00
|
|
|
verify_file_chunks(item)
|
|
|
|
items_buffer.add(item)
|
|
|
|
items_buffer.flush(flush=True)
|
2016-08-14 23:11:33 +00:00
|
|
|
for previous_item_id in archive.items:
|
2016-01-16 22:42:54 +00:00
|
|
|
mark_as_possibly_superseded(previous_item_id)
|
2016-08-14 23:11:33 +00:00
|
|
|
archive.items = items_buffer.chunks
|
|
|
|
data = msgpack.packb(archive.as_dict(), unicode_errors='surrogateescape')
|
2016-01-16 22:42:54 +00:00
|
|
|
new_archive_id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
cdata = self.key.encrypt(Chunk(data))
|
2016-01-16 22:42:54 +00:00
|
|
|
add_reference(new_archive_id, len(data), len(cdata), cdata)
|
2016-08-15 02:17:41 +00:00
|
|
|
self.manifest.archives[info.name] = (new_archive_id, info.ts)
|
2014-02-24 22:37:21 +00:00
|
|
|
|
2015-08-09 10:43:57 +00:00
|
|
|
def orphan_chunks_check(self):
|
|
|
|
if self.check_all:
|
2016-04-16 15:48:47 +00:00
|
|
|
unused = {id_ for id_, entry in self.chunks.iteritems() if entry.refcount == 0}
|
2015-08-09 10:43:57 +00:00
|
|
|
orphaned = unused - self.possibly_superseded
|
|
|
|
if orphaned:
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.error('{} orphaned objects found!'.format(len(orphaned)))
|
|
|
|
self.error_found = True
|
2015-08-09 10:43:57 +00:00
|
|
|
if self.repair:
|
|
|
|
for id_ in unused:
|
|
|
|
self.repository.delete(id_)
|
|
|
|
else:
|
2016-04-03 15:58:15 +00:00
|
|
|
logger.info('Orphaned objects check skipped (needs all archives checked).')
|
2015-08-09 10:43:57 +00:00
|
|
|
|
2015-11-18 01:27:25 +00:00
|
|
|
def finish(self, save_space=False):
|
2014-02-24 22:37:21 +00:00
|
|
|
if self.repair:
|
|
|
|
self.manifest.write()
|
2015-11-18 01:27:25 +00:00
|
|
|
self.repository.commit(save_space=save_space)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ArchiveRecreater:
|
|
|
|
class Interrupted(Exception):
|
|
|
|
def __init__(self, metadata=None):
|
|
|
|
self.metadata = metadata or {}
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def is_temporary_archive(archive_name):
|
|
|
|
return archive_name.endswith('.recreate')
|
|
|
|
|
|
|
|
def __init__(self, repository, manifest, key, cache, matcher,
|
|
|
|
exclude_caches=False, exclude_if_present=None, keep_tag_files=False,
|
2016-07-31 21:09:57 +00:00
|
|
|
chunker_params=None, compression=None, compression_files=None, always_recompress=False,
|
2016-11-19 18:09:47 +00:00
|
|
|
dry_run=False, stats=False, progress=False, file_status_printer=None,
|
|
|
|
checkpoint_interval=1800):
|
2016-04-07 09:29:52 +00:00
|
|
|
self.repository = repository
|
|
|
|
self.key = key
|
|
|
|
self.manifest = manifest
|
|
|
|
self.cache = cache
|
|
|
|
|
|
|
|
self.matcher = matcher
|
|
|
|
self.exclude_caches = exclude_caches
|
|
|
|
self.exclude_if_present = exclude_if_present or []
|
|
|
|
self.keep_tag_files = keep_tag_files
|
|
|
|
|
2016-12-02 19:19:59 +00:00
|
|
|
self.rechunkify = chunker_params is not None
|
|
|
|
if self.rechunkify:
|
|
|
|
logger.debug('Rechunking archives to %s', chunker_params)
|
2016-04-07 09:29:52 +00:00
|
|
|
self.chunker_params = chunker_params or CHUNKER_PARAMS
|
|
|
|
self.recompress = bool(compression)
|
2016-07-31 21:09:57 +00:00
|
|
|
self.always_recompress = always_recompress
|
2016-04-18 23:13:10 +00:00
|
|
|
self.compression = compression or CompressionSpec('none')
|
|
|
|
self.seen_chunks = set()
|
|
|
|
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
2016-07-31 21:09:57 +00:00
|
|
|
compression_files or [])
|
2016-04-18 23:13:10 +00:00
|
|
|
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
self.dry_run = dry_run
|
|
|
|
self.stats = stats
|
|
|
|
self.progress = progress
|
|
|
|
self.print_file_status = file_status_printer or (lambda *args: None)
|
2016-12-02 17:15:11 +00:00
|
|
|
self.checkpoint_interval = None if dry_run else checkpoint_interval
|
2016-04-07 09:29:52 +00:00
|
|
|
|
2016-08-02 13:53:29 +00:00
|
|
|
def recreate(self, archive_name, comment=None, target_name=None):
|
2016-04-07 09:29:52 +00:00
|
|
|
assert not self.is_temporary_archive(archive_name)
|
|
|
|
archive = self.open_archive(archive_name)
|
2016-11-19 15:49:20 +00:00
|
|
|
target = self.create_target(archive, target_name)
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.exclude_if_present or self.exclude_caches:
|
|
|
|
self.matcher_add_tagged_dirs(archive)
|
2016-04-10 12:09:05 +00:00
|
|
|
if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
|
2016-04-07 09:29:52 +00:00
|
|
|
logger.info("Skipping archive %s, nothing to do", archive_name)
|
2016-12-02 17:15:11 +00:00
|
|
|
return
|
2016-11-19 15:49:20 +00:00
|
|
|
self.process_items(archive, target)
|
2016-08-02 13:53:29 +00:00
|
|
|
replace_original = target_name is None
|
2016-12-02 10:09:52 +00:00
|
|
|
self.save(archive, target, comment, replace_original=replace_original)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
2016-11-19 15:49:20 +00:00
|
|
|
def process_items(self, archive, target):
|
2016-04-07 09:29:52 +00:00
|
|
|
matcher = self.matcher
|
|
|
|
target_is_subset = not matcher.empty()
|
|
|
|
hardlink_masters = {} if target_is_subset else None
|
|
|
|
|
|
|
|
def item_is_hardlink_master(item):
|
|
|
|
return (target_is_subset and
|
2016-05-31 23:45:45 +00:00
|
|
|
stat.S_ISREG(item.mode) and
|
|
|
|
item.get('hardlink_master', True) and
|
|
|
|
'source' not in item and
|
|
|
|
not matcher.match(item.path))
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
for item in archive.iter_items():
|
|
|
|
if item_is_hardlink_master(item):
|
2016-05-31 23:45:45 +00:00
|
|
|
hardlink_masters[item.path] = (item.get('chunks'), None)
|
2016-04-07 09:29:52 +00:00
|
|
|
continue
|
2016-05-31 23:45:45 +00:00
|
|
|
if not matcher.match(item.path):
|
|
|
|
self.print_file_status('x', item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
continue
|
2016-05-31 23:45:45 +00:00
|
|
|
if target_is_subset and stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters:
|
2016-04-07 09:29:52 +00:00
|
|
|
# master of this hard link is outside the target subset
|
2016-05-31 23:45:45 +00:00
|
|
|
chunks, new_source = hardlink_masters[item.source]
|
2016-04-07 09:29:52 +00:00
|
|
|
if new_source is None:
|
|
|
|
# First item to use this master, move the chunks
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks = chunks
|
|
|
|
hardlink_masters[item.source] = (None, item.path)
|
|
|
|
del item.source
|
2016-04-07 09:29:52 +00:00
|
|
|
else:
|
|
|
|
# Master was already moved, only update this item's source
|
2016-05-31 23:45:45 +00:00
|
|
|
item.source = new_source
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.dry_run:
|
2016-05-31 23:45:45 +00:00
|
|
|
self.print_file_status('-', item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
else:
|
2016-11-19 15:49:20 +00:00
|
|
|
self.process_item(archive, target, item)
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.progress:
|
|
|
|
target.stats.show_progress(final=True)
|
|
|
|
|
|
|
|
def process_item(self, archive, target, item):
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
2016-11-19 18:09:47 +00:00
|
|
|
self.process_chunks(archive, target, item)
|
2016-04-07 09:29:52 +00:00
|
|
|
target.stats.nfiles += 1
|
|
|
|
target.add_item(item)
|
2016-05-31 23:45:45 +00:00
|
|
|
self.print_file_status(file_status(item.mode), item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
def process_chunks(self, archive, target, item):
|
|
|
|
if not self.recompress and not target.recreate_rechunkify:
|
2016-05-31 23:45:45 +00:00
|
|
|
for chunk_id, size, csize in item.chunks:
|
2016-04-07 09:29:52 +00:00
|
|
|
self.cache.chunk_incref(chunk_id, target.stats)
|
2016-05-31 23:45:45 +00:00
|
|
|
return item.chunks
|
2016-12-02 11:54:27 +00:00
|
|
|
chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
|
2016-07-31 21:09:57 +00:00
|
|
|
compress = self.compression_decider1.decide(item.path)
|
2016-11-19 18:09:47 +00:00
|
|
|
chunk_processor = partial(self.chunk_processor, target, compress)
|
|
|
|
target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
|
|
|
|
|
|
|
|
def chunk_processor(self, target, compress, data):
|
|
|
|
chunk_id = self.key.id_hash(data)
|
|
|
|
if chunk_id in self.seen_chunks:
|
|
|
|
return self.cache.chunk_incref(chunk_id, target.stats)
|
|
|
|
chunk = Chunk(data, compress=compress)
|
|
|
|
compression_spec, chunk = self.key.compression_decider2.decide(chunk)
|
|
|
|
overwrite = self.recompress
|
|
|
|
if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
|
|
|
|
# Check if this chunk is already compressed the way we want it
|
|
|
|
old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
|
|
|
|
if Compressor.detect(old_chunk.data).name == compression_spec['name']:
|
|
|
|
# Stored chunk has the same compression we wanted
|
|
|
|
overwrite = False
|
2016-12-02 10:39:10 +00:00
|
|
|
chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
|
|
|
|
self.seen_chunks.add(chunk_entry.id)
|
|
|
|
return chunk_entry
|
2016-11-19 18:09:47 +00:00
|
|
|
|
2016-12-02 11:54:27 +00:00
|
|
|
def iter_chunks(self, archive, target, chunks):
|
2016-11-19 18:09:47 +00:00
|
|
|
chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
|
2016-04-07 09:29:52 +00:00
|
|
|
if target.recreate_rechunkify:
|
|
|
|
# The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
|
|
|
|
# (does not load the entire file into memory)
|
|
|
|
file = ChunkIteratorFileWrapper(chunk_iterator)
|
2016-12-02 10:20:26 +00:00
|
|
|
yield from target.chunker.chunkify(file)
|
2016-11-19 18:09:47 +00:00
|
|
|
else:
|
|
|
|
for chunk in chunk_iterator:
|
|
|
|
yield chunk.data
|
2016-03-18 02:16:12 +00:00
|
|
|
|
2016-11-19 18:09:47 +00:00
|
|
|
def save(self, archive, target, comment=None, replace_original=True):
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.dry_run:
|
2016-11-19 18:09:47 +00:00
|
|
|
return
|
|
|
|
timestamp = archive.ts.replace(tzinfo=None)
|
|
|
|
if comment is None:
|
|
|
|
comment = archive.metadata.get('comment', '')
|
|
|
|
target.save(timestamp=timestamp, comment=comment, additional_metadata={
|
|
|
|
'cmdline': archive.metadata.cmdline,
|
|
|
|
'recreate_cmdline': sys.argv,
|
|
|
|
})
|
|
|
|
if replace_original:
|
|
|
|
archive.delete(Statistics(), progress=self.progress)
|
|
|
|
target.rename(archive.name)
|
|
|
|
if self.stats:
|
|
|
|
target.end = datetime.utcnow()
|
|
|
|
log_multi(DASHES,
|
|
|
|
str(target),
|
|
|
|
DASHES,
|
|
|
|
str(target.stats),
|
|
|
|
str(self.cache),
|
|
|
|
DASHES)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
def matcher_add_tagged_dirs(self, archive):
|
|
|
|
"""Add excludes to the matcher created by exclude_cache and exclude_if_present."""
|
|
|
|
def exclude(dir, tag_item):
|
|
|
|
if self.keep_tag_files:
|
2016-05-31 23:45:45 +00:00
|
|
|
tag_files.append(PathPrefixPattern(tag_item.path))
|
2016-04-07 09:29:52 +00:00
|
|
|
tagged_dirs.append(FnmatchPattern(dir + '/'))
|
|
|
|
else:
|
|
|
|
tagged_dirs.append(PathPrefixPattern(dir))
|
|
|
|
|
|
|
|
matcher = self.matcher
|
|
|
|
tag_files = []
|
|
|
|
tagged_dirs = []
|
2016-04-17 01:15:19 +00:00
|
|
|
# build hardlink masters, but only for paths ending in CACHE_TAG_NAME, so we can read hard-linked TAGs
|
2016-04-07 09:29:52 +00:00
|
|
|
cachedir_masters = {}
|
|
|
|
|
|
|
|
for item in archive.iter_items(
|
2016-05-31 23:45:45 +00:00
|
|
|
filter=lambda item: item.path.endswith(CACHE_TAG_NAME) or matcher.match(item.path)):
|
|
|
|
if item.path.endswith(CACHE_TAG_NAME):
|
|
|
|
cachedir_masters[item.path] = item
|
|
|
|
if stat.S_ISREG(item.mode):
|
|
|
|
dir, tag_file = os.path.split(item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
if tag_file in self.exclude_if_present:
|
|
|
|
exclude(dir, item)
|
2016-04-17 01:15:19 +00:00
|
|
|
if self.exclude_caches and tag_file == CACHE_TAG_NAME:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
2016-04-07 09:29:52 +00:00
|
|
|
file = open_item(archive, item)
|
|
|
|
else:
|
2016-05-31 23:45:45 +00:00
|
|
|
file = open_item(archive, cachedir_masters[item.source])
|
2016-04-17 01:15:19 +00:00
|
|
|
if file.read(len(CACHE_TAG_CONTENTS)).startswith(CACHE_TAG_CONTENTS):
|
2016-04-07 09:29:52 +00:00
|
|
|
exclude(dir, item)
|
|
|
|
matcher.add(tag_files, True)
|
|
|
|
matcher.add(tagged_dirs, False)
|
|
|
|
|
2016-11-19 15:49:20 +00:00
|
|
|
def create_target(self, archive, target_name=None):
|
|
|
|
"""Create target archive."""
|
2016-08-02 13:53:29 +00:00
|
|
|
target_name = target_name or archive.name + '.recreate'
|
2016-11-19 15:49:20 +00:00
|
|
|
target = self.create_target_archive(target_name)
|
2016-04-07 09:29:52 +00:00
|
|
|
# If the archives use the same chunker params, then don't rechunkify
|
2016-12-02 19:19:59 +00:00
|
|
|
source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
|
|
|
|
target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
|
|
|
|
if target.recreate_rechunkify:
|
|
|
|
logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
|
2016-11-19 15:49:20 +00:00
|
|
|
return target
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
def create_target_archive(self, name):
|
|
|
|
target = Archive(self.repository, self.key, self.manifest, name, create=True,
|
|
|
|
progress=self.progress, chunker_params=self.chunker_params, cache=self.cache,
|
2016-11-19 18:09:47 +00:00
|
|
|
checkpoint_interval=self.checkpoint_interval, compression=self.compression)
|
2016-04-07 09:29:52 +00:00
|
|
|
return target
|
|
|
|
|
|
|
|
def open_archive(self, name, **kwargs):
|
|
|
|
return Archive(self.repository, self.key, self.manifest, name, cache=self.cache, **kwargs)
|