2016-05-30 22:33:13 +00:00
|
|
|
import errno
|
2016-05-30 23:18:03 +00:00
|
|
|
import os
|
|
|
|
import socket
|
|
|
|
import stat
|
|
|
|
import sys
|
|
|
|
import time
|
2016-06-27 18:56:41 +00:00
|
|
|
from contextlib import contextmanager
|
2016-02-08 19:17:35 +00:00
|
|
|
from datetime import datetime, timezone
|
2010-10-24 20:07:54 +00:00
|
|
|
from getpass import getuser
|
2016-05-30 23:18:03 +00:00
|
|
|
from io import BytesIO
|
2014-02-16 21:21:18 +00:00
|
|
|
from itertools import groupby
|
2016-05-30 23:18:03 +00:00
|
|
|
from shutil import get_terminal_size
|
|
|
|
|
|
|
|
import msgpack
|
convert most print() calls to logging
the logging level varies: most is logging.info(), in some place
logging.warning() or logging.error() are used when the condition is
clearly an error or warning. in other cases, we keep using print, but
force writing to sys.stderr, unless we interact with the user.
there were 77 calls to print before this commit, now there are 7, most
of which in the archiver module, which interacts directly with the
user. in one case there, we still use print() only because logging is
not setup properly yet during argument parsing.
it could be argued that commands like info or list should use print
directly, but we have converted them anyways, without ill effects on
the unit tests
unit tests still use print() in some places
this switches all informational output to stderr, which should help
with, if not fix jborg/attic#312 directly
2015-10-01 17:41:42 +00:00
|
|
|
|
2015-10-06 16:33:55 +00:00
|
|
|
from .logger import create_logger
|
|
|
|
logger = create_logger()
|
|
|
|
|
2015-05-22 17:21:41 +00:00
|
|
|
from . import xattr
|
2016-05-30 23:18:03 +00:00
|
|
|
from .cache import ChunkListEntry
|
|
|
|
from .chunker import Chunker
|
2016-07-31 21:09:57 +00:00
|
|
|
from .compress import Compressor
|
2016-04-17 01:15:19 +00:00
|
|
|
from .constants import * # NOQA
|
2016-05-30 23:18:03 +00:00
|
|
|
from .hashindex import ChunkIndex, ChunkIndexEntry
|
2016-05-30 22:33:13 +00:00
|
|
|
from .helpers import Manifest
|
|
|
|
from .helpers import Chunk, ChunkIteratorFileWrapper, open_item
|
|
|
|
from .helpers import Error, IntegrityError
|
|
|
|
from .helpers import uid2user, user2uid, gid2group, group2gid
|
|
|
|
from .helpers import parse_timestamp, to_localtime
|
|
|
|
from .helpers import format_time, format_timedelta, format_file_size, file_status
|
|
|
|
from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates
|
|
|
|
from .helpers import decode_dict, StableDict
|
|
|
|
from .helpers import int_to_bigint, bigint_to_int, bin_to_hex
|
|
|
|
from .helpers import ProgressIndicatorPercent, log_multi
|
|
|
|
from .helpers import PathPrefixPattern, FnmatchPattern
|
|
|
|
from .helpers import consume
|
|
|
|
from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec
|
2016-05-31 23:45:45 +00:00
|
|
|
from .item import Item
|
2016-05-30 23:18:03 +00:00
|
|
|
from .key import key_factory
|
2016-05-18 21:59:47 +00:00
|
|
|
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
|
2016-05-30 23:18:03 +00:00
|
|
|
from .remote import cache_if_remote
|
|
|
|
from .repository import Repository
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2013-06-03 11:45:48 +00:00
|
|
|
has_lchmod = hasattr(os, 'lchmod')
|
2010-10-27 17:30:21 +00:00
|
|
|
|
2016-02-18 22:01:55 +00:00
|
|
|
flags_normal = os.O_RDONLY | getattr(os, 'O_BINARY', 0)
|
|
|
|
flags_noatime = flags_normal | getattr(os, 'O_NOATIME', 0)
|
|
|
|
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-05-18 21:59:47 +00:00
|
|
|
class Statistics:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.osize = self.csize = self.usize = self.nfiles = 0
|
|
|
|
self.last_progress = 0 # timestamp when last progress was shown
|
|
|
|
|
|
|
|
def update(self, size, csize, unique):
|
|
|
|
self.osize += size
|
|
|
|
self.csize += csize
|
|
|
|
if unique:
|
|
|
|
self.usize += csize
|
|
|
|
|
2016-07-30 21:16:19 +00:00
|
|
|
summary = "{label:15} {stats.osize_fmt:>20s} {stats.csize_fmt:>20s} {stats.usize_fmt:>20s}"
|
2016-05-18 21:59:47 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.summary.format(stats=self, label='This archive:')
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<{cls} object at {hash:#x} ({self.osize}, {self.csize}, {self.usize})>".format(
|
|
|
|
cls=type(self).__name__, hash=id(self), self=self)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def osize_fmt(self):
|
|
|
|
return format_file_size(self.osize)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def usize_fmt(self):
|
|
|
|
return format_file_size(self.usize)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def csize_fmt(self):
|
|
|
|
return format_file_size(self.csize)
|
|
|
|
|
|
|
|
def show_progress(self, item=None, final=False, stream=None, dt=None):
|
|
|
|
now = time.time()
|
|
|
|
if dt is None or now - self.last_progress > dt:
|
|
|
|
self.last_progress = now
|
|
|
|
columns, lines = get_terminal_size()
|
|
|
|
if not final:
|
|
|
|
msg = '{0.osize_fmt} O {0.csize_fmt} C {0.usize_fmt} D {0.nfiles} N '.format(self)
|
2016-05-31 23:45:45 +00:00
|
|
|
path = remove_surrogates(item.path) if item else ''
|
2016-05-18 21:59:47 +00:00
|
|
|
space = columns - swidth(msg)
|
|
|
|
if space < swidth('...') + swidth(path):
|
|
|
|
path = '%s...%s' % (path[:(space // 2) - swidth('...')], path[-space // 2:])
|
|
|
|
msg += "{0:<{space}}".format(path, space=space)
|
|
|
|
else:
|
|
|
|
msg = ' ' * columns
|
|
|
|
print(msg, file=stream or sys.stderr, end="\r", flush=True)
|
|
|
|
|
|
|
|
|
2016-07-02 19:04:51 +00:00
|
|
|
def is_special(mode):
|
|
|
|
# file types that get special treatment in --read-special mode
|
|
|
|
return stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)
|
|
|
|
|
|
|
|
|
2016-06-30 22:13:53 +00:00
|
|
|
class BackupOSError(Exception):
|
2016-07-03 21:58:12 +00:00
|
|
|
"""
|
|
|
|
Wrapper for OSError raised while accessing backup files.
|
|
|
|
|
|
|
|
Borg does different kinds of IO, and IO failures have different consequences.
|
|
|
|
This wrapper represents failures of input file or extraction IO.
|
|
|
|
These are non-critical and are only reported (exit code = 1, warning).
|
|
|
|
|
|
|
|
Any unwrapped IO error is critical and aborts execution (for example repository IO failure).
|
|
|
|
"""
|
2016-06-27 18:56:41 +00:00
|
|
|
def __init__(self, os_error):
|
|
|
|
self.os_error = os_error
|
|
|
|
self.errno = os_error.errno
|
|
|
|
self.strerror = os_error.strerror
|
|
|
|
self.filename = os_error.filename
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return str(self.os_error)
|
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
2016-06-30 22:13:53 +00:00
|
|
|
def backup_io():
|
|
|
|
"""Context manager changing OSError to BackupOSError."""
|
2016-06-27 18:56:41 +00:00
|
|
|
try:
|
|
|
|
yield
|
|
|
|
except OSError as os_error:
|
2016-06-30 22:13:53 +00:00
|
|
|
raise BackupOSError(os_error) from os_error
|
2016-06-27 18:56:41 +00:00
|
|
|
|
|
|
|
|
2016-07-03 21:57:55 +00:00
|
|
|
def backup_io_iter(iterator):
|
2016-06-27 18:56:41 +00:00
|
|
|
while True:
|
|
|
|
try:
|
2016-06-30 22:13:53 +00:00
|
|
|
with backup_io():
|
2016-06-27 18:56:41 +00:00
|
|
|
item = next(iterator)
|
|
|
|
except StopIteration:
|
|
|
|
return
|
|
|
|
yield item
|
|
|
|
|
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
class DownloadPipeline:
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def __init__(self, repository, key):
|
|
|
|
self.repository = repository
|
|
|
|
self.key = key
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-23 21:13:08 +00:00
|
|
|
def unpack_many(self, ids, filter=None, preload=False):
|
2014-01-22 19:58:48 +00:00
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
2016-03-18 02:16:12 +00:00
|
|
|
for _, data in self.fetch_many(ids):
|
2014-01-22 19:58:48 +00:00
|
|
|
unpacker.feed(data)
|
2016-05-31 23:45:45 +00:00
|
|
|
items = [Item(internal_dict=item) for item in unpacker]
|
2014-01-22 19:58:48 +00:00
|
|
|
if filter:
|
|
|
|
items = [item for item in items if filter(item)]
|
2016-04-16 15:48:47 +00:00
|
|
|
for item in items:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
item.chunks = [ChunkListEntry(*e) for e in item.chunks]
|
2014-01-23 21:13:08 +00:00
|
|
|
if preload:
|
|
|
|
for item in items:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
self.repository.preload([c.id for c in item.chunks])
|
2014-01-22 19:58:48 +00:00
|
|
|
for item in items:
|
|
|
|
yield item
|
|
|
|
|
|
|
|
def fetch_many(self, ids, is_preloaded=False):
|
2014-02-16 21:21:18 +00:00
|
|
|
for id_, data in zip(ids, self.repository.get_many(ids, is_preloaded=is_preloaded)):
|
2014-01-22 19:58:48 +00:00
|
|
|
yield self.key.decrypt(id_, data)
|
|
|
|
|
|
|
|
|
|
|
|
class ChunkBuffer:
|
2016-08-14 13:07:18 +00:00
|
|
|
BUFFER_SIZE = 8 * 1024 * 1024
|
2014-01-22 19:58:48 +00:00
|
|
|
|
2016-01-15 19:56:21 +00:00
|
|
|
def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
|
2014-07-10 13:44:29 +00:00
|
|
|
self.buffer = BytesIO()
|
2014-01-22 19:58:48 +00:00
|
|
|
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
|
|
|
|
self.chunks = []
|
|
|
|
self.key = key
|
2015-06-20 23:46:41 +00:00
|
|
|
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def add(self, item):
|
2016-05-31 23:45:45 +00:00
|
|
|
self.buffer.write(self.packer.pack(item.as_dict()))
|
2014-02-16 21:21:18 +00:00
|
|
|
if self.is_full():
|
|
|
|
self.flush()
|
|
|
|
|
|
|
|
def write_chunk(self, chunk):
|
|
|
|
raise NotImplementedError
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def flush(self, flush=False):
|
|
|
|
if self.buffer.tell() == 0:
|
|
|
|
return
|
|
|
|
self.buffer.seek(0)
|
2016-03-18 02:16:12 +00:00
|
|
|
chunks = list(Chunk(bytes(s)) for s in self.chunker.chunkify(self.buffer))
|
2014-01-22 19:58:48 +00:00
|
|
|
self.buffer.seek(0)
|
|
|
|
self.buffer.truncate(0)
|
2014-08-30 13:10:41 +00:00
|
|
|
# Leave the last partial chunk in the buffer unless flush is True
|
2014-01-22 19:58:48 +00:00
|
|
|
end = None if flush or len(chunks) == 1 else -1
|
|
|
|
for chunk in chunks[:end]:
|
2014-02-16 21:21:18 +00:00
|
|
|
self.chunks.append(self.write_chunk(chunk))
|
2014-01-22 19:58:48 +00:00
|
|
|
if end == -1:
|
2016-04-19 07:18:46 +00:00
|
|
|
self.buffer.write(chunks[-1].data)
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def is_full(self):
|
|
|
|
return self.buffer.tell() > self.BUFFER_SIZE
|
2012-11-27 23:03:35 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
class CacheChunkBuffer(ChunkBuffer):
|
|
|
|
|
2016-01-15 19:56:21 +00:00
|
|
|
def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS):
|
2015-07-11 16:31:49 +00:00
|
|
|
super().__init__(key, chunker_params)
|
2014-02-16 21:21:18 +00:00
|
|
|
self.cache = cache
|
|
|
|
self.stats = stats
|
|
|
|
|
|
|
|
def write_chunk(self, chunk):
|
2016-03-18 02:16:12 +00:00
|
|
|
id_, _, _ = self.cache.add_chunk(self.key.id_hash(chunk.data), chunk, self.stats)
|
2014-02-16 21:21:18 +00:00
|
|
|
return id_
|
|
|
|
|
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
class Archive:
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2013-12-15 19:35:29 +00:00
|
|
|
class DoesNotExist(Error):
|
|
|
|
"""Archive {} does not exist"""
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2013-12-15 19:35:29 +00:00
|
|
|
class AlreadyExists(Error):
|
|
|
|
"""Archive {} already exists"""
|
2011-09-10 15:19:02 +00:00
|
|
|
|
2015-04-21 20:29:10 +00:00
|
|
|
class IncompatibleFilesystemEncodingError(Error):
|
|
|
|
"""Failed to encode filename "{}" into file system encoding "{}". Consider configuring the LANG environment variable."""
|
|
|
|
|
2013-06-20 10:44:58 +00:00
|
|
|
def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
2015-06-19 23:20:46 +00:00
|
|
|
checkpoint_interval=300, numeric_owner=False, progress=False,
|
2016-07-21 20:24:48 +00:00
|
|
|
chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None,
|
2016-07-21 22:19:56 +00:00
|
|
|
consider_part_files=False):
|
2013-06-03 11:45:48 +00:00
|
|
|
self.cwd = os.getcwd()
|
2011-07-30 19:13:48 +00:00
|
|
|
self.key = key
|
2013-06-20 10:44:58 +00:00
|
|
|
self.repository = repository
|
2011-07-30 19:13:48 +00:00
|
|
|
self.cache = cache
|
2011-09-04 21:02:47 +00:00
|
|
|
self.manifest = manifest
|
2010-10-20 20:53:58 +00:00
|
|
|
self.hard_links = {}
|
2011-08-07 15:10:21 +00:00
|
|
|
self.stats = Statistics()
|
2015-03-24 03:24:54 +00:00
|
|
|
self.show_progress = progress
|
2011-09-10 15:19:02 +00:00
|
|
|
self.name = name
|
|
|
|
self.checkpoint_interval = checkpoint_interval
|
2012-02-29 22:59:17 +00:00
|
|
|
self.numeric_owner = numeric_owner
|
2016-03-11 22:37:37 +00:00
|
|
|
if start is None:
|
|
|
|
start = datetime.utcnow()
|
2016-03-12 11:40:39 +00:00
|
|
|
self.chunker_params = chunker_params
|
2015-10-02 19:56:21 +00:00
|
|
|
self.start = start
|
2016-03-11 22:37:37 +00:00
|
|
|
if end is None:
|
|
|
|
end = datetime.utcnow()
|
2015-10-02 19:56:21 +00:00
|
|
|
self.end = end
|
2016-07-21 22:19:56 +00:00
|
|
|
self.consider_part_files = consider_part_files
|
2014-01-22 19:58:48 +00:00
|
|
|
self.pipeline = DownloadPipeline(self.repository, self.key)
|
2011-09-10 15:19:02 +00:00
|
|
|
if create:
|
2016-08-06 20:37:44 +00:00
|
|
|
self.file_compression_logger = create_logger('borg.debug.file-compression')
|
2016-01-15 19:56:21 +00:00
|
|
|
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
|
2015-06-20 23:46:41 +00:00
|
|
|
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
2016-04-18 23:13:10 +00:00
|
|
|
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
|
|
|
compression_files or [])
|
|
|
|
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
2011-09-10 15:19:02 +00:00
|
|
|
if name in manifest.archives:
|
2012-12-09 22:06:33 +00:00
|
|
|
raise self.AlreadyExists(name)
|
2011-09-10 15:19:02 +00:00
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
i = 0
|
|
|
|
while True:
|
|
|
|
self.checkpoint_name = '%s.checkpoint%s' % (name, i and ('.%d' % i) or '')
|
2015-03-17 22:47:21 +00:00
|
|
|
if self.checkpoint_name not in manifest.archives:
|
2011-09-10 15:19:02 +00:00
|
|
|
break
|
|
|
|
i += 1
|
|
|
|
else:
|
2012-12-10 19:48:39 +00:00
|
|
|
if name not in self.manifest.archives:
|
2012-12-09 22:06:33 +00:00
|
|
|
raise self.DoesNotExist(name)
|
2012-12-10 19:48:39 +00:00
|
|
|
info = self.manifest.archives[name]
|
2013-06-03 11:45:48 +00:00
|
|
|
self.load(info[b'id'])
|
2015-06-20 23:46:41 +00:00
|
|
|
self.zeros = b'\0' * (1 << chunker_params[1])
|
2011-08-15 20:32:26 +00:00
|
|
|
|
2015-03-24 06:11:00 +00:00
|
|
|
def _load_meta(self, id):
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(id, self.repository.get(id))
|
2015-03-24 06:11:00 +00:00
|
|
|
metadata = msgpack.unpackb(data)
|
|
|
|
if metadata[b'version'] != 1:
|
|
|
|
raise Exception('Unknown archive metadata version')
|
|
|
|
return metadata
|
|
|
|
|
2010-10-21 19:21:43 +00:00
|
|
|
def load(self, id):
|
|
|
|
self.id = id
|
2015-03-24 06:11:00 +00:00
|
|
|
self.metadata = self._load_meta(self.id)
|
2016-04-17 01:15:19 +00:00
|
|
|
decode_dict(self.metadata, ARCHIVE_TEXT_KEYS)
|
2016-04-23 20:57:04 +00:00
|
|
|
self.metadata[b'cmdline'] = [safe_decode(arg) for arg in self.metadata[b'cmdline']]
|
2013-06-03 11:45:48 +00:00
|
|
|
self.name = self.metadata[b'name']
|
2010-10-25 17:51:47 +00:00
|
|
|
|
2011-06-16 19:55:54 +00:00
|
|
|
@property
|
|
|
|
def ts(self):
|
2016-02-05 01:02:04 +00:00
|
|
|
"""Timestamp of archive creation (start) in UTC"""
|
2016-02-07 01:35:31 +00:00
|
|
|
ts = self.metadata[b'time']
|
|
|
|
return parse_timestamp(ts)
|
2011-06-16 19:55:54 +00:00
|
|
|
|
2016-02-05 01:02:04 +00:00
|
|
|
@property
|
|
|
|
def ts_end(self):
|
|
|
|
"""Timestamp of archive creation (end) in UTC"""
|
2016-02-07 01:35:31 +00:00
|
|
|
# fall back to time if there is no time_end present in metadata
|
|
|
|
ts = self.metadata.get(b'time_end') or self.metadata[b'time']
|
|
|
|
return parse_timestamp(ts)
|
2016-02-05 01:02:04 +00:00
|
|
|
|
2015-10-02 19:56:21 +00:00
|
|
|
@property
|
|
|
|
def fpr(self):
|
2016-04-23 20:42:56 +00:00
|
|
|
return bin_to_hex(self.id)
|
2015-10-02 19:56:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def duration(self):
|
2016-01-30 20:32:45 +00:00
|
|
|
return format_timedelta(self.end - self.start)
|
2015-10-02 19:56:21 +00:00
|
|
|
|
2016-04-23 23:29:17 +00:00
|
|
|
@property
|
|
|
|
def duration_from_meta(self):
|
|
|
|
return format_timedelta(self.ts_end - self.ts)
|
|
|
|
|
2015-10-02 19:56:21 +00:00
|
|
|
def __str__(self):
|
2016-02-08 19:17:35 +00:00
|
|
|
return '''\
|
|
|
|
Archive name: {0.name}
|
2015-10-02 19:56:21 +00:00
|
|
|
Archive fingerprint: {0.fpr}
|
2016-02-08 19:17:35 +00:00
|
|
|
Time (start): {start}
|
|
|
|
Time (end): {end}
|
2015-10-02 19:56:21 +00:00
|
|
|
Duration: {0.duration}
|
2016-02-08 19:17:35 +00:00
|
|
|
Number of files: {0.stats.nfiles}'''.format(
|
|
|
|
self,
|
|
|
|
start=format_time(to_localtime(self.start.replace(tzinfo=timezone.utc))),
|
|
|
|
end=format_time(to_localtime(self.end.replace(tzinfo=timezone.utc))))
|
2015-10-02 19:56:21 +00:00
|
|
|
|
2011-08-11 19:18:13 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return 'Archive(%r)' % self.name
|
|
|
|
|
2016-06-26 22:25:05 +00:00
|
|
|
def item_filter(self, item, filter=None):
|
2016-07-21 22:19:56 +00:00
|
|
|
if not self.consider_part_files and 'part' in item:
|
|
|
|
# this is a part(ial) file, we usually don't want to consider it.
|
2016-07-21 20:24:48 +00:00
|
|
|
return False
|
|
|
|
return filter(item) if filter else True
|
2016-06-26 22:25:05 +00:00
|
|
|
|
2014-01-23 21:13:08 +00:00
|
|
|
def iter_items(self, filter=None, preload=False):
|
2016-06-26 22:25:05 +00:00
|
|
|
for item in self.pipeline.unpack_many(self.metadata[b'items'], preload=preload,
|
|
|
|
filter=lambda item: self.item_filter(item, filter)):
|
2014-01-23 21:13:08 +00:00
|
|
|
yield item
|
2010-11-29 20:08:37 +00:00
|
|
|
|
2016-06-26 16:07:01 +00:00
|
|
|
def add_item(self, item, show_progress=True):
|
|
|
|
if show_progress and self.show_progress:
|
2015-12-27 10:06:03 +00:00
|
|
|
self.stats.show_progress(item=item, dt=0.2)
|
2014-01-22 19:58:48 +00:00
|
|
|
self.items_buffer.add(item)
|
2010-12-04 20:03:02 +00:00
|
|
|
|
2011-09-10 15:19:02 +00:00
|
|
|
def write_checkpoint(self):
|
|
|
|
self.save(self.checkpoint_name)
|
|
|
|
del self.manifest.archives[self.checkpoint_name]
|
2014-03-19 21:32:07 +00:00
|
|
|
self.cache.chunk_decref(self.id, self.stats)
|
2011-09-10 15:19:02 +00:00
|
|
|
|
2016-04-07 09:29:52 +00:00
|
|
|
def save(self, name=None, comment=None, timestamp=None, additional_metadata=None):
|
2011-09-10 15:19:02 +00:00
|
|
|
name = name or self.name
|
2011-09-04 21:02:47 +00:00
|
|
|
if name in self.manifest.archives:
|
2011-09-10 15:19:02 +00:00
|
|
|
raise self.AlreadyExists(name)
|
2014-01-22 19:58:48 +00:00
|
|
|
self.items_buffer.flush(flush=True)
|
2015-04-18 19:36:10 +00:00
|
|
|
if timestamp is None:
|
2016-02-07 01:35:31 +00:00
|
|
|
self.end = datetime.utcnow()
|
2016-02-05 01:02:04 +00:00
|
|
|
start = self.start
|
|
|
|
end = self.end
|
|
|
|
else:
|
2016-02-07 01:35:31 +00:00
|
|
|
self.end = timestamp
|
2016-02-05 01:02:04 +00:00
|
|
|
start = timestamp
|
|
|
|
end = timestamp # we only have 1 value
|
2016-04-07 09:29:52 +00:00
|
|
|
metadata = {
|
2010-10-20 19:08:46 +00:00
|
|
|
'version': 1,
|
2010-10-20 17:59:15 +00:00
|
|
|
'name': name,
|
2016-04-06 11:04:18 +00:00
|
|
|
'comment': comment,
|
2014-01-22 19:58:48 +00:00
|
|
|
'items': self.items_buffer.chunks,
|
2010-10-21 19:21:43 +00:00
|
|
|
'cmdline': sys.argv,
|
2010-10-24 18:18:18 +00:00
|
|
|
'hostname': socket.gethostname(),
|
2010-10-24 20:07:54 +00:00
|
|
|
'username': getuser(),
|
2016-02-05 01:02:04 +00:00
|
|
|
'time': start.isoformat(),
|
|
|
|
'time_end': end.isoformat(),
|
2016-03-12 11:40:39 +00:00
|
|
|
'chunker_params': self.chunker_params,
|
2016-04-07 09:29:52 +00:00
|
|
|
}
|
|
|
|
metadata.update(additional_metadata or {})
|
|
|
|
data = msgpack.packb(StableDict(metadata), unicode_errors='surrogateescape')
|
2011-08-15 20:32:26 +00:00
|
|
|
self.id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
self.cache.add_chunk(self.id, Chunk(data), self.stats)
|
2011-09-04 21:02:47 +00:00
|
|
|
self.manifest.archives[name] = {'id': self.id, 'time': metadata['time']}
|
|
|
|
self.manifest.write()
|
2013-06-20 10:44:58 +00:00
|
|
|
self.repository.commit()
|
2011-09-10 15:19:02 +00:00
|
|
|
self.cache.commit()
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2015-04-21 18:50:19 +00:00
|
|
|
def calc_stats(self, cache):
|
2012-11-28 20:17:00 +00:00
|
|
|
def add(id):
|
2015-04-19 21:45:05 +00:00
|
|
|
count, size, csize = cache.chunks[id]
|
2012-11-28 20:17:00 +00:00
|
|
|
stats.update(size, csize, count == 1)
|
2015-04-19 22:07:26 +00:00
|
|
|
cache.chunks[id] = count - 1, size, csize
|
2015-07-11 16:31:49 +00:00
|
|
|
|
2014-01-22 19:58:48 +00:00
|
|
|
def add_file_chunks(chunks):
|
|
|
|
for id, _, _ in chunks:
|
|
|
|
add(id)
|
2015-07-11 16:31:49 +00:00
|
|
|
|
2011-07-30 19:13:48 +00:00
|
|
|
# This function is a bit evil since it abuses the cache to calculate
|
2015-04-21 18:50:19 +00:00
|
|
|
# the stats. The cache transaction must be rolled back afterwards
|
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
2012-10-17 09:40:23 +00:00
|
|
|
cache.begin_txn()
|
|
|
|
stats = Statistics()
|
2012-11-28 20:17:00 +00:00
|
|
|
add(self.id)
|
2014-02-16 21:21:18 +00:00
|
|
|
for id, chunk in zip(self.metadata[b'items'], self.repository.get_many(self.metadata[b'items'])):
|
2012-11-29 19:38:03 +00:00
|
|
|
add(id)
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(id, chunk)
|
|
|
|
unpacker.feed(data)
|
2011-07-30 19:13:48 +00:00
|
|
|
for item in unpacker:
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(internal_dict=item)
|
|
|
|
if 'chunks' in item:
|
2012-11-28 20:17:00 +00:00
|
|
|
stats.nfiles += 1
|
2016-05-31 23:45:45 +00:00
|
|
|
add_file_chunks(item.chunks)
|
2011-07-30 19:13:48 +00:00
|
|
|
cache.rollback()
|
2011-07-30 20:50:59 +00:00
|
|
|
return stats
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-03-17 21:39:57 +00:00
|
|
|
def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
|
|
|
|
hardlink_masters=None, original_path=None):
|
|
|
|
"""
|
|
|
|
Extract archive item.
|
|
|
|
|
|
|
|
:param item: the item to extract
|
|
|
|
:param restore_attrs: restore file attributes
|
|
|
|
:param dry_run: do not write any data
|
|
|
|
:param stdout: write extracted data to stdout
|
|
|
|
:param sparse: write sparse files (chunk-granularity, independent of the original being sparse)
|
|
|
|
:param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly
|
2016-05-31 23:45:45 +00:00
|
|
|
:param original_path: 'path' key as stored in archive
|
2016-03-17 21:39:57 +00:00
|
|
|
"""
|
2016-07-10 23:23:27 +00:00
|
|
|
has_damaged_chunks = 'chunks_healthy' in item
|
2015-03-01 04:07:29 +00:00
|
|
|
if dry_run or stdout:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
|
2015-03-01 04:07:29 +00:00
|
|
|
if stdout:
|
|
|
|
sys.stdout.buffer.write(data)
|
|
|
|
if stdout:
|
|
|
|
sys.stdout.buffer.flush()
|
2016-07-09 16:19:25 +00:00
|
|
|
if has_damaged_chunks:
|
|
|
|
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
|
2016-07-17 17:13:09 +00:00
|
|
|
remove_surrogates(item.path))
|
2014-02-18 20:33:06 +00:00
|
|
|
return
|
|
|
|
|
2016-05-31 23:45:45 +00:00
|
|
|
original_path = original_path or item.path
|
2013-06-30 20:32:27 +00:00
|
|
|
dest = self.cwd
|
2016-05-31 23:45:45 +00:00
|
|
|
if item.path.startswith(('/', '..')):
|
2013-08-03 11:34:14 +00:00
|
|
|
raise Exception('Path should be relative and local')
|
2016-05-31 23:45:45 +00:00
|
|
|
path = os.path.join(dest, item.path)
|
2012-12-06 22:04:01 +00:00
|
|
|
# Attempt to remove existing files, ignore errors on failure
|
|
|
|
try:
|
|
|
|
st = os.lstat(path)
|
|
|
|
if stat.S_ISDIR(st.st_mode):
|
|
|
|
os.rmdir(path)
|
|
|
|
else:
|
|
|
|
os.unlink(path)
|
2015-04-21 20:29:10 +00:00
|
|
|
except UnicodeEncodeError:
|
2015-12-14 23:17:03 +00:00
|
|
|
raise self.IncompatibleFilesystemEncodingError(path, sys.getfilesystemencoding()) from None
|
2012-12-06 22:04:01 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2016-05-31 23:45:45 +00:00
|
|
|
mode = item.mode
|
2015-05-31 19:53:37 +00:00
|
|
|
if stat.S_ISREG(mode):
|
2016-07-04 17:07:37 +00:00
|
|
|
with backup_io():
|
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
2016-06-30 22:14:10 +00:00
|
|
|
os.makedirs(os.path.dirname(path))
|
2010-10-30 11:44:25 +00:00
|
|
|
# Hard link?
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'source' in item:
|
|
|
|
source = os.path.join(dest, item.source)
|
2016-06-30 22:14:10 +00:00
|
|
|
with backup_io():
|
|
|
|
if os.path.exists(path):
|
|
|
|
os.unlink(path)
|
2016-07-04 17:07:37 +00:00
|
|
|
if not hardlink_masters:
|
|
|
|
os.link(source, path)
|
|
|
|
return
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks, link_target = hardlink_masters[item.source]
|
2016-03-17 21:39:57 +00:00
|
|
|
if link_target:
|
|
|
|
# Hard link was extracted previously, just link
|
2016-07-04 17:07:37 +00:00
|
|
|
with backup_io():
|
|
|
|
os.link(link_target, path)
|
2016-03-17 21:39:57 +00:00
|
|
|
return
|
|
|
|
# Extract chunks, since the item which had the chunks was not extracted
|
2016-07-04 17:07:37 +00:00
|
|
|
with backup_io():
|
|
|
|
fd = open(path, 'wb')
|
|
|
|
with fd:
|
2016-05-31 23:45:45 +00:00
|
|
|
ids = [c.id for c in item.chunks]
|
2016-03-18 02:16:12 +00:00
|
|
|
for _, data in self.pipeline.fetch_many(ids, is_preloaded=True):
|
2016-06-30 22:14:10 +00:00
|
|
|
with backup_io():
|
2016-07-04 17:07:37 +00:00
|
|
|
if sparse and self.zeros.startswith(data):
|
|
|
|
# all-zero chunk: create a hole in a sparse file
|
|
|
|
fd.seek(len(data), 1)
|
|
|
|
else:
|
|
|
|
fd.write(data)
|
|
|
|
with backup_io():
|
|
|
|
pos = fd.tell()
|
|
|
|
fd.truncate(pos)
|
|
|
|
fd.flush()
|
|
|
|
self.restore_attrs(path, item, fd=fd.fileno())
|
2016-07-10 23:23:27 +00:00
|
|
|
if has_damaged_chunks:
|
|
|
|
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
|
|
|
|
remove_surrogates(item.path))
|
2016-03-17 21:39:57 +00:00
|
|
|
if hardlink_masters:
|
|
|
|
# Update master entry with extracted file path, so that following hardlinks don't extract twice.
|
2016-05-31 23:45:45 +00:00
|
|
|
hardlink_masters[item.get('source') or original_path] = (None, path)
|
2016-06-30 22:14:10 +00:00
|
|
|
return
|
|
|
|
with backup_io():
|
|
|
|
# No repository access beyond this point.
|
|
|
|
if stat.S_ISDIR(mode):
|
|
|
|
if not os.path.exists(path):
|
|
|
|
os.makedirs(path)
|
|
|
|
if restore_attrs:
|
|
|
|
self.restore_attrs(path, item)
|
|
|
|
elif stat.S_ISLNK(mode):
|
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
|
|
|
os.makedirs(os.path.dirname(path))
|
2016-07-04 17:07:37 +00:00
|
|
|
source = item.source
|
2016-06-30 22:14:10 +00:00
|
|
|
if os.path.exists(path):
|
|
|
|
os.unlink(path)
|
|
|
|
try:
|
|
|
|
os.symlink(source, path)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None
|
|
|
|
self.restore_attrs(path, item, symlink=True)
|
|
|
|
elif stat.S_ISFIFO(mode):
|
|
|
|
if not os.path.exists(os.path.dirname(path)):
|
|
|
|
os.makedirs(os.path.dirname(path))
|
|
|
|
os.mkfifo(path)
|
2015-05-31 19:53:37 +00:00
|
|
|
self.restore_attrs(path, item)
|
2016-06-30 22:14:10 +00:00
|
|
|
elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
|
2016-07-04 17:07:37 +00:00
|
|
|
os.mknod(path, item.mode, item.rdev)
|
2016-06-30 22:14:10 +00:00
|
|
|
self.restore_attrs(path, item)
|
|
|
|
else:
|
2016-07-04 17:07:37 +00:00
|
|
|
raise Exception('Unknown archive item type %r' % item.mode)
|
2010-10-20 20:53:58 +00:00
|
|
|
|
2013-06-03 11:45:48 +00:00
|
|
|
def restore_attrs(self, path, item, symlink=False, fd=None):
|
2016-06-30 22:14:10 +00:00
|
|
|
"""
|
2016-07-03 21:58:12 +00:00
|
|
|
Restore filesystem attributes on *path* (*fd*) from *item*.
|
2016-06-30 22:14:10 +00:00
|
|
|
|
|
|
|
Does not access the repository.
|
|
|
|
"""
|
2012-02-29 22:59:17 +00:00
|
|
|
uid = gid = None
|
|
|
|
if not self.numeric_owner:
|
2016-05-31 23:45:45 +00:00
|
|
|
uid = user2uid(item.user)
|
|
|
|
gid = group2gid(item.group)
|
|
|
|
uid = item.uid if uid is None else uid
|
|
|
|
gid = item.gid if gid is None else gid
|
2013-06-03 11:45:48 +00:00
|
|
|
# This code is a bit of a mess due to os specific differences
|
2010-10-20 20:53:58 +00:00
|
|
|
try:
|
2013-06-03 11:45:48 +00:00
|
|
|
if fd:
|
|
|
|
os.fchown(fd, uid, gid)
|
|
|
|
else:
|
|
|
|
os.lchown(path, uid, gid)
|
2010-10-20 20:53:58 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2013-06-03 11:45:48 +00:00
|
|
|
if fd:
|
2016-05-31 23:45:45 +00:00
|
|
|
os.fchmod(fd, item.mode)
|
2013-06-03 11:45:48 +00:00
|
|
|
elif not symlink:
|
2016-05-31 23:45:45 +00:00
|
|
|
os.chmod(path, item.mode)
|
2013-06-03 11:45:48 +00:00
|
|
|
elif has_lchmod: # Not available on Linux
|
2016-05-31 23:45:45 +00:00
|
|
|
os.lchmod(path, item.mode)
|
|
|
|
mtime = item.mtime
|
|
|
|
if 'atime' in item:
|
|
|
|
atime = item.atime
|
2015-10-26 01:07:55 +00:00
|
|
|
else:
|
|
|
|
# old archives only had mtime in item metadata
|
|
|
|
atime = mtime
|
2016-07-25 03:38:28 +00:00
|
|
|
try:
|
|
|
|
if fd:
|
|
|
|
os.utime(fd, None, ns=(atime, mtime))
|
|
|
|
else:
|
|
|
|
os.utime(path, None, ns=(atime, mtime), follow_symlinks=False)
|
|
|
|
except OSError:
|
|
|
|
# some systems don't support calling utime on a symlink
|
|
|
|
pass
|
2014-04-13 18:26:46 +00:00
|
|
|
acl_set(path, item, self.numeric_owner)
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'bsdflags' in item:
|
2014-04-08 19:52:26 +00:00
|
|
|
try:
|
2016-05-31 23:45:45 +00:00
|
|
|
set_flags(path, item.bsdflags, fd=fd)
|
2014-04-08 19:52:26 +00:00
|
|
|
except OSError:
|
|
|
|
pass
|
2016-04-16 21:52:16 +00:00
|
|
|
# chown removes Linux capabilities, so set the extended attributes at the end, after chown, since they include
|
|
|
|
# the Linux capabilities in the "security.capability" attribute.
|
2016-05-31 23:45:45 +00:00
|
|
|
xattrs = item.get('xattrs', {})
|
2016-04-16 21:52:16 +00:00
|
|
|
for k, v in xattrs.items():
|
|
|
|
try:
|
|
|
|
xattr.setxattr(fd or path, k, v, follow_symlinks=False)
|
|
|
|
except OSError as e:
|
|
|
|
if e.errno not in (errno.ENOTSUP, errno.EACCES):
|
|
|
|
# only raise if the errno is not on our ignore list:
|
|
|
|
# ENOTSUP == xattrs not supported here
|
|
|
|
# EACCES == permission denied to set this specific xattr
|
|
|
|
# (this may happen related to security.* keys)
|
|
|
|
raise
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-04-08 05:07:14 +00:00
|
|
|
def set_meta(self, key, value):
|
2015-03-24 06:11:00 +00:00
|
|
|
metadata = StableDict(self._load_meta(self.id))
|
2016-04-08 05:07:14 +00:00
|
|
|
metadata[key] = value
|
2015-03-24 06:11:00 +00:00
|
|
|
data = msgpack.packb(metadata, unicode_errors='surrogateescape')
|
|
|
|
new_id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
self.cache.add_chunk(new_id, Chunk(data), self.stats)
|
2016-04-08 05:07:14 +00:00
|
|
|
self.manifest.archives[self.name] = {'id': new_id, 'time': metadata[b'time']}
|
2015-03-24 06:11:00 +00:00
|
|
|
self.cache.chunk_decref(self.id, self.stats)
|
2016-04-07 09:29:52 +00:00
|
|
|
self.id = new_id
|
2016-04-08 05:07:14 +00:00
|
|
|
|
|
|
|
def rename(self, name):
|
|
|
|
if name in self.manifest.archives:
|
|
|
|
raise self.AlreadyExists(name)
|
|
|
|
oldname = self.name
|
|
|
|
self.name = name
|
|
|
|
self.set_meta(b'name', name)
|
|
|
|
del self.manifest.archives[oldname]
|
2015-03-24 06:11:00 +00:00
|
|
|
|
2016-07-01 02:27:06 +00:00
|
|
|
def delete(self, stats, progress=False, forced=False):
|
|
|
|
class ChunksIndexError(Error):
|
|
|
|
"""Chunk ID {} missing from chunks index, corrupted chunks index - aborting transaction."""
|
|
|
|
|
|
|
|
def chunk_decref(id, stats):
|
|
|
|
nonlocal error
|
|
|
|
try:
|
|
|
|
self.cache.chunk_decref(id, stats)
|
|
|
|
except KeyError:
|
2016-07-08 11:29:48 +00:00
|
|
|
cid = bin_to_hex(id)
|
2016-07-01 02:27:06 +00:00
|
|
|
raise ChunksIndexError(cid)
|
|
|
|
except Repository.ObjectNotFound as e:
|
|
|
|
# object not in repo - strange, but we wanted to delete it anyway.
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
|
|
|
|
|
|
|
error = False
|
|
|
|
try:
|
|
|
|
unpacker = msgpack.Unpacker(use_list=False)
|
|
|
|
items_ids = self.metadata[b'items']
|
|
|
|
pi = ProgressIndicatorPercent(total=len(items_ids), msg="Decrementing references %3.0f%%", same_line=True)
|
|
|
|
for (i, (items_id, data)) in enumerate(zip(items_ids, self.repository.get_many(items_ids))):
|
|
|
|
if progress:
|
|
|
|
pi.show(i)
|
2016-07-05 23:33:53 +00:00
|
|
|
_, data = self.key.decrypt(items_id, data)
|
|
|
|
unpacker.feed(data)
|
2016-07-01 02:27:06 +00:00
|
|
|
chunk_decref(items_id, stats)
|
|
|
|
try:
|
|
|
|
for item in unpacker:
|
2016-07-05 23:33:53 +00:00
|
|
|
item = Item(internal_dict=item)
|
|
|
|
if 'chunks' in item:
|
|
|
|
for chunk_id, size, csize in item.chunks:
|
2016-07-01 02:27:06 +00:00
|
|
|
chunk_decref(chunk_id, stats)
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
# if items metadata spans multiple chunks and one chunk got dropped somehow,
|
|
|
|
# it could be that unpacker yields bad types
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
2016-01-16 19:46:49 +00:00
|
|
|
if progress:
|
2016-07-01 02:27:06 +00:00
|
|
|
pi.finish()
|
|
|
|
except (msgpack.UnpackException, Repository.ObjectNotFound):
|
|
|
|
# items metadata corrupted
|
|
|
|
if not forced:
|
|
|
|
raise
|
|
|
|
error = True
|
|
|
|
# in forced delete mode, we try hard to delete at least the manifest entry,
|
|
|
|
# if possible also the archive superblock, even if processing the items raises
|
|
|
|
# some harmless exception.
|
|
|
|
chunk_decref(self.id, stats)
|
2011-09-04 21:02:47 +00:00
|
|
|
del self.manifest.archives[self.name]
|
2016-07-01 02:27:06 +00:00
|
|
|
if error:
|
|
|
|
logger.warning('forced deletion succeeded, but the deleted archive was corrupted.')
|
|
|
|
logger.warning('borg check --repair is required to free all space.')
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2016-06-26 14:59:38 +00:00
|
|
|
def stat_simple_attrs(self, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs = dict(
|
|
|
|
mode=st.st_mode,
|
2016-06-26 14:59:38 +00:00
|
|
|
uid=st.st_uid,
|
|
|
|
gid=st.st_gid,
|
2016-05-31 23:45:45 +00:00
|
|
|
atime=st.st_atime_ns,
|
|
|
|
ctime=st.st_ctime_ns,
|
|
|
|
mtime=st.st_mtime_ns,
|
|
|
|
)
|
2012-02-29 22:59:17 +00:00
|
|
|
if self.numeric_owner:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['user'] = attrs['group'] = None
|
2016-06-26 14:59:38 +00:00
|
|
|
else:
|
|
|
|
attrs['user'] = uid2user(st.st_uid)
|
|
|
|
attrs['group'] = gid2group(st.st_gid)
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
def stat_ext_attrs(self, st, path):
|
|
|
|
attrs = {}
|
2016-06-30 22:13:53 +00:00
|
|
|
with backup_io():
|
2016-06-27 18:56:41 +00:00
|
|
|
xattrs = xattr.get_all(path, follow_symlinks=False)
|
2016-07-08 11:26:06 +00:00
|
|
|
bsdflags = get_flags(path, st)
|
|
|
|
acl_get(path, attrs, st, self.numeric_owner)
|
2013-07-29 19:09:31 +00:00
|
|
|
if xattrs:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['xattrs'] = StableDict(xattrs)
|
2016-05-17 22:22:49 +00:00
|
|
|
if bsdflags:
|
2016-05-31 23:45:45 +00:00
|
|
|
attrs['bsdflags'] = bsdflags
|
|
|
|
return attrs
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2016-06-26 14:59:38 +00:00
|
|
|
def stat_attrs(self, st, path):
|
|
|
|
attrs = self.stat_simple_attrs(st)
|
|
|
|
attrs.update(self.stat_ext_attrs(st, path))
|
|
|
|
return attrs
|
|
|
|
|
2015-03-22 14:52:43 +00:00
|
|
|
def process_dir(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path))
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2010-11-29 20:08:37 +00:00
|
|
|
self.add_item(item)
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'd' # directory
|
|
|
|
|
|
|
|
def process_fifo(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path))
|
2015-03-22 14:52:43 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
|
|
|
self.add_item(item)
|
|
|
|
return 'f' # fifo
|
2010-10-31 19:12:32 +00:00
|
|
|
|
2012-03-03 13:02:22 +00:00
|
|
|
def process_dev(self, path, st):
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path), rdev=st.st_rdev)
|
2012-03-03 13:02:22 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
if stat.S_ISCHR(st.st_mode):
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'c' # char device
|
2015-03-08 18:18:21 +00:00
|
|
|
elif stat.S_ISBLK(st.st_mode):
|
2015-03-22 14:52:43 +00:00
|
|
|
return 'b' # block device
|
2012-03-03 13:02:22 +00:00
|
|
|
|
2010-10-20 20:53:58 +00:00
|
|
|
def process_symlink(self, path, st):
|
2010-10-20 17:59:15 +00:00
|
|
|
source = os.readlink(path)
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=make_path_safe(path), source=source)
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2010-11-29 20:08:37 +00:00
|
|
|
self.add_item(item)
|
2015-03-22 14:52:43 +00:00
|
|
|
return 's' # symlink
|
2010-10-30 11:44:25 +00:00
|
|
|
|
2016-06-26 16:07:01 +00:00
|
|
|
def chunk_file(self, item, cache, stats, fd, fh=-1, **chunk_kw):
|
2016-07-21 21:56:58 +00:00
|
|
|
def write_part(item, from_chunk, number):
|
|
|
|
item = Item(internal_dict=item.as_dict())
|
|
|
|
length = len(item.chunks)
|
|
|
|
# the item should only have the *additional* chunks we processed after the last partial item:
|
|
|
|
item.chunks = item.chunks[from_chunk:]
|
2016-07-21 22:19:56 +00:00
|
|
|
item.path += '.borg_part_%d' % number
|
|
|
|
item.part = number
|
2016-07-21 21:56:58 +00:00
|
|
|
number += 1
|
|
|
|
self.add_item(item, show_progress=False)
|
|
|
|
self.write_checkpoint()
|
|
|
|
return length, number
|
|
|
|
|
2016-06-26 16:07:01 +00:00
|
|
|
item.chunks = []
|
2016-07-21 21:56:58 +00:00
|
|
|
from_chunk = 0
|
|
|
|
part_number = 1
|
2016-06-26 16:07:01 +00:00
|
|
|
for data in backup_io_iter(self.chunker.chunkify(fd, fh)):
|
|
|
|
item.chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data, **chunk_kw), stats))
|
|
|
|
if self.show_progress:
|
|
|
|
self.stats.show_progress(item=item, dt=0.2)
|
|
|
|
if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
|
2016-07-21 21:56:58 +00:00
|
|
|
from_chunk, part_number = write_part(item, from_chunk, part_number)
|
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
else:
|
2016-07-28 15:55:40 +00:00
|
|
|
if part_number > 1:
|
|
|
|
if item.chunks[from_chunk:]:
|
|
|
|
# if we already have created a part item inside this file, we want to put the final
|
|
|
|
# chunks (if any) into a part item also (so all parts can be concatenated to get
|
|
|
|
# the complete file):
|
|
|
|
from_chunk, part_number = write_part(item, from_chunk, part_number)
|
|
|
|
self.last_checkpoint = time.time()
|
|
|
|
|
|
|
|
# if we created part files, we have referenced all chunks from the part files,
|
|
|
|
# but we also will reference the same chunks also from the final, complete file:
|
|
|
|
for chunk in item.chunks:
|
|
|
|
cache.chunk_incref(chunk.id, stats)
|
2016-06-26 16:07:01 +00:00
|
|
|
|
2015-03-01 03:29:44 +00:00
|
|
|
def process_stdin(self, path, cache):
|
|
|
|
uid, gid = 0, 0
|
2016-05-31 23:45:45 +00:00
|
|
|
t = int(time.time()) * 1000000000
|
|
|
|
item = Item(
|
|
|
|
path=path,
|
|
|
|
mode=0o100660, # regular file, ug=rw
|
|
|
|
uid=uid, user=uid2user(uid),
|
|
|
|
gid=gid, group=gid2group(gid),
|
|
|
|
mtime=t, atime=t, ctime=t,
|
|
|
|
)
|
2016-06-26 15:14:13 +00:00
|
|
|
fd = sys.stdin.buffer # binary
|
2016-06-26 16:07:01 +00:00
|
|
|
self.chunk_file(item, cache, self.stats, fd)
|
2016-06-26 15:14:13 +00:00
|
|
|
self.stats.nfiles += 1
|
2015-03-01 03:29:44 +00:00
|
|
|
self.add_item(item)
|
2015-09-08 01:12:45 +00:00
|
|
|
return 'i' # stdin
|
2015-03-01 03:29:44 +00:00
|
|
|
|
2016-03-15 14:38:55 +00:00
|
|
|
def process_file(self, path, st, cache, ignore_inode=False):
|
2015-03-08 18:18:21 +00:00
|
|
|
status = None
|
2013-08-03 11:34:14 +00:00
|
|
|
safe_path = make_path_safe(path)
|
2010-10-25 20:31:18 +00:00
|
|
|
# Is it a hard link?
|
2010-10-20 20:53:58 +00:00
|
|
|
if st.st_nlink > 1:
|
|
|
|
source = self.hard_links.get((st.st_ino, st.st_dev))
|
|
|
|
if (st.st_ino, st.st_dev) in self.hard_links:
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(path=safe_path, source=source)
|
|
|
|
item.update(self.stat_attrs(st, path))
|
2011-06-19 18:34:46 +00:00
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
status = 'h' # regular file, hardlink (to already seen inodes)
|
|
|
|
return status
|
2010-10-20 20:53:58 +00:00
|
|
|
else:
|
|
|
|
self.hard_links[st.st_ino, st.st_dev] = safe_path
|
2016-07-02 19:04:51 +00:00
|
|
|
is_special_file = is_special(st.st_mode)
|
|
|
|
if not is_special_file:
|
2016-07-04 17:07:37 +00:00
|
|
|
path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
|
2016-07-02 17:44:26 +00:00
|
|
|
ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
|
|
|
|
else:
|
|
|
|
# in --read-special mode, we may be called for special files.
|
|
|
|
# there should be no information in the cache about special files processed in
|
|
|
|
# read-special mode, but we better play safe as this was wrong in the past:
|
|
|
|
path_hash = ids = None
|
2016-08-06 20:38:53 +00:00
|
|
|
first_run = not cache.files and cache.do_files
|
2015-10-16 01:16:31 +00:00
|
|
|
if first_run:
|
2016-03-26 13:31:54 +00:00
|
|
|
logger.debug('Processing files ...')
|
2011-07-30 19:13:48 +00:00
|
|
|
chunks = None
|
2010-10-25 20:31:18 +00:00
|
|
|
if ids is not None:
|
|
|
|
# Make sure all ids are available
|
2014-01-22 19:58:48 +00:00
|
|
|
for id_ in ids:
|
|
|
|
if not cache.seen_chunk(id_):
|
2010-10-25 20:31:18 +00:00
|
|
|
break
|
|
|
|
else:
|
2014-01-22 19:58:48 +00:00
|
|
|
chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
|
2015-03-08 18:18:21 +00:00
|
|
|
status = 'U' # regular file, unchanged
|
|
|
|
else:
|
|
|
|
status = 'A' # regular file, added
|
2016-05-31 23:45:45 +00:00
|
|
|
item = Item(
|
|
|
|
path=safe_path,
|
|
|
|
hardlink_master=st.st_nlink > 1, # item is a hard link and has the chunks
|
|
|
|
)
|
2016-06-26 14:59:38 +00:00
|
|
|
item.update(self.stat_simple_attrs(st))
|
2010-10-25 20:31:18 +00:00
|
|
|
# Only chunkify the file if needed
|
2016-06-26 15:20:29 +00:00
|
|
|
if chunks is not None:
|
|
|
|
item.chunks = chunks
|
|
|
|
else:
|
2016-04-18 23:13:10 +00:00
|
|
|
compress = self.compression_decider1.decide(path)
|
2016-08-06 20:37:44 +00:00
|
|
|
self.file_compression_logger.debug('%s -> compression %s', path, compress['name'])
|
2016-06-30 22:13:53 +00:00
|
|
|
with backup_io():
|
2016-06-27 18:56:41 +00:00
|
|
|
fh = Archive._open_rb(path)
|
2015-04-08 16:43:53 +00:00
|
|
|
with os.fdopen(fh, 'rb') as fd:
|
2016-06-26 16:07:01 +00:00
|
|
|
self.chunk_file(item, cache, self.stats, fd, fh, compress=compress)
|
2016-07-02 19:04:51 +00:00
|
|
|
if not is_special_file:
|
2016-07-02 17:44:26 +00:00
|
|
|
# we must not memorize special files, because the contents of e.g. a
|
|
|
|
# block or char device will change without its mtime/size/inode changing.
|
2016-06-26 15:20:29 +00:00
|
|
|
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
|
2015-03-08 18:18:21 +00:00
|
|
|
status = status or 'M' # regular file, modified (if not 'A' already)
|
2010-10-31 19:31:56 +00:00
|
|
|
item.update(self.stat_attrs(st, path))
|
2016-07-02 19:04:51 +00:00
|
|
|
if is_special_file:
|
2016-07-02 18:17:07 +00:00
|
|
|
# we processed a special file like a regular file. reflect that in mode,
|
2016-07-02 19:04:51 +00:00
|
|
|
# so it can be extracted / accessed in FUSE mount like a regular file:
|
2016-07-04 17:07:37 +00:00
|
|
|
item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
|
2011-08-07 18:00:18 +00:00
|
|
|
self.stats.nfiles += 1
|
2011-08-02 21:20:46 +00:00
|
|
|
self.add_item(item)
|
2015-03-08 18:18:21 +00:00
|
|
|
return status
|
2010-10-20 17:59:15 +00:00
|
|
|
|
2010-10-21 19:21:43 +00:00
|
|
|
@staticmethod
|
2013-06-20 10:44:58 +00:00
|
|
|
def list_archives(repository, key, manifest, cache=None):
|
2015-05-26 00:04:41 +00:00
|
|
|
# expensive! see also Manifest.list_archive_infos.
|
2011-09-04 21:02:47 +00:00
|
|
|
for name, info in manifest.archives.items():
|
2013-06-20 10:44:58 +00:00
|
|
|
yield Archive(repository, key, manifest, name, cache=cache)
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2014-08-30 13:10:41 +00:00
|
|
|
@staticmethod
|
2016-02-18 22:01:55 +00:00
|
|
|
def _open_rb(path):
|
|
|
|
try:
|
|
|
|
# if we have O_NOATIME, this likely will succeed if we are root or owner of file:
|
|
|
|
return os.open(path, flags_noatime)
|
|
|
|
except PermissionError:
|
|
|
|
if flags_noatime == flags_normal:
|
|
|
|
# we do not have O_NOATIME, no need to try again:
|
|
|
|
raise
|
|
|
|
# Was this EPERM due to the O_NOATIME flag? Try again without it:
|
|
|
|
return os.open(path, flags_normal)
|
2014-08-30 13:10:41 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def valid_msgpacked_dict(d, keys_serialized):
|
|
|
|
"""check if the data <d> looks like a msgpacked dict"""
|
|
|
|
d_len = len(d)
|
|
|
|
if d_len == 0:
|
|
|
|
return False
|
|
|
|
if d[0] & 0xf0 == 0x80: # object is a fixmap (up to 15 elements)
|
|
|
|
offs = 1
|
|
|
|
elif d[0] == 0xde: # object is a map16 (up to 2^16-1 elements)
|
|
|
|
offs = 3
|
|
|
|
else:
|
|
|
|
# object is not a map (dict)
|
|
|
|
# note: we must not have dicts with > 2^16-1 elements
|
|
|
|
return False
|
|
|
|
if d_len <= offs:
|
|
|
|
return False
|
|
|
|
# is the first dict key a bytestring?
|
|
|
|
if d[offs] & 0xe0 == 0xa0: # key is a small bytestring (up to 31 chars)
|
|
|
|
pass
|
|
|
|
elif d[offs] in (0xd9, 0xda, 0xdb): # key is a str8, str16 or str32
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# key is not a bytestring
|
|
|
|
return False
|
|
|
|
# is the bytestring any of the expected key names?
|
|
|
|
key_serialized = d[offs:]
|
|
|
|
return any(key_serialized.startswith(pattern) for pattern in keys_serialized)
|
|
|
|
|
|
|
|
|
2015-07-15 09:30:25 +00:00
|
|
|
class RobustUnpacker:
|
2014-02-24 21:43:17 +00:00
|
|
|
"""A restartable/robust version of the streaming msgpack unpacker
|
|
|
|
"""
|
2016-07-28 20:10:29 +00:00
|
|
|
class UnpackerCrashed(Exception):
|
|
|
|
"""raise if unpacker crashed"""
|
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def __init__(self, validator, item_keys):
|
2015-07-11 16:31:49 +00:00
|
|
|
super().__init__()
|
2016-06-12 21:36:56 +00:00
|
|
|
self.item_keys = [msgpack.packb(name.encode()) for name in item_keys]
|
2014-02-24 21:43:17 +00:00
|
|
|
self.validator = validator
|
|
|
|
self._buffered_data = []
|
|
|
|
self._resync = False
|
|
|
|
self._unpacker = msgpack.Unpacker(object_hook=StableDict)
|
|
|
|
|
|
|
|
def resync(self):
|
|
|
|
self._buffered_data = []
|
|
|
|
self._resync = True
|
|
|
|
|
|
|
|
def feed(self, data):
|
|
|
|
if self._resync:
|
|
|
|
self._buffered_data.append(data)
|
|
|
|
else:
|
|
|
|
self._unpacker.feed(data)
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
2016-07-28 20:23:38 +00:00
|
|
|
def unpack_next():
|
|
|
|
try:
|
|
|
|
return next(self._unpacker)
|
|
|
|
except (TypeError, ValueError) as err:
|
|
|
|
# transform exceptions that might be raised when feeding
|
|
|
|
# msgpack with invalid data to a more specific exception
|
|
|
|
raise self.UnpackerCrashed(str(err))
|
|
|
|
|
2014-02-24 21:43:17 +00:00
|
|
|
if self._resync:
|
2014-02-24 22:37:21 +00:00
|
|
|
data = b''.join(self._buffered_data)
|
2014-02-24 21:43:17 +00:00
|
|
|
while self._resync:
|
|
|
|
if not data:
|
|
|
|
raise StopIteration
|
2016-06-12 21:36:56 +00:00
|
|
|
# Abort early if the data does not look like a serialized item dict
|
|
|
|
if not valid_msgpacked_dict(data, self.item_keys):
|
2014-03-04 20:15:52 +00:00
|
|
|
data = data[1:]
|
|
|
|
continue
|
2014-02-24 22:37:21 +00:00
|
|
|
self._unpacker = msgpack.Unpacker(object_hook=StableDict)
|
|
|
|
self._unpacker.feed(data)
|
|
|
|
try:
|
2016-07-28 20:23:38 +00:00
|
|
|
item = unpack_next()
|
|
|
|
except (self.UnpackerCrashed, StopIteration):
|
|
|
|
# as long as we are resyncing, we also ignore StopIteration
|
2016-07-28 20:12:34 +00:00
|
|
|
pass
|
|
|
|
else:
|
2014-02-24 21:43:17 +00:00
|
|
|
if self.validator(item):
|
|
|
|
self._resync = False
|
2014-02-24 22:37:21 +00:00
|
|
|
return item
|
|
|
|
data = data[1:]
|
2014-02-24 21:43:17 +00:00
|
|
|
else:
|
2016-07-28 20:23:38 +00:00
|
|
|
return unpack_next()
|
2014-02-24 21:43:17 +00:00
|
|
|
|
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
class ArchiveChecker:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.error_found = False
|
|
|
|
self.possibly_superseded = set()
|
|
|
|
|
2016-05-13 20:50:34 +00:00
|
|
|
def check(self, repository, repair=False, archive=None, last=None, prefix=None, verify_data=False,
|
|
|
|
save_space=False):
|
|
|
|
"""Perform a set of checks on 'repository'
|
|
|
|
|
|
|
|
:param repair: enable repair mode, write updated or corrected data into repository
|
|
|
|
:param archive: only check this archive
|
|
|
|
:param last: only check this number of recent archives
|
|
|
|
:param prefix: only check archives with this prefix
|
|
|
|
:param verify_data: integrity verification of data referenced by archives
|
|
|
|
:param save_space: Repository.commit(save_space)
|
|
|
|
"""
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Starting archive consistency check...')
|
2015-12-12 23:39:15 +00:00
|
|
|
self.check_all = archive is None and last is None and prefix is None
|
2014-02-24 22:37:21 +00:00
|
|
|
self.repair = repair
|
|
|
|
self.repository = repository
|
|
|
|
self.init_chunks()
|
|
|
|
self.key = self.identify_key(repository)
|
2016-07-28 16:41:08 +00:00
|
|
|
if verify_data:
|
|
|
|
self.verify_data()
|
2015-03-17 22:47:21 +00:00
|
|
|
if Manifest.MANIFEST_ID not in self.chunks:
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.error("Repository manifest not found!")
|
|
|
|
self.error_found = True
|
2014-02-24 22:37:21 +00:00
|
|
|
self.manifest = self.rebuild_manifest()
|
|
|
|
else:
|
|
|
|
self.manifest, _ = Manifest.load(repository, key=self.key)
|
2015-12-12 23:39:15 +00:00
|
|
|
self.rebuild_refcounts(archive=archive, last=last, prefix=prefix)
|
2015-08-09 10:43:57 +00:00
|
|
|
self.orphan_chunks_check()
|
2015-11-18 01:27:25 +00:00
|
|
|
self.finish(save_space=save_space)
|
2015-12-07 23:21:46 +00:00
|
|
|
if self.error_found:
|
|
|
|
logger.error('Archive consistency check complete, problems found.')
|
|
|
|
else:
|
2015-10-02 14:58:08 +00:00
|
|
|
logger.info('Archive consistency check complete, no problems found.')
|
2014-02-24 22:37:21 +00:00
|
|
|
return self.repair or not self.error_found
|
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def init_chunks(self):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Fetch a list of all object keys from repository
|
|
|
|
"""
|
2015-08-08 22:36:17 +00:00
|
|
|
# Explicitly set the initial hash table capacity to avoid performance issues
|
2014-02-17 17:25:25 +00:00
|
|
|
# due to hash table "resonance"
|
2016-07-09 12:36:09 +00:00
|
|
|
capacity = int(len(self.repository) * 1.35 + 1) # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c)
|
2014-07-10 13:32:12 +00:00
|
|
|
self.chunks = ChunkIndex(capacity)
|
2014-02-16 21:21:18 +00:00
|
|
|
marker = None
|
|
|
|
while True:
|
|
|
|
result = self.repository.list(limit=10000, marker=marker)
|
|
|
|
if not result:
|
|
|
|
break
|
|
|
|
marker = result[-1]
|
2016-04-16 15:48:47 +00:00
|
|
|
init_entry = ChunkIndexEntry(refcount=0, size=0, csize=0)
|
2014-02-16 21:21:18 +00:00
|
|
|
for id_ in result:
|
2016-04-16 15:48:47 +00:00
|
|
|
self.chunks[id_] = init_entry
|
2014-02-16 21:21:18 +00:00
|
|
|
|
|
|
|
def identify_key(self, repository):
|
2016-06-12 21:36:56 +00:00
|
|
|
try:
|
|
|
|
some_chunkid, _ = next(self.chunks.iteritems())
|
|
|
|
except StopIteration:
|
|
|
|
# repo is completely empty, no chunks
|
|
|
|
return None
|
|
|
|
cdata = repository.get(some_chunkid)
|
2014-02-16 21:21:18 +00:00
|
|
|
return key_factory(repository, cdata)
|
|
|
|
|
2016-05-13 20:50:34 +00:00
|
|
|
def verify_data(self):
|
|
|
|
logger.info('Starting cryptographic data integrity verification...')
|
2016-07-28 16:40:20 +00:00
|
|
|
count = len(self.chunks)
|
|
|
|
errors = 0
|
|
|
|
pi = ProgressIndicatorPercent(total=count, msg="Verifying data %6.2f%%", step=0.01, same_line=True)
|
2016-05-13 20:50:34 +00:00
|
|
|
for chunk_id, (refcount, *_) in self.chunks.iteritems():
|
|
|
|
pi.show()
|
2016-07-28 16:40:20 +00:00
|
|
|
try:
|
|
|
|
encrypted_data = self.repository.get(chunk_id)
|
|
|
|
except Repository.ObjectNotFound:
|
|
|
|
self.error_found = True
|
|
|
|
errors += 1
|
|
|
|
logger.error('chunk %s not found', bin_to_hex(chunk_id))
|
2016-05-13 20:50:34 +00:00
|
|
|
continue
|
|
|
|
try:
|
2016-07-28 16:40:20 +00:00
|
|
|
_chunk_id = None if chunk_id == Manifest.MANIFEST_ID else chunk_id
|
|
|
|
_, data = self.key.decrypt(_chunk_id, encrypted_data)
|
2016-05-13 20:50:34 +00:00
|
|
|
except IntegrityError as integrity_error:
|
|
|
|
self.error_found = True
|
|
|
|
errors += 1
|
|
|
|
logger.error('chunk %s, integrity error: %s', bin_to_hex(chunk_id), integrity_error)
|
|
|
|
pi.finish()
|
|
|
|
log = logger.error if errors else logger.info
|
|
|
|
log('Finished cryptographic data integrity verification, verified %d chunks with %d integrity errors.', count, errors)
|
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def rebuild_manifest(self):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Rebuild the manifest object if it is missing
|
|
|
|
|
|
|
|
Iterates through all objects in the repository looking for archive metadata blocks.
|
|
|
|
"""
|
2016-06-12 21:36:56 +00:00
|
|
|
required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS)
|
|
|
|
|
|
|
|
def valid_archive(obj):
|
|
|
|
if not isinstance(obj, dict):
|
|
|
|
return False
|
|
|
|
keys = set(obj)
|
|
|
|
return required_archive_keys.issubset(keys)
|
|
|
|
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Rebuilding missing manifest, this might take some time...')
|
2016-06-12 21:36:56 +00:00
|
|
|
# as we have lost the manifest, we do not know any more what valid item keys we had.
|
|
|
|
# collecting any key we encounter in a damaged repo seems unwise, thus we just use
|
|
|
|
# the hardcoded list from the source code. thus, it is not recommended to rebuild a
|
|
|
|
# lost manifest on a older borg version than the most recent one that was ever used
|
|
|
|
# within this repository (assuming that newer borg versions support more item keys).
|
2014-02-16 21:21:18 +00:00
|
|
|
manifest = Manifest(self.key, self.repository)
|
2016-06-12 21:36:56 +00:00
|
|
|
archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS]
|
2014-02-16 21:21:18 +00:00
|
|
|
for chunk_id, _ in self.chunks.iteritems():
|
|
|
|
cdata = self.repository.get(chunk_id)
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(chunk_id, cdata)
|
2016-06-12 21:36:56 +00:00
|
|
|
if not valid_msgpacked_dict(data, archive_keys_serialized):
|
2014-03-01 14:00:21 +00:00
|
|
|
continue
|
2015-03-17 22:47:21 +00:00
|
|
|
if b'cmdline' not in data or b'\xa7version\x01' not in data:
|
2014-03-01 14:00:21 +00:00
|
|
|
continue
|
2014-02-16 21:21:18 +00:00
|
|
|
try:
|
|
|
|
archive = msgpack.unpackb(data)
|
2015-03-20 01:31:39 +00:00
|
|
|
# Ignore exceptions that might be raised when feeding
|
|
|
|
# msgpack with invalid data
|
|
|
|
except (TypeError, ValueError, StopIteration):
|
2014-02-16 21:21:18 +00:00
|
|
|
continue
|
2016-06-12 21:36:56 +00:00
|
|
|
if valid_archive(archive):
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Found archive %s', archive[b'name'].decode('utf-8'))
|
2014-02-16 21:21:18 +00:00
|
|
|
manifest.archives[archive[b'name'].decode('utf-8')] = {b'id': chunk_id, b'time': archive[b'time']}
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.info('Manifest rebuild complete.')
|
2014-02-16 21:21:18 +00:00
|
|
|
return manifest
|
|
|
|
|
2015-12-12 23:39:15 +00:00
|
|
|
def rebuild_refcounts(self, archive=None, last=None, prefix=None):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Rebuild object reference counts by walking the metadata
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2014-02-24 22:37:21 +00:00
|
|
|
Missing and/or incorrect data is repaired when detected
|
|
|
|
"""
|
2014-02-16 21:21:18 +00:00
|
|
|
# Exclude the manifest from chunks
|
|
|
|
del self.chunks[Manifest.MANIFEST_ID]
|
2014-02-18 20:16:36 +00:00
|
|
|
|
2014-02-24 22:37:21 +00:00
|
|
|
def mark_as_possibly_superseded(id_):
|
2016-04-16 15:48:47 +00:00
|
|
|
if self.chunks.get(id_, ChunkIndexEntry(0, 0, 0)).refcount == 0:
|
2014-02-16 21:21:18 +00:00
|
|
|
self.possibly_superseded.add(id_)
|
|
|
|
|
|
|
|
def add_callback(chunk):
|
2016-03-18 02:16:12 +00:00
|
|
|
id_ = self.key.id_hash(chunk.data)
|
2014-02-16 21:21:18 +00:00
|
|
|
cdata = self.key.encrypt(chunk)
|
2016-03-18 02:16:12 +00:00
|
|
|
add_reference(id_, len(chunk.data), len(cdata), cdata)
|
2014-02-16 21:21:18 +00:00
|
|
|
return id_
|
|
|
|
|
|
|
|
def add_reference(id_, size, csize, cdata=None):
|
|
|
|
try:
|
2016-04-11 22:10:44 +00:00
|
|
|
self.chunks.incref(id_)
|
2014-02-16 21:21:18 +00:00
|
|
|
except KeyError:
|
|
|
|
assert cdata is not None
|
2016-04-16 15:48:47 +00:00
|
|
|
self.chunks[id_] = ChunkIndexEntry(refcount=1, size=size, csize=csize)
|
2014-02-16 21:21:18 +00:00
|
|
|
if self.repair:
|
|
|
|
self.repository.put(id_, cdata)
|
|
|
|
|
|
|
|
def verify_file_chunks(item):
|
2016-07-09 14:38:07 +00:00
|
|
|
"""Verifies that all file chunks are present.
|
2014-02-24 22:37:21 +00:00
|
|
|
|
2016-07-09 14:38:07 +00:00
|
|
|
Missing file chunks will be replaced with new chunks of the same length containing all zeros.
|
|
|
|
If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
|
2014-02-24 22:37:21 +00:00
|
|
|
"""
|
2014-02-16 21:21:18 +00:00
|
|
|
offset = 0
|
|
|
|
chunk_list = []
|
2016-07-06 21:10:04 +00:00
|
|
|
chunks_replaced = False
|
2016-07-10 23:23:27 +00:00
|
|
|
has_chunks_healthy = 'chunks_healthy' in item
|
|
|
|
chunks_current = item.chunks
|
|
|
|
chunks_healthy = item.chunks_healthy if has_chunks_healthy else chunks_current
|
2016-07-09 14:38:07 +00:00
|
|
|
assert len(chunks_current) == len(chunks_healthy)
|
|
|
|
for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy):
|
|
|
|
chunk_id, size, csize = chunk_healthy
|
2015-03-17 22:47:21 +00:00
|
|
|
if chunk_id not in self.chunks:
|
2016-07-09 14:38:07 +00:00
|
|
|
# a chunk of the healthy list is missing
|
|
|
|
if chunk_current == chunk_healthy:
|
|
|
|
logger.error('{}: New missing file chunk detected (Byte {}-{}). '
|
2016-07-10 23:23:27 +00:00
|
|
|
'Replacing with all-zero chunk.'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
self.error_found = chunks_replaced = True
|
|
|
|
data = bytes(size)
|
|
|
|
chunk_id = self.key.id_hash(data)
|
2016-07-10 23:23:27 +00:00
|
|
|
cdata = self.key.encrypt(Chunk(data))
|
2016-07-09 14:38:07 +00:00
|
|
|
csize = len(cdata)
|
|
|
|
add_reference(chunk_id, size, csize, cdata)
|
|
|
|
else:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Previously missing file chunk is still missing (Byte {}-{}). It has a '
|
|
|
|
'all-zero replacement chunk already.'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
chunk_id, size, csize = chunk_current
|
|
|
|
add_reference(chunk_id, size, csize)
|
2014-02-16 21:21:18 +00:00
|
|
|
else:
|
2016-07-09 14:38:07 +00:00
|
|
|
if chunk_current == chunk_healthy:
|
|
|
|
# normal case, all fine.
|
|
|
|
add_reference(chunk_id, size, csize)
|
|
|
|
else:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Healed previously missing file chunk! '
|
|
|
|
'(Byte {}-{}).'.format(item.path, offset, offset + size))
|
2016-07-09 14:38:07 +00:00
|
|
|
add_reference(chunk_id, size, csize)
|
|
|
|
mark_as_possibly_superseded(chunk_current[0]) # maybe orphaned the all-zero replacement chunk
|
|
|
|
chunk_list.append([chunk_id, size, csize]) # list-typed element as chunks_healthy is list-of-lists
|
2014-02-16 21:21:18 +00:00
|
|
|
offset += size
|
2016-07-09 14:38:07 +00:00
|
|
|
if chunks_replaced and not has_chunks_healthy:
|
2016-07-06 20:42:18 +00:00
|
|
|
# if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later
|
2016-07-08 10:13:52 +00:00
|
|
|
item.chunks_healthy = item.chunks
|
2016-07-09 14:38:07 +00:00
|
|
|
if has_chunks_healthy and chunk_list == chunks_healthy:
|
2016-07-10 23:23:27 +00:00
|
|
|
logger.info('{}: Completely healed previously damaged file!'.format(item.path))
|
|
|
|
del item.chunks_healthy
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks = chunk_list
|
2014-02-16 21:21:18 +00:00
|
|
|
|
|
|
|
def robust_iterator(archive):
|
2014-02-24 22:37:21 +00:00
|
|
|
"""Iterates through all archive items
|
|
|
|
|
|
|
|
Missing item chunks will be skipped and the msgpack stream will be restarted
|
|
|
|
"""
|
2016-06-12 21:36:56 +00:00
|
|
|
item_keys = frozenset(key.encode() for key in self.manifest.item_keys)
|
|
|
|
required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS)
|
|
|
|
unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and 'path' in item,
|
|
|
|
self.manifest.item_keys)
|
2014-02-24 21:43:17 +00:00
|
|
|
_state = 0
|
2015-03-17 22:47:21 +00:00
|
|
|
|
2014-02-16 21:21:18 +00:00
|
|
|
def missing_chunk_detector(chunk_id):
|
2014-02-24 21:43:17 +00:00
|
|
|
nonlocal _state
|
2015-03-17 22:47:21 +00:00
|
|
|
if _state % 2 != int(chunk_id not in self.chunks):
|
2014-02-24 21:43:17 +00:00
|
|
|
_state += 1
|
|
|
|
return _state
|
2015-03-17 22:47:21 +00:00
|
|
|
|
2015-11-03 22:45:49 +00:00
|
|
|
def report(msg, chunk_id, chunk_no):
|
2016-04-23 20:42:56 +00:00
|
|
|
cid = bin_to_hex(chunk_id)
|
2015-11-03 22:45:49 +00:00
|
|
|
msg += ' [chunk: %06d_%s]' % (chunk_no, cid) # see debug-dump-archive-items
|
2015-12-07 23:21:46 +00:00
|
|
|
self.error_found = True
|
|
|
|
logger.error(msg)
|
2015-11-03 22:45:49 +00:00
|
|
|
|
2016-06-12 21:36:56 +00:00
|
|
|
def valid_item(obj):
|
|
|
|
if not isinstance(obj, StableDict):
|
|
|
|
return False
|
|
|
|
keys = set(obj)
|
|
|
|
return required_item_keys.issubset(keys) and keys.issubset(item_keys)
|
|
|
|
|
2015-11-03 22:45:49 +00:00
|
|
|
i = 0
|
2014-02-16 21:21:18 +00:00
|
|
|
for state, items in groupby(archive[b'items'], missing_chunk_detector):
|
2014-02-24 21:43:17 +00:00
|
|
|
items = list(items)
|
2014-02-16 21:21:18 +00:00
|
|
|
if state % 2:
|
2015-11-03 22:45:49 +00:00
|
|
|
for chunk_id in items:
|
|
|
|
report('item metadata chunk missing', chunk_id, i)
|
|
|
|
i += 1
|
2014-02-24 21:43:17 +00:00
|
|
|
continue
|
|
|
|
if state > 0:
|
|
|
|
unpacker.resync()
|
2014-03-13 21:29:47 +00:00
|
|
|
for chunk_id, cdata in zip(items, repository.get_many(items)):
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(chunk_id, cdata)
|
|
|
|
unpacker.feed(data)
|
2015-11-03 22:45:49 +00:00
|
|
|
try:
|
|
|
|
for item in unpacker:
|
2016-06-12 21:36:56 +00:00
|
|
|
if valid_item(item):
|
2016-05-31 23:45:45 +00:00
|
|
|
yield Item(internal_dict=item)
|
2015-11-03 22:45:49 +00:00
|
|
|
else:
|
|
|
|
report('Did not get expected metadata dict when unpacking item metadata', chunk_id, i)
|
2016-07-28 20:10:29 +00:00
|
|
|
except RobustUnpacker.UnpackerCrashed as err:
|
|
|
|
report('Unpacker crashed while unpacking item metadata, trying to resync...', chunk_id, i)
|
|
|
|
unpacker.resync()
|
2015-11-03 22:45:49 +00:00
|
|
|
except Exception:
|
|
|
|
report('Exception while unpacking item metadata', chunk_id, i)
|
|
|
|
raise
|
|
|
|
i += 1
|
2014-02-16 21:21:18 +00:00
|
|
|
|
2015-08-08 20:11:40 +00:00
|
|
|
if archive is None:
|
|
|
|
# we need last N or all archives
|
|
|
|
archive_items = sorted(self.manifest.archives.items(), reverse=True,
|
|
|
|
key=lambda name_info: name_info[1][b'time'])
|
2015-12-12 23:39:15 +00:00
|
|
|
if prefix is not None:
|
|
|
|
archive_items = [item for item in archive_items if item[0].startswith(prefix)]
|
|
|
|
num_archives = len(archive_items)
|
2015-08-08 20:11:40 +00:00
|
|
|
end = None if last is None else min(num_archives, last)
|
|
|
|
else:
|
|
|
|
# we only want one specific archive
|
|
|
|
archive_items = [item for item in self.manifest.archives.items() if item[0] == archive]
|
2016-05-13 20:50:34 +00:00
|
|
|
if not archive_items:
|
|
|
|
logger.error("Archive '%s' not found.", archive)
|
2015-08-08 20:11:40 +00:00
|
|
|
num_archives = 1
|
|
|
|
end = 1
|
2016-01-16 22:42:54 +00:00
|
|
|
|
|
|
|
with cache_if_remote(self.repository) as repository:
|
|
|
|
for i, (name, info) in enumerate(archive_items[:end]):
|
|
|
|
logger.info('Analyzing archive {} ({}/{})'.format(name, num_archives - i, num_archives))
|
|
|
|
archive_id = info[b'id']
|
|
|
|
if archive_id not in self.chunks:
|
|
|
|
logger.error('Archive metadata block is missing!')
|
|
|
|
self.error_found = True
|
|
|
|
del self.manifest.archives[name]
|
|
|
|
continue
|
|
|
|
mark_as_possibly_superseded(archive_id)
|
|
|
|
cdata = self.repository.get(archive_id)
|
2016-03-18 02:16:12 +00:00
|
|
|
_, data = self.key.decrypt(archive_id, cdata)
|
2016-01-16 22:42:54 +00:00
|
|
|
archive = StableDict(msgpack.unpackb(data))
|
|
|
|
if archive[b'version'] != 1:
|
|
|
|
raise Exception('Unknown archive metadata version')
|
2016-04-17 01:15:19 +00:00
|
|
|
decode_dict(archive, ARCHIVE_TEXT_KEYS)
|
2016-04-23 20:57:04 +00:00
|
|
|
archive[b'cmdline'] = [safe_decode(arg) for arg in archive[b'cmdline']]
|
2016-01-16 22:42:54 +00:00
|
|
|
items_buffer = ChunkBuffer(self.key)
|
|
|
|
items_buffer.write_chunk = add_callback
|
|
|
|
for item in robust_iterator(archive):
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
2016-01-16 22:42:54 +00:00
|
|
|
verify_file_chunks(item)
|
|
|
|
items_buffer.add(item)
|
|
|
|
items_buffer.flush(flush=True)
|
|
|
|
for previous_item_id in archive[b'items']:
|
|
|
|
mark_as_possibly_superseded(previous_item_id)
|
|
|
|
archive[b'items'] = items_buffer.chunks
|
|
|
|
data = msgpack.packb(archive, unicode_errors='surrogateescape')
|
|
|
|
new_archive_id = self.key.id_hash(data)
|
2016-03-18 02:16:12 +00:00
|
|
|
cdata = self.key.encrypt(Chunk(data))
|
2016-01-16 22:42:54 +00:00
|
|
|
add_reference(new_archive_id, len(data), len(cdata), cdata)
|
|
|
|
info[b'id'] = new_archive_id
|
2014-02-24 22:37:21 +00:00
|
|
|
|
2015-08-09 10:43:57 +00:00
|
|
|
def orphan_chunks_check(self):
|
|
|
|
if self.check_all:
|
2016-04-16 15:48:47 +00:00
|
|
|
unused = {id_ for id_, entry in self.chunks.iteritems() if entry.refcount == 0}
|
2015-08-09 10:43:57 +00:00
|
|
|
orphaned = unused - self.possibly_superseded
|
|
|
|
if orphaned:
|
2015-12-07 23:21:46 +00:00
|
|
|
logger.error('{} orphaned objects found!'.format(len(orphaned)))
|
|
|
|
self.error_found = True
|
2015-08-09 10:43:57 +00:00
|
|
|
if self.repair:
|
|
|
|
for id_ in unused:
|
|
|
|
self.repository.delete(id_)
|
|
|
|
else:
|
2016-04-03 15:58:15 +00:00
|
|
|
logger.info('Orphaned objects check skipped (needs all archives checked).')
|
2015-08-09 10:43:57 +00:00
|
|
|
|
2015-11-18 01:27:25 +00:00
|
|
|
def finish(self, save_space=False):
|
2014-02-24 22:37:21 +00:00
|
|
|
if self.repair:
|
|
|
|
self.manifest.write()
|
2015-11-18 01:27:25 +00:00
|
|
|
self.repository.commit(save_space=save_space)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ArchiveRecreater:
|
|
|
|
AUTOCOMMIT_THRESHOLD = 512 * 1024 * 1024
|
|
|
|
"""Commit (compact segments) after this many (or 1 % of repository size, whichever is greater) bytes."""
|
|
|
|
|
|
|
|
class FakeTargetArchive:
|
|
|
|
def __init__(self):
|
|
|
|
self.stats = Statistics()
|
|
|
|
|
|
|
|
class Interrupted(Exception):
|
|
|
|
def __init__(self, metadata=None):
|
|
|
|
self.metadata = metadata or {}
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def is_temporary_archive(archive_name):
|
|
|
|
return archive_name.endswith('.recreate')
|
|
|
|
|
|
|
|
def __init__(self, repository, manifest, key, cache, matcher,
|
|
|
|
exclude_caches=False, exclude_if_present=None, keep_tag_files=False,
|
2016-07-31 21:09:57 +00:00
|
|
|
chunker_params=None, compression=None, compression_files=None, always_recompress=False,
|
2016-04-07 09:29:52 +00:00
|
|
|
dry_run=False, stats=False, progress=False, file_status_printer=None):
|
|
|
|
self.repository = repository
|
|
|
|
self.key = key
|
|
|
|
self.manifest = manifest
|
|
|
|
self.cache = cache
|
|
|
|
|
|
|
|
self.matcher = matcher
|
|
|
|
self.exclude_caches = exclude_caches
|
|
|
|
self.exclude_if_present = exclude_if_present or []
|
|
|
|
self.keep_tag_files = keep_tag_files
|
|
|
|
|
|
|
|
self.chunker_params = chunker_params or CHUNKER_PARAMS
|
|
|
|
self.recompress = bool(compression)
|
2016-07-31 21:09:57 +00:00
|
|
|
self.always_recompress = always_recompress
|
2016-04-18 23:13:10 +00:00
|
|
|
self.compression = compression or CompressionSpec('none')
|
|
|
|
self.seen_chunks = set()
|
|
|
|
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
2016-07-31 21:09:57 +00:00
|
|
|
compression_files or [])
|
2016-04-18 23:13:10 +00:00
|
|
|
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
self.autocommit_threshold = max(self.AUTOCOMMIT_THRESHOLD, self.cache.chunks_stored_size() / 100)
|
|
|
|
logger.debug("Autocommit threshold: %s", format_file_size(self.autocommit_threshold))
|
|
|
|
|
|
|
|
self.dry_run = dry_run
|
|
|
|
self.stats = stats
|
|
|
|
self.progress = progress
|
|
|
|
self.print_file_status = file_status_printer or (lambda *args: None)
|
|
|
|
|
|
|
|
self.interrupt = False
|
|
|
|
self.errors = False
|
|
|
|
|
2016-08-02 13:53:29 +00:00
|
|
|
def recreate(self, archive_name, comment=None, target_name=None):
|
2016-04-07 09:29:52 +00:00
|
|
|
assert not self.is_temporary_archive(archive_name)
|
|
|
|
archive = self.open_archive(archive_name)
|
2016-08-02 13:53:29 +00:00
|
|
|
target, resume_from = self.create_target_or_resume(archive, target_name)
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.exclude_if_present or self.exclude_caches:
|
|
|
|
self.matcher_add_tagged_dirs(archive)
|
2016-04-10 12:09:05 +00:00
|
|
|
if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
|
2016-04-07 09:29:52 +00:00
|
|
|
logger.info("Skipping archive %s, nothing to do", archive_name)
|
|
|
|
return True
|
|
|
|
try:
|
|
|
|
self.process_items(archive, target, resume_from)
|
|
|
|
except self.Interrupted as e:
|
|
|
|
return self.save(archive, target, completed=False, metadata=e.metadata)
|
2016-08-02 13:53:29 +00:00
|
|
|
replace_original = target_name is None
|
|
|
|
return self.save(archive, target, comment, replace_original=replace_original)
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
def process_items(self, archive, target, resume_from=None):
|
|
|
|
matcher = self.matcher
|
|
|
|
target_is_subset = not matcher.empty()
|
|
|
|
hardlink_masters = {} if target_is_subset else None
|
|
|
|
|
|
|
|
def item_is_hardlink_master(item):
|
|
|
|
return (target_is_subset and
|
2016-05-31 23:45:45 +00:00
|
|
|
stat.S_ISREG(item.mode) and
|
|
|
|
item.get('hardlink_master', True) and
|
|
|
|
'source' not in item and
|
|
|
|
not matcher.match(item.path))
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
for item in archive.iter_items():
|
|
|
|
if item_is_hardlink_master(item):
|
|
|
|
# Re-visit all of these items in the archive even when fast-forwarding to rebuild hardlink_masters
|
2016-05-31 23:45:45 +00:00
|
|
|
hardlink_masters[item.path] = (item.get('chunks'), None)
|
2016-04-07 09:29:52 +00:00
|
|
|
continue
|
|
|
|
if resume_from:
|
|
|
|
# Fast forward to after the last processed file
|
2016-05-31 23:45:45 +00:00
|
|
|
if item.path == resume_from:
|
|
|
|
logger.info('Fast-forwarded to %s', remove_surrogates(item.path))
|
2016-04-07 09:29:52 +00:00
|
|
|
resume_from = None
|
|
|
|
continue
|
2016-05-31 23:45:45 +00:00
|
|
|
if not matcher.match(item.path):
|
|
|
|
self.print_file_status('x', item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
continue
|
2016-05-31 23:45:45 +00:00
|
|
|
if target_is_subset and stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters:
|
2016-04-07 09:29:52 +00:00
|
|
|
# master of this hard link is outside the target subset
|
2016-05-31 23:45:45 +00:00
|
|
|
chunks, new_source = hardlink_masters[item.source]
|
2016-04-07 09:29:52 +00:00
|
|
|
if new_source is None:
|
|
|
|
# First item to use this master, move the chunks
|
2016-05-31 23:45:45 +00:00
|
|
|
item.chunks = chunks
|
|
|
|
hardlink_masters[item.source] = (None, item.path)
|
|
|
|
del item.source
|
2016-04-07 09:29:52 +00:00
|
|
|
else:
|
|
|
|
# Master was already moved, only update this item's source
|
2016-05-31 23:45:45 +00:00
|
|
|
item.source = new_source
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.dry_run:
|
2016-05-31 23:45:45 +00:00
|
|
|
self.print_file_status('-', item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
else:
|
|
|
|
try:
|
|
|
|
self.process_item(archive, target, item)
|
|
|
|
except self.Interrupted:
|
|
|
|
if self.progress:
|
|
|
|
target.stats.show_progress(final=True)
|
|
|
|
raise
|
|
|
|
if self.progress:
|
|
|
|
target.stats.show_progress(final=True)
|
|
|
|
|
|
|
|
def process_item(self, archive, target, item):
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
|
|
|
item.chunks = self.process_chunks(archive, target, item)
|
2016-04-07 09:29:52 +00:00
|
|
|
target.stats.nfiles += 1
|
|
|
|
target.add_item(item)
|
2016-05-31 23:45:45 +00:00
|
|
|
self.print_file_status(file_status(item.mode), item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.interrupt:
|
|
|
|
raise self.Interrupted
|
|
|
|
|
|
|
|
def process_chunks(self, archive, target, item):
|
|
|
|
"""Return new chunk ID list for 'item'."""
|
|
|
|
if not self.recompress and not target.recreate_rechunkify:
|
2016-05-31 23:45:45 +00:00
|
|
|
for chunk_id, size, csize in item.chunks:
|
2016-04-07 09:29:52 +00:00
|
|
|
self.cache.chunk_incref(chunk_id, target.stats)
|
2016-05-31 23:45:45 +00:00
|
|
|
return item.chunks
|
2016-04-07 09:29:52 +00:00
|
|
|
new_chunks = self.process_partial_chunks(target)
|
|
|
|
chunk_iterator = self.create_chunk_iterator(archive, target, item)
|
|
|
|
consume(chunk_iterator, len(new_chunks))
|
2016-07-31 21:09:57 +00:00
|
|
|
compress = self.compression_decider1.decide(item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
for chunk in chunk_iterator:
|
2016-07-31 21:09:57 +00:00
|
|
|
chunk.meta['compress'] = compress
|
2016-03-18 02:16:12 +00:00
|
|
|
chunk_id = self.key.id_hash(chunk.data)
|
2016-04-07 09:29:52 +00:00
|
|
|
if chunk_id in self.seen_chunks:
|
|
|
|
new_chunks.append(self.cache.chunk_incref(chunk_id, target.stats))
|
|
|
|
else:
|
2016-07-31 21:09:57 +00:00
|
|
|
compression_spec, chunk = self.key.compression_decider2.decide(chunk)
|
|
|
|
overwrite = self.recompress
|
|
|
|
if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
|
|
|
|
# Check if this chunk is already compressed the way we want it
|
|
|
|
old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
|
|
|
|
if Compressor.detect(old_chunk.data).name == compression_spec['name']:
|
|
|
|
# Stored chunk has the same compression we wanted
|
|
|
|
overwrite = False
|
|
|
|
chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
|
2016-04-07 09:29:52 +00:00
|
|
|
new_chunks.append((chunk_id, size, csize))
|
|
|
|
self.seen_chunks.add(chunk_id)
|
|
|
|
if self.recompress:
|
|
|
|
# This tracks how many bytes are uncommitted but compactable, since we are recompressing
|
|
|
|
# existing chunks.
|
|
|
|
target.recreate_uncomitted_bytes += csize
|
|
|
|
if target.recreate_uncomitted_bytes >= self.autocommit_threshold:
|
|
|
|
# Issue commits to limit additional space usage when recompressing chunks
|
|
|
|
target.recreate_uncomitted_bytes = 0
|
|
|
|
self.repository.commit()
|
|
|
|
if self.progress:
|
|
|
|
target.stats.show_progress(item=item, dt=0.2)
|
|
|
|
if self.interrupt:
|
|
|
|
raise self.Interrupted({
|
|
|
|
'recreate_partial_chunks': new_chunks,
|
|
|
|
})
|
|
|
|
return new_chunks
|
|
|
|
|
|
|
|
def create_chunk_iterator(self, archive, target, item):
|
|
|
|
"""Return iterator of chunks to store for 'item' from 'archive' in 'target'."""
|
2016-05-31 23:45:45 +00:00
|
|
|
chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in item.chunks])
|
2016-04-07 09:29:52 +00:00
|
|
|
if target.recreate_rechunkify:
|
|
|
|
# The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
|
|
|
|
# (does not load the entire file into memory)
|
|
|
|
file = ChunkIteratorFileWrapper(chunk_iterator)
|
2016-03-18 02:16:12 +00:00
|
|
|
|
|
|
|
def _chunk_iterator():
|
|
|
|
for data in target.chunker.chunkify(file):
|
|
|
|
yield Chunk(data)
|
|
|
|
|
|
|
|
chunk_iterator = _chunk_iterator()
|
2016-04-07 09:29:52 +00:00
|
|
|
return chunk_iterator
|
|
|
|
|
|
|
|
def process_partial_chunks(self, target):
|
|
|
|
"""Return chunks from a previous run for archive 'target' (if any) or an empty list."""
|
|
|
|
if not target.recreate_partial_chunks:
|
|
|
|
return []
|
|
|
|
# No incref, create_target_or_resume already did that before to deleting the old target archive
|
|
|
|
# So just copy these over
|
|
|
|
partial_chunks = target.recreate_partial_chunks
|
|
|
|
target.recreate_partial_chunks = None
|
|
|
|
for chunk_id, size, csize in partial_chunks:
|
|
|
|
self.seen_chunks.add(chunk_id)
|
|
|
|
logger.debug('Copied %d chunks from a partially processed item', len(partial_chunks))
|
|
|
|
return partial_chunks
|
|
|
|
|
2016-08-02 13:53:29 +00:00
|
|
|
def save(self, archive, target, comment=None, completed=True, metadata=None, replace_original=True):
|
2016-04-07 09:29:52 +00:00
|
|
|
"""Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
|
|
|
|
if self.dry_run:
|
|
|
|
return completed
|
|
|
|
if completed:
|
|
|
|
timestamp = archive.ts.replace(tzinfo=None)
|
2016-04-10 12:09:05 +00:00
|
|
|
if comment is None:
|
|
|
|
comment = archive.metadata.get(b'comment', '')
|
|
|
|
target.save(timestamp=timestamp, comment=comment, additional_metadata={
|
2016-04-07 09:29:52 +00:00
|
|
|
'cmdline': archive.metadata[b'cmdline'],
|
|
|
|
'recreate_cmdline': sys.argv,
|
|
|
|
})
|
2016-08-02 13:53:29 +00:00
|
|
|
if replace_original:
|
|
|
|
archive.delete(Statistics(), progress=self.progress)
|
|
|
|
target.rename(archive.name)
|
2016-04-07 09:29:52 +00:00
|
|
|
if self.stats:
|
|
|
|
target.end = datetime.utcnow()
|
|
|
|
log_multi(DASHES,
|
|
|
|
str(target),
|
|
|
|
DASHES,
|
|
|
|
str(target.stats),
|
|
|
|
str(self.cache),
|
|
|
|
DASHES)
|
|
|
|
else:
|
|
|
|
additional_metadata = metadata or {}
|
|
|
|
additional_metadata.update({
|
|
|
|
'recreate_source_id': archive.id,
|
|
|
|
'recreate_args': sys.argv[1:],
|
|
|
|
})
|
|
|
|
target.save(name=archive.name + '.recreate', additional_metadata=additional_metadata)
|
|
|
|
logger.info('Run the same command again to resume.')
|
|
|
|
return completed
|
|
|
|
|
|
|
|
def matcher_add_tagged_dirs(self, archive):
|
|
|
|
"""Add excludes to the matcher created by exclude_cache and exclude_if_present."""
|
|
|
|
def exclude(dir, tag_item):
|
|
|
|
if self.keep_tag_files:
|
2016-05-31 23:45:45 +00:00
|
|
|
tag_files.append(PathPrefixPattern(tag_item.path))
|
2016-04-07 09:29:52 +00:00
|
|
|
tagged_dirs.append(FnmatchPattern(dir + '/'))
|
|
|
|
else:
|
|
|
|
tagged_dirs.append(PathPrefixPattern(dir))
|
|
|
|
|
|
|
|
matcher = self.matcher
|
|
|
|
tag_files = []
|
|
|
|
tagged_dirs = []
|
2016-04-17 01:15:19 +00:00
|
|
|
# build hardlink masters, but only for paths ending in CACHE_TAG_NAME, so we can read hard-linked TAGs
|
2016-04-07 09:29:52 +00:00
|
|
|
cachedir_masters = {}
|
|
|
|
|
|
|
|
for item in archive.iter_items(
|
2016-05-31 23:45:45 +00:00
|
|
|
filter=lambda item: item.path.endswith(CACHE_TAG_NAME) or matcher.match(item.path)):
|
|
|
|
if item.path.endswith(CACHE_TAG_NAME):
|
|
|
|
cachedir_masters[item.path] = item
|
|
|
|
if stat.S_ISREG(item.mode):
|
|
|
|
dir, tag_file = os.path.split(item.path)
|
2016-04-07 09:29:52 +00:00
|
|
|
if tag_file in self.exclude_if_present:
|
|
|
|
exclude(dir, item)
|
2016-04-17 01:15:19 +00:00
|
|
|
if self.exclude_caches and tag_file == CACHE_TAG_NAME:
|
2016-05-31 23:45:45 +00:00
|
|
|
if 'chunks' in item:
|
2016-04-07 09:29:52 +00:00
|
|
|
file = open_item(archive, item)
|
|
|
|
else:
|
2016-05-31 23:45:45 +00:00
|
|
|
file = open_item(archive, cachedir_masters[item.source])
|
2016-04-17 01:15:19 +00:00
|
|
|
if file.read(len(CACHE_TAG_CONTENTS)).startswith(CACHE_TAG_CONTENTS):
|
2016-04-07 09:29:52 +00:00
|
|
|
exclude(dir, item)
|
|
|
|
matcher.add(tag_files, True)
|
|
|
|
matcher.add(tagged_dirs, False)
|
|
|
|
|
2016-08-02 13:53:29 +00:00
|
|
|
def create_target_or_resume(self, archive, target_name=None):
|
2016-04-07 09:29:52 +00:00
|
|
|
"""Create new target archive or resume from temporary archive, if it exists. Return archive, resume from path"""
|
|
|
|
if self.dry_run:
|
|
|
|
return self.FakeTargetArchive(), None
|
2016-08-02 13:53:29 +00:00
|
|
|
target_name = target_name or archive.name + '.recreate'
|
2016-04-07 09:29:52 +00:00
|
|
|
resume = target_name in self.manifest.archives
|
|
|
|
target, resume_from = None, None
|
|
|
|
if resume:
|
|
|
|
target, resume_from = self.try_resume(archive, target_name)
|
|
|
|
if not target:
|
|
|
|
target = self.create_target_archive(target_name)
|
|
|
|
# If the archives use the same chunker params, then don't rechunkify
|
|
|
|
target.recreate_rechunkify = tuple(archive.metadata.get(b'chunker_params')) != self.chunker_params
|
|
|
|
return target, resume_from
|
|
|
|
|
|
|
|
def try_resume(self, archive, target_name):
|
|
|
|
"""Try to resume from temporary archive. Return (target archive, resume from path) if successful."""
|
|
|
|
logger.info('Found %s, will resume interrupted operation', target_name)
|
|
|
|
old_target = self.open_archive(target_name)
|
2016-08-07 10:23:37 +00:00
|
|
|
if not self.can_resume(archive, old_target, target_name):
|
|
|
|
return None, None
|
2016-04-07 09:29:52 +00:00
|
|
|
target = self.create_target_archive(target_name + '.temp')
|
|
|
|
logger.info('Replaying items from interrupted operation...')
|
2016-08-07 10:23:37 +00:00
|
|
|
last_old_item = self.copy_items(old_target, target)
|
2016-08-07 10:32:38 +00:00
|
|
|
resume_from = getattr(last_old_item, 'path', None)
|
2016-08-07 10:23:37 +00:00
|
|
|
self.incref_partial_chunks(old_target, target)
|
|
|
|
old_target.delete(Statistics(), progress=self.progress)
|
|
|
|
logger.info('Done replaying items')
|
|
|
|
return target, resume_from
|
|
|
|
|
|
|
|
def incref_partial_chunks(self, source_archive, target_archive):
|
|
|
|
target_archive.recreate_partial_chunks = source_archive.metadata.get(b'recreate_partial_chunks', [])
|
|
|
|
for chunk_id, size, csize in target_archive.recreate_partial_chunks:
|
2016-04-10 13:59:10 +00:00
|
|
|
if not self.cache.seen_chunk(chunk_id):
|
|
|
|
try:
|
|
|
|
# Repository has __contains__, RemoteRepository doesn't
|
2016-08-07 10:32:38 +00:00
|
|
|
# `chunk_id in repo` doesn't read the data though, so we try to use that if possible.
|
|
|
|
get_or_in = getattr(self.repository, '__contains__', self.repository.get)
|
|
|
|
if get_or_in(chunk_id) is False:
|
|
|
|
raise Repository.ObjectNotFound(chunk_id, self.repository)
|
2016-04-10 13:59:10 +00:00
|
|
|
except Repository.ObjectNotFound:
|
|
|
|
# delete/prune/check between invocations: these chunks are gone.
|
2016-08-07 10:23:37 +00:00
|
|
|
target_archive.recreate_partial_chunks = None
|
2016-04-10 13:59:10 +00:00
|
|
|
break
|
|
|
|
# fast-lane insert into chunks cache
|
|
|
|
self.cache.chunks[chunk_id] = (1, size, csize)
|
2016-08-07 10:23:37 +00:00
|
|
|
target_archive.stats.update(size, csize, True)
|
2016-04-10 13:59:10 +00:00
|
|
|
continue
|
2016-08-07 10:23:37 +00:00
|
|
|
# incref now, otherwise a source_archive.delete() might delete these chunks
|
|
|
|
self.cache.chunk_incref(chunk_id, target_archive.stats)
|
|
|
|
|
|
|
|
def copy_items(self, source_archive, target_archive):
|
|
|
|
item = None
|
|
|
|
for item in source_archive.iter_items():
|
|
|
|
if 'chunks' in item:
|
|
|
|
for chunk in item.chunks:
|
|
|
|
self.cache.chunk_incref(chunk.id, target_archive.stats)
|
|
|
|
target_archive.stats.nfiles += 1
|
|
|
|
target_archive.add_item(item)
|
|
|
|
if self.progress:
|
2016-08-09 19:20:13 +00:00
|
|
|
target_archive.stats.show_progress(final=True)
|
2016-08-07 10:23:37 +00:00
|
|
|
return item
|
|
|
|
|
|
|
|
def can_resume(self, archive, old_target, target_name):
|
|
|
|
resume_id = old_target.metadata[b'recreate_source_id']
|
|
|
|
resume_args = [safe_decode(arg) for arg in old_target.metadata[b'recreate_args']]
|
|
|
|
if resume_id != archive.id:
|
|
|
|
logger.warning('Source archive changed, will discard %s and start over', target_name)
|
|
|
|
logger.warning('Saved fingerprint: %s', bin_to_hex(resume_id))
|
|
|
|
logger.warning('Current fingerprint: %s', archive.fpr)
|
|
|
|
old_target.delete(Statistics(), progress=self.progress)
|
|
|
|
return False
|
|
|
|
if resume_args != sys.argv[1:]:
|
|
|
|
logger.warning('Command line changed, this might lead to inconsistencies')
|
|
|
|
logger.warning('Saved: %s', repr(resume_args))
|
|
|
|
logger.warning('Current: %s', repr(sys.argv[1:]))
|
|
|
|
# Just warn in this case, don't start over
|
|
|
|
return True
|
2016-04-07 09:29:52 +00:00
|
|
|
|
|
|
|
def create_target_archive(self, name):
|
|
|
|
target = Archive(self.repository, self.key, self.manifest, name, create=True,
|
|
|
|
progress=self.progress, chunker_params=self.chunker_params, cache=self.cache,
|
2016-04-18 23:13:10 +00:00
|
|
|
checkpoint_interval=0, compression=self.compression)
|
2016-04-07 09:29:52 +00:00
|
|
|
target.recreate_partial_chunks = None
|
|
|
|
target.recreate_uncomitted_bytes = 0
|
|
|
|
return target
|
|
|
|
|
|
|
|
def open_archive(self, name, **kwargs):
|
|
|
|
return Archive(self.repository, self.key, self.manifest, name, cache=self.cache, **kwargs)
|