Merge pull request #1921 from enkore/f/recreate-fixes

recreate fixes
This commit is contained in:
enkore 2016-12-03 00:02:01 +01:00 committed by GitHub
commit e9d7f928e2
3 changed files with 26 additions and 35 deletions

View File

@ -1394,10 +1394,6 @@ class ArchiveChecker:
class ArchiveRecreater: class ArchiveRecreater:
class FakeTargetArchive:
def __init__(self):
self.stats = Statistics()
class Interrupted(Exception): class Interrupted(Exception):
def __init__(self, metadata=None): def __init__(self, metadata=None):
self.metadata = metadata or {} self.metadata = metadata or {}
@ -1421,6 +1417,9 @@ class ArchiveRecreater:
self.exclude_if_present = exclude_if_present or [] self.exclude_if_present = exclude_if_present or []
self.keep_tag_files = keep_tag_files self.keep_tag_files = keep_tag_files
self.rechunkify = chunker_params is not None
if self.rechunkify:
logger.debug('Rechunking archives to %s', chunker_params)
self.chunker_params = chunker_params or CHUNKER_PARAMS self.chunker_params = chunker_params or CHUNKER_PARAMS
self.recompress = bool(compression) self.recompress = bool(compression)
self.always_recompress = always_recompress self.always_recompress = always_recompress
@ -1434,7 +1433,7 @@ class ArchiveRecreater:
self.stats = stats self.stats = stats
self.progress = progress self.progress = progress
self.print_file_status = file_status_printer or (lambda *args: None) self.print_file_status = file_status_printer or (lambda *args: None)
self.checkpoint_interval = checkpoint_interval self.checkpoint_interval = None if dry_run else checkpoint_interval
def recreate(self, archive_name, comment=None, target_name=None): def recreate(self, archive_name, comment=None, target_name=None):
assert not self.is_temporary_archive(archive_name) assert not self.is_temporary_archive(archive_name)
@ -1444,10 +1443,10 @@ class ArchiveRecreater:
self.matcher_add_tagged_dirs(archive) self.matcher_add_tagged_dirs(archive)
if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None: if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
logger.info("Skipping archive %s, nothing to do", archive_name) logger.info("Skipping archive %s, nothing to do", archive_name)
return True return
self.process_items(archive, target) self.process_items(archive, target)
replace_original = target_name is None replace_original = target_name is None
return self.save(archive, target, comment, replace_original=replace_original) self.save(archive, target, comment, replace_original=replace_original)
def process_items(self, archive, target): def process_items(self, archive, target):
matcher = self.matcher matcher = self.matcher
@ -1494,12 +1493,11 @@ class ArchiveRecreater:
self.print_file_status(file_status(item.mode), item.path) self.print_file_status(file_status(item.mode), item.path)
def process_chunks(self, archive, target, item): def process_chunks(self, archive, target, item):
"""Return new chunk ID list for 'item'."""
if not self.recompress and not target.recreate_rechunkify: if not self.recompress and not target.recreate_rechunkify:
for chunk_id, size, csize in item.chunks: for chunk_id, size, csize in item.chunks:
self.cache.chunk_incref(chunk_id, target.stats) self.cache.chunk_incref(chunk_id, target.stats)
return item.chunks return item.chunks
chunk_iterator = self.create_chunk_iterator(archive, target, list(item.chunks)) chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
compress = self.compression_decider1.decide(item.path) compress = self.compression_decider1.decide(item.path)
chunk_processor = partial(self.chunk_processor, target, compress) chunk_processor = partial(self.chunk_processor, target, compress)
target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor) target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
@ -1517,24 +1515,22 @@ class ArchiveRecreater:
if Compressor.detect(old_chunk.data).name == compression_spec['name']: if Compressor.detect(old_chunk.data).name == compression_spec['name']:
# Stored chunk has the same compression we wanted # Stored chunk has the same compression we wanted
overwrite = False overwrite = False
chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite) chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
self.seen_chunks.add(chunk_id) self.seen_chunks.add(chunk_entry.id)
return chunk_id, size, csize return chunk_entry
def create_chunk_iterator(self, archive, target, chunks): def iter_chunks(self, archive, target, chunks):
"""Return iterator of chunks to store for 'item' from 'archive' in 'target'."""
chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks]) chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
if target.recreate_rechunkify: if target.recreate_rechunkify:
# The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
# (does not load the entire file into memory) # (does not load the entire file into memory)
file = ChunkIteratorFileWrapper(chunk_iterator) file = ChunkIteratorFileWrapper(chunk_iterator)
return target.chunker.chunkify(file) yield from target.chunker.chunkify(file)
else: else:
for chunk in chunk_iterator: for chunk in chunk_iterator:
yield chunk.data yield chunk.data
def save(self, archive, target, comment=None, replace_original=True): def save(self, archive, target, comment=None, replace_original=True):
"""Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
if self.dry_run: if self.dry_run:
return return
timestamp = archive.ts.replace(tzinfo=None) timestamp = archive.ts.replace(tzinfo=None)
@ -1591,12 +1587,13 @@ class ArchiveRecreater:
def create_target(self, archive, target_name=None): def create_target(self, archive, target_name=None):
"""Create target archive.""" """Create target archive."""
if self.dry_run:
return self.FakeTargetArchive(), None
target_name = target_name or archive.name + '.recreate' target_name = target_name or archive.name + '.recreate'
target = self.create_target_archive(target_name) target = self.create_target_archive(target_name)
# If the archives use the same chunker params, then don't rechunkify # If the archives use the same chunker params, then don't rechunkify
target.recreate_rechunkify = tuple(archive.metadata.get('chunker_params', [])) != self.chunker_params source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
if target.recreate_rechunkify:
logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
return target return target
def create_target_archive(self, name): def create_target_archive(self, name):

View File

@ -1101,11 +1101,11 @@ class Archiver:
if recreater.is_temporary_archive(name): if recreater.is_temporary_archive(name):
continue continue
print('Processing', name) print('Processing', name)
if not recreater.recreate(name, args.comment): recreater.recreate(name, args.comment)
break if not args.dry_run:
manifest.write() manifest.write()
repository.commit() repository.commit()
cache.commit() cache.commit()
return self.exit_code return self.exit_code
@with_repository(manifest=False, exclusive=True) @with_repository(manifest=False, exclusive=True)
@ -2356,6 +2356,8 @@ class Archiver:
recreate_epilog = textwrap.dedent(""" recreate_epilog = textwrap.dedent("""
Recreate the contents of existing archives. Recreate the contents of existing archives.
This is an *experimental* feature. Do *not* use this on your only backup.
--exclude, --exclude-from and PATH have the exact same semantics --exclude, --exclude-from and PATH have the exact same semantics
as in "borg create". If PATHs are specified the resulting archive as in "borg create". If PATHs are specified the resulting archive
will only contain files from these PATHs. will only contain files from these PATHs.
@ -2372,15 +2374,6 @@ class Archiver:
used to have upgraded Borg 0.xx or Attic archives deduplicate with used to have upgraded Borg 0.xx or Attic archives deduplicate with
Borg 1.x archives. Borg 1.x archives.
borg recreate is signal safe. Send either SIGINT (Ctrl-C on most terminals) or
SIGTERM to request termination.
Use the *exact same* command line to resume the operation later - changing excludes
or paths will lead to inconsistencies (changed excludes will only apply to newly
processed files/dirs). Changing compression leads to incorrect size information
(which does not cause any data loss, but can be misleading).
Changing chunker params between invocations might lead to data loss.
USE WITH CAUTION. USE WITH CAUTION.
Depending on the PATHs and patterns given, recreate can be used to permanently Depending on the PATHs and patterns given, recreate can be used to permanently
delete files from archives. delete files from archives.
@ -2395,8 +2388,8 @@ class Archiver:
When rechunking space usage can be substantial, expect at least the entire When rechunking space usage can be substantial, expect at least the entire
deduplicated size of the archives using the previous chunker params. deduplicated size of the archives using the previous chunker params.
When recompressing approximately 1 % of the repository size or 512 MB When recompressing expect approx. (throughput / checkpoint-interval) in space usage,
(whichever is greater) of additional space is used. assuming all chunks are recompressed.
""") """)
subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False, subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False,
description=self.do_recreate.__doc__, description=self.do_recreate.__doc__,

View File

@ -1823,6 +1823,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd('recreate', self.repository_location, '--chunker-params', 'default') self.cmd('recreate', self.repository_location, '--chunker-params', 'default')
self.check_cache() self.check_cache()
# test1 and test2 do deduplicate after recreate # test1 and test2 do deduplicate after recreate
assert int(self.cmd('list', self.repository_location + '::test1', 'input/large_file', '--format={size}'))
assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file', assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file',
'--format', '{unique_chunks}')) '--format', '{unique_chunks}'))