Merge pull request #1921 from enkore/f/recreate-fixes

recreate fixes
This commit is contained in:
enkore 2016-12-03 00:02:01 +01:00 committed by GitHub
commit e9d7f928e2
3 changed files with 26 additions and 35 deletions

View File

@ -1394,10 +1394,6 @@ class ArchiveChecker:
class ArchiveRecreater:
class FakeTargetArchive:
def __init__(self):
self.stats = Statistics()
class Interrupted(Exception):
def __init__(self, metadata=None):
self.metadata = metadata or {}
@ -1421,6 +1417,9 @@ class ArchiveRecreater:
self.exclude_if_present = exclude_if_present or []
self.keep_tag_files = keep_tag_files
self.rechunkify = chunker_params is not None
if self.rechunkify:
logger.debug('Rechunking archives to %s', chunker_params)
self.chunker_params = chunker_params or CHUNKER_PARAMS
self.recompress = bool(compression)
self.always_recompress = always_recompress
@ -1434,7 +1433,7 @@ class ArchiveRecreater:
self.stats = stats
self.progress = progress
self.print_file_status = file_status_printer or (lambda *args: None)
self.checkpoint_interval = checkpoint_interval
self.checkpoint_interval = None if dry_run else checkpoint_interval
def recreate(self, archive_name, comment=None, target_name=None):
assert not self.is_temporary_archive(archive_name)
@ -1444,10 +1443,10 @@ class ArchiveRecreater:
self.matcher_add_tagged_dirs(archive)
if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
logger.info("Skipping archive %s, nothing to do", archive_name)
return True
return
self.process_items(archive, target)
replace_original = target_name is None
return self.save(archive, target, comment, replace_original=replace_original)
self.save(archive, target, comment, replace_original=replace_original)
def process_items(self, archive, target):
matcher = self.matcher
@ -1494,12 +1493,11 @@ class ArchiveRecreater:
self.print_file_status(file_status(item.mode), item.path)
def process_chunks(self, archive, target, item):
"""Return new chunk ID list for 'item'."""
if not self.recompress and not target.recreate_rechunkify:
for chunk_id, size, csize in item.chunks:
self.cache.chunk_incref(chunk_id, target.stats)
return item.chunks
chunk_iterator = self.create_chunk_iterator(archive, target, list(item.chunks))
chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
compress = self.compression_decider1.decide(item.path)
chunk_processor = partial(self.chunk_processor, target, compress)
target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
@ -1517,24 +1515,22 @@ class ArchiveRecreater:
if Compressor.detect(old_chunk.data).name == compression_spec['name']:
# Stored chunk has the same compression we wanted
overwrite = False
chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
self.seen_chunks.add(chunk_id)
return chunk_id, size, csize
chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
self.seen_chunks.add(chunk_entry.id)
return chunk_entry
def create_chunk_iterator(self, archive, target, chunks):
"""Return iterator of chunks to store for 'item' from 'archive' in 'target'."""
def iter_chunks(self, archive, target, chunks):
chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
if target.recreate_rechunkify:
# The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
# (does not load the entire file into memory)
file = ChunkIteratorFileWrapper(chunk_iterator)
return target.chunker.chunkify(file)
yield from target.chunker.chunkify(file)
else:
for chunk in chunk_iterator:
yield chunk.data
def save(self, archive, target, comment=None, replace_original=True):
"""Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
if self.dry_run:
return
timestamp = archive.ts.replace(tzinfo=None)
@ -1591,12 +1587,13 @@ class ArchiveRecreater:
def create_target(self, archive, target_name=None):
"""Create target archive."""
if self.dry_run:
return self.FakeTargetArchive(), None
target_name = target_name or archive.name + '.recreate'
target = self.create_target_archive(target_name)
# If the archives use the same chunker params, then don't rechunkify
target.recreate_rechunkify = tuple(archive.metadata.get('chunker_params', [])) != self.chunker_params
source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
if target.recreate_rechunkify:
logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
return target
def create_target_archive(self, name):

View File

@ -1101,11 +1101,11 @@ class Archiver:
if recreater.is_temporary_archive(name):
continue
print('Processing', name)
if not recreater.recreate(name, args.comment):
break
manifest.write()
repository.commit()
cache.commit()
recreater.recreate(name, args.comment)
if not args.dry_run:
manifest.write()
repository.commit()
cache.commit()
return self.exit_code
@with_repository(manifest=False, exclusive=True)
@ -2356,6 +2356,8 @@ class Archiver:
recreate_epilog = textwrap.dedent("""
Recreate the contents of existing archives.
This is an *experimental* feature. Do *not* use this on your only backup.
--exclude, --exclude-from and PATH have the exact same semantics
as in "borg create". If PATHs are specified the resulting archive
will only contain files from these PATHs.
@ -2372,15 +2374,6 @@ class Archiver:
used to have upgraded Borg 0.xx or Attic archives deduplicate with
Borg 1.x archives.
borg recreate is signal safe. Send either SIGINT (Ctrl-C on most terminals) or
SIGTERM to request termination.
Use the *exact same* command line to resume the operation later - changing excludes
or paths will lead to inconsistencies (changed excludes will only apply to newly
processed files/dirs). Changing compression leads to incorrect size information
(which does not cause any data loss, but can be misleading).
Changing chunker params between invocations might lead to data loss.
USE WITH CAUTION.
Depending on the PATHs and patterns given, recreate can be used to permanently
delete files from archives.
@ -2395,8 +2388,8 @@ class Archiver:
When rechunking space usage can be substantial, expect at least the entire
deduplicated size of the archives using the previous chunker params.
When recompressing approximately 1 % of the repository size or 512 MB
(whichever is greater) of additional space is used.
When recompressing expect approx. (throughput / checkpoint-interval) in space usage,
assuming all chunks are recompressed.
""")
subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False,
description=self.do_recreate.__doc__,

View File

@ -1823,6 +1823,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd('recreate', self.repository_location, '--chunker-params', 'default')
self.check_cache()
# test1 and test2 do deduplicate after recreate
assert int(self.cmd('list', self.repository_location + '::test1', 'input/large_file', '--format={size}'))
assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file',
'--format', '{unique_chunks}'))