extract: When doing a partial restore don't leak prefetched chunks.

The filter function passed to iter_items (with preload=True) may never return True for items that
are not really extracted later because that would leak prefetched items.

For restoring hard linked files the item containing the actual chunks might not be matched
or implicitly removed from the restore by strip_components. For this reason the chunk list or all
items that can potentially be used as hardlink target needs to be stored.

To achive both requirements at the same time the filter function needs to store the needed information
for the hardlinks while not returning True just because it could be a hardlink target.

Known problems: When using progress indication the calculated extracted_size now can be smaller
than the actual extracted size in presence of hard links (master is not restored) instead of
bigger (potential master not used in restore).
This commit is contained in:
Martin Hostettler 2016-08-22 22:58:54 +02:00
parent e0b8635098
commit 972392e290
2 changed files with 12 additions and 15 deletions

View File

@ -161,11 +161,11 @@ class DownloadPipeline:
for _, data in self.fetch_many(ids): for _, data in self.fetch_many(ids):
unpacker.feed(data) unpacker.feed(data)
items = [Item(internal_dict=item) for item in unpacker] items = [Item(internal_dict=item) for item in unpacker]
if filter:
items = [item for item in items if filter(item)]
for item in items: for item in items:
if 'chunks' in item: if 'chunks' in item:
item.chunks = [ChunkListEntry(*e) for e in item.chunks] item.chunks = [ChunkListEntry(*e) for e in item.chunks]
if filter:
items = [item for item in items if filter(item)]
if preload: if preload:
for item in items: for item in items:
if 'chunks' in item: if 'chunks' in item:

View File

@ -417,15 +417,15 @@ class Archiver:
self.print_file_status(status, path) self.print_file_status(status, path)
@staticmethod @staticmethod
def build_filter(matcher, is_hardlink_master, strip_components=0): def build_filter(matcher, peek_and_store_hardlink_masters, strip_components=0):
if strip_components: if strip_components:
def item_filter(item): def item_filter(item):
return (is_hardlink_master(item) or peek_and_store_hardlink_masters(item)
matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:])) return matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:])
else: else:
def item_filter(item): def item_filter(item):
return (is_hardlink_master(item) or peek_and_store_hardlink_masters(item)
matcher.match(item.path)) return matcher.match(item.path)
return item_filter return item_filter
@with_repository() @with_repository()
@ -450,11 +450,12 @@ class Archiver:
partial_extract = not matcher.empty() or strip_components partial_extract = not matcher.empty() or strip_components
hardlink_masters = {} if partial_extract else None hardlink_masters = {} if partial_extract else None
def item_is_hardlink_master(item): def peek_and_store_hardlink_masters(item):
return (partial_extract and stat.S_ISREG(item.mode) and if (partial_extract and stat.S_ISREG(item.mode) and
item.get('hardlink_master', True) and 'source' not in item) item.get('hardlink_master', True) and 'source' not in item):
hardlink_masters[item.get('path')] = (item.get('chunks'), None)
filter = self.build_filter(matcher, item_is_hardlink_master, strip_components) filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components)
if progress: if progress:
progress_logger = logging.getLogger(ProgressIndicatorPercent.LOGGER) progress_logger = logging.getLogger(ProgressIndicatorPercent.LOGGER)
progress_logger.info('Calculating size') progress_logger.info('Calculating size')
@ -465,10 +466,6 @@ class Archiver:
for item in archive.iter_items(filter, preload=True): for item in archive.iter_items(filter, preload=True):
orig_path = item.path orig_path = item.path
if item_is_hardlink_master(item):
hardlink_masters[orig_path] = (item.get('chunks'), None)
if not matcher.match(item.path):
continue
if strip_components: if strip_components:
item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
if not args.dry_run: if not args.dry_run: