From 9e534c1929c3ec5a80ff5149b92eab2bd0efa313 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 16 Apr 2023 13:42:33 +0200 Subject: [PATCH 1/3] Archive.extract_item: remove unused params, make most params kwargs stripped_components: this is done already in do_extract, it modifies item.path accordingly. original_path: not used any more. also: run black. --- src/borg/archive.py | 16 +--------------- src/borg/archiver/extract_cmd.py | 10 +--------- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index c181a2a53..fefb2bb46 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -780,18 +780,7 @@ def extract_helper(self, item, path, hlm, *, dry_run=False): # In this case, we *want* to extract twice, because there is no other way. pass - def extract_item( - self, - item, - restore_attrs=True, - dry_run=False, - stdout=False, - sparse=False, - hlm=None, - stripped_components=0, - original_path=None, - pi=None, - ): + def extract_item(self, item, *, restore_attrs=True, dry_run=False, stdout=False, sparse=False, hlm=None, pi=None): """ Extract archive item. @@ -801,8 +790,6 @@ def extract_item( :param stdout: write extracted data to stdout :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly - :param stripped_components: stripped leading path components to correct hard link extraction - :param original_path: 'path' key as stored in archive :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) """ has_damaged_chunks = "chunks_healthy" in item @@ -834,7 +821,6 @@ def extract_item( raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.") return - original_path = original_path or item.path dest = self.cwd if item.path.startswith(("/", "../")): raise Exception("Path should be relative and local") diff --git a/src/borg/archiver/extract_cmd.py b/src/borg/archiver/extract_cmd.py index a3fed094e..af64b1ad9 100644 --- a/src/borg/archiver/extract_cmd.py +++ b/src/borg/archiver/extract_cmd.py @@ -75,15 +75,7 @@ def do_extract(self, args, repository, manifest, archive): dirs.append(item) archive.extract_item(item, stdout=stdout, restore_attrs=False) else: - archive.extract_item( - item, - stdout=stdout, - sparse=sparse, - hlm=hlm, - stripped_components=strip_components, - original_path=orig_path, - pi=pi, - ) + archive.extract_item(item, stdout=stdout, sparse=sparse, hlm=hlm, pi=pi) except (BackupOSError, BackupError) as e: self.print_warning("%s: %s", remove_surrogates(orig_path), e) From 7786cc7cb4e380bec58581c39128d8c9a3c96ff4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 16 Apr 2023 18:46:40 +0200 Subject: [PATCH 2/3] extract: support extraction of atime/mtime on win32 --- src/borg/archive.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/borg/archive.py b/src/borg/archive.py index fefb2bb46..bd3895c0e 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -984,6 +984,16 @@ def restore_attrs(self, path, item, symlink=False, fd=None): set_flags(path, item.bsdflags, fd=fd) except OSError: pass + else: # win32 + # set timestamps rather late + mtime = item.mtime + atime = item.atime if "atime" in item else mtime + try: + # note: no fd support on win32 + os.utime(path, None, ns=(atime, mtime)) + except OSError: + # some systems don't support calling utime on a symlink + pass def set_meta(self, key, value): metadata = self._load_meta(self.id) From 573275e67850bc9cb8383c9e7ff7bc81c72f7cfb Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 16 Apr 2023 15:34:40 +0200 Subject: [PATCH 3/3] extract --continue: continue a previously interrupted extraction, fixes #1356 This skips over all previously fully extracted regular files, but will delete and fully re-extract incomplete files. --- src/borg/archive.py | 37 +++++++++++++++++- src/borg/archiver/extract_cmd.py | 11 +++++- src/borg/testsuite/archiver/extract_cmd.py | 45 +++++++++++++++++++++- 3 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index bd3895c0e..3caf50608 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -780,7 +780,18 @@ def extract_helper(self, item, path, hlm, *, dry_run=False): # In this case, we *want* to extract twice, because there is no other way. pass - def extract_item(self, item, *, restore_attrs=True, dry_run=False, stdout=False, sparse=False, hlm=None, pi=None): + def extract_item( + self, + item, + *, + restore_attrs=True, + dry_run=False, + stdout=False, + sparse=False, + hlm=None, + pi=None, + continue_extraction=False, + ): """ Extract archive item. @@ -791,7 +802,27 @@ def extract_item(self, item, *, restore_attrs=True, dry_run=False, stdout=False, :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) + :param continue_extraction: continue a previously interrupted extraction of same archive """ + + def same_item(item, st): + """is the archived item the same as the fs item at same path with stat st?""" + if not stat.S_ISREG(st.st_mode): + # we only "optimize" for regular files. + # other file types are less frequent and have no content extraction we could "optimize away". + return False + if item.mode != st.st_mode or item.size != st.st_size: + # the size check catches incomplete previous file extraction + return False + if item.get("mtime") != st.st_mtime_ns: + # note: mtime is "extracted" late, after xattrs and ACLs, but before flags. + return False + # this is good enough for the intended use case: + # continuing an extraction of same archive that initially started in an empty directory. + # there is a very small risk that "bsdflags" of one file are wrong: + # if a previous extraction was interrupted between setting the mtime and setting non-default flags. + return True + has_damaged_chunks = "chunks_healthy" in item if dry_run or stdout: with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set: @@ -828,7 +859,9 @@ def extract_item(self, item, *, restore_attrs=True, dry_run=False, stdout=False, # Attempt to remove existing files, ignore errors on failure try: st = os.stat(path, follow_symlinks=False) - if stat.S_ISDIR(st.st_mode): + if continue_extraction and same_item(item, st): + return # done! we already have fully extracted this file in a previous run. + elif stat.S_ISDIR(st.st_mode): os.rmdir(path) else: os.unlink(path) diff --git a/src/borg/archiver/extract_cmd.py b/src/borg/archiver/extract_cmd.py index af64b1ad9..452b9a9a5 100644 --- a/src/borg/archiver/extract_cmd.py +++ b/src/borg/archiver/extract_cmd.py @@ -42,6 +42,7 @@ def do_extract(self, args, repository, manifest, archive): stdout = args.stdout sparse = args.sparse strip_components = args.strip_components + continue_extraction = args.continue_extraction dirs = [] hlm = HardLinkManager(id_type=bytes, info_type=str) # hlid -> path @@ -75,7 +76,9 @@ def do_extract(self, args, repository, manifest, archive): dirs.append(item) archive.extract_item(item, stdout=stdout, restore_attrs=False) else: - archive.extract_item(item, stdout=stdout, sparse=sparse, hlm=hlm, pi=pi) + archive.extract_item( + item, stdout=stdout, sparse=sparse, hlm=hlm, pi=pi, continue_extraction=continue_extraction + ) except (BackupOSError, BackupError) as e: self.print_warning("%s: %s", remove_surrogates(orig_path), e) @@ -166,6 +169,12 @@ def build_parser_extract(self, subparsers, common_parser, mid_common_parser): action="store_true", help="create holes in output sparse file from all-zero chunks", ) + subparser.add_argument( + "--continue", + dest="continue_extraction", + action="store_true", + help="continue a previously interrupted extraction of same archive", + ) subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name") subparser.add_argument( "paths", metavar="PATH", nargs="*", type=str, help="paths to extract; patterns are supported" diff --git a/src/borg/testsuite/archiver/extract_cmd.py b/src/borg/testsuite/archiver/extract_cmd.py index f3266d011..db1b3e4d2 100644 --- a/src/borg/testsuite/archiver/extract_cmd.py +++ b/src/borg/testsuite/archiver/extract_cmd.py @@ -13,7 +13,7 @@ from ...helpers import flags_noatime, flags_normal from .. import changedir, same_ts_ns from .. import are_symlinks_supported, are_hardlinks_supported, is_utime_fully_supported, is_birthtime_fully_supported -from ..platform import is_darwin +from ..platform import is_darwin, is_win32 from . import ( ArchiverTestCaseBase, ArchiverTestCaseBinaryBase, @@ -621,6 +621,49 @@ def patched_setxattr_EACCES(*args, **kwargs): with patch.object(xattr, "setxattr", patched_setxattr_EACCES): self.cmd(f"--repo={self.repository_location}", "extract", "test", exit_code=EXIT_WARNING) + def test_extract_continue(self): + CONTENTS1, CONTENTS2, CONTENTS3 = b"contents1" * 100, b"contents2" * 200, b"contents3" * 300 + self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION) + self.create_regular_file("file1", contents=CONTENTS1) + self.create_regular_file("file2", contents=CONTENTS2) + self.create_regular_file("file3", contents=CONTENTS3) + self.cmd(f"--repo={self.repository_location}", "create", "arch", "input") + with changedir("output"): + # we simulate an interrupted/partial extraction: + self.cmd(f"--repo={self.repository_location}", "extract", "arch") + # do not modify file1, it stands for a successfully extracted file + file1_st = os.stat("input/file1") + # simulate a partially extracted file2 (smaller size, archived mtime not yet set) + file2_st = os.stat("input/file2") + os.truncate("input/file2", 123) # -> incorrect size, incorrect mtime + # simulate file3 has not yet been extracted + file3_st = os.stat("input/file3") + os.remove("input/file3") + with changedir("output"): + # now try to continue extracting, using the same archive, same output dir: + self.cmd(f"--repo={self.repository_location}", "extract", "arch", "--continue") + now_file1_st = os.stat("input/file1") + assert file1_st.st_ino == now_file1_st.st_ino # file1 was NOT extracted again + assert file1_st.st_mtime_ns == now_file1_st.st_mtime_ns # has correct mtime + new_file2_st = os.stat("input/file2") + assert file2_st.st_ino != new_file2_st.st_ino # file2 was extracted again + assert file2_st.st_mtime_ns == new_file2_st.st_mtime_ns # has correct mtime + new_file3_st = os.stat("input/file3") + assert file3_st.st_ino != new_file3_st.st_ino # file3 was extracted again + assert file3_st.st_mtime_ns == new_file3_st.st_mtime_ns # has correct mtime + # windows has a strange ctime behaviour when deleting and recreating a file + if not is_win32: + assert file1_st.st_ctime_ns == now_file1_st.st_ctime_ns # file not extracted again + assert file2_st.st_ctime_ns != new_file2_st.st_ctime_ns # file extracted again + assert file3_st.st_ctime_ns != new_file3_st.st_ctime_ns # file extracted again + # check if all contents (and thus also file sizes) are correct: + with open("input/file1", "rb") as f: + assert f.read() == CONTENTS1 + with open("input/file2", "rb") as f: + assert f.read() == CONTENTS2 + with open("input/file3", "rb") as f: + assert f.read() == CONTENTS3 + class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase): """run the same tests, but with a remote repository"""