From 2bc91e5010e40576e326943da3beedd97e0b6369 Mon Sep 17 00:00:00 2001
From: Peter Gerber <peter@arbitrary.ch>
Date: Mon, 25 Oct 2021 09:46:02 +0000
Subject: [PATCH 1/2] Speed up search for next valid object in segment in
 --repair mode

When an object is corrupted, the start position of the next object
will not be known as the size field belonging to the corrupted
object may be corrupted as well. In order to find the next object
within the segment, the remainder is scanned for the next valid
object, byte-by-byte. An object is considered valid if the CRC
checksum matches the content. However, doing so the scan accepted
any object size that fit within the remainder of the segment. As a
result, in particular when the corruption occurred near the start
of a segment, CRC checksums were calculated for large objects,
often hundreds of megabytes in size, despite the size being limited
to 20 MiB. This change makes it so that CRC calculation is skipped
when the object header indicates an impossible size, thereby,
greatly reducing the number of CPU cycles used for CRC calculations.
In my case, this brought down the time for repair from hours to mere
minutes.

This has also the additional benefit that there is some verification
in addition to the CRC checksum. The 4-bytes checksum is rather
short considering the amount of data that might be in an archive.

Likely fixes the hanging --repair in #5995 also.
---
 src/borg/repository.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/borg/repository.py b/src/borg/repository.py
index c0926b58a..cd673bd3f 100644
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -1518,10 +1518,8 @@ class LoggedIO:
                         dst_fd.write(MAGIC)
                         while len(d) >= self.header_fmt.size:
                             crc, size, tag = self.header_fmt.unpack(d[:self.header_fmt.size])
-                            if size < self.header_fmt.size or size > len(d):
-                                d = d[1:]
-                                continue
-                            if crc32(d[4:size]) & 0xffffffff != crc:
+                            if size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d)
+                               or crc32(d[4:size]) & 0xffffffff != crc:
                                 d = d[1:]
                                 continue
                             dst_fd.write(d[:size])

From 6c21404143ce9bf73272e2c05d2626e353687e3a Mon Sep 17 00:00:00 2001
From: Peter Gerber <peter@arbitrary.ch>
Date: Mon, 25 Oct 2021 19:24:44 +0000
Subject: [PATCH 2/2] Validate tag ID when --repair[ing] an object

This too should make the scan faster as, assuming the data is
random, we can skip CRC checks for almost 94% of the incorrect
header location solely based on the tag.

As draw back, this will limit the number of tags that can be
added without breaking backwards compatibility to 16, with
13 currently unused.
---
 src/borg/repository.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/borg/repository.py b/src/borg/repository.py
index cd673bd3f..7bf925e62 100644
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -38,6 +38,15 @@ TAG_PUT = 0
 TAG_DELETE = 1
 TAG_COMMIT = 2
 
+# Highest ID usable as TAG_* value
+#
+# Code may expect not to find any tags exceeding this value. In particular,
+# in order to speed up `borg check --repair`, any tag greater than MAX_TAG_ID
+# is assumed to be corrupted. When increasing this value, in order to add more
+# tags, keep in mind that old versions of Borg accessing a new repository
+# may not be able to handle the new tags.
+MAX_TAG_ID = 15
+
 FreeSpace = partial(defaultdict, int)
 
 
@@ -1518,8 +1527,8 @@ class LoggedIO:
                         dst_fd.write(MAGIC)
                         while len(d) >= self.header_fmt.size:
                             crc, size, tag = self.header_fmt.unpack(d[:self.header_fmt.size])
-                            if size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d)
-                               or crc32(d[4:size]) & 0xffffffff != crc:
+                            if size > MAX_OBJECT_SIZE or tag > MAX_TAG_ID or size < self.header_fmt.size \
+                               or size > len(d) or crc32(d[4:size]) & 0xffffffff != crc:
                                 d = d[1:]
                                 continue
                             dst_fd.write(d[:size])
@@ -1548,6 +1557,10 @@ class LoggedIO:
 
     def _read(self, fd, fmt, header, segment, offset, acceptable_tags, read_data=True):
         # some code shared by read() and iter_objects()
+
+        # See comment on MAX_TAG_ID for details
+        assert max(acceptable_tags) <= MAX_TAG_ID, 'Exceeding MAX_TAG_ID will break backwards compatibility'
+
         try:
             hdr_tuple = fmt.unpack(header)
         except struct.error as err: