From 2bc91e5010e40576e326943da3beedd97e0b6369 Mon Sep 17 00:00:00 2001 From: Peter Gerber Date: Mon, 25 Oct 2021 09:46:02 +0000 Subject: [PATCH] Speed up search for next valid object in segment in --repair mode When an object is corrupted, the start position of the next object will not be known as the size field belonging to the corrupted object may be corrupted as well. In order to find the next object within the segment, the remainder is scanned for the next valid object, byte-by-byte. An object is considered valid if the CRC checksum matches the content. However, doing so the scan accepted any object size that fit within the remainder of the segment. As a result, in particular when the corruption occurred near the start of a segment, CRC checksums were calculated for large objects, often hundreds of megabytes in size, despite the size being limited to 20 MiB. This change makes it so that CRC calculation is skipped when the object header indicates an impossible size, thereby, greatly reducing the number of CPU cycles used for CRC calculations. In my case, this brought down the time for repair from hours to mere minutes. This has also the additional benefit that there is some verification in addition to the CRC checksum. The 4-bytes checksum is rather short considering the amount of data that might be in an archive. Likely fixes the hanging --repair in #5995 also. --- src/borg/repository.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index c0926b58a..cd673bd3f 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1518,10 +1518,8 @@ class LoggedIO: dst_fd.write(MAGIC) while len(d) >= self.header_fmt.size: crc, size, tag = self.header_fmt.unpack(d[:self.header_fmt.size]) - if size < self.header_fmt.size or size > len(d): - d = d[1:] - continue - if crc32(d[4:size]) & 0xffffffff != crc: + if size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d) + or crc32(d[4:size]) & 0xffffffff != crc: d = d[1:] continue dst_fd.write(d[:size])