diff --git a/src/borg/crypto/file_integrity.py b/src/borg/crypto/file_integrity.py new file mode 100644 index 000000000..5c1fa4e1c --- /dev/null +++ b/src/borg/crypto/file_integrity.py @@ -0,0 +1,182 @@ +import hashlib +import io +import json +import os +from hmac import compare_digest + +from ..helpers import IntegrityError +from ..logger import create_logger + +logger = create_logger() + + +class FileLikeWrapper: + def __enter__(self): + self.fd.__enter__() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.fd.__exit__(exc_type, exc_val, exc_tb) + + def tell(self): + return self.fd.tell() + + def seek(self, offset, whence=io.SEEK_SET): + return self.fd.seek(offset, whence) + + def write(self, data): + return self.fd.write(data) + + def read(self, n=None): + return self.fd.read(n) + + def flush(self): + self.fd.flush() + + def fileno(self): + return self.fd.fileno() + + +class SHA512FileHashingWrapper(FileLikeWrapper): + """ + Wrapper for file-like objects that computes a hash on-the-fly while reading/writing. + + WARNING: Seeks should only be used to query the size of the file, not + to skip data, because skipped data isn't read and not hashed into the digest. + + Similarly skipping while writing to create sparse files is also not supported. + + Data has to be read/written in a symmetric fashion, otherwise different + digests will be generated. + + Note: When used as a context manager read/write operations outside the enclosed scope + are illegal. + """ + + ALGORITHM = 'SHA512' + + def __init__(self, backing_fd, write): + self.fd = backing_fd + self.writing = write + self.hash = hashlib.new(self.ALGORITHM) + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + self.hash_length() + super().__exit__(exc_type, exc_val, exc_tb) + + def write(self, data): + """ + Write *data* to backing file and update internal state. + """ + n = super().write(data) + self.hash.update(data) + return n + + def read(self, n=None): + """ + Read *data* from backing file (*n* has the usual meaning) and update internal state. + """ + data = super().read(n) + self.hash.update(data) + return data + + def hexdigest(self): + """ + Return current digest bytes as hex-string. + + Note: this can be called multiple times. + """ + return self.hash.hexdigest() + + def update(self, data: bytes): + self.hash.update(data) + + def hash_length(self, seek_to_end=False): + if seek_to_end: + # Add length of file to the hash to avoid problems if only a prefix is read. + self.seek(0, io.SEEK_END) + self.hash.update(str(self.tell()).encode()) + + +class FileIntegrityError(IntegrityError): + """File failed integrity check: {}""" + + +class IntegrityCheckedFile(FileLikeWrapper): + def __init__(self, path, write, filename=None, override_fd=None): + self.path = path + self.writing = write + mode = 'wb' if write else 'rb' + self.file_fd = override_fd or open(path, mode) + + self.fd = self.hasher = SHA512FileHashingWrapper(backing_fd=self.file_fd, write=write) + + self.hash_filename(filename) + + if write: + self.digests = {} + else: + self.digests = self.read_integrity_file(path, self.hasher) + # TODO: When we're reading but don't have any digests, i.e. no integrity file existed, + # TODO: then we could just short-circuit. + + def hash_filename(self, filename=None): + # Hash the name of the file, but only the basename, ie. not the path. + # In Borg the name itself encodes the context (eg. index.N, cache, files), + # while the path doesn't matter, and moving e.g. a repository or cache directory is supported. + # Changing the name however imbues a change of context that is not permissible. + filename = os.path.basename(filename or self.path) + self.hasher.update(('%10d' % len(filename)).encode()) + self.hasher.update(filename.encode()) + + @staticmethod + def integrity_file_path(path): + return path + '.integrity' + + @classmethod + def read_integrity_file(cls, path, hasher): + try: + with open(cls.integrity_file_path(path), 'r') as fd: + integrity_file = json.load(fd) + # Provisions for agility now, implementation later, but make sure the on-disk joint is oiled. + algorithm = integrity_file['algorithm'] + if algorithm != hasher.ALGORITHM: + logger.warning('Cannot verify integrity of %s: Unknown algorithm %r', path, algorithm) + return + digests = integrity_file['digests'] + # Require at least presence of the final digest + digests['final'] + return digests + except FileNotFoundError: + logger.info('No integrity file found for %s', path) + except (OSError, ValueError, TypeError, KeyError) as e: + logger.warning('Could not read integrity file for %s: %s', path, e) + raise FileIntegrityError(path) + + def hash_part(self, partname, is_final=False): + if not self.writing and not self.digests: + return + self.hasher.update(partname.encode()) + self.hasher.hash_length(seek_to_end=is_final) + digest = self.hasher.hexdigest() + if self.writing: + self.digests[partname] = digest + elif self.digests and not compare_digest(self.digests.get(partname, ''), digest): + raise FileIntegrityError(self.path) + + def __exit__(self, exc_type, exc_val, exc_tb): + exception = exc_type is not None + if not exception: + self.hash_part('final', is_final=True) + self.hasher.__exit__(exc_type, exc_val, exc_tb) + if exception: + return + if self.writing: + with open(self.integrity_file_path(self.path), 'w') as fd: + json.dump({ + 'algorithm': self.hasher.ALGORITHM, + 'digests': self.digests, + }, fd) + elif self.digests: + logger.debug('Verified integrity of %s', self.path) diff --git a/src/borg/testsuite/file_integrity.py b/src/borg/testsuite/file_integrity.py new file mode 100644 index 000000000..a8ef95f74 --- /dev/null +++ b/src/borg/testsuite/file_integrity.py @@ -0,0 +1,152 @@ + +import pytest + +from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError + + +class TestReadIntegrityFile: + def test_no_integrity(self, tmpdir): + protected_file = tmpdir.join('file') + protected_file.write('1234') + assert IntegrityCheckedFile.read_integrity_file(str(protected_file), None) is None + + def test_truncated_integrity(self, tmpdir): + protected_file = tmpdir.join('file') + protected_file.write('1234') + tmpdir.join('file.integrity').write('') + with pytest.raises(FileIntegrityError): + IntegrityCheckedFile.read_integrity_file(str(protected_file), None) + + def test_unknown_algorithm(self, tmpdir): + class SomeHasher: + ALGORITHM = 'HMAC_FOOHASH9000' + + protected_file = tmpdir.join('file') + protected_file.write('1234') + tmpdir.join('file.integrity').write('{"algorithm": "HMAC_SERIOUSHASH", "digests": "1234"}') + assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) is None + + @pytest.mark.parametrize('json', ( + '{"ALGORITHM": "HMAC_SERIOUSHASH", "digests": "1234"}', + '[]', + '1234.5', + '"A string"', + 'Invalid JSON', + )) + def test_malformed(self, tmpdir, json): + protected_file = tmpdir.join('file') + protected_file.write('1234') + tmpdir.join('file.integrity').write(json) + with pytest.raises(FileIntegrityError): + IntegrityCheckedFile.read_integrity_file(str(protected_file), None) + + def test_valid(self, tmpdir): + class SomeHasher: + ALGORITHM = 'HMAC_FOO1' + + protected_file = tmpdir.join('file') + protected_file.write('1234') + tmpdir.join('file.integrity').write('{"algorithm": "HMAC_FOO1", "digests": {"final": "1234"}}') + assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) == {'final': '1234'} + + +class TestIntegrityCheckedFile: + @pytest.fixture + def integrity_protected_file(self, tmpdir): + path = str(tmpdir.join('file')) + with IntegrityCheckedFile(path, write=True) as fd: + fd.write(b'foo and bar') + return path + + def test_simple(self, tmpdir, integrity_protected_file): + assert tmpdir.join('file').check(file=True) + assert tmpdir.join('file.integrity').check(file=True) + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + assert fd.read() == b'foo and bar' + + def test_corrupted_file(self, integrity_protected_file): + with open(integrity_protected_file, 'ab') as fd: + fd.write(b' extra data') + with pytest.raises(FileIntegrityError): + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + assert fd.read() == b'foo and bar extra data' + + def test_corrupted_file_partial_read(self, integrity_protected_file): + with open(integrity_protected_file, 'ab') as fd: + fd.write(b' extra data') + with pytest.raises(FileIntegrityError): + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + data = b'foo and bar' + assert fd.read(len(data)) == data + + @pytest.mark.parametrize('new_name', ( + 'different_file', + 'different_file.different_ext', + )) + def test_renamed_file(self, tmpdir, integrity_protected_file, new_name): + new_path = tmpdir.join(new_name) + tmpdir.join('file').move(new_path) + tmpdir.join('file.integrity').move(new_path + '.integrity') + with pytest.raises(FileIntegrityError): + with IntegrityCheckedFile(str(new_path), write=False) as fd: + assert fd.read() == b'foo and bar' + + def test_moved_file(self, tmpdir, integrity_protected_file): + new_dir = tmpdir.mkdir('another_directory') + tmpdir.join('file').move(new_dir.join('file')) + tmpdir.join('file.integrity').move(new_dir.join('file.integrity')) + new_path = str(new_dir.join('file')) + with IntegrityCheckedFile(new_path, write=False) as fd: + assert fd.read() == b'foo and bar' + + def test_no_integrity(self, tmpdir, integrity_protected_file): + tmpdir.join('file.integrity').remove() + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + assert fd.read() == b'foo and bar' + + +class TestIntegrityCheckedFileParts: + @pytest.fixture + def integrity_protected_file(self, tmpdir): + path = str(tmpdir.join('file')) + with IntegrityCheckedFile(path, write=True) as fd: + fd.write(b'foo and bar') + fd.hash_part('foopart') + fd.write(b' other data') + return path + + def test_simple(self, integrity_protected_file): + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + data1 = b'foo and bar' + assert fd.read(len(data1)) == data1 + fd.hash_part('foopart') + assert fd.read() == b' other data' + + def test_wrong_part_name(self, integrity_protected_file): + with pytest.raises(FileIntegrityError): + # Because some hash_part failed, the final digest will fail as well - again - even if we catch + # the failing hash_part. This is intentional: (1) it makes the code simpler (2) it's a good fail-safe + # against overly broad exception handling. + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + data1 = b'foo and bar' + assert fd.read(len(data1)) == data1 + with pytest.raises(FileIntegrityError): + # This specific bit raises it directly + fd.hash_part('barpart') + # Still explodes in the end. + + @pytest.mark.parametrize('partial_read', (False, True)) + def test_part_independence(self, integrity_protected_file, partial_read): + with open(integrity_protected_file, 'ab') as fd: + fd.write(b'some extra stuff that does not belong') + with pytest.raises(FileIntegrityError): + with IntegrityCheckedFile(integrity_protected_file, write=False) as fd: + data1 = b'foo and bar' + try: + assert fd.read(len(data1)) == data1 + fd.hash_part('foopart') + except FileIntegrityError: + assert False, 'This part must not raise, since this part is still valid.' + if not partial_read: + fd.read() + # But overall it explodes with the final digest. Neat, eh?