Implement IntegrityCheckedFile (#2502)

Implement IntegrityCheckedFile This is based on much earlier work from October 2016 by me, but is overall simplified and changed terminology (from "signing" to hashing and integrity checking). See #1688 for the full history.
2017-05-12 21:38:31 +02:00 · 2017-05-12 21:38:31 +02:00 · 820066da5d
parent 1dd53f0e03
commit 820066da5d
2 changed files with 334 additions and 0 deletions
--- a/src/borg/crypto/file_integrity.py
+++ b/src/borg/crypto/file_integrity.py
@ -0,0 +1,182 @@
+import hashlib
+import io
+import json
+import os
+from hmac import compare_digest
+
+from ..helpers import IntegrityError
+from ..logger import create_logger
+
+logger = create_logger()
+
+
+class FileLikeWrapper:
+    def __enter__(self):
+        self.fd.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.fd.__exit__(exc_type, exc_val, exc_tb)
+
+    def tell(self):
+        return self.fd.tell()
+
+    def seek(self, offset, whence=io.SEEK_SET):
+        return self.fd.seek(offset, whence)
+
+    def write(self, data):
+        return self.fd.write(data)
+
+    def read(self, n=None):
+        return self.fd.read(n)
+
+    def flush(self):
+        self.fd.flush()
+
+    def fileno(self):
+        return self.fd.fileno()
+
+
+class SHA512FileHashingWrapper(FileLikeWrapper):
+    """
+    Wrapper for file-like objects that computes a hash on-the-fly while reading/writing.
+
+    WARNING: Seeks should only be used to query the size of the file, not
+    to skip data, because skipped data isn't read and not hashed into the digest.
+
+    Similarly skipping while writing to create sparse files is also not supported.
+
+    Data has to be read/written in a symmetric fashion, otherwise different
+    digests will be generated.
+
+    Note: When used as a context manager read/write operations outside the enclosed scope
+    are illegal.
+    """
+
+    ALGORITHM = 'SHA512'
+
+    def __init__(self, backing_fd, write):
+        self.fd = backing_fd
+        self.writing = write
+        self.hash = hashlib.new(self.ALGORITHM)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            self.hash_length()
+        super().__exit__(exc_type, exc_val, exc_tb)
+
+    def write(self, data):
+        """
+        Write *data* to backing file and update internal state.
+        """
+        n = super().write(data)
+        self.hash.update(data)
+        return n
+
+    def read(self, n=None):
+        """
+        Read *data* from backing file (*n* has the usual meaning) and update internal state.
+        """
+        data = super().read(n)
+        self.hash.update(data)
+        return data
+
+    def hexdigest(self):
+        """
+        Return current digest bytes as hex-string.
+
+        Note: this can be called multiple times.
+        """
+        return self.hash.hexdigest()
+
+    def update(self, data: bytes):
+        self.hash.update(data)
+
+    def hash_length(self, seek_to_end=False):
+        if seek_to_end:
+            # Add length of file to the hash to avoid problems if only a prefix is read.
+            self.seek(0, io.SEEK_END)
+        self.hash.update(str(self.tell()).encode())
+
+
+class FileIntegrityError(IntegrityError):
+    """File failed integrity check: {}"""
+
+
+class IntegrityCheckedFile(FileLikeWrapper):
+    def __init__(self, path, write, filename=None, override_fd=None):
+        self.path = path
+        self.writing = write
+        mode = 'wb' if write else 'rb'
+        self.file_fd = override_fd or open(path, mode)
+
+        self.fd = self.hasher = SHA512FileHashingWrapper(backing_fd=self.file_fd, write=write)
+
+        self.hash_filename(filename)
+
+        if write:
+            self.digests = {}
+        else:
+            self.digests = self.read_integrity_file(path, self.hasher)
+            # TODO: When we're reading but don't have any digests, i.e. no integrity file existed,
+            # TODO: then we could just short-circuit.
+
+    def hash_filename(self, filename=None):
+        # Hash the name of the file, but only the basename, ie. not the path.
+        # In Borg the name itself encodes the context (eg. index.N, cache, files),
+        # while the path doesn't matter, and moving e.g. a repository or cache directory is supported.
+        # Changing the name however imbues a change of context that is not permissible.
+        filename = os.path.basename(filename or self.path)
+        self.hasher.update(('%10d' % len(filename)).encode())
+        self.hasher.update(filename.encode())
+
+    @staticmethod
+    def integrity_file_path(path):
+        return path + '.integrity'
+
+    @classmethod
+    def read_integrity_file(cls, path, hasher):
+        try:
+            with open(cls.integrity_file_path(path), 'r') as fd:
+                integrity_file = json.load(fd)
+                # Provisions for agility now, implementation later, but make sure the on-disk joint is oiled.
+                algorithm = integrity_file['algorithm']
+                if algorithm != hasher.ALGORITHM:
+                    logger.warning('Cannot verify integrity of %s: Unknown algorithm %r', path, algorithm)
+                    return
+                digests = integrity_file['digests']
+                # Require at least presence of the final digest
+                digests['final']
+                return digests
+        except FileNotFoundError:
+            logger.info('No integrity file found for %s', path)
+        except (OSError, ValueError, TypeError, KeyError) as e:
+            logger.warning('Could not read integrity file for %s: %s', path, e)
+            raise FileIntegrityError(path)
+
+    def hash_part(self, partname, is_final=False):
+        if not self.writing and not self.digests:
+            return
+        self.hasher.update(partname.encode())
+        self.hasher.hash_length(seek_to_end=is_final)
+        digest = self.hasher.hexdigest()
+        if self.writing:
+            self.digests[partname] = digest
+        elif self.digests and not compare_digest(self.digests.get(partname, ''), digest):
+            raise FileIntegrityError(self.path)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        exception = exc_type is not None
+        if not exception:
+            self.hash_part('final', is_final=True)
+        self.hasher.__exit__(exc_type, exc_val, exc_tb)
+        if exception:
+            return
+        if self.writing:
+            with open(self.integrity_file_path(self.path), 'w') as fd:
+                json.dump({
+                    'algorithm': self.hasher.ALGORITHM,
+                    'digests': self.digests,
+                }, fd)
+        elif self.digests:
+            logger.debug('Verified integrity of %s', self.path)
--- a/src/borg/testsuite/file_integrity.py
+++ b/src/borg/testsuite/file_integrity.py
@ -0,0 +1,152 @@
+
+import pytest
+
+from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
+
+
+class TestReadIntegrityFile:
+    def test_no_integrity(self, tmpdir):
+        protected_file = tmpdir.join('file')
+        protected_file.write('1234')
+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), None) is None
+
+    def test_truncated_integrity(self, tmpdir):
+        protected_file = tmpdir.join('file')
+        protected_file.write('1234')
+        tmpdir.join('file.integrity').write('')
+        with pytest.raises(FileIntegrityError):
+            IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
+
+    def test_unknown_algorithm(self, tmpdir):
+        class SomeHasher:
+            ALGORITHM = 'HMAC_FOOHASH9000'
+
+        protected_file = tmpdir.join('file')
+        protected_file.write('1234')
+        tmpdir.join('file.integrity').write('{"algorithm": "HMAC_SERIOUSHASH", "digests": "1234"}')
+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) is None
+
+    @pytest.mark.parametrize('json', (
+        '{"ALGORITHM": "HMAC_SERIOUSHASH", "digests": "1234"}',
+        '[]',
+        '1234.5',
+        '"A string"',
+        'Invalid JSON',
+    ))
+    def test_malformed(self, tmpdir, json):
+        protected_file = tmpdir.join('file')
+        protected_file.write('1234')
+        tmpdir.join('file.integrity').write(json)
+        with pytest.raises(FileIntegrityError):
+            IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
+
+    def test_valid(self, tmpdir):
+        class SomeHasher:
+            ALGORITHM = 'HMAC_FOO1'
+
+        protected_file = tmpdir.join('file')
+        protected_file.write('1234')
+        tmpdir.join('file.integrity').write('{"algorithm": "HMAC_FOO1", "digests": {"final": "1234"}}')
+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) == {'final': '1234'}
+
+
+class TestIntegrityCheckedFile:
+    @pytest.fixture
+    def integrity_protected_file(self, tmpdir):
+        path = str(tmpdir.join('file'))
+        with IntegrityCheckedFile(path, write=True) as fd:
+            fd.write(b'foo and bar')
+        return path
+
+    def test_simple(self, tmpdir, integrity_protected_file):
+        assert tmpdir.join('file').check(file=True)
+        assert tmpdir.join('file.integrity').check(file=True)
+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+            assert fd.read() == b'foo and bar'
+
+    def test_corrupted_file(self, integrity_protected_file):
+        with open(integrity_protected_file, 'ab') as fd:
+            fd.write(b' extra data')
+        with pytest.raises(FileIntegrityError):
+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+                assert fd.read() == b'foo and bar extra data'
+
+    def test_corrupted_file_partial_read(self, integrity_protected_file):
+        with open(integrity_protected_file, 'ab') as fd:
+            fd.write(b' extra data')
+        with pytest.raises(FileIntegrityError):
+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+                data = b'foo and bar'
+                assert fd.read(len(data)) == data
+
+    @pytest.mark.parametrize('new_name', (
+        'different_file',
+        'different_file.different_ext',
+    ))
+    def test_renamed_file(self, tmpdir, integrity_protected_file, new_name):
+        new_path = tmpdir.join(new_name)
+        tmpdir.join('file').move(new_path)
+        tmpdir.join('file.integrity').move(new_path + '.integrity')
+        with pytest.raises(FileIntegrityError):
+            with IntegrityCheckedFile(str(new_path), write=False) as fd:
+                assert fd.read() == b'foo and bar'
+
+    def test_moved_file(self, tmpdir, integrity_protected_file):
+        new_dir = tmpdir.mkdir('another_directory')
+        tmpdir.join('file').move(new_dir.join('file'))
+        tmpdir.join('file.integrity').move(new_dir.join('file.integrity'))
+        new_path = str(new_dir.join('file'))
+        with IntegrityCheckedFile(new_path, write=False) as fd:
+            assert fd.read() == b'foo and bar'
+
+    def test_no_integrity(self, tmpdir, integrity_protected_file):
+        tmpdir.join('file.integrity').remove()
+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+            assert fd.read() == b'foo and bar'
+
+
+class TestIntegrityCheckedFileParts:
+    @pytest.fixture
+    def integrity_protected_file(self, tmpdir):
+        path = str(tmpdir.join('file'))
+        with IntegrityCheckedFile(path, write=True) as fd:
+            fd.write(b'foo and bar')
+            fd.hash_part('foopart')
+            fd.write(b' other data')
+        return path
+
+    def test_simple(self, integrity_protected_file):
+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+            data1 = b'foo and bar'
+            assert fd.read(len(data1)) == data1
+            fd.hash_part('foopart')
+            assert fd.read() == b' other data'
+
+    def test_wrong_part_name(self, integrity_protected_file):
+        with pytest.raises(FileIntegrityError):
+            # Because some hash_part failed, the final digest will fail as well - again - even if we catch
+            # the failing hash_part. This is intentional: (1) it makes the code simpler (2) it's a good fail-safe
+            # against overly broad exception handling.
+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+                data1 = b'foo and bar'
+                assert fd.read(len(data1)) == data1
+                with pytest.raises(FileIntegrityError):
+                    # This specific bit raises it directly
+                    fd.hash_part('barpart')
+                # Still explodes in the end.
+
+    @pytest.mark.parametrize('partial_read', (False, True))
+    def test_part_independence(self, integrity_protected_file, partial_read):
+        with open(integrity_protected_file, 'ab') as fd:
+            fd.write(b'some extra stuff that does not belong')
+        with pytest.raises(FileIntegrityError):
+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
+                data1 = b'foo and bar'
+                try:
+                    assert fd.read(len(data1)) == data1
+                    fd.hash_part('foopart')
+                except FileIntegrityError:
+                    assert False, 'This part must not raise, since this part is still valid.'
+                if not partial_read:
+                    fd.read()
+                # But overall it explodes with the final digest. Neat, eh?