mirror of https://github.com/borgbackup/borg.git
Implement IntegrityCheckedFile (#2502)
Implement IntegrityCheckedFile This is based on much earlier work from October 2016 by me, but is overall simplified and changed terminology (from "signing" to hashing and integrity checking). See #1688 for the full history.
This commit is contained in:
parent
1dd53f0e03
commit
820066da5d
|
@ -0,0 +1,182 @@
|
|||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from hmac import compare_digest
|
||||
|
||||
from ..helpers import IntegrityError
|
||||
from ..logger import create_logger
|
||||
|
||||
logger = create_logger()
|
||||
|
||||
|
||||
class FileLikeWrapper:
|
||||
def __enter__(self):
|
||||
self.fd.__enter__()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.fd.__exit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
def tell(self):
|
||||
return self.fd.tell()
|
||||
|
||||
def seek(self, offset, whence=io.SEEK_SET):
|
||||
return self.fd.seek(offset, whence)
|
||||
|
||||
def write(self, data):
|
||||
return self.fd.write(data)
|
||||
|
||||
def read(self, n=None):
|
||||
return self.fd.read(n)
|
||||
|
||||
def flush(self):
|
||||
self.fd.flush()
|
||||
|
||||
def fileno(self):
|
||||
return self.fd.fileno()
|
||||
|
||||
|
||||
class SHA512FileHashingWrapper(FileLikeWrapper):
|
||||
"""
|
||||
Wrapper for file-like objects that computes a hash on-the-fly while reading/writing.
|
||||
|
||||
WARNING: Seeks should only be used to query the size of the file, not
|
||||
to skip data, because skipped data isn't read and not hashed into the digest.
|
||||
|
||||
Similarly skipping while writing to create sparse files is also not supported.
|
||||
|
||||
Data has to be read/written in a symmetric fashion, otherwise different
|
||||
digests will be generated.
|
||||
|
||||
Note: When used as a context manager read/write operations outside the enclosed scope
|
||||
are illegal.
|
||||
"""
|
||||
|
||||
ALGORITHM = 'SHA512'
|
||||
|
||||
def __init__(self, backing_fd, write):
|
||||
self.fd = backing_fd
|
||||
self.writing = write
|
||||
self.hash = hashlib.new(self.ALGORITHM)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if exc_type is None:
|
||||
self.hash_length()
|
||||
super().__exit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
def write(self, data):
|
||||
"""
|
||||
Write *data* to backing file and update internal state.
|
||||
"""
|
||||
n = super().write(data)
|
||||
self.hash.update(data)
|
||||
return n
|
||||
|
||||
def read(self, n=None):
|
||||
"""
|
||||
Read *data* from backing file (*n* has the usual meaning) and update internal state.
|
||||
"""
|
||||
data = super().read(n)
|
||||
self.hash.update(data)
|
||||
return data
|
||||
|
||||
def hexdigest(self):
|
||||
"""
|
||||
Return current digest bytes as hex-string.
|
||||
|
||||
Note: this can be called multiple times.
|
||||
"""
|
||||
return self.hash.hexdigest()
|
||||
|
||||
def update(self, data: bytes):
|
||||
self.hash.update(data)
|
||||
|
||||
def hash_length(self, seek_to_end=False):
|
||||
if seek_to_end:
|
||||
# Add length of file to the hash to avoid problems if only a prefix is read.
|
||||
self.seek(0, io.SEEK_END)
|
||||
self.hash.update(str(self.tell()).encode())
|
||||
|
||||
|
||||
class FileIntegrityError(IntegrityError):
|
||||
"""File failed integrity check: {}"""
|
||||
|
||||
|
||||
class IntegrityCheckedFile(FileLikeWrapper):
|
||||
def __init__(self, path, write, filename=None, override_fd=None):
|
||||
self.path = path
|
||||
self.writing = write
|
||||
mode = 'wb' if write else 'rb'
|
||||
self.file_fd = override_fd or open(path, mode)
|
||||
|
||||
self.fd = self.hasher = SHA512FileHashingWrapper(backing_fd=self.file_fd, write=write)
|
||||
|
||||
self.hash_filename(filename)
|
||||
|
||||
if write:
|
||||
self.digests = {}
|
||||
else:
|
||||
self.digests = self.read_integrity_file(path, self.hasher)
|
||||
# TODO: When we're reading but don't have any digests, i.e. no integrity file existed,
|
||||
# TODO: then we could just short-circuit.
|
||||
|
||||
def hash_filename(self, filename=None):
|
||||
# Hash the name of the file, but only the basename, ie. not the path.
|
||||
# In Borg the name itself encodes the context (eg. index.N, cache, files),
|
||||
# while the path doesn't matter, and moving e.g. a repository or cache directory is supported.
|
||||
# Changing the name however imbues a change of context that is not permissible.
|
||||
filename = os.path.basename(filename or self.path)
|
||||
self.hasher.update(('%10d' % len(filename)).encode())
|
||||
self.hasher.update(filename.encode())
|
||||
|
||||
@staticmethod
|
||||
def integrity_file_path(path):
|
||||
return path + '.integrity'
|
||||
|
||||
@classmethod
|
||||
def read_integrity_file(cls, path, hasher):
|
||||
try:
|
||||
with open(cls.integrity_file_path(path), 'r') as fd:
|
||||
integrity_file = json.load(fd)
|
||||
# Provisions for agility now, implementation later, but make sure the on-disk joint is oiled.
|
||||
algorithm = integrity_file['algorithm']
|
||||
if algorithm != hasher.ALGORITHM:
|
||||
logger.warning('Cannot verify integrity of %s: Unknown algorithm %r', path, algorithm)
|
||||
return
|
||||
digests = integrity_file['digests']
|
||||
# Require at least presence of the final digest
|
||||
digests['final']
|
||||
return digests
|
||||
except FileNotFoundError:
|
||||
logger.info('No integrity file found for %s', path)
|
||||
except (OSError, ValueError, TypeError, KeyError) as e:
|
||||
logger.warning('Could not read integrity file for %s: %s', path, e)
|
||||
raise FileIntegrityError(path)
|
||||
|
||||
def hash_part(self, partname, is_final=False):
|
||||
if not self.writing and not self.digests:
|
||||
return
|
||||
self.hasher.update(partname.encode())
|
||||
self.hasher.hash_length(seek_to_end=is_final)
|
||||
digest = self.hasher.hexdigest()
|
||||
if self.writing:
|
||||
self.digests[partname] = digest
|
||||
elif self.digests and not compare_digest(self.digests.get(partname, ''), digest):
|
||||
raise FileIntegrityError(self.path)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
exception = exc_type is not None
|
||||
if not exception:
|
||||
self.hash_part('final', is_final=True)
|
||||
self.hasher.__exit__(exc_type, exc_val, exc_tb)
|
||||
if exception:
|
||||
return
|
||||
if self.writing:
|
||||
with open(self.integrity_file_path(self.path), 'w') as fd:
|
||||
json.dump({
|
||||
'algorithm': self.hasher.ALGORITHM,
|
||||
'digests': self.digests,
|
||||
}, fd)
|
||||
elif self.digests:
|
||||
logger.debug('Verified integrity of %s', self.path)
|
|
@ -0,0 +1,152 @@
|
|||
|
||||
import pytest
|
||||
|
||||
from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
|
||||
|
||||
|
||||
class TestReadIntegrityFile:
|
||||
def test_no_integrity(self, tmpdir):
|
||||
protected_file = tmpdir.join('file')
|
||||
protected_file.write('1234')
|
||||
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), None) is None
|
||||
|
||||
def test_truncated_integrity(self, tmpdir):
|
||||
protected_file = tmpdir.join('file')
|
||||
protected_file.write('1234')
|
||||
tmpdir.join('file.integrity').write('')
|
||||
with pytest.raises(FileIntegrityError):
|
||||
IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
|
||||
|
||||
def test_unknown_algorithm(self, tmpdir):
|
||||
class SomeHasher:
|
||||
ALGORITHM = 'HMAC_FOOHASH9000'
|
||||
|
||||
protected_file = tmpdir.join('file')
|
||||
protected_file.write('1234')
|
||||
tmpdir.join('file.integrity').write('{"algorithm": "HMAC_SERIOUSHASH", "digests": "1234"}')
|
||||
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) is None
|
||||
|
||||
@pytest.mark.parametrize('json', (
|
||||
'{"ALGORITHM": "HMAC_SERIOUSHASH", "digests": "1234"}',
|
||||
'[]',
|
||||
'1234.5',
|
||||
'"A string"',
|
||||
'Invalid JSON',
|
||||
))
|
||||
def test_malformed(self, tmpdir, json):
|
||||
protected_file = tmpdir.join('file')
|
||||
protected_file.write('1234')
|
||||
tmpdir.join('file.integrity').write(json)
|
||||
with pytest.raises(FileIntegrityError):
|
||||
IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
|
||||
|
||||
def test_valid(self, tmpdir):
|
||||
class SomeHasher:
|
||||
ALGORITHM = 'HMAC_FOO1'
|
||||
|
||||
protected_file = tmpdir.join('file')
|
||||
protected_file.write('1234')
|
||||
tmpdir.join('file.integrity').write('{"algorithm": "HMAC_FOO1", "digests": {"final": "1234"}}')
|
||||
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) == {'final': '1234'}
|
||||
|
||||
|
||||
class TestIntegrityCheckedFile:
|
||||
@pytest.fixture
|
||||
def integrity_protected_file(self, tmpdir):
|
||||
path = str(tmpdir.join('file'))
|
||||
with IntegrityCheckedFile(path, write=True) as fd:
|
||||
fd.write(b'foo and bar')
|
||||
return path
|
||||
|
||||
def test_simple(self, tmpdir, integrity_protected_file):
|
||||
assert tmpdir.join('file').check(file=True)
|
||||
assert tmpdir.join('file.integrity').check(file=True)
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
assert fd.read() == b'foo and bar'
|
||||
|
||||
def test_corrupted_file(self, integrity_protected_file):
|
||||
with open(integrity_protected_file, 'ab') as fd:
|
||||
fd.write(b' extra data')
|
||||
with pytest.raises(FileIntegrityError):
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
assert fd.read() == b'foo and bar extra data'
|
||||
|
||||
def test_corrupted_file_partial_read(self, integrity_protected_file):
|
||||
with open(integrity_protected_file, 'ab') as fd:
|
||||
fd.write(b' extra data')
|
||||
with pytest.raises(FileIntegrityError):
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
data = b'foo and bar'
|
||||
assert fd.read(len(data)) == data
|
||||
|
||||
@pytest.mark.parametrize('new_name', (
|
||||
'different_file',
|
||||
'different_file.different_ext',
|
||||
))
|
||||
def test_renamed_file(self, tmpdir, integrity_protected_file, new_name):
|
||||
new_path = tmpdir.join(new_name)
|
||||
tmpdir.join('file').move(new_path)
|
||||
tmpdir.join('file.integrity').move(new_path + '.integrity')
|
||||
with pytest.raises(FileIntegrityError):
|
||||
with IntegrityCheckedFile(str(new_path), write=False) as fd:
|
||||
assert fd.read() == b'foo and bar'
|
||||
|
||||
def test_moved_file(self, tmpdir, integrity_protected_file):
|
||||
new_dir = tmpdir.mkdir('another_directory')
|
||||
tmpdir.join('file').move(new_dir.join('file'))
|
||||
tmpdir.join('file.integrity').move(new_dir.join('file.integrity'))
|
||||
new_path = str(new_dir.join('file'))
|
||||
with IntegrityCheckedFile(new_path, write=False) as fd:
|
||||
assert fd.read() == b'foo and bar'
|
||||
|
||||
def test_no_integrity(self, tmpdir, integrity_protected_file):
|
||||
tmpdir.join('file.integrity').remove()
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
assert fd.read() == b'foo and bar'
|
||||
|
||||
|
||||
class TestIntegrityCheckedFileParts:
|
||||
@pytest.fixture
|
||||
def integrity_protected_file(self, tmpdir):
|
||||
path = str(tmpdir.join('file'))
|
||||
with IntegrityCheckedFile(path, write=True) as fd:
|
||||
fd.write(b'foo and bar')
|
||||
fd.hash_part('foopart')
|
||||
fd.write(b' other data')
|
||||
return path
|
||||
|
||||
def test_simple(self, integrity_protected_file):
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
data1 = b'foo and bar'
|
||||
assert fd.read(len(data1)) == data1
|
||||
fd.hash_part('foopart')
|
||||
assert fd.read() == b' other data'
|
||||
|
||||
def test_wrong_part_name(self, integrity_protected_file):
|
||||
with pytest.raises(FileIntegrityError):
|
||||
# Because some hash_part failed, the final digest will fail as well - again - even if we catch
|
||||
# the failing hash_part. This is intentional: (1) it makes the code simpler (2) it's a good fail-safe
|
||||
# against overly broad exception handling.
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
data1 = b'foo and bar'
|
||||
assert fd.read(len(data1)) == data1
|
||||
with pytest.raises(FileIntegrityError):
|
||||
# This specific bit raises it directly
|
||||
fd.hash_part('barpart')
|
||||
# Still explodes in the end.
|
||||
|
||||
@pytest.mark.parametrize('partial_read', (False, True))
|
||||
def test_part_independence(self, integrity_protected_file, partial_read):
|
||||
with open(integrity_protected_file, 'ab') as fd:
|
||||
fd.write(b'some extra stuff that does not belong')
|
||||
with pytest.raises(FileIntegrityError):
|
||||
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
|
||||
data1 = b'foo and bar'
|
||||
try:
|
||||
assert fd.read(len(data1)) == data1
|
||||
fd.hash_part('foopart')
|
||||
except FileIntegrityError:
|
||||
assert False, 'This part must not raise, since this part is still valid.'
|
||||
if not partial_read:
|
||||
fd.read()
|
||||
# But overall it explodes with the final digest. Neat, eh?
|
Loading…
Reference in New Issue