Implement IntegrityCheckedFile (#2502)

Implement IntegrityCheckedFile

This is based on much earlier work from October 2016 by me, but is
overall simplified and changed terminology (from "signing" to
hashing and integrity checking).

See #1688 for the full history.
This commit is contained in:
enkore 2017-05-12 21:38:31 +02:00 committed by GitHub
parent 1dd53f0e03
commit 820066da5d
2 changed files with 334 additions and 0 deletions

View File

@ -0,0 +1,182 @@
import hashlib
import io
import json
import os
from hmac import compare_digest
from ..helpers import IntegrityError
from ..logger import create_logger
logger = create_logger()
class FileLikeWrapper:
def __enter__(self):
self.fd.__enter__()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.fd.__exit__(exc_type, exc_val, exc_tb)
def tell(self):
return self.fd.tell()
def seek(self, offset, whence=io.SEEK_SET):
return self.fd.seek(offset, whence)
def write(self, data):
return self.fd.write(data)
def read(self, n=None):
return self.fd.read(n)
def flush(self):
self.fd.flush()
def fileno(self):
return self.fd.fileno()
class SHA512FileHashingWrapper(FileLikeWrapper):
"""
Wrapper for file-like objects that computes a hash on-the-fly while reading/writing.
WARNING: Seeks should only be used to query the size of the file, not
to skip data, because skipped data isn't read and not hashed into the digest.
Similarly skipping while writing to create sparse files is also not supported.
Data has to be read/written in a symmetric fashion, otherwise different
digests will be generated.
Note: When used as a context manager read/write operations outside the enclosed scope
are illegal.
"""
ALGORITHM = 'SHA512'
def __init__(self, backing_fd, write):
self.fd = backing_fd
self.writing = write
self.hash = hashlib.new(self.ALGORITHM)
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is None:
self.hash_length()
super().__exit__(exc_type, exc_val, exc_tb)
def write(self, data):
"""
Write *data* to backing file and update internal state.
"""
n = super().write(data)
self.hash.update(data)
return n
def read(self, n=None):
"""
Read *data* from backing file (*n* has the usual meaning) and update internal state.
"""
data = super().read(n)
self.hash.update(data)
return data
def hexdigest(self):
"""
Return current digest bytes as hex-string.
Note: this can be called multiple times.
"""
return self.hash.hexdigest()
def update(self, data: bytes):
self.hash.update(data)
def hash_length(self, seek_to_end=False):
if seek_to_end:
# Add length of file to the hash to avoid problems if only a prefix is read.
self.seek(0, io.SEEK_END)
self.hash.update(str(self.tell()).encode())
class FileIntegrityError(IntegrityError):
"""File failed integrity check: {}"""
class IntegrityCheckedFile(FileLikeWrapper):
def __init__(self, path, write, filename=None, override_fd=None):
self.path = path
self.writing = write
mode = 'wb' if write else 'rb'
self.file_fd = override_fd or open(path, mode)
self.fd = self.hasher = SHA512FileHashingWrapper(backing_fd=self.file_fd, write=write)
self.hash_filename(filename)
if write:
self.digests = {}
else:
self.digests = self.read_integrity_file(path, self.hasher)
# TODO: When we're reading but don't have any digests, i.e. no integrity file existed,
# TODO: then we could just short-circuit.
def hash_filename(self, filename=None):
# Hash the name of the file, but only the basename, ie. not the path.
# In Borg the name itself encodes the context (eg. index.N, cache, files),
# while the path doesn't matter, and moving e.g. a repository or cache directory is supported.
# Changing the name however imbues a change of context that is not permissible.
filename = os.path.basename(filename or self.path)
self.hasher.update(('%10d' % len(filename)).encode())
self.hasher.update(filename.encode())
@staticmethod
def integrity_file_path(path):
return path + '.integrity'
@classmethod
def read_integrity_file(cls, path, hasher):
try:
with open(cls.integrity_file_path(path), 'r') as fd:
integrity_file = json.load(fd)
# Provisions for agility now, implementation later, but make sure the on-disk joint is oiled.
algorithm = integrity_file['algorithm']
if algorithm != hasher.ALGORITHM:
logger.warning('Cannot verify integrity of %s: Unknown algorithm %r', path, algorithm)
return
digests = integrity_file['digests']
# Require at least presence of the final digest
digests['final']
return digests
except FileNotFoundError:
logger.info('No integrity file found for %s', path)
except (OSError, ValueError, TypeError, KeyError) as e:
logger.warning('Could not read integrity file for %s: %s', path, e)
raise FileIntegrityError(path)
def hash_part(self, partname, is_final=False):
if not self.writing and not self.digests:
return
self.hasher.update(partname.encode())
self.hasher.hash_length(seek_to_end=is_final)
digest = self.hasher.hexdigest()
if self.writing:
self.digests[partname] = digest
elif self.digests and not compare_digest(self.digests.get(partname, ''), digest):
raise FileIntegrityError(self.path)
def __exit__(self, exc_type, exc_val, exc_tb):
exception = exc_type is not None
if not exception:
self.hash_part('final', is_final=True)
self.hasher.__exit__(exc_type, exc_val, exc_tb)
if exception:
return
if self.writing:
with open(self.integrity_file_path(self.path), 'w') as fd:
json.dump({
'algorithm': self.hasher.ALGORITHM,
'digests': self.digests,
}, fd)
elif self.digests:
logger.debug('Verified integrity of %s', self.path)

View File

@ -0,0 +1,152 @@
import pytest
from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
class TestReadIntegrityFile:
def test_no_integrity(self, tmpdir):
protected_file = tmpdir.join('file')
protected_file.write('1234')
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), None) is None
def test_truncated_integrity(self, tmpdir):
protected_file = tmpdir.join('file')
protected_file.write('1234')
tmpdir.join('file.integrity').write('')
with pytest.raises(FileIntegrityError):
IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
def test_unknown_algorithm(self, tmpdir):
class SomeHasher:
ALGORITHM = 'HMAC_FOOHASH9000'
protected_file = tmpdir.join('file')
protected_file.write('1234')
tmpdir.join('file.integrity').write('{"algorithm": "HMAC_SERIOUSHASH", "digests": "1234"}')
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) is None
@pytest.mark.parametrize('json', (
'{"ALGORITHM": "HMAC_SERIOUSHASH", "digests": "1234"}',
'[]',
'1234.5',
'"A string"',
'Invalid JSON',
))
def test_malformed(self, tmpdir, json):
protected_file = tmpdir.join('file')
protected_file.write('1234')
tmpdir.join('file.integrity').write(json)
with pytest.raises(FileIntegrityError):
IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
def test_valid(self, tmpdir):
class SomeHasher:
ALGORITHM = 'HMAC_FOO1'
protected_file = tmpdir.join('file')
protected_file.write('1234')
tmpdir.join('file.integrity').write('{"algorithm": "HMAC_FOO1", "digests": {"final": "1234"}}')
assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) == {'final': '1234'}
class TestIntegrityCheckedFile:
@pytest.fixture
def integrity_protected_file(self, tmpdir):
path = str(tmpdir.join('file'))
with IntegrityCheckedFile(path, write=True) as fd:
fd.write(b'foo and bar')
return path
def test_simple(self, tmpdir, integrity_protected_file):
assert tmpdir.join('file').check(file=True)
assert tmpdir.join('file.integrity').check(file=True)
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
assert fd.read() == b'foo and bar'
def test_corrupted_file(self, integrity_protected_file):
with open(integrity_protected_file, 'ab') as fd:
fd.write(b' extra data')
with pytest.raises(FileIntegrityError):
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
assert fd.read() == b'foo and bar extra data'
def test_corrupted_file_partial_read(self, integrity_protected_file):
with open(integrity_protected_file, 'ab') as fd:
fd.write(b' extra data')
with pytest.raises(FileIntegrityError):
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
data = b'foo and bar'
assert fd.read(len(data)) == data
@pytest.mark.parametrize('new_name', (
'different_file',
'different_file.different_ext',
))
def test_renamed_file(self, tmpdir, integrity_protected_file, new_name):
new_path = tmpdir.join(new_name)
tmpdir.join('file').move(new_path)
tmpdir.join('file.integrity').move(new_path + '.integrity')
with pytest.raises(FileIntegrityError):
with IntegrityCheckedFile(str(new_path), write=False) as fd:
assert fd.read() == b'foo and bar'
def test_moved_file(self, tmpdir, integrity_protected_file):
new_dir = tmpdir.mkdir('another_directory')
tmpdir.join('file').move(new_dir.join('file'))
tmpdir.join('file.integrity').move(new_dir.join('file.integrity'))
new_path = str(new_dir.join('file'))
with IntegrityCheckedFile(new_path, write=False) as fd:
assert fd.read() == b'foo and bar'
def test_no_integrity(self, tmpdir, integrity_protected_file):
tmpdir.join('file.integrity').remove()
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
assert fd.read() == b'foo and bar'
class TestIntegrityCheckedFileParts:
@pytest.fixture
def integrity_protected_file(self, tmpdir):
path = str(tmpdir.join('file'))
with IntegrityCheckedFile(path, write=True) as fd:
fd.write(b'foo and bar')
fd.hash_part('foopart')
fd.write(b' other data')
return path
def test_simple(self, integrity_protected_file):
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
data1 = b'foo and bar'
assert fd.read(len(data1)) == data1
fd.hash_part('foopart')
assert fd.read() == b' other data'
def test_wrong_part_name(self, integrity_protected_file):
with pytest.raises(FileIntegrityError):
# Because some hash_part failed, the final digest will fail as well - again - even if we catch
# the failing hash_part. This is intentional: (1) it makes the code simpler (2) it's a good fail-safe
# against overly broad exception handling.
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
data1 = b'foo and bar'
assert fd.read(len(data1)) == data1
with pytest.raises(FileIntegrityError):
# This specific bit raises it directly
fd.hash_part('barpart')
# Still explodes in the end.
@pytest.mark.parametrize('partial_read', (False, True))
def test_part_independence(self, integrity_protected_file, partial_read):
with open(integrity_protected_file, 'ab') as fd:
fd.write(b'some extra stuff that does not belong')
with pytest.raises(FileIntegrityError):
with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
data1 = b'foo and bar'
try:
assert fd.read(len(data1)) == data1
fd.hash_part('foopart')
except FileIntegrityError:
assert False, 'This part must not raise, since this part is still valid.'
if not partial_read:
fd.read()
# But overall it explodes with the final digest. Neat, eh?