mirror of
https://github.com/morpheus65535/bazarr
synced 2025-01-02 21:15:41 +00:00
733 lines
27 KiB
Python
733 lines
27 KiB
Python
|
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
|
||
|
#
|
||
|
# This module is part of GitDB and is released under
|
||
|
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
|
||
|
|
||
|
from io import BytesIO
|
||
|
|
||
|
import mmap
|
||
|
import os
|
||
|
import sys
|
||
|
import zlib
|
||
|
|
||
|
from gitdb.fun import (
|
||
|
msb_size,
|
||
|
stream_copy,
|
||
|
apply_delta_data,
|
||
|
connect_deltas,
|
||
|
delta_types
|
||
|
)
|
||
|
|
||
|
from gitdb.util import (
|
||
|
allocate_memory,
|
||
|
LazyMixin,
|
||
|
make_sha,
|
||
|
write,
|
||
|
close,
|
||
|
)
|
||
|
|
||
|
from gitdb.const import NULL_BYTE, BYTE_SPACE
|
||
|
from gitdb.utils.compat import buffer
|
||
|
from gitdb.utils.encoding import force_bytes
|
||
|
|
||
|
has_perf_mod = False
|
||
|
PY26 = sys.version_info[:2] < (2, 7)
|
||
|
try:
|
||
|
from gitdb_speedups._perf import apply_delta as c_apply_delta
|
||
|
has_perf_mod = True
|
||
|
except ImportError:
|
||
|
pass
|
||
|
|
||
|
__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader',
|
||
|
'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer',
|
||
|
'FDStream', 'NullStream')
|
||
|
|
||
|
|
||
|
#{ RO Streams
|
||
|
|
||
|
class DecompressMemMapReader(LazyMixin):
|
||
|
|
||
|
"""Reads data in chunks from a memory map and decompresses it. The client sees
|
||
|
only the uncompressed data, respective file-like read calls are handling on-demand
|
||
|
buffered decompression accordingly
|
||
|
|
||
|
A constraint on the total size of bytes is activated, simulating
|
||
|
a logical file within a possibly larger physical memory area
|
||
|
|
||
|
To read efficiently, you clearly don't want to read individual bytes, instead,
|
||
|
read a few kilobytes at least.
|
||
|
|
||
|
**Note:** The chunk-size should be carefully selected as it will involve quite a bit
|
||
|
of string copying due to the way the zlib is implemented. Its very wasteful,
|
||
|
hence we try to find a good tradeoff between allocation time and number of
|
||
|
times we actually allocate. An own zlib implementation would be good here
|
||
|
to better support streamed reading - it would only need to keep the mmap
|
||
|
and decompress it into chunks, that's all ... """
|
||
|
__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close',
|
||
|
'_cbr', '_phi')
|
||
|
|
||
|
max_read_size = 512 * 1024 # currently unused
|
||
|
|
||
|
def __init__(self, m, close_on_deletion, size=None):
|
||
|
"""Initialize with mmap for stream reading
|
||
|
:param m: must be content data - use new if you have object data and no size"""
|
||
|
self._m = m
|
||
|
self._zip = zlib.decompressobj()
|
||
|
self._buf = None # buffer of decompressed bytes
|
||
|
self._buflen = 0 # length of bytes in buffer
|
||
|
if size is not None:
|
||
|
self._s = size # size of uncompressed data to read in total
|
||
|
self._br = 0 # num uncompressed bytes read
|
||
|
self._cws = 0 # start byte of compression window
|
||
|
self._cwe = 0 # end byte of compression window
|
||
|
self._cbr = 0 # number of compressed bytes read
|
||
|
self._phi = False # is True if we parsed the header info
|
||
|
self._close = close_on_deletion # close the memmap on deletion ?
|
||
|
|
||
|
def _set_cache_(self, attr):
|
||
|
assert attr == '_s'
|
||
|
# only happens for size, which is a marker to indicate we still
|
||
|
# have to parse the header from the stream
|
||
|
self._parse_header_info()
|
||
|
|
||
|
def __del__(self):
|
||
|
self.close()
|
||
|
|
||
|
def _parse_header_info(self):
|
||
|
"""If this stream contains object data, parse the header info and skip the
|
||
|
stream to a point where each read will yield object content
|
||
|
|
||
|
:return: parsed type_string, size"""
|
||
|
# read header
|
||
|
# should really be enough, cgit uses 8192 I believe
|
||
|
# And for good reason !! This needs to be that high for the header to be read correctly in all cases
|
||
|
maxb = 8192
|
||
|
self._s = maxb
|
||
|
hdr = self.read(maxb)
|
||
|
hdrend = hdr.find(NULL_BYTE)
|
||
|
typ, size = hdr[:hdrend].split(BYTE_SPACE)
|
||
|
size = int(size)
|
||
|
self._s = size
|
||
|
|
||
|
# adjust internal state to match actual header length that we ignore
|
||
|
# The buffer will be depleted first on future reads
|
||
|
self._br = 0
|
||
|
hdrend += 1
|
||
|
self._buf = BytesIO(hdr[hdrend:])
|
||
|
self._buflen = len(hdr) - hdrend
|
||
|
|
||
|
self._phi = True
|
||
|
|
||
|
return typ, size
|
||
|
|
||
|
#{ Interface
|
||
|
|
||
|
@classmethod
|
||
|
def new(self, m, close_on_deletion=False):
|
||
|
"""Create a new DecompressMemMapReader instance for acting as a read-only stream
|
||
|
This method parses the object header from m and returns the parsed
|
||
|
type and size, as well as the created stream instance.
|
||
|
|
||
|
:param m: memory map on which to operate. It must be object data ( header + contents )
|
||
|
:param close_on_deletion: if True, the memory map will be closed once we are
|
||
|
being deleted"""
|
||
|
inst = DecompressMemMapReader(m, close_on_deletion, 0)
|
||
|
typ, size = inst._parse_header_info()
|
||
|
return typ, size, inst
|
||
|
|
||
|
def data(self):
|
||
|
""":return: random access compatible data we are working on"""
|
||
|
return self._m
|
||
|
|
||
|
def close(self):
|
||
|
"""Close our underlying stream of compressed bytes if this was allowed during initialization
|
||
|
:return: True if we closed the underlying stream
|
||
|
:note: can be called safely
|
||
|
"""
|
||
|
if self._close:
|
||
|
if hasattr(self._m, 'close'):
|
||
|
self._m.close()
|
||
|
self._close = False
|
||
|
# END handle resource freeing
|
||
|
|
||
|
def compressed_bytes_read(self):
|
||
|
"""
|
||
|
:return: number of compressed bytes read. This includes the bytes it
|
||
|
took to decompress the header ( if there was one )"""
|
||
|
# ABSTRACT: When decompressing a byte stream, it can be that the first
|
||
|
# x bytes which were requested match the first x bytes in the loosely
|
||
|
# compressed datastream. This is the worst-case assumption that the reader
|
||
|
# does, it assumes that it will get at least X bytes from X compressed bytes
|
||
|
# in call cases.
|
||
|
# The caveat is that the object, according to our known uncompressed size,
|
||
|
# is already complete, but there are still some bytes left in the compressed
|
||
|
# stream that contribute to the amount of compressed bytes.
|
||
|
# How can we know that we are truly done, and have read all bytes we need
|
||
|
# to read ?
|
||
|
# Without help, we cannot know, as we need to obtain the status of the
|
||
|
# decompression. If it is not finished, we need to decompress more data
|
||
|
# until it is finished, to yield the actual number of compressed bytes
|
||
|
# belonging to the decompressed object
|
||
|
# We are using a custom zlib module for this, if its not present,
|
||
|
# we try to put in additional bytes up for decompression if feasible
|
||
|
# and check for the unused_data.
|
||
|
|
||
|
# Only scrub the stream forward if we are officially done with the
|
||
|
# bytes we were to have.
|
||
|
if self._br == self._s and not self._zip.unused_data:
|
||
|
# manipulate the bytes-read to allow our own read method to continue
|
||
|
# but keep the window at its current position
|
||
|
self._br = 0
|
||
|
if hasattr(self._zip, 'status'):
|
||
|
while self._zip.status == zlib.Z_OK:
|
||
|
self.read(mmap.PAGESIZE)
|
||
|
# END scrub-loop custom zlib
|
||
|
else:
|
||
|
# pass in additional pages, until we have unused data
|
||
|
while not self._zip.unused_data and self._cbr != len(self._m):
|
||
|
self.read(mmap.PAGESIZE)
|
||
|
# END scrub-loop default zlib
|
||
|
# END handle stream scrubbing
|
||
|
|
||
|
# reset bytes read, just to be sure
|
||
|
self._br = self._s
|
||
|
# END handle stream scrubbing
|
||
|
|
||
|
# unused data ends up in the unconsumed tail, which was removed
|
||
|
# from the count already
|
||
|
return self._cbr
|
||
|
|
||
|
#} END interface
|
||
|
|
||
|
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
|
||
|
"""Allows to reset the stream to restart reading
|
||
|
:raise ValueError: If offset and whence are not 0"""
|
||
|
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
|
||
|
raise ValueError("Can only seek to position 0")
|
||
|
# END handle offset
|
||
|
|
||
|
self._zip = zlib.decompressobj()
|
||
|
self._br = self._cws = self._cwe = self._cbr = 0
|
||
|
if self._phi:
|
||
|
self._phi = False
|
||
|
del(self._s) # trigger header parsing on first access
|
||
|
# END skip header
|
||
|
|
||
|
def read(self, size=-1):
|
||
|
if size < 1:
|
||
|
size = self._s - self._br
|
||
|
else:
|
||
|
size = min(size, self._s - self._br)
|
||
|
# END clamp size
|
||
|
|
||
|
if size == 0:
|
||
|
return bytes()
|
||
|
# END handle depletion
|
||
|
|
||
|
# deplete the buffer, then just continue using the decompress object
|
||
|
# which has an own buffer. We just need this to transparently parse the
|
||
|
# header from the zlib stream
|
||
|
dat = bytes()
|
||
|
if self._buf:
|
||
|
if self._buflen >= size:
|
||
|
# have enough data
|
||
|
dat = self._buf.read(size)
|
||
|
self._buflen -= size
|
||
|
self._br += size
|
||
|
return dat
|
||
|
else:
|
||
|
dat = self._buf.read() # ouch, duplicates data
|
||
|
size -= self._buflen
|
||
|
self._br += self._buflen
|
||
|
|
||
|
self._buflen = 0
|
||
|
self._buf = None
|
||
|
# END handle buffer len
|
||
|
# END handle buffer
|
||
|
|
||
|
# decompress some data
|
||
|
# Abstract: zlib needs to operate on chunks of our memory map ( which may
|
||
|
# be large ), as it will otherwise and always fill in the 'unconsumed_tail'
|
||
|
# attribute which possible reads our whole map to the end, forcing
|
||
|
# everything to be read from disk even though just a portion was requested.
|
||
|
# As this would be a nogo, we workaround it by passing only chunks of data,
|
||
|
# moving the window into the memory map along as we decompress, which keeps
|
||
|
# the tail smaller than our chunk-size. This causes 'only' the chunk to be
|
||
|
# copied once, and another copy of a part of it when it creates the unconsumed
|
||
|
# tail. We have to use it to hand in the appropriate amount of bytes during
|
||
|
# the next read.
|
||
|
tail = self._zip.unconsumed_tail
|
||
|
if tail:
|
||
|
# move the window, make it as large as size demands. For code-clarity,
|
||
|
# we just take the chunk from our map again instead of reusing the unconsumed
|
||
|
# tail. The latter one would safe some memory copying, but we could end up
|
||
|
# with not getting enough data uncompressed, so we had to sort that out as well.
|
||
|
# Now we just assume the worst case, hence the data is uncompressed and the window
|
||
|
# needs to be as large as the uncompressed bytes we want to read.
|
||
|
self._cws = self._cwe - len(tail)
|
||
|
self._cwe = self._cws + size
|
||
|
else:
|
||
|
cws = self._cws
|
||
|
self._cws = self._cwe
|
||
|
self._cwe = cws + size
|
||
|
# END handle tail
|
||
|
|
||
|
# if window is too small, make it larger so zip can decompress something
|
||
|
if self._cwe - self._cws < 8:
|
||
|
self._cwe = self._cws + 8
|
||
|
# END adjust winsize
|
||
|
|
||
|
# takes a slice, but doesn't copy the data, it says ...
|
||
|
indata = buffer(self._m, self._cws, self._cwe - self._cws)
|
||
|
|
||
|
# get the actual window end to be sure we don't use it for computations
|
||
|
self._cwe = self._cws + len(indata)
|
||
|
dcompdat = self._zip.decompress(indata, size)
|
||
|
# update the amount of compressed bytes read
|
||
|
# We feed possibly overlapping chunks, which is why the unconsumed tail
|
||
|
# has to be taken into consideration, as well as the unused data
|
||
|
# if we hit the end of the stream
|
||
|
# NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
|
||
|
# They are thorough, and I assume it is truly working.
|
||
|
# Why is this logic as convoluted as it is ? Please look at the table in
|
||
|
# https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
|
||
|
# Bascially, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
|
||
|
# will be the one that works.
|
||
|
# However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
|
||
|
# table in the github issue. This is it ... it was the only way I could make this work everywhere.
|
||
|
# IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
|
||
|
if PY26 or ((zlib.ZLIB_VERSION == '1.2.7' or zlib.ZLIB_VERSION == '1.2.5') and not sys.platform == 'darwin'):
|
||
|
unused_datalen = len(self._zip.unconsumed_tail)
|
||
|
else:
|
||
|
unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
|
||
|
# # end handle very special case ...
|
||
|
|
||
|
self._cbr += len(indata) - unused_datalen
|
||
|
self._br += len(dcompdat)
|
||
|
|
||
|
if dat:
|
||
|
dcompdat = dat + dcompdat
|
||
|
# END prepend our cached data
|
||
|
|
||
|
# it can happen, depending on the compression, that we get less bytes
|
||
|
# than ordered as it needs the final portion of the data as well.
|
||
|
# Recursively resolve that.
|
||
|
# Note: dcompdat can be empty even though we still appear to have bytes
|
||
|
# to read, if we are called by compressed_bytes_read - it manipulates
|
||
|
# us to empty the stream
|
||
|
if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s:
|
||
|
dcompdat += self.read(size - len(dcompdat))
|
||
|
# END handle special case
|
||
|
return dcompdat
|
||
|
|
||
|
|
||
|
class DeltaApplyReader(LazyMixin):
|
||
|
|
||
|
"""A reader which dynamically applies pack deltas to a base object, keeping the
|
||
|
memory demands to a minimum.
|
||
|
|
||
|
The size of the final object is only obtainable once all deltas have been
|
||
|
applied, unless it is retrieved from a pack index.
|
||
|
|
||
|
The uncompressed Delta has the following layout (MSB being a most significant
|
||
|
bit encoded dynamic size):
|
||
|
|
||
|
* MSB Source Size - the size of the base against which the delta was created
|
||
|
* MSB Target Size - the size of the resulting data after the delta was applied
|
||
|
* A list of one byte commands (cmd) which are followed by a specific protocol:
|
||
|
|
||
|
* cmd & 0x80 - copy delta_data[offset:offset+size]
|
||
|
|
||
|
* Followed by an encoded offset into the delta data
|
||
|
* Followed by an encoded size of the chunk to copy
|
||
|
|
||
|
* cmd & 0x7f - insert
|
||
|
|
||
|
* insert cmd bytes from the delta buffer into the output stream
|
||
|
|
||
|
* cmd == 0 - invalid operation ( or error in delta stream )
|
||
|
"""
|
||
|
__slots__ = (
|
||
|
"_bstream", # base stream to which to apply the deltas
|
||
|
"_dstreams", # tuple of delta stream readers
|
||
|
"_mm_target", # memory map of the delta-applied data
|
||
|
"_size", # actual number of bytes in _mm_target
|
||
|
"_br" # number of bytes read
|
||
|
)
|
||
|
|
||
|
#{ Configuration
|
||
|
k_max_memory_move = 250 * 1000 * 1000
|
||
|
#} END configuration
|
||
|
|
||
|
def __init__(self, stream_list):
|
||
|
"""Initialize this instance with a list of streams, the first stream being
|
||
|
the delta to apply on top of all following deltas, the last stream being the
|
||
|
base object onto which to apply the deltas"""
|
||
|
assert len(stream_list) > 1, "Need at least one delta and one base stream"
|
||
|
|
||
|
self._bstream = stream_list[-1]
|
||
|
self._dstreams = tuple(stream_list[:-1])
|
||
|
self._br = 0
|
||
|
|
||
|
def _set_cache_too_slow_without_c(self, attr):
|
||
|
# the direct algorithm is fastest and most direct if there is only one
|
||
|
# delta. Also, the extra overhead might not be worth it for items smaller
|
||
|
# than X - definitely the case in python, every function call costs
|
||
|
# huge amounts of time
|
||
|
# if len(self._dstreams) * self._bstream.size < self.k_max_memory_move:
|
||
|
if len(self._dstreams) == 1:
|
||
|
return self._set_cache_brute_(attr)
|
||
|
|
||
|
# Aggregate all deltas into one delta in reverse order. Hence we take
|
||
|
# the last delta, and reverse-merge its ancestor delta, until we receive
|
||
|
# the final delta data stream.
|
||
|
dcl = connect_deltas(self._dstreams)
|
||
|
|
||
|
# call len directly, as the (optional) c version doesn't implement the sequence
|
||
|
# protocol
|
||
|
if dcl.rbound() == 0:
|
||
|
self._size = 0
|
||
|
self._mm_target = allocate_memory(0)
|
||
|
return
|
||
|
# END handle empty list
|
||
|
|
||
|
self._size = dcl.rbound()
|
||
|
self._mm_target = allocate_memory(self._size)
|
||
|
|
||
|
bbuf = allocate_memory(self._bstream.size)
|
||
|
stream_copy(self._bstream.read, bbuf.write, self._bstream.size, 256 * mmap.PAGESIZE)
|
||
|
|
||
|
# APPLY CHUNKS
|
||
|
write = self._mm_target.write
|
||
|
dcl.apply(bbuf, write)
|
||
|
|
||
|
self._mm_target.seek(0)
|
||
|
|
||
|
def _set_cache_brute_(self, attr):
|
||
|
"""If we are here, we apply the actual deltas"""
|
||
|
# TODO: There should be a special case if there is only one stream
|
||
|
# Then the default-git algorithm should perform a tad faster, as the
|
||
|
# delta is not peaked into, causing less overhead.
|
||
|
buffer_info_list = list()
|
||
|
max_target_size = 0
|
||
|
for dstream in self._dstreams:
|
||
|
buf = dstream.read(512) # read the header information + X
|
||
|
offset, src_size = msb_size(buf)
|
||
|
offset, target_size = msb_size(buf, offset)
|
||
|
buffer_info_list.append((buffer(buf, offset), offset, src_size, target_size))
|
||
|
max_target_size = max(max_target_size, target_size)
|
||
|
# END for each delta stream
|
||
|
|
||
|
# sanity check - the first delta to apply should have the same source
|
||
|
# size as our actual base stream
|
||
|
base_size = self._bstream.size
|
||
|
target_size = max_target_size
|
||
|
|
||
|
# if we have more than 1 delta to apply, we will swap buffers, hence we must
|
||
|
# assure that all buffers we use are large enough to hold all the results
|
||
|
if len(self._dstreams) > 1:
|
||
|
base_size = target_size = max(base_size, max_target_size)
|
||
|
# END adjust buffer sizes
|
||
|
|
||
|
# Allocate private memory map big enough to hold the first base buffer
|
||
|
# We need random access to it
|
||
|
bbuf = allocate_memory(base_size)
|
||
|
stream_copy(self._bstream.read, bbuf.write, base_size, 256 * mmap.PAGESIZE)
|
||
|
|
||
|
# allocate memory map large enough for the largest (intermediate) target
|
||
|
# We will use it as scratch space for all delta ops. If the final
|
||
|
# target buffer is smaller than our allocated space, we just use parts
|
||
|
# of it upon return.
|
||
|
tbuf = allocate_memory(target_size)
|
||
|
|
||
|
# for each delta to apply, memory map the decompressed delta and
|
||
|
# work on the op-codes to reconstruct everything.
|
||
|
# For the actual copying, we use a seek and write pattern of buffer
|
||
|
# slices.
|
||
|
final_target_size = None
|
||
|
for (dbuf, offset, src_size, target_size), dstream in zip(reversed(buffer_info_list), reversed(self._dstreams)):
|
||
|
# allocate a buffer to hold all delta data - fill in the data for
|
||
|
# fast access. We do this as we know that reading individual bytes
|
||
|
# from our stream would be slower than necessary ( although possible )
|
||
|
# The dbuf buffer contains commands after the first two MSB sizes, the
|
||
|
# offset specifies the amount of bytes read to get the sizes.
|
||
|
ddata = allocate_memory(dstream.size - offset)
|
||
|
ddata.write(dbuf)
|
||
|
# read the rest from the stream. The size we give is larger than necessary
|
||
|
stream_copy(dstream.read, ddata.write, dstream.size, 256 * mmap.PAGESIZE)
|
||
|
|
||
|
#######################################################################
|
||
|
if 'c_apply_delta' in globals():
|
||
|
c_apply_delta(bbuf, ddata, tbuf)
|
||
|
else:
|
||
|
apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write)
|
||
|
#######################################################################
|
||
|
|
||
|
# finally, swap out source and target buffers. The target is now the
|
||
|
# base for the next delta to apply
|
||
|
bbuf, tbuf = tbuf, bbuf
|
||
|
bbuf.seek(0)
|
||
|
tbuf.seek(0)
|
||
|
final_target_size = target_size
|
||
|
# END for each delta to apply
|
||
|
|
||
|
# its already seeked to 0, constrain it to the actual size
|
||
|
# NOTE: in the end of the loop, it swaps buffers, hence our target buffer
|
||
|
# is not tbuf, but bbuf !
|
||
|
self._mm_target = bbuf
|
||
|
self._size = final_target_size
|
||
|
|
||
|
#{ Configuration
|
||
|
if not has_perf_mod:
|
||
|
_set_cache_ = _set_cache_brute_
|
||
|
else:
|
||
|
_set_cache_ = _set_cache_too_slow_without_c
|
||
|
|
||
|
#} END configuration
|
||
|
|
||
|
def read(self, count=0):
|
||
|
bl = self._size - self._br # bytes left
|
||
|
if count < 1 or count > bl:
|
||
|
count = bl
|
||
|
# NOTE: we could check for certain size limits, and possibly
|
||
|
# return buffers instead of strings to prevent byte copying
|
||
|
data = self._mm_target.read(count)
|
||
|
self._br += len(data)
|
||
|
return data
|
||
|
|
||
|
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
|
||
|
"""Allows to reset the stream to restart reading
|
||
|
|
||
|
:raise ValueError: If offset and whence are not 0"""
|
||
|
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
|
||
|
raise ValueError("Can only seek to position 0")
|
||
|
# END handle offset
|
||
|
self._br = 0
|
||
|
self._mm_target.seek(0)
|
||
|
|
||
|
#{ Interface
|
||
|
|
||
|
@classmethod
|
||
|
def new(cls, stream_list):
|
||
|
"""
|
||
|
Convert the given list of streams into a stream which resolves deltas
|
||
|
when reading from it.
|
||
|
|
||
|
:param stream_list: two or more stream objects, first stream is a Delta
|
||
|
to the object that you want to resolve, followed by N additional delta
|
||
|
streams. The list's last stream must be a non-delta stream.
|
||
|
|
||
|
:return: Non-Delta OPackStream object whose stream can be used to obtain
|
||
|
the decompressed resolved data
|
||
|
:raise ValueError: if the stream list cannot be handled"""
|
||
|
if len(stream_list) < 2:
|
||
|
raise ValueError("Need at least two streams")
|
||
|
# END single object special handling
|
||
|
|
||
|
if stream_list[-1].type_id in delta_types:
|
||
|
raise ValueError(
|
||
|
"Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type)
|
||
|
# END check stream
|
||
|
return cls(stream_list)
|
||
|
|
||
|
#} END interface
|
||
|
|
||
|
#{ OInfo like Interface
|
||
|
|
||
|
@property
|
||
|
def type(self):
|
||
|
return self._bstream.type
|
||
|
|
||
|
@property
|
||
|
def type_id(self):
|
||
|
return self._bstream.type_id
|
||
|
|
||
|
@property
|
||
|
def size(self):
|
||
|
""":return: number of uncompressed bytes in the stream"""
|
||
|
return self._size
|
||
|
|
||
|
#} END oinfo like interface
|
||
|
|
||
|
|
||
|
#} END RO streams
|
||
|
|
||
|
|
||
|
#{ W Streams
|
||
|
|
||
|
class Sha1Writer(object):
|
||
|
|
||
|
"""Simple stream writer which produces a sha whenever you like as it degests
|
||
|
everything it is supposed to write"""
|
||
|
__slots__ = "sha1"
|
||
|
|
||
|
def __init__(self):
|
||
|
self.sha1 = make_sha()
|
||
|
|
||
|
#{ Stream Interface
|
||
|
|
||
|
def write(self, data):
|
||
|
""":raise IOError: If not all bytes could be written
|
||
|
:param data: byte object
|
||
|
:return: length of incoming data"""
|
||
|
|
||
|
self.sha1.update(data)
|
||
|
|
||
|
return len(data)
|
||
|
|
||
|
# END stream interface
|
||
|
|
||
|
#{ Interface
|
||
|
|
||
|
def sha(self, as_hex=False):
|
||
|
""":return: sha so far
|
||
|
:param as_hex: if True, sha will be hex-encoded, binary otherwise"""
|
||
|
if as_hex:
|
||
|
return self.sha1.hexdigest()
|
||
|
return self.sha1.digest()
|
||
|
|
||
|
#} END interface
|
||
|
|
||
|
|
||
|
class FlexibleSha1Writer(Sha1Writer):
|
||
|
|
||
|
"""Writer producing a sha1 while passing on the written bytes to the given
|
||
|
write function"""
|
||
|
__slots__ = 'writer'
|
||
|
|
||
|
def __init__(self, writer):
|
||
|
Sha1Writer.__init__(self)
|
||
|
self.writer = writer
|
||
|
|
||
|
def write(self, data):
|
||
|
Sha1Writer.write(self, data)
|
||
|
self.writer(data)
|
||
|
|
||
|
|
||
|
class ZippedStoreShaWriter(Sha1Writer):
|
||
|
|
||
|
"""Remembers everything someone writes to it and generates a sha"""
|
||
|
__slots__ = ('buf', 'zip')
|
||
|
|
||
|
def __init__(self):
|
||
|
Sha1Writer.__init__(self)
|
||
|
self.buf = BytesIO()
|
||
|
self.zip = zlib.compressobj(zlib.Z_BEST_SPEED)
|
||
|
|
||
|
def __getattr__(self, attr):
|
||
|
return getattr(self.buf, attr)
|
||
|
|
||
|
def write(self, data):
|
||
|
alen = Sha1Writer.write(self, data)
|
||
|
self.buf.write(self.zip.compress(data))
|
||
|
|
||
|
return alen
|
||
|
|
||
|
def close(self):
|
||
|
self.buf.write(self.zip.flush())
|
||
|
|
||
|
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
|
||
|
"""Seeking currently only supports to rewind written data
|
||
|
Multiple writes are not supported"""
|
||
|
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
|
||
|
raise ValueError("Can only seek to position 0")
|
||
|
# END handle offset
|
||
|
self.buf.seek(0)
|
||
|
|
||
|
def getvalue(self):
|
||
|
""":return: string value from the current stream position to the end"""
|
||
|
return self.buf.getvalue()
|
||
|
|
||
|
|
||
|
class FDCompressedSha1Writer(Sha1Writer):
|
||
|
|
||
|
"""Digests data written to it, making the sha available, then compress the
|
||
|
data and write it to the file descriptor
|
||
|
|
||
|
**Note:** operates on raw file descriptors
|
||
|
**Note:** for this to work, you have to use the close-method of this instance"""
|
||
|
__slots__ = ("fd", "sha1", "zip")
|
||
|
|
||
|
# default exception
|
||
|
exc = IOError("Failed to write all bytes to filedescriptor")
|
||
|
|
||
|
def __init__(self, fd):
|
||
|
super(FDCompressedSha1Writer, self).__init__()
|
||
|
self.fd = fd
|
||
|
self.zip = zlib.compressobj(zlib.Z_BEST_SPEED)
|
||
|
|
||
|
#{ Stream Interface
|
||
|
|
||
|
def write(self, data):
|
||
|
""":raise IOError: If not all bytes could be written
|
||
|
:return: length of incoming data"""
|
||
|
self.sha1.update(data)
|
||
|
cdata = self.zip.compress(data)
|
||
|
bytes_written = write(self.fd, cdata)
|
||
|
|
||
|
if bytes_written != len(cdata):
|
||
|
raise self.exc
|
||
|
|
||
|
return len(data)
|
||
|
|
||
|
def close(self):
|
||
|
remainder = self.zip.flush()
|
||
|
if write(self.fd, remainder) != len(remainder):
|
||
|
raise self.exc
|
||
|
return close(self.fd)
|
||
|
|
||
|
#} END stream interface
|
||
|
|
||
|
|
||
|
class FDStream(object):
|
||
|
|
||
|
"""A simple wrapper providing the most basic functions on a file descriptor
|
||
|
with the fileobject interface. Cannot use os.fdopen as the resulting stream
|
||
|
takes ownership"""
|
||
|
__slots__ = ("_fd", '_pos')
|
||
|
|
||
|
def __init__(self, fd):
|
||
|
self._fd = fd
|
||
|
self._pos = 0
|
||
|
|
||
|
def write(self, data):
|
||
|
self._pos += len(data)
|
||
|
os.write(self._fd, data)
|
||
|
|
||
|
def read(self, count=0):
|
||
|
if count == 0:
|
||
|
count = os.path.getsize(self._filepath)
|
||
|
# END handle read everything
|
||
|
|
||
|
bytes = os.read(self._fd, count)
|
||
|
self._pos += len(bytes)
|
||
|
return bytes
|
||
|
|
||
|
def fileno(self):
|
||
|
return self._fd
|
||
|
|
||
|
def tell(self):
|
||
|
return self._pos
|
||
|
|
||
|
def close(self):
|
||
|
close(self._fd)
|
||
|
|
||
|
|
||
|
class NullStream(object):
|
||
|
|
||
|
"""A stream that does nothing but providing a stream interface.
|
||
|
Use it like /dev/null"""
|
||
|
__slots__ = tuple()
|
||
|
|
||
|
def read(self, size=0):
|
||
|
return ''
|
||
|
|
||
|
def close(self):
|
||
|
pass
|
||
|
|
||
|
def write(self, data):
|
||
|
return len(data)
|
||
|
|
||
|
|
||
|
#} END W streams
|