diff options
author | morpheus65535 <[email protected]> | 2018-09-16 20:27:00 -0400 |
---|---|---|
committer | morpheus65535 <[email protected]> | 2018-09-16 20:33:04 -0400 |
commit | 0f061f21226f91883c841f85ceef31b30981277a (patch) | |
tree | a1350723ae688ccbae4d4ca564cc4175ccc73996 /libs/gitdb | |
parent | 8b681d8a151a3b41d3aaa5bfdd7a082bdda7896c (diff) | |
download | bazarr-0f061f21226f91883c841f85ceef31b30981277a.tar.gz bazarr-0f061f21226f91883c841f85ceef31b30981277a.zip |
Include dependencies and remove requirements.txt
Diffstat (limited to 'libs/gitdb')
-rw-r--r-- | libs/gitdb/__init__.py | 39 | ||||
-rw-r--r-- | libs/gitdb/base.py | 315 | ||||
-rw-r--r-- | libs/gitdb/const.py | 4 | ||||
-rw-r--r-- | libs/gitdb/db/__init__.py | 11 | ||||
-rw-r--r-- | libs/gitdb/db/base.py | 273 | ||||
-rw-r--r-- | libs/gitdb/db/git.py | 85 | ||||
-rw-r--r-- | libs/gitdb/db/loose.py | 262 | ||||
-rw-r--r-- | libs/gitdb/db/mem.py | 112 | ||||
-rw-r--r-- | libs/gitdb/db/pack.py | 207 | ||||
-rw-r--r-- | libs/gitdb/db/ref.py | 82 | ||||
-rw-r--r-- | libs/gitdb/exc.py | 46 | ||||
-rw-r--r-- | libs/gitdb/fun.py | 781 | ||||
-rw-r--r-- | libs/gitdb/pack.py | 1033 | ||||
-rw-r--r-- | libs/gitdb/stream.py | 732 | ||||
-rw-r--r-- | libs/gitdb/test/__init__.py | 4 | ||||
-rw-r--r-- | libs/gitdb/test/lib.py | 208 | ||||
-rw-r--r-- | libs/gitdb/test/test_base.py | 105 | ||||
-rw-r--r-- | libs/gitdb/test/test_example.py | 43 | ||||
-rw-r--r-- | libs/gitdb/test/test_pack.py | 255 | ||||
-rw-r--r-- | libs/gitdb/test/test_stream.py | 164 | ||||
-rw-r--r-- | libs/gitdb/test/test_util.py | 100 | ||||
-rw-r--r-- | libs/gitdb/typ.py | 10 | ||||
-rw-r--r-- | libs/gitdb/util.py | 401 | ||||
-rw-r--r-- | libs/gitdb/utils/__init__.py | 0 | ||||
-rw-r--r-- | libs/gitdb/utils/compat.py | 43 | ||||
-rw-r--r-- | libs/gitdb/utils/encoding.py | 31 |
26 files changed, 5346 insertions, 0 deletions
diff --git a/libs/gitdb/__init__.py b/libs/gitdb/__init__.py new file mode 100644 index 000000000..344c3de01 --- /dev/null +++ b/libs/gitdb/__init__.py @@ -0,0 +1,39 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Initialize the object database module""" + +import sys +import os + +#{ Initialization + + +def _init_externals(): + """Initialize external projects by putting them into the path""" + for module in ('smmap',): + sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module)) + + try: + __import__(module) + except ImportError: + raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module) + # END verify import + # END handel imports + +#} END initialization + +_init_externals() + +__author__ = "Sebastian Thiel" +__contact__ = "[email protected]" +__homepage__ = "https://github.com/gitpython-developers/gitdb" +version_info = (2, 0, 4) +__version__ = '.'.join(str(i) for i in version_info) + + +# default imports +from gitdb.base import * +from gitdb.db import * +from gitdb.stream import * diff --git a/libs/gitdb/base.py b/libs/gitdb/base.py new file mode 100644 index 000000000..42e71d0fa --- /dev/null +++ b/libs/gitdb/base.py @@ -0,0 +1,315 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Module with basic data structures - they are designed to be lightweight and fast""" +from gitdb.util import bin_to_hex + +from gitdb.fun import ( + type_id_to_type_map, + type_to_type_id_map +) + +__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo', + 'OStream', 'OPackStream', 'ODeltaPackStream', + 'IStream', 'InvalidOInfo', 'InvalidOStream') + +#{ ODB Bases + + +class OInfo(tuple): + + """Carries information about an object in an ODB, providing information + about the binary sha of the object, the type_string as well as the uncompressed size + in bytes. + + It can be accessed using tuple notation and using attribute access notation:: + + assert dbi[0] == dbi.binsha + assert dbi[1] == dbi.type + assert dbi[2] == dbi.size + + The type is designed to be as lightweight as possible.""" + __slots__ = tuple() + + def __new__(cls, sha, type, size): + return tuple.__new__(cls, (sha, type, size)) + + def __init__(self, *args): + tuple.__init__(self) + + #{ Interface + @property + def binsha(self): + """:return: our sha as binary, 20 bytes""" + return self[0] + + @property + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return bin_to_hex(self[0]) + + @property + def type(self): + return self[1] + + @property + def type_id(self): + return type_to_type_id_map[self[1]] + + @property + def size(self): + return self[2] + #} END interface + + +class OPackInfo(tuple): + + """As OInfo, but provides a type_id property to retrieve the numerical type id, and + does not include a sha. + + Additionally, the pack_offset is the absolute offset into the packfile at which + all object information is located. The data_offset property points to the absolute + location in the pack at which that actual data stream can be found.""" + __slots__ = tuple() + + def __new__(cls, packoffset, type, size): + return tuple.__new__(cls, (packoffset, type, size)) + + def __init__(self, *args): + tuple.__init__(self) + + #{ Interface + + @property + def pack_offset(self): + return self[0] + + @property + def type(self): + return type_id_to_type_map[self[1]] + + @property + def type_id(self): + return self[1] + + @property + def size(self): + return self[2] + + #} END interface + + +class ODeltaPackInfo(OPackInfo): + + """Adds delta specific information, + Either the 20 byte sha which points to some object in the database, + or the negative offset from the pack_offset, so that pack_offset - delta_info yields + the pack offset of the base object""" + __slots__ = tuple() + + def __new__(cls, packoffset, type, size, delta_info): + return tuple.__new__(cls, (packoffset, type, size, delta_info)) + + #{ Interface + @property + def delta_info(self): + return self[3] + #} END interface + + +class OStream(OInfo): + + """Base for object streams retrieved from the database, providing additional + information about the stream. + Generally, ODB streams are read-only as objects are immutable""" + __slots__ = tuple() + + def __new__(cls, sha, type, size, stream, *args, **kwargs): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (sha, type, size, stream)) + + def __init__(self, *args, **kwargs): + tuple.__init__(self) + + #{ Stream Reader Interface + + def read(self, size=-1): + return self[3].read(size) + + @property + def stream(self): + return self[3] + + #} END stream reader interface + + +class ODeltaStream(OStream): + + """Uses size info of its stream, delaying reads""" + + def __new__(cls, sha, type, size, stream, *args, **kwargs): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (sha, type, size, stream)) + + #{ Stream Reader Interface + + @property + def size(self): + return self[3].size + + #} END stream reader interface + + +class OPackStream(OPackInfo): + + """Next to pack object information, a stream outputting an undeltified base object + is provided""" + __slots__ = tuple() + + def __new__(cls, packoffset, type, size, stream, *args): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (packoffset, type, size, stream)) + + #{ Stream Reader Interface + def read(self, size=-1): + return self[3].read(size) + + @property + def stream(self): + return self[3] + #} END stream reader interface + + +class ODeltaPackStream(ODeltaPackInfo): + + """Provides a stream outputting the uncompressed offset delta information""" + __slots__ = tuple() + + def __new__(cls, packoffset, type, size, delta_info, stream): + return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) + + #{ Stream Reader Interface + def read(self, size=-1): + return self[4].read(size) + + @property + def stream(self): + return self[4] + #} END stream reader interface + + +class IStream(list): + + """Represents an input content stream to be fed into the ODB. It is mutable to allow + the ODB to record information about the operations outcome right in this instance. + + It provides interfaces for the OStream and a StreamReader to allow the instance + to blend in without prior conversion. + + The only method your content stream must support is 'read'""" + __slots__ = tuple() + + def __new__(cls, type, size, stream, sha=None): + return list.__new__(cls, (sha, type, size, stream, None)) + + def __init__(self, type, size, stream, sha=None): + list.__init__(self, (sha, type, size, stream, None)) + + #{ Interface + @property + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return bin_to_hex(self[0]) + + def _error(self): + """:return: the error that occurred when processing the stream, or None""" + return self[4] + + def _set_error(self, exc): + """Set this input stream to the given exc, may be None to reset the error""" + self[4] = exc + + error = property(_error, _set_error) + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + """Implements a simple stream reader interface, passing the read call on + to our internal stream""" + return self[3].read(size) + + #} END stream reader interface + + #{ interface + + def _set_binsha(self, binsha): + self[0] = binsha + + def _binsha(self): + return self[0] + + binsha = property(_binsha, _set_binsha) + + def _type(self): + return self[1] + + def _set_type(self, type): + self[1] = type + + type = property(_type, _set_type) + + def _size(self): + return self[2] + + def _set_size(self, size): + self[2] = size + + size = property(_size, _set_size) + + def _stream(self): + return self[3] + + def _set_stream(self, stream): + self[3] = stream + + stream = property(_stream, _set_stream) + + #} END odb info interface + + +class InvalidOInfo(tuple): + + """Carries information about a sha identifying an object which is invalid in + the queried database. The exception attribute provides more information about + the cause of the issue""" + __slots__ = tuple() + + def __new__(cls, sha, exc): + return tuple.__new__(cls, (sha, exc)) + + def __init__(self, sha, exc): + tuple.__init__(self, (sha, exc)) + + @property + def binsha(self): + return self[0] + + @property + def hexsha(self): + return bin_to_hex(self[0]) + + @property + def error(self): + """:return: exception instance explaining the failure""" + return self[1] + + +class InvalidOStream(InvalidOInfo): + + """Carries information about an invalid ODB stream""" + __slots__ = tuple() + +#} END ODB Bases diff --git a/libs/gitdb/const.py b/libs/gitdb/const.py new file mode 100644 index 000000000..6391d796f --- /dev/null +++ b/libs/gitdb/const.py @@ -0,0 +1,4 @@ +BYTE_SPACE = b' ' +NULL_BYTE = b'\0' +NULL_HEX_SHA = "0" * 40 +NULL_BIN_SHA = NULL_BYTE * 20 diff --git a/libs/gitdb/db/__init__.py b/libs/gitdb/db/__init__.py new file mode 100644 index 000000000..0a2a46a64 --- /dev/null +++ b/libs/gitdb/db/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php + +from gitdb.db.base import * +from gitdb.db.loose import * +from gitdb.db.mem import * +from gitdb.db.pack import * +from gitdb.db.git import * +from gitdb.db.ref import * diff --git a/libs/gitdb/db/base.py b/libs/gitdb/db/base.py new file mode 100644 index 000000000..2d7b9fa8d --- /dev/null +++ b/libs/gitdb/db/base.py @@ -0,0 +1,273 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Contains implementations of database retrieveing objects""" +from gitdb.util import ( + join, + LazyMixin, + hex_to_bin +) + +from gitdb.utils.encoding import force_text +from gitdb.exc import ( + BadObject, + AmbiguousObjectName +) + +from itertools import chain +from functools import reduce + + +__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB') + + +class ObjectDBR(object): + + """Defines an interface for object database lookup. + Objects are identified either by their 20 byte bin sha""" + + def __contains__(self, sha): + return self.has_obj + + #{ Query Interface + def has_object(self, sha): + """ + :return: True if the object identified by the given 20 bytes + binary sha is contained in the database""" + raise NotImplementedError("To be implemented in subclass") + + def info(self, sha): + """ :return: OInfo instance + :param sha: bytes binary sha + :raise BadObject:""" + raise NotImplementedError("To be implemented in subclass") + + def stream(self, sha): + """:return: OStream instance + :param sha: 20 bytes binary sha + :raise BadObject:""" + raise NotImplementedError("To be implemented in subclass") + + def size(self): + """:return: amount of objects in this database""" + raise NotImplementedError() + + def sha_iter(self): + """Return iterator yielding 20 byte shas for all objects in this data base""" + raise NotImplementedError() + + #} END query interface + + +class ObjectDBW(object): + + """Defines an interface to create objects in the database""" + + def __init__(self, *args, **kwargs): + self._ostream = None + + #{ Edit Interface + def set_ostream(self, stream): + """ + Adjusts the stream to which all data should be sent when storing new objects + + :param stream: if not None, the stream to use, if None the default stream + will be used. + :return: previously installed stream, or None if there was no override + :raise TypeError: if the stream doesn't have the supported functionality""" + cstream = self._ostream + self._ostream = stream + return cstream + + def ostream(self): + """ + :return: overridden output stream this instance will write to, or None + if it will write to the default stream""" + return self._ostream + + def store(self, istream): + """ + Create a new object in the database + :return: the input istream object with its sha set to its corresponding value + + :param istream: IStream compatible instance. If its sha is already set + to a value, the object will just be stored in the our database format, + in which case the input stream is expected to be in object format ( header + contents ). + :raise IOError: if data could not be written""" + raise NotImplementedError("To be implemented in subclass") + + #} END edit interface + + +class FileDBBase(object): + + """Provides basic facilities to retrieve files of interest, including + caching facilities to help mapping hexsha's to objects""" + + def __init__(self, root_path): + """Initialize this instance to look for its files at the given root path + All subsequent operations will be relative to this path + :raise InvalidDBRoot: + **Note:** The base will not perform any accessablity checking as the base + might not yet be accessible, but become accessible before the first + access.""" + super(FileDBBase, self).__init__() + self._root_path = root_path + + #{ Interface + def root_path(self): + """:return: path at which this db operates""" + return self._root_path + + def db_path(self, rela_path): + """ + :return: the given relative path relative to our database root, allowing + to pontentially access datafiles""" + return join(self._root_path, force_text(rela_path)) + #} END interface + + +class CachingDB(object): + + """A database which uses caches to speed-up access""" + + #{ Interface + def update_cache(self, force=False): + """ + Call this method if the underlying data changed to trigger an update + of the internal caching structures. + + :param force: if True, the update must be performed. Otherwise the implementation + may decide not to perform an update if it thinks nothing has changed. + :return: True if an update was performed as something change indeed""" + + # END interface + + +def _databases_recursive(database, output): + """Fill output list with database from db, in order. Deals with Loose, Packed + and compound databases.""" + if isinstance(database, CompoundDB): + dbs = database.databases() + output.extend(db for db in dbs if not isinstance(db, CompoundDB)) + for cdb in (db for db in dbs if isinstance(db, CompoundDB)): + _databases_recursive(cdb, output) + else: + output.append(database) + # END handle database type + + +class CompoundDB(ObjectDBR, LazyMixin, CachingDB): + + """A database which delegates calls to sub-databases. + + Databases are stored in the lazy-loaded _dbs attribute. + Define _set_cache_ to update it with your databases""" + + def _set_cache_(self, attr): + if attr == '_dbs': + self._dbs = list() + elif attr == '_db_cache': + self._db_cache = dict() + else: + super(CompoundDB, self)._set_cache_(attr) + + def _db_query(self, sha): + """:return: database containing the given 20 byte sha + :raise BadObject:""" + # most databases use binary representations, prevent converting + # it every time a database is being queried + try: + return self._db_cache[sha] + except KeyError: + pass + # END first level cache + + for db in self._dbs: + if db.has_object(sha): + self._db_cache[sha] = db + return db + # END for each database + raise BadObject(sha) + + #{ ObjectDBR interface + + def has_object(self, sha): + try: + self._db_query(sha) + return True + except BadObject: + return False + # END handle exceptions + + def info(self, sha): + return self._db_query(sha).info(sha) + + def stream(self, sha): + return self._db_query(sha).stream(sha) + + def size(self): + """:return: total size of all contained databases""" + return reduce(lambda x, y: x + y, (db.size() for db in self._dbs), 0) + + def sha_iter(self): + return chain(*(db.sha_iter() for db in self._dbs)) + + #} END object DBR Interface + + #{ Interface + + def databases(self): + """:return: tuple of database instances we use for lookups""" + return tuple(self._dbs) + + def update_cache(self, force=False): + # something might have changed, clear everything + self._db_cache.clear() + stat = False + for db in self._dbs: + if isinstance(db, CachingDB): + stat |= db.update_cache(force) + # END if is caching db + # END for each database to update + return stat + + def partial_to_complete_sha_hex(self, partial_hexsha): + """ + :return: 20 byte binary sha1 from the given less-than-40 byte hexsha (bytes or str) + :param partial_hexsha: hexsha with less than 40 byte + :raise AmbiguousObjectName: """ + databases = list() + _databases_recursive(self, databases) + partial_hexsha = force_text(partial_hexsha) + len_partial_hexsha = len(partial_hexsha) + if len_partial_hexsha % 2 != 0: + partial_binsha = hex_to_bin(partial_hexsha + "0") + else: + partial_binsha = hex_to_bin(partial_hexsha) + # END assure successful binary conversion + + candidate = None + for db in databases: + full_bin_sha = None + try: + if hasattr(db, 'partial_to_complete_sha_hex'): + full_bin_sha = db.partial_to_complete_sha_hex(partial_hexsha) + else: + full_bin_sha = db.partial_to_complete_sha(partial_binsha, len_partial_hexsha) + # END handle database type + except BadObject: + continue + # END ignore bad objects + if full_bin_sha: + if candidate and candidate != full_bin_sha: + raise AmbiguousObjectName(partial_hexsha) + candidate = full_bin_sha + # END handle candidate + # END for each db + if not candidate: + raise BadObject(partial_binsha) + return candidate + + #} END interface diff --git a/libs/gitdb/db/git.py b/libs/gitdb/db/git.py new file mode 100644 index 000000000..7a43d7235 --- /dev/null +++ b/libs/gitdb/db/git.py @@ -0,0 +1,85 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +from gitdb.db.base import ( + CompoundDB, + ObjectDBW, + FileDBBase +) + +from gitdb.db.loose import LooseObjectDB +from gitdb.db.pack import PackedDB +from gitdb.db.ref import ReferenceDB + +from gitdb.exc import InvalidDBRoot + +import os + +__all__ = ('GitDB', ) + + +class GitDB(FileDBBase, ObjectDBW, CompoundDB): + + """A git-style object database, which contains all objects in the 'objects' + subdirectory + + ``IMPORTANT``: The usage of this implementation is highly discouraged as it fails to release file-handles. + This can be a problem with long-running processes and/or big repositories. + """ + # Configuration + PackDBCls = PackedDB + LooseDBCls = LooseObjectDB + ReferenceDBCls = ReferenceDB + + # Directories + packs_dir = 'pack' + loose_dir = '' + alternates_dir = os.path.join('info', 'alternates') + + def __init__(self, root_path): + """Initialize ourselves on a git objects directory""" + super(GitDB, self).__init__(root_path) + + def _set_cache_(self, attr): + if attr == '_dbs' or attr == '_loose_db': + self._dbs = list() + loose_db = None + for subpath, dbcls in ((self.packs_dir, self.PackDBCls), + (self.loose_dir, self.LooseDBCls), + (self.alternates_dir, self.ReferenceDBCls)): + path = self.db_path(subpath) + if os.path.exists(path): + self._dbs.append(dbcls(path)) + if dbcls is self.LooseDBCls: + loose_db = self._dbs[-1] + # END remember loose db + # END check path exists + # END for each db type + + # should have at least one subdb + if not self._dbs: + raise InvalidDBRoot(self.root_path()) + # END handle error + + # we the first one should have the store method + assert loose_db is not None and hasattr(loose_db, 'store'), "First database needs store functionality" + + # finally set the value + self._loose_db = loose_db + else: + super(GitDB, self)._set_cache_(attr) + # END handle attrs + + #{ ObjectDBW interface + + def store(self, istream): + return self._loose_db.store(istream) + + def ostream(self): + return self._loose_db.ostream() + + def set_ostream(self, ostream): + return self._loose_db.set_ostream(ostream) + + #} END objectdbw interface diff --git a/libs/gitdb/db/loose.py b/libs/gitdb/db/loose.py new file mode 100644 index 000000000..192c524af --- /dev/null +++ b/libs/gitdb/db/loose.py @@ -0,0 +1,262 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +from gitdb.db.base import ( + FileDBBase, + ObjectDBR, + ObjectDBW +) + + +from gitdb.exc import ( + BadObject, + AmbiguousObjectName +) + +from gitdb.stream import ( + DecompressMemMapReader, + FDCompressedSha1Writer, + FDStream, + Sha1Writer +) + +from gitdb.base import ( + OStream, + OInfo +) + +from gitdb.util import ( + file_contents_ro_filepath, + ENOENT, + hex_to_bin, + bin_to_hex, + exists, + chmod, + isdir, + isfile, + remove, + mkdir, + rename, + dirname, + basename, + join +) + +from gitdb.fun import ( + chunk_size, + loose_object_header_info, + write_object, + stream_copy +) + +from gitdb.utils.compat import MAXSIZE +from gitdb.utils.encoding import force_bytes + +import tempfile +import os + + +__all__ = ('LooseObjectDB', ) + + +class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): + + """A database which operates on loose object files""" + + # CONFIGURATION + # chunks in which data will be copied between streams + stream_chunk_size = chunk_size + + # On windows we need to keep it writable, otherwise it cannot be removed + # either + new_objects_mode = int("444", 8) + if os.name == 'nt': + new_objects_mode = int("644", 8) + + def __init__(self, root_path): + super(LooseObjectDB, self).__init__(root_path) + self._hexsha_to_file = dict() + # Additional Flags - might be set to 0 after the first failure + # Depending on the root, this might work for some mounts, for others not, which + # is why it is per instance + self._fd_open_flags = getattr(os, 'O_NOATIME', 0) + + #{ Interface + def object_path(self, hexsha): + """ + :return: path at which the object with the given hexsha would be stored, + relative to the database root""" + return join(hexsha[:2], hexsha[2:]) + + def readable_db_object_path(self, hexsha): + """ + :return: readable object path to the object identified by hexsha + :raise BadObject: If the object file does not exist""" + try: + return self._hexsha_to_file[hexsha] + except KeyError: + pass + # END ignore cache misses + + # try filesystem + path = self.db_path(self.object_path(hexsha)) + if exists(path): + self._hexsha_to_file[hexsha] = path + return path + # END handle cache + raise BadObject(hexsha) + + def partial_to_complete_sha_hex(self, partial_hexsha): + """:return: 20 byte binary sha1 string which matches the given name uniquely + :param name: hexadecimal partial name (bytes or ascii string) + :raise AmbiguousObjectName: + :raise BadObject: """ + candidate = None + for binsha in self.sha_iter(): + if bin_to_hex(binsha).startswith(force_bytes(partial_hexsha)): + # it can't ever find the same object twice + if candidate is not None: + raise AmbiguousObjectName(partial_hexsha) + candidate = binsha + # END for each object + if candidate is None: + raise BadObject(partial_hexsha) + return candidate + + #} END interface + + def _map_loose_object(self, sha): + """ + :return: memory map of that file to allow random read access + :raise BadObject: if object could not be located""" + db_path = self.db_path(self.object_path(bin_to_hex(sha))) + try: + return file_contents_ro_filepath(db_path, flags=self._fd_open_flags) + except OSError as e: + if e.errno != ENOENT: + # try again without noatime + try: + return file_contents_ro_filepath(db_path) + except OSError: + raise BadObject(sha) + # didn't work because of our flag, don't try it again + self._fd_open_flags = 0 + else: + raise BadObject(sha) + # END handle error + # END exception handling + + def set_ostream(self, stream): + """:raise TypeError: if the stream does not support the Sha1Writer interface""" + if stream is not None and not isinstance(stream, Sha1Writer): + raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + return super(LooseObjectDB, self).set_ostream(stream) + + def info(self, sha): + m = self._map_loose_object(sha) + try: + typ, size = loose_object_header_info(m) + return OInfo(sha, typ, size) + finally: + if hasattr(m, 'close'): + m.close() + # END assure release of system resources + + def stream(self, sha): + m = self._map_loose_object(sha) + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True) + return OStream(sha, type, size, stream) + + def has_object(self, sha): + try: + self.readable_db_object_path(bin_to_hex(sha)) + return True + except BadObject: + return False + # END check existence + + def store(self, istream): + """note: The sha we produce will be hex by nature""" + tmp_path = None + writer = self.ostream() + if writer is None: + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + + if istream.binsha is None: + writer = FDCompressedSha1Writer(fd) + else: + writer = FDStream(fd) + # END handle direct stream copies + # END handle custom writer + + try: + try: + if istream.binsha is not None: + # copy as much as possible, the actual uncompressed item size might + # be smaller than the compressed version + stream_copy(istream.read, writer.write, MAXSIZE, self.stream_chunk_size) + else: + # write object with header, we have to make a new one + write_object(istream.type, istream.size, istream.read, writer.write, + chunk_size=self.stream_chunk_size) + # END handle direct stream copies + finally: + if tmp_path: + writer.close() + # END assure target stream is closed + except: + if tmp_path: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + + hexsha = None + if istream.binsha: + hexsha = istream.hexsha + else: + hexsha = writer.sha(as_hex=True) + # END handle sha + + if tmp_path: + obj_path = self.db_path(self.object_path(hexsha)) + obj_dir = dirname(obj_path) + if not isdir(obj_dir): + mkdir(obj_dir) + # END handle destination directory + # rename onto existing doesn't work on windows + if os.name == 'nt': + if isfile(obj_path): + remove(tmp_path) + else: + rename(tmp_path, obj_path) + # end rename only if needed + else: + rename(tmp_path, obj_path) + # END handle win32 + + # make sure its readable for all ! It started out as rw-- tmp file + # but needs to be rwrr + chmod(obj_path, self.new_objects_mode) + # END handle dry_run + + istream.binsha = hex_to_bin(hexsha) + return istream + + def sha_iter(self): + # find all files which look like an object, extract sha from there + for root, dirs, files in os.walk(self.root_path()): + root_base = basename(root) + if len(root_base) != 2: + continue + + for f in files: + if len(f) != 38: + continue + yield hex_to_bin(root_base + f) + # END for each file + # END for each walk iteration + + def size(self): + return len(tuple(self.sha_iter())) diff --git a/libs/gitdb/db/mem.py b/libs/gitdb/db/mem.py new file mode 100644 index 000000000..871133468 --- /dev/null +++ b/libs/gitdb/db/mem.py @@ -0,0 +1,112 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Contains the MemoryDatabase implementation""" +from gitdb.db.loose import LooseObjectDB +from gitdb.db.base import ( + ObjectDBR, + ObjectDBW +) + +from gitdb.base import ( + OStream, + IStream, +) + +from gitdb.exc import ( + BadObject, + UnsupportedOperation +) + +from gitdb.stream import ( + ZippedStoreShaWriter, + DecompressMemMapReader, +) + +from io import BytesIO + +__all__ = ("MemoryDB", ) + + +class MemoryDB(ObjectDBR, ObjectDBW): + + """A memory database stores everything to memory, providing fast IO and object + retrieval. It should be used to buffer results and obtain SHAs before writing + it to the actual physical storage, as it allows to query whether object already + exists in the target storage before introducing actual IO""" + + def __init__(self): + super(MemoryDB, self).__init__() + self._db = LooseObjectDB("path/doesnt/matter") + + # maps 20 byte shas to their OStream objects + self._cache = dict() + + def set_ostream(self, stream): + raise UnsupportedOperation("MemoryDB's always stream into memory") + + def store(self, istream): + zstream = ZippedStoreShaWriter() + self._db.set_ostream(zstream) + + istream = self._db.store(istream) + zstream.close() # close to flush + zstream.seek(0) + + # don't provide a size, the stream is written in object format, hence the + # header needs decompression + decomp_stream = DecompressMemMapReader(zstream.getvalue(), close_on_deletion=False) + self._cache[istream.binsha] = OStream(istream.binsha, istream.type, istream.size, decomp_stream) + + return istream + + def has_object(self, sha): + return sha in self._cache + + def info(self, sha): + # we always return streams, which are infos as well + return self.stream(sha) + + def stream(self, sha): + try: + ostream = self._cache[sha] + # rewind stream for the next one to read + ostream.stream.seek(0) + return ostream + except KeyError: + raise BadObject(sha) + # END exception handling + + def size(self): + return len(self._cache) + + def sha_iter(self): + try: + return self._cache.iterkeys() + except AttributeError: + return self._cache.keys() + + #{ Interface + def stream_copy(self, sha_iter, odb): + """Copy the streams as identified by sha's yielded by sha_iter into the given odb + The streams will be copied directly + **Note:** the object will only be written if it did not exist in the target db + :return: amount of streams actually copied into odb. If smaller than the amount + of input shas, one or more objects did already exist in odb""" + count = 0 + for sha in sha_iter: + if odb.has_object(sha): + continue + # END check object existence + + ostream = self.stream(sha) + # compressed data including header + sio = BytesIO(ostream.stream.data()) + istream = IStream(ostream.type, ostream.size, sio, sha) + + odb.store(istream) + count += 1 + # END for each sha + return count + #} END interface diff --git a/libs/gitdb/db/pack.py b/libs/gitdb/db/pack.py new file mode 100644 index 000000000..6b03d8383 --- /dev/null +++ b/libs/gitdb/db/pack.py @@ -0,0 +1,207 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Module containing a database to deal with packs""" +from gitdb.db.base import ( + FileDBBase, + ObjectDBR, + CachingDB +) + +from gitdb.util import LazyMixin + +from gitdb.exc import ( + BadObject, + UnsupportedOperation, + AmbiguousObjectName +) + +from gitdb.pack import PackEntity +from gitdb.utils.compat import xrange + +from functools import reduce + +import os +import glob + +__all__ = ('PackedDB', ) + +#{ Utilities + + +class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin): + + """A database operating on a set of object packs""" + + # sort the priority list every N queries + # Higher values are better, performance tests don't show this has + # any effect, but it should have one + _sort_interval = 500 + + def __init__(self, root_path): + super(PackedDB, self).__init__(root_path) + # list of lists with three items: + # * hits - number of times the pack was hit with a request + # * entity - Pack entity instance + # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query + # self._entities = list() # lazy loaded list + self._hit_count = 0 # amount of hits + self._st_mtime = 0 # last modification data of our root path + + def _set_cache_(self, attr): + if attr == '_entities': + self._entities = list() + self.update_cache(force=True) + # END handle entities initialization + + def _sort_entities(self): + self._entities.sort(key=lambda l: l[0], reverse=True) + + def _pack_info(self, sha): + """:return: tuple(entity, index) for an item at the given sha + :param sha: 20 or 40 byte sha + :raise BadObject: + **Note:** This method is not thread-safe, but may be hit in multi-threaded + operation. The worst thing that can happen though is a counter that + was not incremented, or the list being in wrong order. So we safe + the time for locking here, lets see how that goes""" + # presort ? + if self._hit_count % self._sort_interval == 0: + self._sort_entities() + # END update sorting + + for item in self._entities: + index = item[2](sha) + if index is not None: + item[0] += 1 # one hit for you + self._hit_count += 1 # general hit count + return (item[1], index) + # END index found in pack + # END for each item + + # no hit, see whether we have to update packs + # NOTE: considering packs don't change very often, we safe this call + # and leave it to the super-caller to trigger that + raise BadObject(sha) + + #{ Object DB Read + + def has_object(self, sha): + try: + self._pack_info(sha) + return True + except BadObject: + return False + # END exception handling + + def info(self, sha): + entity, index = self._pack_info(sha) + return entity.info_at_index(index) + + def stream(self, sha): + entity, index = self._pack_info(sha) + return entity.stream_at_index(index) + + def sha_iter(self): + for entity in self.entities(): + index = entity.index() + sha_by_index = index.sha + for index in xrange(index.size()): + yield sha_by_index(index) + # END for each index + # END for each entity + + def size(self): + sizes = [item[1].index().size() for item in self._entities] + return reduce(lambda x, y: x + y, sizes, 0) + + #} END object db read + + #{ object db write + + def store(self, istream): + """Storing individual objects is not feasible as a pack is designed to + hold multiple objects. Writing or rewriting packs for single objects is + inefficient""" + raise UnsupportedOperation() + + #} END object db write + + #{ Interface + + def update_cache(self, force=False): + """ + Update our cache with the acutally existing packs on disk. Add new ones, + and remove deleted ones. We keep the unchanged ones + + :param force: If True, the cache will be updated even though the directory + does not appear to have changed according to its modification timestamp. + :return: True if the packs have been updated so there is new information, + False if there was no change to the pack database""" + stat = os.stat(self.root_path()) + if not force and stat.st_mtime <= self._st_mtime: + return False + # END abort early on no change + self._st_mtime = stat.st_mtime + + # packs are supposed to be prefixed with pack- by git-convention + # get all pack files, figure out what changed + pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack"))) + our_pack_files = set(item[1].pack().path() for item in self._entities) + + # new packs + for pack_file in (pack_files - our_pack_files): + # init the hit-counter/priority with the size, a good measure for hit- + # probability. Its implemented so that only 12 bytes will be read + entity = PackEntity(pack_file) + self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index]) + # END for each new packfile + + # removed packs + for pack_file in (our_pack_files - pack_files): + del_index = -1 + for i, item in enumerate(self._entities): + if item[1].pack().path() == pack_file: + del_index = i + break + # END found index + # END for each entity + assert del_index != -1 + del(self._entities[del_index]) + # END for each removed pack + + # reinitialize prioritiess + self._sort_entities() + return True + + def entities(self): + """:return: list of pack entities operated upon by this database""" + return [item[1] for item in self._entities] + + def partial_to_complete_sha(self, partial_binsha, canonical_length): + """:return: 20 byte sha as inferred by the given partial binary sha + :param partial_binsha: binary sha with less than 20 bytes + :param canonical_length: length of the corresponding canonical representation. + It is required as binary sha's cannot display whether the original hex sha + had an odd or even number of characters + :raise AmbiguousObjectName: + :raise BadObject: """ + candidate = None + for item in self._entities: + item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length) + if item_index is not None: + sha = item[1].index().sha(item_index) + if candidate and candidate != sha: + raise AmbiguousObjectName(partial_binsha) + candidate = sha + # END handle full sha could be found + # END for each entity + + if candidate: + return candidate + + # still not found ? + raise BadObject(partial_binsha) + + #} END interface diff --git a/libs/gitdb/db/ref.py b/libs/gitdb/db/ref.py new file mode 100644 index 000000000..94a2f01f8 --- /dev/null +++ b/libs/gitdb/db/ref.py @@ -0,0 +1,82 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +import codecs +from gitdb.db.base import ( + CompoundDB, +) + +__all__ = ('ReferenceDB', ) + + +class ReferenceDB(CompoundDB): + + """A database consisting of database referred to in a file""" + + # Configuration + # Specifies the object database to use for the paths found in the alternates + # file. If None, it defaults to the GitDB + ObjectDBCls = None + + def __init__(self, ref_file): + super(ReferenceDB, self).__init__() + self._ref_file = ref_file + + def _set_cache_(self, attr): + if attr == '_dbs': + self._dbs = list() + self._update_dbs_from_ref_file() + else: + super(ReferenceDB, self)._set_cache_(attr) + # END handle attrs + + def _update_dbs_from_ref_file(self): + dbcls = self.ObjectDBCls + if dbcls is None: + # late import + from gitdb.db.git import GitDB + dbcls = GitDB + # END get db type + + # try to get as many as possible, don't fail if some are unavailable + ref_paths = list() + try: + with codecs.open(self._ref_file, 'r', encoding="utf-8") as f: + ref_paths = [l.strip() for l in f] + except (OSError, IOError): + pass + # END handle alternates + + ref_paths_set = set(ref_paths) + cur_ref_paths_set = set(db.root_path() for db in self._dbs) + + # remove existing + for path in (cur_ref_paths_set - ref_paths_set): + for i, db in enumerate(self._dbs[:]): + if db.root_path() == path: + del(self._dbs[i]) + continue + # END del matching db + # END for each path to remove + + # add new + # sort them to maintain order + added_paths = sorted(ref_paths_set - cur_ref_paths_set, key=lambda p: ref_paths.index(p)) + for path in added_paths: + try: + db = dbcls(path) + # force an update to verify path + if isinstance(db, CompoundDB): + db.databases() + # END verification + self._dbs.append(db) + except Exception: + # ignore invalid paths or issues + pass + # END for each path to add + + def update_cache(self, force=False): + # re-read alternates and update databases + self._update_dbs_from_ref_file() + return super(ReferenceDB, self).update_cache(force) diff --git a/libs/gitdb/exc.py b/libs/gitdb/exc.py new file mode 100644 index 000000000..947e5d8bf --- /dev/null +++ b/libs/gitdb/exc.py @@ -0,0 +1,46 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Module with common exceptions""" +from gitdb.util import to_hex_sha + + +class ODBError(Exception): + """All errors thrown by the object database""" + + +class InvalidDBRoot(ODBError): + """Thrown if an object database cannot be initialized at the given path""" + + +class BadObject(ODBError): + """The object with the given SHA does not exist. Instantiate with the + failed sha""" + + def __str__(self): + return "BadObject: %s" % to_hex_sha(self.args[0]) + + +class BadName(ODBError): + """A name provided to rev_parse wasn't understood""" + + def __str__(self): + return "Ref '%s' did not resolve to an object" % self.args[0] + + +class ParseError(ODBError): + """Thrown if the parsing of a file failed due to an invalid format""" + + +class AmbiguousObjectName(ODBError): + """Thrown if a possibly shortened name does not uniquely represent a single object + in the database""" + + +class BadObjectType(ODBError): + """The object had an unsupported type""" + + +class UnsupportedOperation(ODBError): + """Thrown if the given operation cannot be supported by the object database""" diff --git a/libs/gitdb/fun.py b/libs/gitdb/fun.py new file mode 100644 index 000000000..8ca38c867 --- /dev/null +++ b/libs/gitdb/fun.py @@ -0,0 +1,781 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Contains basic c-functions which usually contain performance critical code +Keeping this code separate from the beginning makes it easier to out-source +it into c later, if required""" + +import zlib +from gitdb.util import byte_ord +decompressobj = zlib.decompressobj + +import mmap +from itertools import islice +from functools import reduce + +from gitdb.const import NULL_BYTE, BYTE_SPACE +from gitdb.utils.encoding import force_text +from gitdb.utils.compat import izip, buffer, xrange, PY3 +from gitdb.typ import ( + str_blob_type, + str_commit_type, + str_tree_type, + str_tag_type, +) + +from io import StringIO + +# INVARIANTS +OFS_DELTA = 6 +REF_DELTA = 7 +delta_types = (OFS_DELTA, REF_DELTA) + +type_id_to_type_map = { + 0: b'', # EXT 1 + 1: str_commit_type, + 2: str_tree_type, + 3: str_blob_type, + 4: str_tag_type, + 5: b'', # EXT 2 + OFS_DELTA: "OFS_DELTA", # OFFSET DELTA + REF_DELTA: "REF_DELTA" # REFERENCE DELTA +} + +type_to_type_id_map = { + str_commit_type: 1, + str_tree_type: 2, + str_blob_type: 3, + str_tag_type: 4, + "OFS_DELTA": OFS_DELTA, + "REF_DELTA": REF_DELTA, +} + +# used when dealing with larger streams +chunk_size = 1000 * mmap.PAGESIZE + +__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', + 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', + 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') + + +#{ Structures + +def _set_delta_rbound(d, size): + """Truncate the given delta to the given size + :param size: size relative to our target offset, may not be 0, must be smaller or equal + to our size + :return: d""" + d.ts = size + + # NOTE: data is truncated automatically when applying the delta + # MUST NOT DO THIS HERE + return d + + +def _move_delta_lbound(d, bytes): + """Move the delta by the given amount of bytes, reducing its size so that its + right bound stays static + :param bytes: amount of bytes to move, must be smaller than delta size + :return: d""" + if bytes == 0: + return + + d.to += bytes + d.so += bytes + d.ts -= bytes + if d.data is not None: + d.data = d.data[bytes:] + # END handle data + + return d + + +def delta_duplicate(src): + return DeltaChunk(src.to, src.ts, src.so, src.data) + + +def delta_chunk_apply(dc, bbuf, write): + """Apply own data to the target buffer + :param bbuf: buffer providing source bytes for copy operations + :param write: write method to call with data to write""" + if dc.data is None: + # COPY DATA FROM SOURCE + write(buffer(bbuf, dc.so, dc.ts)) + else: + # APPEND DATA + # whats faster: if + 4 function calls or just a write with a slice ? + # Considering data can be larger than 127 bytes now, it should be worth it + if dc.ts < len(dc.data): + write(dc.data[:dc.ts]) + else: + write(dc.data) + # END handle truncation + # END handle chunk mode + + +class DeltaChunk(object): + + """Represents a piece of a delta, it can either add new data, or copy existing + one from a source buffer""" + __slots__ = ( + 'to', # start offset in the target buffer in bytes + 'ts', # size of this chunk in the target buffer in bytes + 'so', # start offset in the source buffer in bytes or None + 'data', # chunk of bytes to be added to the target buffer, + # DeltaChunkList to use as base, or None + ) + + def __init__(self, to, ts, so, data): + self.to = to + self.ts = ts + self.so = so + self.data = data + + def __repr__(self): + return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") + + #{ Interface + + def rbound(self): + return self.to + self.ts + + def has_data(self): + """:return: True if the instance has data to add to the target stream""" + return self.data is not None + + #} END interface + + +def _closest_index(dcl, absofs): + """:return: index at which the given absofs should be inserted. The index points + to the DeltaChunk with a target buffer absofs that equals or is greater than + absofs. + **Note:** global method for performance only, it belongs to DeltaChunkList""" + lo = 0 + hi = len(dcl) + while lo < hi: + mid = (lo + hi) / 2 + dc = dcl[mid] + if dc.to > absofs: + hi = mid + elif dc.rbound() > absofs or dc.to == absofs: + return mid + else: + lo = mid + 1 + # END handle bound + # END for each delta absofs + return len(dcl) - 1 + + +def delta_list_apply(dcl, bbuf, write): + """Apply the chain's changes and write the final result using the passed + write function. + :param bbuf: base buffer containing the base of all deltas contained in this + list. It will only be used if the chunk in question does not have a base + chain. + :param write: function taking a string of bytes to write to the output""" + for dc in dcl: + delta_chunk_apply(dc, bbuf, write) + # END for each dc + + +def delta_list_slice(dcl, absofs, size, ndcl): + """:return: Subsection of this list at the given absolute offset, with the given + size in bytes. + :return: None""" + cdi = _closest_index(dcl, absofs) # delta start index + cd = dcl[cdi] + slen = len(dcl) + lappend = ndcl.append + + if cd.to != absofs: + tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) + _move_delta_lbound(tcd, absofs - cd.to) + tcd.ts = min(tcd.ts, size) + lappend(tcd) + size -= tcd.ts + cdi += 1 + # END lbound overlap handling + + while cdi < slen and size: + # are we larger than the current block + cd = dcl[cdi] + if cd.ts <= size: + lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) + size -= cd.ts + else: + tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) + tcd.ts = size + lappend(tcd) + size -= tcd.ts + break + # END hadle size + cdi += 1 + # END for each chunk + + +class DeltaChunkList(list): + + """List with special functionality to deal with DeltaChunks. + There are two types of lists we represent. The one was created bottom-up, working + towards the latest delta, the other kind was created top-down, working from the + latest delta down to the earliest ancestor. This attribute is queryable + after all processing with is_reversed.""" + + __slots__ = tuple() + + def rbound(self): + """:return: rightmost extend in bytes, absolute""" + if len(self) == 0: + return 0 + return self[-1].rbound() + + def lbound(self): + """:return: leftmost byte at which this chunklist starts""" + if len(self) == 0: + return 0 + return self[0].to + + def size(self): + """:return: size of bytes as measured by our delta chunks""" + return self.rbound() - self.lbound() + + def apply(self, bbuf, write): + """Only used by public clients, internally we only use the global routines + for performance""" + return delta_list_apply(self, bbuf, write) + + def compress(self): + """Alter the list to reduce the amount of nodes. Currently we concatenate + add-chunks + :return: self""" + slen = len(self) + if slen < 2: + return self + i = 0 + + first_data_index = None + while i < slen: + dc = self[i] + i += 1 + if dc.data is None: + if first_data_index is not None and i - 2 - first_data_index > 1: + # if first_data_index is not None: + nd = StringIO() # new data + so = self[first_data_index].to # start offset in target buffer + for x in xrange(first_data_index, i - 1): + xdc = self[x] + nd.write(xdc.data[:xdc.ts]) + # END collect data + + del(self[first_data_index:i - 1]) + buf = nd.getvalue() + self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) + + slen = len(self) + i = first_data_index + 1 + + # END concatenate data + first_data_index = None + continue + # END skip non-data chunks + + if first_data_index is None: + first_data_index = i - 1 + # END iterate list + + # if slen_orig != len(self): + # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) + return self + + def check_integrity(self, target_size=-1): + """Verify the list has non-overlapping chunks only, and the total size matches + target_size + :param target_size: if not -1, the total size of the chain must be target_size + :raise AssertionError: if the size doen't match""" + if target_size > -1: + assert self[-1].rbound() == target_size + assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size + # END target size verification + + if len(self) < 2: + return + + # check data + for dc in self: + assert dc.ts > 0 + if dc.has_data(): + assert len(dc.data) >= dc.ts + # END for each dc + + left = islice(self, 0, len(self) - 1) + right = iter(self) + right.next() + # this is very pythonic - we might have just use index based access here, + # but this could actually be faster + for lft, rgt in izip(left, right): + assert lft.rbound() == rgt.to + assert lft.to + lft.ts == rgt.to + # END for each pair + + +class TopdownDeltaChunkList(DeltaChunkList): + + """Represents a list which is generated by feeding its ancestor streams one by + one""" + __slots__ = tuple() + + def connect_with_next_base(self, bdcl): + """Connect this chain with the next level of our base delta chunklist. + The goal in this game is to mark as many of our chunks rigid, hence they + cannot be changed by any of the upcoming bases anymore. Once all our + chunks are marked like that, we can stop all processing + :param bdcl: data chunk list being one of our bases. They must be fed in + consequtively and in order, towards the earliest ancestor delta + :return: True if processing was done. Use it to abort processing of + remaining streams if False is returned""" + nfc = 0 # number of frozen chunks + dci = 0 # delta chunk index + slen = len(self) # len of self + ccl = list() # temporary list + while dci < slen: + dc = self[dci] + dci += 1 + + # all add-chunks which are already topmost don't need additional processing + if dc.data is not None: + nfc += 1 + continue + # END skip add chunks + + # copy chunks + # integrate the portion of the base list into ourselves. Lists + # dont support efficient insertion ( just one at a time ), but for now + # we live with it. Internally, its all just a 32/64bit pointer, and + # the portions of moved memory should be smallish. Maybe we just rebuild + # ourselves in order to reduce the amount of insertions ... + del(ccl[:]) + delta_list_slice(bdcl, dc.so, dc.ts, ccl) + + # move the target bounds into place to match with our chunk + ofs = dc.to - dc.so + for cdc in ccl: + cdc.to += ofs + # END update target bounds + + if len(ccl) == 1: + self[dci - 1] = ccl[0] + else: + # maybe try to compute the expenses here, and pick the right algorithm + # It would normally be faster than copying everything physically though + # TODO: Use a deque here, and decide by the index whether to extend + # or extend left ! + post_dci = self[dci:] + del(self[dci - 1:]) # include deletion of dc + self.extend(ccl) + self.extend(post_dci) + + slen = len(self) + dci += len(ccl) - 1 # deleted dc, added rest + + # END handle chunk replacement + # END for each chunk + + if nfc == slen: + return False + # END handle completeness + return True + + +#} END structures + +#{ Routines + +def is_loose_object(m): + """ + :return: True the file contained in memory map m appears to be a loose object. + Only the first two bytes are needed""" + b0, b1 = map(ord, m[:2]) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0 + + +def loose_object_header_info(m): + """ + :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the + object as well as its uncompressed size in bytes. + :param m: memory map from which to read the compressed object data""" + decompress_size = 8192 # is used in cgit as well + hdr = decompressobj().decompress(m, decompress_size) + type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE) + + return type_name, int(size) + + +def pack_object_header_info(data): + """ + :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) + The type_id should be interpreted according to the ``type_id_to_type_map`` map + The byte-offset specifies the start of the actual zlib compressed datastream + :param m: random-access memory, like a string or memory map""" + c = byte_ord(data[0]) # first byte + i = 1 # next char to read + type_id = (c >> 4) & 7 # numeric type + size = c & 15 # starting size + s = 4 # starting bit-shift size + if PY3: + while c & 0x80: + c = byte_ord(data[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + else: + while c & 0x80: + c = ord(data[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + # end performance at expense of maintenance ... + return (type_id, size, i) + + +def create_pack_object_header(obj_type, obj_size): + """ + :return: string defining the pack header comprised of the object type + and its incompressed size in bytes + + :param obj_type: pack type_id of the object + :param obj_size: uncompressed size in bytes of the following object stream""" + c = 0 # 1 byte + if PY3: + hdr = bytearray() # output string + + c = (obj_type << 4) | (obj_size & 0xf) + obj_size >>= 4 + while obj_size: + hdr.append(c | 0x80) + c = obj_size & 0x7f + obj_size >>= 7 + # END until size is consumed + hdr.append(c) + else: + hdr = bytes() # output string + + c = (obj_type << 4) | (obj_size & 0xf) + obj_size >>= 4 + while obj_size: + hdr += chr(c | 0x80) + c = obj_size & 0x7f + obj_size >>= 7 + # END until size is consumed + hdr += chr(c) + # end handle interpreter + return hdr + + +def msb_size(data, offset=0): + """ + :return: tuple(read_bytes, size) read the msb size from the given random + access data starting at the given byte offset""" + size = 0 + i = 0 + l = len(data) + hit_msb = False + if PY3: + while i < l: + c = data[i + offset] + size |= (c & 0x7f) << i * 7 + i += 1 + if not c & 0x80: + hit_msb = True + break + # END check msb bit + # END while in range + else: + while i < l: + c = ord(data[i + offset]) + size |= (c & 0x7f) << i * 7 + i += 1 + if not c & 0x80: + hit_msb = True + break + # END check msb bit + # END while in range + # end performance ... + if not hit_msb: + raise AssertionError("Could not find terminating MSB byte in data stream") + return i + offset, size + + +def loose_object_header(type, size): + """ + :return: bytes representing the loose object header, which is immediately + followed by the content stream of size 'size'""" + return ('%s %i\0' % (force_text(type), size)).encode('ascii') + + +def write_object(type, size, read, write, chunk_size=chunk_size): + """ + Write the object as identified by type, size and source_stream into the + target_stream + + :param type: type string of the object + :param size: amount of bytes to write from source_stream + :param read: read method of a stream providing the content data + :param write: write method of the output stream + :param close_target_stream: if True, the target stream will be closed when + the routine exits, even if an error is thrown + :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" + tbw = 0 # total num bytes written + + # WRITE HEADER: type SP size NULL + tbw += write(loose_object_header(type, size)) + tbw += stream_copy(read, write, size, chunk_size) + + return tbw + + +def stream_copy(read, write, size, chunk_size): + """ + Copy a stream up to size bytes using the provided read and write methods, + in chunks of chunk_size + + **Note:** its much like stream_copy utility, but operates just using methods""" + dbw = 0 # num data bytes written + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size - dbw) + # NOTE: not all write methods return the amount of written bytes, like + # mmap.write. Its bad, but we just deal with it ... perhaps its not + # even less efficient + # data_len = write(read(cs)) + # dbw += data_len + data = read(cs) + data_len = len(data) + dbw += data_len + write(data) + if data_len < cs or dbw == size: + break + # END check for stream end + # END duplicate data + return dbw + + +def connect_deltas(dstreams): + """ + Read the condensed delta chunk information from dstream and merge its information + into a list of existing delta chunks + + :param dstreams: iterable of delta stream objects, the delta to be applied last + comes first, then all its ancestors in order + :return: DeltaChunkList, containing all operations to apply""" + tdcl = None # topmost dcl + + dcl = tdcl = TopdownDeltaChunkList() + for dsi, ds in enumerate(dstreams): + # print "Stream", dsi + db = ds.read() + delta_buf_size = ds.size + + # read header + i, base_size = msb_size(db) + i, target_size = msb_size(db, i) + + # interpret opcodes + tbw = 0 # amount of target bytes written + while i < delta_buf_size: + c = ord(db[i]) + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = ord(db[i]) + i += 1 + if (c & 0x02): + cp_off |= (ord(db[i]) << 8) + i += 1 + if (c & 0x04): + cp_off |= (ord(db[i]) << 16) + i += 1 + if (c & 0x08): + cp_off |= (ord(db[i]) << 24) + i += 1 + if (c & 0x10): + cp_size = ord(db[i]) + i += 1 + if (c & 0x20): + cp_size |= (ord(db[i]) << 8) + i += 1 + if (c & 0x40): + cp_size |= (ord(db[i]) << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or + rbound > base_size): + break + + dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) + tbw += cp_size + elif c: + # NOTE: in C, the data chunks should probably be concatenated here. + # In python, we do it as a post-process + dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) + i += c + tbw += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data + + dcl.compress() + + # merge the lists ! + if dsi > 0: + if not tdcl.connect_with_next_base(dcl): + break + # END handle merge + + # prepare next base + dcl = DeltaChunkList() + # END for each delta stream + + return tdcl + + +def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): + """ + Apply data from a delta buffer using a source buffer to the target file + + :param src_buf: random access data from which the delta was created + :param src_buf_size: size of the source buffer in bytes + :param delta_buf_size: size fo the delta buffer in bytes + :param delta_buf: random access delta data + :param write: write method taking a chunk of bytes + + **Note:** transcribed to python from the similar routine in patch-delta.c""" + i = 0 + db = delta_buf + if PY3: + while i < delta_buf_size: + c = db[i] + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = db[i] + i += 1 + if (c & 0x02): + cp_off |= (db[i] << 8) + i += 1 + if (c & 0x04): + cp_off |= (db[i] << 16) + i += 1 + if (c & 0x08): + cp_off |= (db[i] << 24) + i += 1 + if (c & 0x10): + cp_size = db[i] + i += 1 + if (c & 0x20): + cp_size |= (db[i] << 8) + i += 1 + if (c & 0x40): + cp_size |= (db[i] << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or + rbound > src_buf_size): + break + write(buffer(src_buf, cp_off, cp_size)) + elif c: + write(db[i:i + c]) + i += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data + else: + while i < delta_buf_size: + c = ord(db[i]) + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = ord(db[i]) + i += 1 + if (c & 0x02): + cp_off |= (ord(db[i]) << 8) + i += 1 + if (c & 0x04): + cp_off |= (ord(db[i]) << 16) + i += 1 + if (c & 0x08): + cp_off |= (ord(db[i]) << 24) + i += 1 + if (c & 0x10): + cp_size = ord(db[i]) + i += 1 + if (c & 0x20): + cp_size |= (ord(db[i]) << 8) + i += 1 + if (c & 0x40): + cp_size |= (ord(db[i]) << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or + rbound > src_buf_size): + break + write(buffer(src_buf, cp_off, cp_size)) + elif c: + write(db[i:i + c]) + i += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data + # end save byte_ord call and prevent performance regression in py2 + + # yes, lets use the exact same error message that git uses :) + assert i == delta_buf_size, "delta replay has gone wild" + + +def is_equal_canonical_sha(canonical_length, match, sha1): + """ + :return: True if the given lhs and rhs 20 byte binary shas + The comparison will take the canonical_length of the match sha into account, + hence the comparison will only use the last 4 bytes for uneven canonical representations + :param match: less than 20 byte sha + :param sha1: 20 byte sha""" + binary_length = canonical_length // 2 + if match[:binary_length] != sha1[:binary_length]: + return False + + if canonical_length - binary_length and \ + (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0: + return False + # END handle uneven canonnical length + return True + +#} END routines + + +try: + from gitdb_speedups._perf import connect_deltas +except ImportError: + pass diff --git a/libs/gitdb/pack.py b/libs/gitdb/pack.py new file mode 100644 index 000000000..115d94365 --- /dev/null +++ b/libs/gitdb/pack.py @@ -0,0 +1,1033 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Contains PackIndexFile and PackFile implementations""" +import zlib + +from gitdb.exc import ( + BadObject, + AmbiguousObjectName, + UnsupportedOperation, + ParseError +) + +from gitdb.util import ( + mman, + LazyMixin, + unpack_from, + bin_to_hex, + byte_ord, +) + +from gitdb.fun import ( + create_pack_object_header, + pack_object_header_info, + is_equal_canonical_sha, + type_id_to_type_map, + write_object, + stream_copy, + chunk_size, + delta_types, + OFS_DELTA, + REF_DELTA, + msb_size +) + +try: + from gitdb_speedups._perf import PackIndexFile_sha_to_index +except ImportError: + pass +# END try c module + +from gitdb.base import ( # Amazing ! + OInfo, + OStream, + OPackInfo, + OPackStream, + ODeltaStream, + ODeltaPackInfo, + ODeltaPackStream, +) + +from gitdb.stream import ( + DecompressMemMapReader, + DeltaApplyReader, + Sha1Writer, + NullStream, + FlexibleSha1Writer +) + +from struct import pack +from binascii import crc32 + +from gitdb.const import NULL_BYTE +from gitdb.utils.compat import ( + izip, + buffer, + xrange, + to_bytes +) + +import tempfile +import array +import os +import sys + +__all__ = ('PackIndexFile', 'PackFile', 'PackEntity') + + +#{ Utilities + +def pack_object_at(cursor, offset, as_stream): + """ + :return: Tuple(abs_data_offset, PackInfo|PackStream) + an object of the correct type according to the type_id of the object. + If as_stream is True, the object will contain a stream, allowing the + data to be read decompressed. + :param data: random accessible data containing all required information + :parma offset: offset in to the data at which the object information is located + :param as_stream: if True, a stream object will be returned that can read + the data, otherwise you receive an info object only""" + data = cursor.use_region(offset).buffer() + type_id, uncomp_size, data_rela_offset = pack_object_header_info(data) + total_rela_offset = None # set later, actual offset until data stream begins + delta_info = None + + # OFFSET DELTA + if type_id == OFS_DELTA: + i = data_rela_offset + c = byte_ord(data[i]) + i += 1 + delta_offset = c & 0x7f + while c & 0x80: + c = byte_ord(data[i]) + i += 1 + delta_offset += 1 + delta_offset = (delta_offset << 7) + (c & 0x7f) + # END character loop + delta_info = delta_offset + total_rela_offset = i + # REF DELTA + elif type_id == REF_DELTA: + total_rela_offset = data_rela_offset + 20 + delta_info = data[data_rela_offset:total_rela_offset] + # BASE OBJECT + else: + # assume its a base object + total_rela_offset = data_rela_offset + # END handle type id + abs_data_offset = offset + total_rela_offset + if as_stream: + stream = DecompressMemMapReader(buffer(data, total_rela_offset), False, uncomp_size) + if delta_info is None: + return abs_data_offset, OPackStream(offset, type_id, uncomp_size, stream) + else: + return abs_data_offset, ODeltaPackStream(offset, type_id, uncomp_size, delta_info, stream) + else: + if delta_info is None: + return abs_data_offset, OPackInfo(offset, type_id, uncomp_size) + else: + return abs_data_offset, ODeltaPackInfo(offset, type_id, uncomp_size, delta_info) + # END handle info + # END handle stream + + +def write_stream_to_pack(read, write, zstream, base_crc=None): + """Copy a stream as read from read function, zip it, and write the result. + Count the number of written bytes and return it + :param base_crc: if not None, the crc will be the base for all compressed data + we consecutively write and generate a crc32 from. If None, no crc will be generated + :return: tuple(no bytes read, no bytes written, crc32) crc might be 0 if base_crc + was false""" + br = 0 # bytes read + bw = 0 # bytes written + want_crc = base_crc is not None + crc = 0 + if want_crc: + crc = base_crc + # END initialize crc + + while True: + chunk = read(chunk_size) + br += len(chunk) + compressed = zstream.compress(chunk) + bw += len(compressed) + write(compressed) # cannot assume return value + + if want_crc: + crc = crc32(compressed, crc) + # END handle crc + + if len(chunk) != chunk_size: + break + # END copy loop + + compressed = zstream.flush() + bw += len(compressed) + write(compressed) + if want_crc: + crc = crc32(compressed, crc) + # END handle crc + + return (br, bw, crc) + + +#} END utilities + + +class IndexWriter(object): + + """Utility to cache index information, allowing to write all information later + in one go to the given stream + **Note:** currently only writes v2 indices""" + __slots__ = '_objs' + + def __init__(self): + self._objs = list() + + def append(self, binsha, crc, offset): + """Append one piece of object information""" + self._objs.append((binsha, crc, offset)) + + def write(self, pack_sha, write): + """Write the index file using the given write method + :param pack_sha: binary sha over the whole pack that we index + :return: sha1 binary sha over all index file contents""" + # sort for sha1 hash + self._objs.sort(key=lambda o: o[0]) + + sha_writer = FlexibleSha1Writer(write) + sha_write = sha_writer.write + sha_write(PackIndexFile.index_v2_signature) + sha_write(pack(">L", PackIndexFile.index_version_default)) + + # fanout + tmplist = list((0,) * 256) # fanout or list with 64 bit offsets + for t in self._objs: + tmplist[byte_ord(t[0][0])] += 1 + # END prepare fanout + for i in xrange(255): + v = tmplist[i] + sha_write(pack('>L', v)) + tmplist[i + 1] += v + # END write each fanout entry + sha_write(pack('>L', tmplist[255])) + + # sha1 ordered + # save calls, that is push them into c + sha_write(b''.join(t[0] for t in self._objs)) + + # crc32 + for t in self._objs: + sha_write(pack('>L', t[1] & 0xffffffff)) + # END for each crc + + tmplist = list() + # offset 32 + for t in self._objs: + ofs = t[2] + if ofs > 0x7fffffff: + tmplist.append(ofs) + ofs = 0x80000000 + len(tmplist) - 1 + # END hande 64 bit offsets + sha_write(pack('>L', ofs & 0xffffffff)) + # END for each offset + + # offset 64 + for ofs in tmplist: + sha_write(pack(">Q", ofs)) + # END for each offset + + # trailer + assert(len(pack_sha) == 20) + sha_write(pack_sha) + sha = sha_writer.sha(as_hex=False) + write(sha) + return sha + + +class PackIndexFile(LazyMixin): + + """A pack index provides offsets into the corresponding pack, allowing to find + locations for offsets faster.""" + + # Dont use slots as we dynamically bind functions for each version, need a dict for this + # The slots you see here are just to keep track of our instance variables + # __slots__ = ('_indexpath', '_fanout_table', '_cursor', '_version', + # '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset') + + # used in v2 indices + _sha_list_offset = 8 + 1024 + index_v2_signature = b'\xfftOc' + index_version_default = 2 + + def __init__(self, indexpath): + super(PackIndexFile, self).__init__() + self._indexpath = indexpath + + def close(self): + mman.force_map_handle_removal_win(self._indexpath) + self._cursor = None + + def _set_cache_(self, attr): + if attr == "_packfile_checksum": + self._packfile_checksum = self._cursor.map()[-40:-20] + elif attr == "_packfile_checksum": + self._packfile_checksum = self._cursor.map()[-20:] + elif attr == "_cursor": + # Note: We don't lock the file when reading as we cannot be sure + # that we can actually write to the location - it could be a read-only + # alternate for instance + self._cursor = mman.make_cursor(self._indexpath).use_region() + # We will assume that the index will always fully fit into memory ! + if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): + raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( + self._indexpath, self._cursor.file_size(), mman.window_size())) + # END assert window size + else: + # now its time to initialize everything - if we are here, someone wants + # to access the fanout table or related properties + + # CHECK VERSION + mmap = self._cursor.map() + self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 + if self._version == 2: + version_id = unpack_from(">L", mmap, 4)[0] + assert version_id == self._version, "Unsupported index version: %i" % version_id + # END assert version + + # SETUP FUNCTIONS + # setup our functions according to the actual version + for fname in ('entry', 'offset', 'sha', 'crc'): + setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) + # END for each function to initialize + + # INITIALIZE DATA + # byte offset is 8 if version is 2, 0 otherwise + self._initialize() + # END handle attributes + + #{ Access V1 + + def _entry_v1(self, i): + """:return: tuple(offset, binsha, 0)""" + return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0, ) + + def _offset_v1(self, i): + """see ``_offset_v2``""" + return unpack_from(">L", self._cursor.map(), 1024 + i * 24)[0] + + def _sha_v1(self, i): + """see ``_sha_v2``""" + base = 1024 + (i * 24) + 4 + return self._cursor.map()[base:base + 20] + + def _crc_v1(self, i): + """unsupported""" + return 0 + + #} END access V1 + + #{ Access V2 + def _entry_v2(self, i): + """:return: tuple(offset, binsha, crc)""" + return (self._offset_v2(i), self._sha_v2(i), self._crc_v2(i)) + + def _offset_v2(self, i): + """:return: 32 or 64 byte offset into pack files. 64 byte offsets will only + be returned if the pack is larger than 4 GiB, or 2^32""" + offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0] + + # if the high-bit is set, this indicates that we have to lookup the offset + # in the 64 bit region of the file. The current offset ( lower 31 bits ) + # are the index into it + if offset & 0x80000000: + offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0] + # END handle 64 bit offset + + return offset + + def _sha_v2(self, i): + """:return: sha at the given index of this file index instance""" + base = self._sha_list_offset + i * 20 + return self._cursor.map()[base:base + 20] + + def _crc_v2(self, i): + """:return: 4 bytes crc for the object at index i""" + return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0] + + #} END access V2 + + #{ Initialization + + def _initialize(self): + """initialize base data""" + self._fanout_table = self._read_fanout((self._version == 2) * 8) + + if self._version == 2: + self._crc_list_offset = self._sha_list_offset + self.size() * 20 + self._pack_offset = self._crc_list_offset + self.size() * 4 + self._pack_64_offset = self._pack_offset + self.size() * 4 + # END setup base + + def _read_fanout(self, byte_offset): + """Generate a fanout table from our data""" + d = self._cursor.map() + out = list() + append = out.append + for i in xrange(256): + append(unpack_from('>L', d, byte_offset + i * 4)[0]) + # END for each entry + return out + + #} END initialization + + #{ Properties + def version(self): + return self._version + + def size(self): + """:return: amount of objects referred to by this index""" + return self._fanout_table[255] + + def path(self): + """:return: path to the packindexfile""" + return self._indexpath + + def packfile_checksum(self): + """:return: 20 byte sha representing the sha1 hash of the pack file""" + return self._cursor.map()[-40:-20] + + def indexfile_checksum(self): + """:return: 20 byte sha representing the sha1 hash of this index file""" + return self._cursor.map()[-20:] + + def offsets(self): + """:return: sequence of all offsets in the order in which they were written + + **Note:** return value can be random accessed, but may be immmutable""" + if self._version == 2: + # read stream to array, convert to tuple + a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears + a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset)) + + # networkbyteorder to something array likes more + if sys.byteorder == 'little': + a.byteswap() + return a + else: + return tuple(self.offset(index) for index in xrange(self.size())) + # END handle version + + def sha_to_index(self, sha): + """ + :return: index usable with the ``offset`` or ``entry`` method, or None + if the sha was not found in this pack index + :param sha: 20 byte sha to lookup""" + first_byte = byte_ord(sha[0]) + get_sha = self.sha + lo = 0 # lower index, the left bound of the bisection + if first_byte != 0: + lo = self._fanout_table[first_byte - 1] + hi = self._fanout_table[first_byte] # the upper, right bound of the bisection + + # bisect until we have the sha + while lo < hi: + mid = (lo + hi) // 2 + mid_sha = get_sha(mid) + if sha < mid_sha: + hi = mid + elif sha == mid_sha: + return mid + else: + lo = mid + 1 + # END handle midpoint + # END bisect + return None + + def partial_sha_to_index(self, partial_bin_sha, canonical_length): + """ + :return: index as in `sha_to_index` or None if the sha was not found in this + index file + :param partial_bin_sha: an at least two bytes of a partial binary sha as bytes + :param canonical_length: length of the original hexadecimal representation of the + given partial binary sha + :raise AmbiguousObjectName:""" + if len(partial_bin_sha) < 2: + raise ValueError("Require at least 2 bytes of partial sha") + + assert isinstance(partial_bin_sha, bytes), "partial_bin_sha must be bytes" + first_byte = byte_ord(partial_bin_sha[0]) + + get_sha = self.sha + lo = 0 # lower index, the left bound of the bisection + if first_byte != 0: + lo = self._fanout_table[first_byte - 1] + hi = self._fanout_table[first_byte] # the upper, right bound of the bisection + + # fill the partial to full 20 bytes + filled_sha = partial_bin_sha + NULL_BYTE * (20 - len(partial_bin_sha)) + + # find lowest + while lo < hi: + mid = (lo + hi) // 2 + mid_sha = get_sha(mid) + if filled_sha < mid_sha: + hi = mid + elif filled_sha == mid_sha: + # perfect match + lo = mid + break + else: + lo = mid + 1 + # END handle midpoint + # END bisect + + if lo < self.size(): + cur_sha = get_sha(lo) + if is_equal_canonical_sha(canonical_length, partial_bin_sha, cur_sha): + next_sha = None + if lo + 1 < self.size(): + next_sha = get_sha(lo + 1) + if next_sha and next_sha == cur_sha: + raise AmbiguousObjectName(partial_bin_sha) + return lo + # END if we have a match + # END if we found something + return None + + if 'PackIndexFile_sha_to_index' in globals(): + # NOTE: Its just about 25% faster, the major bottleneck might be the attr + # accesses + def sha_to_index(self, sha): + return PackIndexFile_sha_to_index(self, sha) + # END redefine heavy-hitter with c version + + #} END properties + + +class PackFile(LazyMixin): + + """A pack is a file written according to the Version 2 for git packs + + As we currently use memory maps, it could be assumed that the maximum size of + packs therefor is 32 bit on 32 bit systems. On 64 bit systems, this should be + fine though. + + **Note:** at some point, this might be implemented using streams as well, or + streams are an alternate path in the case memory maps cannot be created + for some reason - one clearly doesn't want to read 10GB at once in that + case""" + + __slots__ = ('_packpath', '_cursor', '_size', '_version') + pack_signature = 0x5041434b # 'PACK' + pack_version_default = 2 + + # offset into our data at which the first object starts + first_object_offset = 3 * 4 # header bytes + footer_size = 20 # final sha + + def __init__(self, packpath): + self._packpath = packpath + + def close(self): + mman.force_map_handle_removal_win(self._packpath) + self._cursor = None + + def _set_cache_(self, attr): + # we fill the whole cache, whichever attribute gets queried first + self._cursor = mman.make_cursor(self._packpath).use_region() + + # read the header information + type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0) + + # TODO: figure out whether we should better keep the lock, or maybe + # add a .keep file instead ? + if type_id != self.pack_signature: + raise ParseError("Invalid pack signature: %i" % type_id) + + def _iter_objects(self, start_offset, as_stream=True): + """Handle the actual iteration of objects within this pack""" + c = self._cursor + content_size = c.file_size() - self.footer_size + cur_offset = start_offset or self.first_object_offset + + null = NullStream() + while cur_offset < content_size: + data_offset, ostream = pack_object_at(c, cur_offset, True) + # scrub the stream to the end - this decompresses the object, but yields + # the amount of compressed bytes we need to get to the next offset + + stream_copy(ostream.read, null.write, ostream.size, chunk_size) + assert ostream.stream._br == ostream.size + cur_offset += (data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read() + + # if a stream is requested, reset it beforehand + # Otherwise return the Stream object directly, its derived from the + # info object + if as_stream: + ostream.stream.seek(0) + yield ostream + # END until we have read everything + + #{ Pack Information + + def size(self): + """:return: The amount of objects stored in this pack""" + return self._size + + def version(self): + """:return: the version of this pack""" + return self._version + + def data(self): + """ + :return: read-only data of this pack. It provides random access and usually + is a memory map. + :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" + # can use map as we are starting at offset 0. Otherwise we would have to use buffer() + return self._cursor.use_region().map() + + def checksum(self): + """:return: 20 byte sha1 hash on all object sha's contained in this file""" + return self._cursor.use_region(self._cursor.file_size() - 20).buffer()[:] + + def path(self): + """:return: path to the packfile""" + return self._packpath + #} END pack information + + #{ Pack Specific + + def collect_streams(self, offset): + """ + :return: list of pack streams which are required to build the object + at the given offset. The first entry of the list is the object at offset, + the last one is either a full object, or a REF_Delta stream. The latter + type needs its reference object to be locked up in an ODB to form a valid + delta chain. + If the object at offset is no delta, the size of the list is 1. + :param offset: specifies the first byte of the object within this pack""" + out = list() + c = self._cursor + while True: + ostream = pack_object_at(c, offset, True)[1] + out.append(ostream) + if ostream.type_id == OFS_DELTA: + offset = ostream.pack_offset - ostream.delta_info + else: + # the only thing we can lookup are OFFSET deltas. Everything + # else is either an object, or a ref delta, in the latter + # case someone else has to find it + break + # END handle type + # END while chaining streams + return out + + #} END pack specific + + #{ Read-Database like Interface + + def info(self, offset): + """Retrieve information about the object at the given file-absolute offset + + :param offset: byte offset + :return: OPackInfo instance, the actual type differs depending on the type_id attribute""" + return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1] + + def stream(self, offset): + """Retrieve an object at the given file-relative offset as stream along with its information + + :param offset: byte offset + :return: OPackStream instance, the actual type differs depending on the type_id attribute""" + return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1] + + def stream_iter(self, start_offset=0): + """ + :return: iterator yielding OPackStream compatible instances, allowing + to access the data in the pack directly. + :param start_offset: offset to the first object to iterate. If 0, iteration + starts at the very first object in the pack. + + **Note:** Iterating a pack directly is costly as the datastream has to be decompressed + to determine the bounds between the objects""" + return self._iter_objects(start_offset, as_stream=True) + + #} END Read-Database like Interface + + +class PackEntity(LazyMixin): + + """Combines the PackIndexFile and the PackFile into one, allowing the + actual objects to be resolved and iterated""" + + __slots__ = ('_index', # our index file + '_pack', # our pack file + '_offset_map' # on demand dict mapping one offset to the next consecutive one + ) + + IndexFileCls = PackIndexFile + PackFileCls = PackFile + + def __init__(self, pack_or_index_path): + """Initialize ourselves with the path to the respective pack or index file""" + basename, ext = os.path.splitext(pack_or_index_path) + self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance + self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance + + def close(self): + self._index.close() + self._pack.close() + + def _set_cache_(self, attr): + # currently this can only be _offset_map + # TODO: make this a simple sorted offset array which can be bisected + # to find the respective entry, from which we can take a +1 easily + # This might be slower, but should also be much lighter in memory ! + offsets_sorted = sorted(self._index.offsets()) + last_offset = len(self._pack.data()) - self._pack.footer_size + assert offsets_sorted, "Cannot handle empty indices" + + offset_map = None + if len(offsets_sorted) == 1: + offset_map = {offsets_sorted[0]: last_offset} + else: + iter_offsets = iter(offsets_sorted) + iter_offsets_plus_one = iter(offsets_sorted) + next(iter_offsets_plus_one) + consecutive = izip(iter_offsets, iter_offsets_plus_one) + + offset_map = dict(consecutive) + + # the last offset is not yet set + offset_map[offsets_sorted[-1]] = last_offset + # END handle offset amount + self._offset_map = offset_map + + def _sha_to_index(self, sha): + """:return: index for the given sha, or raise""" + index = self._index.sha_to_index(sha) + if index is None: + raise BadObject(sha) + return index + + def _iter_objects(self, as_stream): + """Iterate over all objects in our index and yield their OInfo or OStream instences""" + _sha = self._index.sha + _object = self._object + for index in xrange(self._index.size()): + yield _object(_sha(index), as_stream, index) + # END for each index + + def _object(self, sha, as_stream, index=-1): + """:return: OInfo or OStream object providing information about the given sha + :param index: if not -1, its assumed to be the sha's index in the IndexFile""" + # its a little bit redundant here, but it needs to be efficient + if index < 0: + index = self._sha_to_index(sha) + if sha is None: + sha = self._index.sha(index) + # END assure sha is present ( in output ) + offset = self._index.offset(index) + type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) + if as_stream: + if type_id not in delta_types: + packstream = self._pack.stream(offset) + return OStream(sha, packstream.type, packstream.size, packstream.stream) + # END handle non-deltas + + # produce a delta stream containing all info + # To prevent it from applying the deltas when querying the size, + # we extract it from the delta stream ourselves + streams = self.collect_streams_at_offset(offset) + dstream = DeltaApplyReader.new(streams) + + return ODeltaStream(sha, dstream.type, None, dstream) + else: + if type_id not in delta_types: + return OInfo(sha, type_id_to_type_map[type_id], uncomp_size) + # END handle non-deltas + + # deltas are a little tougher - unpack the first bytes to obtain + # the actual target size, as opposed to the size of the delta data + streams = self.collect_streams_at_offset(offset) + buf = streams[0].read(512) + offset, src_size = msb_size(buf) + offset, target_size = msb_size(buf, offset) + + # collect the streams to obtain the actual object type + if streams[-1].type_id in delta_types: + raise BadObject(sha, "Could not resolve delta object") + return OInfo(sha, streams[-1].type, target_size) + # END handle stream + + #{ Read-Database like Interface + + def info(self, sha): + """Retrieve information about the object identified by the given sha + + :param sha: 20 byte sha1 + :raise BadObject: + :return: OInfo instance, with 20 byte sha""" + return self._object(sha, False) + + def stream(self, sha): + """Retrieve an object stream along with its information as identified by the given sha + + :param sha: 20 byte sha1 + :raise BadObject: + :return: OStream instance, with 20 byte sha""" + return self._object(sha, True) + + def info_at_index(self, index): + """As ``info``, but uses a PackIndexFile compatible index to refer to the object""" + return self._object(None, False, index) + + def stream_at_index(self, index): + """As ``stream``, but uses a PackIndexFile compatible index to refer to the + object""" + return self._object(None, True, index) + + #} END Read-Database like Interface + + #{ Interface + + def pack(self): + """:return: the underlying pack file instance""" + return self._pack + + def index(self): + """:return: the underlying pack index file instance""" + return self._index + + def is_valid_stream(self, sha, use_crc=False): + """ + Verify that the stream at the given sha is valid. + + :param use_crc: if True, the index' crc is run over the compressed stream of + the object, which is much faster than checking the sha1. It is also + more prone to unnoticed corruption or manipulation. + :param sha: 20 byte sha1 of the object whose stream to verify + whether the compressed stream of the object is valid. If it is + a delta, this only verifies that the delta's data is valid, not the + data of the actual undeltified object, as it depends on more than + just this stream. + If False, the object will be decompressed and the sha generated. It must + match the given sha + + :return: True if the stream is valid + :raise UnsupportedOperation: If the index is version 1 only + :raise BadObject: sha was not found""" + if use_crc: + if self._index.version() < 2: + raise UnsupportedOperation("Version 1 indices do not contain crc's, verify by sha instead") + # END handle index version + + index = self._sha_to_index(sha) + offset = self._index.offset(index) + next_offset = self._offset_map[offset] + crc_value = self._index.crc(index) + + # create the current crc value, on the compressed object data + # Read it in chunks, without copying the data + crc_update = zlib.crc32 + pack_data = self._pack.data() + cur_pos = offset + this_crc_value = 0 + while cur_pos < next_offset: + rbound = min(cur_pos + chunk_size, next_offset) + size = rbound - cur_pos + this_crc_value = crc_update(buffer(pack_data, cur_pos, size), this_crc_value) + cur_pos += size + # END window size loop + + # crc returns signed 32 bit numbers, the AND op forces it into unsigned + # mode ... wow, sneaky, from dulwich. + return (this_crc_value & 0xffffffff) == crc_value + else: + shawriter = Sha1Writer() + stream = self._object(sha, as_stream=True) + # write a loose object, which is the basis for the sha + write_object(stream.type, stream.size, stream.read, shawriter.write) + + assert shawriter.sha(as_hex=False) == sha + return shawriter.sha(as_hex=False) == sha + # END handle crc/sha verification + return True + + def info_iter(self): + """ + :return: Iterator over all objects in this pack. The iterator yields + OInfo instances""" + return self._iter_objects(as_stream=False) + + def stream_iter(self): + """ + :return: iterator over all objects in this pack. The iterator yields + OStream instances""" + return self._iter_objects(as_stream=True) + + def collect_streams_at_offset(self, offset): + """ + As the version in the PackFile, but can resolve REF deltas within this pack + For more info, see ``collect_streams`` + + :param offset: offset into the pack file at which the object can be found""" + streams = self._pack.collect_streams(offset) + + # try to resolve the last one if needed. It is assumed to be either + # a REF delta, or a base object, as OFFSET deltas are resolved by the pack + if streams[-1].type_id == REF_DELTA: + stream = streams[-1] + while stream.type_id in delta_types: + if stream.type_id == REF_DELTA: + sindex = self._index.sha_to_index(to_bytes(stream.delta_info)) + if sindex is None: + break + stream = self._pack.stream(self._index.offset(sindex)) + streams.append(stream) + else: + # must be another OFS DELTA - this could happen if a REF + # delta we resolve previously points to an OFS delta. Who + # would do that ;) ? We can handle it though + stream = self._pack.stream(stream.delta_info) + streams.append(stream) + # END handle ref delta + # END resolve ref streams + # END resolve streams + + return streams + + def collect_streams(self, sha): + """ + As ``PackFile.collect_streams``, but takes a sha instead of an offset. + Additionally, ref_delta streams will be resolved within this pack. + If this is not possible, the stream will be left alone, hence it is adivsed + to check for unresolved ref-deltas and resolve them before attempting to + construct a delta stream. + + :param sha: 20 byte sha1 specifying the object whose related streams you want to collect + :return: list of streams, first being the actual object delta, the last being + a possibly unresolved base object. + :raise BadObject:""" + return self.collect_streams_at_offset(self._index.offset(self._sha_to_index(sha))) + + @classmethod + def write_pack(cls, object_iter, pack_write, index_write=None, + object_count=None, zlib_compression=zlib.Z_BEST_SPEED): + """ + Create a new pack by putting all objects obtained by the object_iterator + into a pack which is written using the pack_write method. + The respective index is produced as well if index_write is not Non. + + :param object_iter: iterator yielding odb output objects + :param pack_write: function to receive strings to write into the pack stream + :param indx_write: if not None, the function writes the index file corresponding + to the pack. + :param object_count: if you can provide the amount of objects in your iteration, + this would be the place to put it. Otherwise we have to pre-iterate and store + all items into a list to get the number, which uses more memory than necessary. + :param zlib_compression: the zlib compression level to use + :return: tuple(pack_sha, index_binsha) binary sha over all the contents of the pack + and over all contents of the index. If index_write was None, index_binsha will be None + + **Note:** The destination of the write functions is up to the user. It could + be a socket, or a file for instance + + **Note:** writes only undeltified objects""" + objs = object_iter + if not object_count: + if not isinstance(object_iter, (tuple, list)): + objs = list(object_iter) + # END handle list type + object_count = len(objs) + # END handle object + + pack_writer = FlexibleSha1Writer(pack_write) + pwrite = pack_writer.write + ofs = 0 # current offset into the pack file + index = None + wants_index = index_write is not None + + # write header + pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) + ofs += 12 + + if wants_index: + index = IndexWriter() + # END handle index header + + actual_count = 0 + for obj in objs: + actual_count += 1 + crc = 0 + + # object header + hdr = create_pack_object_header(obj.type_id, obj.size) + if index_write: + crc = crc32(hdr) + else: + crc = None + # END handle crc + pwrite(hdr) + + # data stream + zstream = zlib.compressobj(zlib_compression) + ostream = obj.stream + br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) + assert(br == obj.size) + if wants_index: + index.append(obj.binsha, crc, ofs) + # END handle index + + ofs += len(hdr) + bw + if actual_count == object_count: + break + # END abort once we are done + # END for each object + + if actual_count != object_count: + raise ValueError( + "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) + # END count assertion + + # write footer + pack_sha = pack_writer.sha(as_hex=False) + assert len(pack_sha) == 20 + pack_write(pack_sha) + ofs += len(pack_sha) # just for completeness ;) + + index_sha = None + if wants_index: + index_sha = index.write(pack_sha, index_write) + # END handle index + + return pack_sha, index_sha + + @classmethod + def create(cls, object_iter, base_dir, object_count=None, zlib_compression=zlib.Z_BEST_SPEED): + """Create a new on-disk entity comprised of a properly named pack file and a properly named + and corresponding index file. The pack contains all OStream objects contained in object iter. + :param base_dir: directory which is to contain the files + :return: PackEntity instance initialized with the new pack + + **Note:** for more information on the other parameters see the write_pack method""" + pack_fd, pack_path = tempfile.mkstemp('', 'pack', base_dir) + index_fd, index_path = tempfile.mkstemp('', 'index', base_dir) + pack_write = lambda d: os.write(pack_fd, d) + index_write = lambda d: os.write(index_fd, d) + + pack_binsha, index_binsha = cls.write_pack(object_iter, pack_write, index_write, object_count, zlib_compression) + os.close(pack_fd) + os.close(index_fd) + + fmt = "pack-%s.%s" + new_pack_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'pack')) + new_index_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'idx')) + os.rename(pack_path, new_pack_path) + os.rename(index_path, new_index_path) + + return cls(new_pack_path) + + #} END interface diff --git a/libs/gitdb/stream.py b/libs/gitdb/stream.py new file mode 100644 index 000000000..2f4c12dfb --- /dev/null +++ b/libs/gitdb/stream.py @@ -0,0 +1,732 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php + +from io import BytesIO + +import mmap +import os +import sys +import zlib + +from gitdb.fun import ( + msb_size, + stream_copy, + apply_delta_data, + connect_deltas, + delta_types +) + +from gitdb.util import ( + allocate_memory, + LazyMixin, + make_sha, + write, + close, +) + +from gitdb.const import NULL_BYTE, BYTE_SPACE +from gitdb.utils.compat import buffer +from gitdb.utils.encoding import force_bytes + +has_perf_mod = False +PY26 = sys.version_info[:2] < (2, 7) +try: + from gitdb_speedups._perf import apply_delta as c_apply_delta + has_perf_mod = True +except ImportError: + pass + +__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader', + 'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer', + 'FDStream', 'NullStream') + + +#{ RO Streams + +class DecompressMemMapReader(LazyMixin): + + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + **Note:** The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, that's all ... """ + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', + '_cbr', '_phi') + + max_read_size = 512 * 1024 # currently unused + + def __init__(self, m, close_on_deletion, size=None): + """Initialize with mmap for stream reading + :param m: must be content data - use new if you have object data and no size""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + if size is not None: + self._s = size # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._cbr = 0 # number of compressed bytes read + self._phi = False # is True if we parsed the header info + self._close = close_on_deletion # close the memmap on deletion ? + + def _set_cache_(self, attr): + assert attr == '_s' + # only happens for size, which is a marker to indicate we still + # have to parse the header from the stream + self._parse_header_info() + + def __del__(self): + self.close() + + def _parse_header_info(self): + """If this stream contains object data, parse the header info and skip the + stream to a point where each read will yield object content + + :return: parsed type_string, size""" + # read header + # should really be enough, cgit uses 8192 I believe + # And for good reason !! This needs to be that high for the header to be read correctly in all cases + maxb = 8192 + self._s = maxb + hdr = self.read(maxb) + hdrend = hdr.find(NULL_BYTE) + typ, size = hdr[:hdrend].split(BYTE_SPACE) + size = int(size) + self._s = size + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + self._br = 0 + hdrend += 1 + self._buf = BytesIO(hdr[hdrend:]) + self._buflen = len(hdr) - hdrend + + self._phi = True + + return typ, size + + #{ Interface + + @classmethod + def new(self, m, close_on_deletion=False): + """Create a new DecompressMemMapReader instance for acting as a read-only stream + This method parses the object header from m and returns the parsed + type and size, as well as the created stream instance. + + :param m: memory map on which to operate. It must be object data ( header + contents ) + :param close_on_deletion: if True, the memory map will be closed once we are + being deleted""" + inst = DecompressMemMapReader(m, close_on_deletion, 0) + typ, size = inst._parse_header_info() + return typ, size, inst + + def data(self): + """:return: random access compatible data we are working on""" + return self._m + + def close(self): + """Close our underlying stream of compressed bytes if this was allowed during initialization + :return: True if we closed the underlying stream + :note: can be called safely + """ + if self._close: + if hasattr(self._m, 'close'): + self._m.close() + self._close = False + # END handle resource freeing + + def compressed_bytes_read(self): + """ + :return: number of compressed bytes read. This includes the bytes it + took to decompress the header ( if there was one )""" + # ABSTRACT: When decompressing a byte stream, it can be that the first + # x bytes which were requested match the first x bytes in the loosely + # compressed datastream. This is the worst-case assumption that the reader + # does, it assumes that it will get at least X bytes from X compressed bytes + # in call cases. + # The caveat is that the object, according to our known uncompressed size, + # is already complete, but there are still some bytes left in the compressed + # stream that contribute to the amount of compressed bytes. + # How can we know that we are truly done, and have read all bytes we need + # to read ? + # Without help, we cannot know, as we need to obtain the status of the + # decompression. If it is not finished, we need to decompress more data + # until it is finished, to yield the actual number of compressed bytes + # belonging to the decompressed object + # We are using a custom zlib module for this, if its not present, + # we try to put in additional bytes up for decompression if feasible + # and check for the unused_data. + + # Only scrub the stream forward if we are officially done with the + # bytes we were to have. + if self._br == self._s and not self._zip.unused_data: + # manipulate the bytes-read to allow our own read method to continue + # but keep the window at its current position + self._br = 0 + if hasattr(self._zip, 'status'): + while self._zip.status == zlib.Z_OK: + self.read(mmap.PAGESIZE) + # END scrub-loop custom zlib + else: + # pass in additional pages, until we have unused data + while not self._zip.unused_data and self._cbr != len(self._m): + self.read(mmap.PAGESIZE) + # END scrub-loop default zlib + # END handle stream scrubbing + + # reset bytes read, just to be sure + self._br = self._s + # END handle stream scrubbing + + # unused data ends up in the unconsumed tail, which was removed + # from the count already + return self._cbr + + #} END interface + + def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): + """Allows to reset the stream to restart reading + :raise ValueError: If offset and whence are not 0""" + if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): + raise ValueError("Can only seek to position 0") + # END handle offset + + self._zip = zlib.decompressobj() + self._br = self._cws = self._cwe = self._cbr = 0 + if self._phi: + self._phi = False + del(self._s) # trigger header parsing on first access + # END skip header + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return bytes() + # END handle depletion + + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = bytes() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.read() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes during + # the next read. + tail = self._zip.unconsumed_tail + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + size + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + # END handle tail + + # if window is too small, make it larger so zip can decompress something + if self._cwe - self._cws < 8: + self._cwe = self._cws + 8 + # END adjust winsize + + # takes a slice, but doesn't copy the data, it says ... + indata = buffer(self._m, self._cws, self._cwe - self._cws) + + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + dcompdat = self._zip.decompress(indata, size) + # update the amount of compressed bytes read + # We feed possibly overlapping chunks, which is why the unconsumed tail + # has to be taken into consideration, as well as the unused data + # if we hit the end of the stream + # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. + # They are thorough, and I assume it is truly working. + # Why is this logic as convoluted as it is ? Please look at the table in + # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. + # Bascially, on py2.6, you want to use branch 1, whereas on all other python version, the second branch + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # table in the github issue. This is it ... it was the only way I could make this work everywhere. + # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . + if PY26 or ((zlib.ZLIB_VERSION == '1.2.7' or zlib.ZLIB_VERSION == '1.2.5') and not sys.platform == 'darwin'): + unused_datalen = len(self._zip.unconsumed_tail) + else: + unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) + # # end handle very special case ... + + self._cbr += len(indata) - unused_datalen + self._br += len(dcompdat) + + if dat: + dcompdat = dat + dcompdat + # END prepend our cached data + + # it can happen, depending on the compression, that we get less bytes + # than ordered as it needs the final portion of the data as well. + # Recursively resolve that. + # Note: dcompdat can be empty even though we still appear to have bytes + # to read, if we are called by compressed_bytes_read - it manipulates + # us to empty the stream + if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: + dcompdat += self.read(size - len(dcompdat)) + # END handle special case + return dcompdat + + +class DeltaApplyReader(LazyMixin): + + """A reader which dynamically applies pack deltas to a base object, keeping the + memory demands to a minimum. + + The size of the final object is only obtainable once all deltas have been + applied, unless it is retrieved from a pack index. + + The uncompressed Delta has the following layout (MSB being a most significant + bit encoded dynamic size): + + * MSB Source Size - the size of the base against which the delta was created + * MSB Target Size - the size of the resulting data after the delta was applied + * A list of one byte commands (cmd) which are followed by a specific protocol: + + * cmd & 0x80 - copy delta_data[offset:offset+size] + + * Followed by an encoded offset into the delta data + * Followed by an encoded size of the chunk to copy + + * cmd & 0x7f - insert + + * insert cmd bytes from the delta buffer into the output stream + + * cmd == 0 - invalid operation ( or error in delta stream ) + """ + __slots__ = ( + "_bstream", # base stream to which to apply the deltas + "_dstreams", # tuple of delta stream readers + "_mm_target", # memory map of the delta-applied data + "_size", # actual number of bytes in _mm_target + "_br" # number of bytes read + ) + + #{ Configuration + k_max_memory_move = 250 * 1000 * 1000 + #} END configuration + + def __init__(self, stream_list): + """Initialize this instance with a list of streams, the first stream being + the delta to apply on top of all following deltas, the last stream being the + base object onto which to apply the deltas""" + assert len(stream_list) > 1, "Need at least one delta and one base stream" + + self._bstream = stream_list[-1] + self._dstreams = tuple(stream_list[:-1]) + self._br = 0 + + def _set_cache_too_slow_without_c(self, attr): + # the direct algorithm is fastest and most direct if there is only one + # delta. Also, the extra overhead might not be worth it for items smaller + # than X - definitely the case in python, every function call costs + # huge amounts of time + # if len(self._dstreams) * self._bstream.size < self.k_max_memory_move: + if len(self._dstreams) == 1: + return self._set_cache_brute_(attr) + + # Aggregate all deltas into one delta in reverse order. Hence we take + # the last delta, and reverse-merge its ancestor delta, until we receive + # the final delta data stream. + dcl = connect_deltas(self._dstreams) + + # call len directly, as the (optional) c version doesn't implement the sequence + # protocol + if dcl.rbound() == 0: + self._size = 0 + self._mm_target = allocate_memory(0) + return + # END handle empty list + + self._size = dcl.rbound() + self._mm_target = allocate_memory(self._size) + + bbuf = allocate_memory(self._bstream.size) + stream_copy(self._bstream.read, bbuf.write, self._bstream.size, 256 * mmap.PAGESIZE) + + # APPLY CHUNKS + write = self._mm_target.write + dcl.apply(bbuf, write) + + self._mm_target.seek(0) + + def _set_cache_brute_(self, attr): + """If we are here, we apply the actual deltas""" + # TODO: There should be a special case if there is only one stream + # Then the default-git algorithm should perform a tad faster, as the + # delta is not peaked into, causing less overhead. + buffer_info_list = list() + max_target_size = 0 + for dstream in self._dstreams: + buf = dstream.read(512) # read the header information + X + offset, src_size = msb_size(buf) + offset, target_size = msb_size(buf, offset) + buffer_info_list.append((buffer(buf, offset), offset, src_size, target_size)) + max_target_size = max(max_target_size, target_size) + # END for each delta stream + + # sanity check - the first delta to apply should have the same source + # size as our actual base stream + base_size = self._bstream.size + target_size = max_target_size + + # if we have more than 1 delta to apply, we will swap buffers, hence we must + # assure that all buffers we use are large enough to hold all the results + if len(self._dstreams) > 1: + base_size = target_size = max(base_size, max_target_size) + # END adjust buffer sizes + + # Allocate private memory map big enough to hold the first base buffer + # We need random access to it + bbuf = allocate_memory(base_size) + stream_copy(self._bstream.read, bbuf.write, base_size, 256 * mmap.PAGESIZE) + + # allocate memory map large enough for the largest (intermediate) target + # We will use it as scratch space for all delta ops. If the final + # target buffer is smaller than our allocated space, we just use parts + # of it upon return. + tbuf = allocate_memory(target_size) + + # for each delta to apply, memory map the decompressed delta and + # work on the op-codes to reconstruct everything. + # For the actual copying, we use a seek and write pattern of buffer + # slices. + final_target_size = None + for (dbuf, offset, src_size, target_size), dstream in zip(reversed(buffer_info_list), reversed(self._dstreams)): + # allocate a buffer to hold all delta data - fill in the data for + # fast access. We do this as we know that reading individual bytes + # from our stream would be slower than necessary ( although possible ) + # The dbuf buffer contains commands after the first two MSB sizes, the + # offset specifies the amount of bytes read to get the sizes. + ddata = allocate_memory(dstream.size - offset) + ddata.write(dbuf) + # read the rest from the stream. The size we give is larger than necessary + stream_copy(dstream.read, ddata.write, dstream.size, 256 * mmap.PAGESIZE) + + ####################################################################### + if 'c_apply_delta' in globals(): + c_apply_delta(bbuf, ddata, tbuf) + else: + apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write) + ####################################################################### + + # finally, swap out source and target buffers. The target is now the + # base for the next delta to apply + bbuf, tbuf = tbuf, bbuf + bbuf.seek(0) + tbuf.seek(0) + final_target_size = target_size + # END for each delta to apply + + # its already seeked to 0, constrain it to the actual size + # NOTE: in the end of the loop, it swaps buffers, hence our target buffer + # is not tbuf, but bbuf ! + self._mm_target = bbuf + self._size = final_target_size + + #{ Configuration + if not has_perf_mod: + _set_cache_ = _set_cache_brute_ + else: + _set_cache_ = _set_cache_too_slow_without_c + + #} END configuration + + def read(self, count=0): + bl = self._size - self._br # bytes left + if count < 1 or count > bl: + count = bl + # NOTE: we could check for certain size limits, and possibly + # return buffers instead of strings to prevent byte copying + data = self._mm_target.read(count) + self._br += len(data) + return data + + def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): + """Allows to reset the stream to restart reading + + :raise ValueError: If offset and whence are not 0""" + if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): + raise ValueError("Can only seek to position 0") + # END handle offset + self._br = 0 + self._mm_target.seek(0) + + #{ Interface + + @classmethod + def new(cls, stream_list): + """ + Convert the given list of streams into a stream which resolves deltas + when reading from it. + + :param stream_list: two or more stream objects, first stream is a Delta + to the object that you want to resolve, followed by N additional delta + streams. The list's last stream must be a non-delta stream. + + :return: Non-Delta OPackStream object whose stream can be used to obtain + the decompressed resolved data + :raise ValueError: if the stream list cannot be handled""" + if len(stream_list) < 2: + raise ValueError("Need at least two streams") + # END single object special handling + + if stream_list[-1].type_id in delta_types: + raise ValueError( + "Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type) + # END check stream + return cls(stream_list) + + #} END interface + + #{ OInfo like Interface + + @property + def type(self): + return self._bstream.type + + @property + def type_id(self): + return self._bstream.type_id + + @property + def size(self): + """:return: number of uncompressed bytes in the stream""" + return self._size + + #} END oinfo like interface + + +#} END RO streams + + +#{ W Streams + +class Sha1Writer(object): + + """Simple stream writer which produces a sha whenever you like as it degests + everything it is supposed to write""" + __slots__ = "sha1" + + def __init__(self): + self.sha1 = make_sha() + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :param data: byte object + :return: length of incoming data""" + + self.sha1.update(data) + + return len(data) + + # END stream interface + + #{ Interface + + def sha(self, as_hex=False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + #} END interface + + +class FlexibleSha1Writer(Sha1Writer): + + """Writer producing a sha1 while passing on the written bytes to the given + write function""" + __slots__ = 'writer' + + def __init__(self, writer): + Sha1Writer.__init__(self) + self.writer = writer + + def write(self, data): + Sha1Writer.write(self, data) + self.writer(data) + + +class ZippedStoreShaWriter(Sha1Writer): + + """Remembers everything someone writes to it and generates a sha""" + __slots__ = ('buf', 'zip') + + def __init__(self): + Sha1Writer.__init__(self) + self.buf = BytesIO() + self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) + + def __getattr__(self, attr): + return getattr(self.buf, attr) + + def write(self, data): + alen = Sha1Writer.write(self, data) + self.buf.write(self.zip.compress(data)) + + return alen + + def close(self): + self.buf.write(self.zip.flush()) + + def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): + """Seeking currently only supports to rewind written data + Multiple writes are not supported""" + if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): + raise ValueError("Can only seek to position 0") + # END handle offset + self.buf.seek(0) + + def getvalue(self): + """:return: string value from the current stream position to the end""" + return self.buf.getvalue() + + +class FDCompressedSha1Writer(Sha1Writer): + + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + + **Note:** operates on raw file descriptors + **Note:** for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + super(FDCompressedSha1Writer, self).__init__() + self.fd = fd + self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: length of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + + if bytes_written != len(cdata): + raise self.exc + + return len(data) + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + #} END stream interface + + +class FDStream(object): + + """A simple wrapper providing the most basic functions on a file descriptor + with the fileobject interface. Cannot use os.fdopen as the resulting stream + takes ownership""" + __slots__ = ("_fd", '_pos') + + def __init__(self, fd): + self._fd = fd + self._pos = 0 + + def write(self, data): + self._pos += len(data) + os.write(self._fd, data) + + def read(self, count=0): + if count == 0: + count = os.path.getsize(self._filepath) + # END handle read everything + + bytes = os.read(self._fd, count) + self._pos += len(bytes) + return bytes + + def fileno(self): + return self._fd + + def tell(self): + return self._pos + + def close(self): + close(self._fd) + + +class NullStream(object): + + """A stream that does nothing but providing a stream interface. + Use it like /dev/null""" + __slots__ = tuple() + + def read(self, size=0): + return '' + + def close(self): + pass + + def write(self, data): + return len(data) + + +#} END W streams diff --git a/libs/gitdb/test/__init__.py b/libs/gitdb/test/__init__.py new file mode 100644 index 000000000..8a681e428 --- /dev/null +++ b/libs/gitdb/test/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php diff --git a/libs/gitdb/test/lib.py b/libs/gitdb/test/lib.py new file mode 100644 index 000000000..bbdd241cd --- /dev/null +++ b/libs/gitdb/test/lib.py @@ -0,0 +1,208 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Utilities used in ODB testing""" +from gitdb import OStream +from gitdb.utils.compat import xrange + +import sys +import random +from array import array + +from io import BytesIO + +import glob +import unittest +import tempfile +import shutil +import os +import gc +import logging +from functools import wraps + + +#{ Bases + +class TestBase(unittest.TestCase): + """Base class for all tests + + TestCase providing access to readonly repositories using the following member variables. + + * gitrepopath + + * read-only base path of the git source repository, i.e. .../git/.git + """ + + #{ Invvariants + k_env_git_repo = "GITDB_TEST_GIT_REPO_BASE" + #} END invariants + + @classmethod + def setUpClass(cls): + try: + super(TestBase, cls).setUpClass() + except AttributeError: + pass + + cls.gitrepopath = os.environ.get(cls.k_env_git_repo) + if not cls.gitrepopath: + logging.info( + "You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository", cls.k_env_git_repo) + ospd = os.path.dirname + cls.gitrepopath = os.path.join(ospd(ospd(ospd(__file__))), '.git') + # end assure gitrepo is set + assert cls.gitrepopath.endswith('.git') + + +#} END bases + +#{ Decorators + +def skip_on_travis_ci(func): + """All tests decorated with this one will raise SkipTest when run on travis ci. + Use it to workaround difficult to solve issues + NOTE: copied from bcore (https://github.com/Byron/bcore)""" + @wraps(func) + def wrapper(self, *args, **kwargs): + if 'TRAVIS' in os.environ: + import nose + raise nose.SkipTest("Cannot run on travis-ci") + # end check for travis ci + return func(self, *args, **kwargs) + # end wrapper + return wrapper + + +def with_rw_directory(func): + """Create a temporary directory which can be written to, remove it if the + test succeeds, but leave it otherwise to aid additional debugging""" + + def wrapper(self): + path = tempfile.mktemp(prefix=func.__name__) + os.mkdir(path) + keep = False + try: + try: + return func(self, path) + except Exception: + sys.stderr.write("Test %s.%s failed, output is at %r\n" % (type(self).__name__, func.__name__, path)) + keep = True + raise + finally: + # Need to collect here to be sure all handles have been closed. It appears + # a windows-only issue. In fact things should be deleted, as well as + # memory maps closed, once objects go out of scope. For some reason + # though this is not the case here unless we collect explicitly. + if not keep: + gc.collect() + shutil.rmtree(path) + # END handle exception + # END wrapper + + wrapper.__name__ = func.__name__ + return wrapper + + +def with_packs_rw(func): + """Function that provides a path into which the packs for testing should be + copied. Will pass on the path to the actual function afterwards""" + + def wrapper(self, path): + src_pack_glob = fixture_path('packs/*') + copy_files_globbed(src_pack_glob, path, hard_link_ok=True) + return func(self, path) + # END wrapper + + wrapper.__name__ = func.__name__ + return wrapper + +#} END decorators + +#{ Routines + + +def fixture_path(relapath=''): + """:return: absolute path into the fixture directory + :param relapath: relative path into the fixtures directory, or '' + to obtain the fixture directory itself""" + return os.path.join(os.path.dirname(__file__), 'fixtures', relapath) + + +def copy_files_globbed(source_glob, target_dir, hard_link_ok=False): + """Copy all files found according to the given source glob into the target directory + :param hard_link_ok: if True, hard links will be created if possible. Otherwise + the files will be copied""" + for src_file in glob.glob(source_glob): + if hard_link_ok and hasattr(os, 'link'): + target = os.path.join(target_dir, os.path.basename(src_file)) + try: + os.link(src_file, target) + except OSError: + shutil.copy(src_file, target_dir) + # END handle cross device links ( and resulting failure ) + else: + shutil.copy(src_file, target_dir) + # END try hard link + # END for each file to copy + + +def make_bytes(size_in_bytes, randomize=False): + """:return: string with given size in bytes + :param randomize: try to produce a very random stream""" + actual_size = size_in_bytes // 4 + producer = xrange(actual_size) + if randomize: + producer = list(producer) + random.shuffle(producer) + # END randomize + a = array('i', producer) + return a.tostring() + + +def make_object(type, data): + """:return: bytes resembling an uncompressed object""" + odata = "blob %i\0" % len(data) + return odata.encode("ascii") + data + + +def make_memory_file(size_in_bytes, randomize=False): + """:return: tuple(size_of_stream, stream) + :param randomize: try to produce a very random stream""" + d = make_bytes(size_in_bytes, randomize) + return len(d), BytesIO(d) + +#} END routines + +#{ Stream Utilities + + +class DummyStream(object): + + def __init__(self): + self.was_read = False + self.bytes = 0 + self.closed = False + + def read(self, size): + self.was_read = True + self.bytes = size + + def close(self): + self.closed = True + + def _assert(self): + assert self.was_read + + +class DeriveTest(OStream): + + def __init__(self, sha, type, size, stream, *args, **kwargs): + self.myarg = kwargs.pop('myarg') + self.args = args + + def _assert(self): + assert self.args + assert self.myarg + +#} END stream utilitiess diff --git a/libs/gitdb/test/test_base.py b/libs/gitdb/test/test_base.py new file mode 100644 index 000000000..519cdfdc1 --- /dev/null +++ b/libs/gitdb/test/test_base.py @@ -0,0 +1,105 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Test for object db""" +from gitdb.test.lib import ( + TestBase, + DummyStream, + DeriveTest, +) + +from gitdb import ( + OInfo, + OPackInfo, + ODeltaPackInfo, + OStream, + OPackStream, + ODeltaPackStream, + IStream +) +from gitdb.util import ( + NULL_BIN_SHA +) + +from gitdb.typ import ( + str_blob_type +) + + +class TestBaseTypes(TestBase): + + def test_streams(self): + # test info + sha = NULL_BIN_SHA + s = 20 + blob_id = 3 + + info = OInfo(sha, str_blob_type, s) + assert info.binsha == sha + assert info.type == str_blob_type + assert info.type_id == blob_id + assert info.size == s + + # test pack info + # provides type_id + pinfo = OPackInfo(0, blob_id, s) + assert pinfo.type == str_blob_type + assert pinfo.type_id == blob_id + assert pinfo.pack_offset == 0 + + dpinfo = ODeltaPackInfo(0, blob_id, s, sha) + assert dpinfo.type == str_blob_type + assert dpinfo.type_id == blob_id + assert dpinfo.delta_info == sha + assert dpinfo.pack_offset == 0 + + # test ostream + stream = DummyStream() + ostream = OStream(*(info + (stream, ))) + assert ostream.stream is stream + ostream.read(15) + stream._assert() + assert stream.bytes == 15 + ostream.read(20) + assert stream.bytes == 20 + + # test packstream + postream = OPackStream(*(pinfo + (stream, ))) + assert postream.stream is stream + postream.read(10) + stream._assert() + assert stream.bytes == 10 + + # test deltapackstream + dpostream = ODeltaPackStream(*(dpinfo + (stream, ))) + dpostream.stream is stream + dpostream.read(5) + stream._assert() + assert stream.bytes == 5 + + # derive with own args + DeriveTest(sha, str_blob_type, s, stream, 'mine', myarg=3)._assert() + + # test istream + istream = IStream(str_blob_type, s, stream) + assert istream.binsha == None + istream.binsha = sha + assert istream.binsha == sha + + assert len(istream.binsha) == 20 + assert len(istream.hexsha) == 40 + + assert istream.size == s + istream.size = s * 2 + istream.size == s * 2 + assert istream.type == str_blob_type + istream.type = "something" + assert istream.type == "something" + assert istream.stream is stream + istream.stream = None + assert istream.stream is None + + assert istream.error is None + istream.error = Exception() + assert isinstance(istream.error, Exception) diff --git a/libs/gitdb/test/test_example.py b/libs/gitdb/test/test_example.py new file mode 100644 index 000000000..6e80bf5c6 --- /dev/null +++ b/libs/gitdb/test/test_example.py @@ -0,0 +1,43 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Module with examples from the tutorial section of the docs""" +import os +from gitdb.test.lib import TestBase +from gitdb import IStream +from gitdb.db import LooseObjectDB + +from io import BytesIO + + +class TestExamples(TestBase): + + def test_base(self): + ldb = LooseObjectDB(os.path.join(self.gitrepopath, 'objects')) + + for sha1 in ldb.sha_iter(): + oinfo = ldb.info(sha1) + ostream = ldb.stream(sha1) + assert oinfo[:3] == ostream[:3] + + assert len(ostream.read()) == ostream.size + assert ldb.has_object(oinfo.binsha) + # END for each sha in database + # assure we close all files + try: + del(ostream) + del(oinfo) + except UnboundLocalError: + pass + # END ignore exception if there are no loose objects + + data = "my data".encode("ascii") + istream = IStream("blob", len(data), BytesIO(data)) + + # the object does not yet have a sha + assert istream.binsha is None + ldb.store(istream) + # now the sha is set + assert len(istream.binsha) == 20 + assert ldb.has_object(istream.binsha) diff --git a/libs/gitdb/test/test_pack.py b/libs/gitdb/test/test_pack.py new file mode 100644 index 000000000..24e2a3134 --- /dev/null +++ b/libs/gitdb/test/test_pack.py @@ -0,0 +1,255 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Test everything about packs reading and writing""" +from gitdb.test.lib import ( + TestBase, + with_rw_directory, + fixture_path +) + +from gitdb.stream import DeltaApplyReader + +from gitdb.pack import ( + PackEntity, + PackIndexFile, + PackFile +) + +from gitdb.base import ( + OInfo, + OStream, +) + +from gitdb.fun import delta_types +from gitdb.exc import UnsupportedOperation +from gitdb.util import to_bin_sha +from gitdb.utils.compat import xrange + +try: + from itertools import izip +except ImportError: + izip = zip + +from nose import SkipTest + +import os +import tempfile + + +#{ Utilities +def bin_sha_from_filename(filename): + return to_bin_sha(os.path.splitext(os.path.basename(filename))[0][5:]) +#} END utilities + + +class TestPack(TestBase): + + packindexfile_v1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx'), 1, 67) + packindexfile_v2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx'), 2, 30) + packindexfile_v2_3_ascii = (fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx'), 2, 42) + packfile_v2_1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack'), 2, packindexfile_v1[2]) + packfile_v2_2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack'), 2, packindexfile_v2[2]) + packfile_v2_3_ascii = ( + fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack'), 2, packindexfile_v2_3_ascii[2]) + + def _assert_index_file(self, index, version, size): + assert index.packfile_checksum() != index.indexfile_checksum() + assert len(index.packfile_checksum()) == 20 + assert len(index.indexfile_checksum()) == 20 + assert index.version() == version + assert index.size() == size + assert len(index.offsets()) == size + + # get all data of all objects + for oidx in xrange(index.size()): + sha = index.sha(oidx) + assert oidx == index.sha_to_index(sha) + + entry = index.entry(oidx) + assert len(entry) == 3 + + assert entry[0] == index.offset(oidx) + assert entry[1] == sha + assert entry[2] == index.crc(oidx) + + # verify partial sha + for l in (4, 8, 11, 17, 20): + assert index.partial_sha_to_index(sha[:l], l * 2) == oidx + + # END for each object index in indexfile + self.failUnlessRaises(ValueError, index.partial_sha_to_index, "\0", 2) + + def _assert_pack_file(self, pack, version, size): + assert pack.version() == 2 + assert pack.size() == size + assert len(pack.checksum()) == 20 + + num_obj = 0 + for obj in pack.stream_iter(): + num_obj += 1 + info = pack.info(obj.pack_offset) + stream = pack.stream(obj.pack_offset) + + assert info.pack_offset == stream.pack_offset + assert info.type_id == stream.type_id + assert hasattr(stream, 'read') + + # it should be possible to read from both streams + assert obj.read() == stream.read() + + streams = pack.collect_streams(obj.pack_offset) + assert streams + + # read the stream + try: + dstream = DeltaApplyReader.new(streams) + except ValueError: + # ignore these, old git versions use only ref deltas, + # which we havent resolved ( as we are without an index ) + # Also ignore non-delta streams + continue + # END get deltastream + + # read all + data = dstream.read() + assert len(data) == dstream.size + + # test seek + dstream.seek(0) + assert dstream.read() == data + + # read chunks + # NOTE: the current implementation is safe, it basically transfers + # all calls to the underlying memory map + + # END for each object + assert num_obj == size + + def test_pack_index(self): + # check version 1 and 2 + for indexfile, version, size in (self.packindexfile_v1, self.packindexfile_v2): + index = PackIndexFile(indexfile) + self._assert_index_file(index, version, size) + # END run tests + + def test_pack(self): + # there is this special version 3, but apparently its like 2 ... + for packfile, version, size in (self.packfile_v2_3_ascii, self.packfile_v2_1, self.packfile_v2_2): + pack = PackFile(packfile) + self._assert_pack_file(pack, version, size) + # END for each pack to test + + @with_rw_directory + def test_pack_entity(self, rw_dir): + pack_objs = list() + for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1), + (self.packfile_v2_2, self.packindexfile_v2), + (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): + packfile, version, size = packinfo + indexfile, version, size = indexinfo + entity = PackEntity(packfile) + assert entity.pack().path() == packfile + assert entity.index().path() == indexfile + pack_objs.extend(entity.stream_iter()) + + count = 0 + for info, stream in izip(entity.info_iter(), entity.stream_iter()): + count += 1 + assert info.binsha == stream.binsha + assert len(info.binsha) == 20 + assert info.type_id == stream.type_id + assert info.size == stream.size + + # we return fully resolved items, which is implied by the sha centric access + assert not info.type_id in delta_types + + # try all calls + assert len(entity.collect_streams(info.binsha)) + oinfo = entity.info(info.binsha) + assert isinstance(oinfo, OInfo) + assert oinfo.binsha is not None + ostream = entity.stream(info.binsha) + assert isinstance(ostream, OStream) + assert ostream.binsha is not None + + # verify the stream + try: + assert entity.is_valid_stream(info.binsha, use_crc=True) + except UnsupportedOperation: + pass + # END ignore version issues + assert entity.is_valid_stream(info.binsha, use_crc=False) + # END for each info, stream tuple + assert count == size + + # END for each entity + + # pack writing - write all packs into one + # index path can be None + pack_path1 = tempfile.mktemp('', "pack1", rw_dir) + pack_path2 = tempfile.mktemp('', "pack2", rw_dir) + index_path = tempfile.mktemp('', 'index', rw_dir) + iteration = 0 + + def rewind_streams(): + for obj in pack_objs: + obj.stream.seek(0) + # END utility + for ppath, ipath, num_obj in zip((pack_path1, pack_path2), + (index_path, None), + (len(pack_objs), None)): + iwrite = None + if ipath: + ifile = open(ipath, 'wb') + iwrite = ifile.write + # END handle ip + + # make sure we rewind the streams ... we work on the same objects over and over again + if iteration > 0: + rewind_streams() + # END rewind streams + iteration += 1 + + with open(ppath, 'wb') as pfile: + pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) + assert os.path.getsize(ppath) > 100 + + # verify pack + pf = PackFile(ppath) + assert pf.size() == len(pack_objs) + assert pf.version() == PackFile.pack_version_default + assert pf.checksum() == pack_sha + pf.close() + + # verify index + if ipath is not None: + ifile.close() + assert os.path.getsize(ipath) > 100 + idx = PackIndexFile(ipath) + assert idx.version() == PackIndexFile.index_version_default + assert idx.packfile_checksum() == pack_sha + assert idx.indexfile_checksum() == index_sha + assert idx.size() == len(pack_objs) + idx.close() + # END verify files exist + # END for each packpath, indexpath pair + + # verify the packs thoroughly + rewind_streams() + entity = PackEntity.create(pack_objs, rw_dir) + count = 0 + for info in entity.info_iter(): + count += 1 + for use_crc in range(2): + assert entity.is_valid_stream(info.binsha, use_crc) + # END for each crc mode + # END for each info + assert count == len(pack_objs) + entity.close() + + def test_pack_64(self): + # TODO: hex-edit a pack helping us to verify that we can handle 64 byte offsets + # of course without really needing such a huge pack + raise SkipTest() diff --git a/libs/gitdb/test/test_stream.py b/libs/gitdb/test/test_stream.py new file mode 100644 index 000000000..96268252d --- /dev/null +++ b/libs/gitdb/test/test_stream.py @@ -0,0 +1,164 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Test for object db""" + +from gitdb.test.lib import ( + TestBase, + DummyStream, + make_bytes, + make_object, + fixture_path +) + +from gitdb import ( + DecompressMemMapReader, + FDCompressedSha1Writer, + LooseObjectDB, + Sha1Writer, + MemoryDB, + IStream, +) +from gitdb.util import hex_to_bin + +import zlib +from gitdb.typ import ( + str_blob_type +) + +import tempfile +import os +from io import BytesIO + + +class TestStream(TestBase): + + """Test stream classes""" + + data_sizes = (15, 10000, 1000 * 1024 + 512) + + def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): + """Make stream tests - the orig_stream is seekable, allowing it to be + rewound and reused + :param cdata: the data we expect to read from stream, the contents + :param rewind_stream: function called to rewind the stream to make it ready + for reuse""" + ns = 10 + assert len(cdata) > ns - 1, "Data must be larger than %i, was %i" % (ns, len(cdata)) + + # read in small steps + ss = len(cdata) // ns + for i in range(ns): + data = stream.read(ss) + chunk = cdata[i * ss:(i + 1) * ss] + assert data == chunk + # END for each step + rest = stream.read() + if rest: + assert rest == cdata[-len(rest):] + # END handle rest + + if isinstance(stream, DecompressMemMapReader): + assert len(stream.data()) == stream.compressed_bytes_read() + # END handle special type + + rewind_stream(stream) + + # read everything + rdata = stream.read() + assert rdata == cdata + + if isinstance(stream, DecompressMemMapReader): + assert len(stream.data()) == stream.compressed_bytes_read() + # END handle special type + + def test_decompress_reader(self): + for close_on_deletion in range(2): + for with_size in range(2): + for ds in self.data_sizes: + cdata = make_bytes(ds, randomize=False) + + # zdata = zipped actual data + # cdata = original content data + + # create reader + if with_size: + # need object data + zdata = zlib.compress(make_object(str_blob_type, cdata)) + typ, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion) + assert size == len(cdata) + assert typ == str_blob_type + + # even if we don't set the size, it will be set automatically on first read + test_reader = DecompressMemMapReader(zdata, close_on_deletion=False) + assert test_reader._s == len(cdata) + else: + # here we need content data + zdata = zlib.compress(cdata) + reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata)) + assert reader._s == len(cdata) + # END get reader + + self._assert_stream_reader(reader, cdata, lambda r: r.seek(0)) + + # put in a dummy stream for closing + dummy = DummyStream() + reader._m = dummy + + assert not dummy.closed + del(reader) + assert dummy.closed == close_on_deletion + # END for each datasize + # END whether size should be used + # END whether stream should be closed when deleted + + def test_sha_writer(self): + writer = Sha1Writer() + assert 2 == writer.write("hi".encode("ascii")) + assert len(writer.sha(as_hex=1)) == 40 + assert len(writer.sha(as_hex=0)) == 20 + + # make sure it does something ;) + prev_sha = writer.sha() + writer.write("hi again".encode("ascii")) + assert writer.sha() != prev_sha + + def test_compressed_writer(self): + for ds in self.data_sizes: + fd, path = tempfile.mkstemp() + ostream = FDCompressedSha1Writer(fd) + data = make_bytes(ds, randomize=False) + + # for now, just a single write, code doesn't care about chunking + assert len(data) == ostream.write(data) + ostream.close() + + # its closed already + self.failUnlessRaises(OSError, os.close, fd) + + # read everything back, compare to data we zip + fd = os.open(path, os.O_RDONLY | getattr(os, 'O_BINARY', 0)) + written_data = os.read(fd, os.path.getsize(path)) + assert len(written_data) == os.path.getsize(path) + os.close(fd) + assert written_data == zlib.compress(data, 1) # best speed + + os.remove(path) + # END for each os + + def test_decompress_reader_special_case(self): + odb = LooseObjectDB(fixture_path('objects')) + mdb = MemoryDB() + for sha in (b'888401851f15db0eed60eb1bc29dec5ddcace911', + b'7bb839852ed5e3a069966281bb08d50012fb309b',): + ostream = odb.stream(hex_to_bin(sha)) + + # if there is a bug, we will be missing one byte exactly ! + data = ostream.read() + assert len(data) == ostream.size + + # Putting it back in should yield nothing new - after all, we have + dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) + assert dump.hexsha == sha + # end for each loose object sha to test diff --git a/libs/gitdb/test/test_util.py b/libs/gitdb/test/test_util.py new file mode 100644 index 000000000..847bdab5e --- /dev/null +++ b/libs/gitdb/test/test_util.py @@ -0,0 +1,100 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Test for object db""" +import tempfile +import os + +from gitdb.test.lib import TestBase +from gitdb.util import ( + to_hex_sha, + to_bin_sha, + NULL_HEX_SHA, + LockedFD +) + + +class TestUtils(TestBase): + + def test_basics(self): + assert to_hex_sha(NULL_HEX_SHA) == NULL_HEX_SHA + assert len(to_bin_sha(NULL_HEX_SHA)) == 20 + assert to_hex_sha(to_bin_sha(NULL_HEX_SHA)) == NULL_HEX_SHA.encode("ascii") + + def _cmp_contents(self, file_path, data): + # raise if data from file at file_path + # does not match data string + with open(file_path, "rb") as fp: + assert fp.read() == data.encode("ascii") + + def test_lockedfd(self): + my_file = tempfile.mktemp() + orig_data = "hello" + new_data = "world" + with open(my_file, "wb") as my_file_fp: + my_file_fp.write(orig_data.encode("ascii")) + + try: + lfd = LockedFD(my_file) + lockfilepath = lfd._lockfilepath() + + # cannot end before it was started + self.failUnlessRaises(AssertionError, lfd.rollback) + self.failUnlessRaises(AssertionError, lfd.commit) + + # open for writing + assert not os.path.isfile(lockfilepath) + wfd = lfd.open(write=True) + assert lfd._fd is wfd + assert os.path.isfile(lockfilepath) + + # write data and fail + os.write(wfd, new_data.encode("ascii")) + lfd.rollback() + assert lfd._fd is None + self._cmp_contents(my_file, orig_data) + assert not os.path.isfile(lockfilepath) + + # additional call doesn't fail + lfd.commit() + lfd.rollback() + + # test reading + lfd = LockedFD(my_file) + rfd = lfd.open(write=False) + assert os.read(rfd, len(orig_data)) == orig_data.encode("ascii") + + assert os.path.isfile(lockfilepath) + # deletion rolls back + del(lfd) + assert not os.path.isfile(lockfilepath) + + # write data - concurrently + lfd = LockedFD(my_file) + olfd = LockedFD(my_file) + assert not os.path.isfile(lockfilepath) + wfdstream = lfd.open(write=True, stream=True) # this time as stream + assert os.path.isfile(lockfilepath) + # another one fails + self.failUnlessRaises(IOError, olfd.open) + + wfdstream.write(new_data.encode("ascii")) + lfd.commit() + assert not os.path.isfile(lockfilepath) + self._cmp_contents(my_file, new_data) + + # could test automatic _end_writing on destruction + finally: + os.remove(my_file) + # END final cleanup + + # try non-existing file for reading + lfd = LockedFD(tempfile.mktemp()) + try: + lfd.open(write=False) + except OSError: + assert not os.path.exists(lfd._lockfilepath()) + else: + self.fail("expected OSError") + # END handle exceptions diff --git a/libs/gitdb/typ.py b/libs/gitdb/typ.py new file mode 100644 index 000000000..98d15f3ec --- /dev/null +++ b/libs/gitdb/typ.py @@ -0,0 +1,10 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +"""Module containing information about types known to the database""" + +str_blob_type = b'blob' +str_commit_type = b'commit' +str_tree_type = b'tree' +str_tag_type = b'tag' diff --git a/libs/gitdb/util.py b/libs/gitdb/util.py new file mode 100644 index 000000000..8a1819b6d --- /dev/null +++ b/libs/gitdb/util.py @@ -0,0 +1,401 @@ +# Copyright (C) 2010, 2011 Sebastian Thiel ([email protected]) and contributors +# +# This module is part of GitDB and is released under +# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +import binascii +import os +import mmap +import sys +import time +import errno + +from io import BytesIO + +from smmap import ( + StaticWindowMapManager, + SlidingWindowMapManager, + SlidingWindowMapBuffer +) + +# initialize our global memory manager instance +# Use it to free cached (and unused) resources. +if sys.version_info < (2, 6): + mman = StaticWindowMapManager() +else: + mman = SlidingWindowMapManager() +# END handle mman + +import hashlib + +try: + from struct import unpack_from +except ImportError: + from struct import unpack, calcsize + __calcsize_cache = dict() + + def unpack_from(fmt, data, offset=0): + try: + size = __calcsize_cache[fmt] + except KeyError: + size = calcsize(fmt) + __calcsize_cache[fmt] = size + # END exception handling + return unpack(fmt, data[offset: offset + size]) + # END own unpack_from implementation + + +#{ Aliases + +hex_to_bin = binascii.a2b_hex +bin_to_hex = binascii.b2a_hex + +# errors +ENOENT = errno.ENOENT + +# os shortcuts +exists = os.path.exists +mkdir = os.mkdir +chmod = os.chmod +isdir = os.path.isdir +isfile = os.path.isfile +rename = os.rename +dirname = os.path.dirname +basename = os.path.basename +join = os.path.join +read = os.read +write = os.write +close = os.close +fsync = os.fsync + + +def _retry(func, *args, **kwargs): + # Wrapper around functions, that are problematic on "Windows". Sometimes + # the OS or someone else has still a handle to the file + if sys.platform == "win32": + for _ in range(10): + try: + return func(*args, **kwargs) + except Exception: + time.sleep(0.1) + return func(*args, **kwargs) + else: + return func(*args, **kwargs) + + +def remove(*args, **kwargs): + return _retry(os.remove, *args, **kwargs) + + +# Backwards compatibility imports +from gitdb.const import ( + NULL_BIN_SHA, + NULL_HEX_SHA +) + +#} END Aliases + +#{ compatibility stuff ... + + +class _RandomAccessBytesIO(object): + + """Wrapper to provide required functionality in case memory maps cannot or may + not be used. This is only really required in python 2.4""" + __slots__ = '_sio' + + def __init__(self, buf=''): + self._sio = BytesIO(buf) + + def __getattr__(self, attr): + return getattr(self._sio, attr) + + def __len__(self): + return len(self.getvalue()) + + def __getitem__(self, i): + return self.getvalue()[i] + + def __getslice__(self, start, end): + return self.getvalue()[start:end] + + +def byte_ord(b): + """ + Return the integer representation of the byte string. This supports Python + 3 byte arrays as well as standard strings. + """ + try: + return ord(b) + except TypeError: + return b + +#} END compatibility stuff ... + +#{ Routines + + +def make_sha(source=''.encode("ascii")): + """A python2.4 workaround for the sha/hashlib module fiasco + + **Note** From the dulwich project """ + try: + return hashlib.sha1(source) + except NameError: + import sha + sha1 = sha.sha(source) + return sha1 + + +def allocate_memory(size): + """:return: a file-protocol accessible memory block of the given size""" + if size == 0: + return _RandomAccessBytesIO(b'') + # END handle empty chunks gracefully + + try: + return mmap.mmap(-1, size) # read-write by default + except EnvironmentError: + # setup real memory instead + # this of course may fail if the amount of memory is not available in + # one chunk - would only be the case in python 2.4, being more likely on + # 32 bit systems. + return _RandomAccessBytesIO(b"\0" * size) + # END handle memory allocation + + +def file_contents_ro(fd, stream=False, allow_mmap=True): + """:return: read-only contents of the file represented by the file descriptor fd + + :param fd: file descriptor opened for reading + :param stream: if False, random access is provided, otherwise the stream interface + is provided. + :param allow_mmap: if True, its allowed to map the contents into memory, which + allows large files to be handled and accessed efficiently. The file-descriptor + will change its position if this is False""" + try: + if allow_mmap: + # supports stream and random access + try: + return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + except EnvironmentError: + # python 2.4 issue, 0 wants to be the actual size + return mmap.mmap(fd, os.fstat(fd).st_size, access=mmap.ACCESS_READ) + # END handle python 2.4 + except OSError: + pass + # END exception handling + + # read manully + contents = os.read(fd, os.fstat(fd).st_size) + if stream: + return _RandomAccessBytesIO(contents) + return contents + + +def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0): + """Get the file contents at filepath as fast as possible + + :return: random access compatible memory of the given filepath + :param stream: see ``file_contents_ro`` + :param allow_mmap: see ``file_contents_ro`` + :param flags: additional flags to pass to os.open + :raise OSError: If the file could not be opened + + **Note** for now we don't try to use O_NOATIME directly as the right value needs to be + shared per database in fact. It only makes a real difference for loose object + databases anyway, and they use it with the help of the ``flags`` parameter""" + fd = os.open(filepath, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags) + try: + return file_contents_ro(fd, stream, allow_mmap) + finally: + close(fd) + # END assure file is closed + + +def sliding_ro_buffer(filepath, flags=0): + """ + :return: a buffer compatible object which uses our mapped memory manager internally + ready to read the whole given filepath""" + return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags) + + +def to_hex_sha(sha): + """:return: hexified version of sha""" + if len(sha) == 40: + return sha + return bin_to_hex(sha) + + +def to_bin_sha(sha): + if len(sha) == 20: + return sha + return hex_to_bin(sha) + + +#} END routines + + +#{ Utilities + +class LazyMixin(object): + + """ + Base class providing an interface to lazily retrieve attribute values upon + first access. If slots are used, memory will only be reserved once the attribute + is actually accessed and retrieved the first time. All future accesses will + return the cached value as stored in the Instance's dict or slot. + """ + + __slots__ = tuple() + + def __getattr__(self, attr): + """ + Whenever an attribute is requested that we do not know, we allow it + to be created and set. Next time the same attribute is reqeusted, it is simply + returned from our dict/slots. """ + self._set_cache_(attr) + # will raise in case the cache was not created + return object.__getattribute__(self, attr) + + def _set_cache_(self, attr): + """ + This method should be overridden in the derived class. + It should check whether the attribute named by attr can be created + and cached. Do nothing if you do not know the attribute or call your subclass + + The derived class may create as many additional attributes as it deems + necessary in case a git command returns more information than represented + in the single attribute.""" + pass + + +class LockedFD(object): + + """ + This class facilitates a safe read and write operation to a file on disk. + If we write to 'file', we obtain a lock file at 'file.lock' and write to + that instead. If we succeed, the lock file will be renamed to overwrite + the original file. + + When reading, we obtain a lock file, but to prevent other writers from + succeeding while we are reading the file. + + This type handles error correctly in that it will assure a consistent state + on destruction. + + **note** with this setup, parallel reading is not possible""" + __slots__ = ("_filepath", '_fd', '_write') + + def __init__(self, filepath): + """Initialize an instance with the givne filepath""" + self._filepath = filepath + self._fd = None + self._write = None # if True, we write a file + + def __del__(self): + # will do nothing if the file descriptor is already closed + if self._fd is not None: + self.rollback() + + def _lockfilepath(self): + return "%s.lock" % self._filepath + + def open(self, write=False, stream=False): + """ + Open the file descriptor for reading or writing, both in binary mode. + + :param write: if True, the file descriptor will be opened for writing. Other + wise it will be opened read-only. + :param stream: if True, the file descriptor will be wrapped into a simple stream + object which supports only reading or writing + :return: fd to read from or write to. It is still maintained by this instance + and must not be closed directly + :raise IOError: if the lock could not be retrieved + :raise OSError: If the actual file could not be opened for reading + + **note** must only be called once""" + if self._write is not None: + raise AssertionError("Called %s multiple times" % self.open) + + self._write = write + + # try to open the lock file + binary = getattr(os, 'O_BINARY', 0) + lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary + try: + fd = os.open(self._lockfilepath(), lockmode, int("600", 8)) + if not write: + os.close(fd) + else: + self._fd = fd + # END handle file descriptor + except OSError: + raise IOError("Lock at %r could not be obtained" % self._lockfilepath()) + # END handle lock retrieval + + # open actual file if required + if self._fd is None: + # we could specify exlusive here, as we obtained the lock anyway + try: + self._fd = os.open(self._filepath, os.O_RDONLY | binary) + except: + # assure we release our lockfile + remove(self._lockfilepath()) + raise + # END handle lockfile + # END open descriptor for reading + + if stream: + # need delayed import + from gitdb.stream import FDStream + return FDStream(self._fd) + else: + return self._fd + # END handle stream + + def commit(self): + """When done writing, call this function to commit your changes into the + actual file. + The file descriptor will be closed, and the lockfile handled. + + **Note** can be called multiple times""" + self._end_writing(successful=True) + + def rollback(self): + """Abort your operation without any changes. The file descriptor will be + closed, and the lock released. + + **Note** can be called multiple times""" + self._end_writing(successful=False) + + def _end_writing(self, successful=True): + """Handle the lock according to the write mode """ + if self._write is None: + raise AssertionError("Cannot end operation if it wasn't started yet") + + if self._fd is None: + return + + os.close(self._fd) + self._fd = None + + lockfile = self._lockfilepath() + if self._write and successful: + # on windows, rename does not silently overwrite the existing one + if sys.platform == "win32": + if isfile(self._filepath): + remove(self._filepath) + # END remove if exists + # END win32 special handling + os.rename(lockfile, self._filepath) + + # assure others can at least read the file - the tmpfile left it at rw-- + # We may also write that file, on windows that boils down to a remove- + # protection as well + chmod(self._filepath, int("644", 8)) + else: + # just delete the file so far, we failed + remove(lockfile) + # END successful handling + +#} END utilities diff --git a/libs/gitdb/utils/__init__.py b/libs/gitdb/utils/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/libs/gitdb/utils/__init__.py diff --git a/libs/gitdb/utils/compat.py b/libs/gitdb/utils/compat.py new file mode 100644 index 000000000..a7899cb14 --- /dev/null +++ b/libs/gitdb/utils/compat.py @@ -0,0 +1,43 @@ +import sys + +PY3 = sys.version_info[0] == 3 + +try: + from itertools import izip + xrange = xrange +except ImportError: + # py3 + izip = zip + xrange = range +# end handle python version + +try: + # Python 2 + buffer = buffer + memoryview = buffer + # Assume no memory view ... + def to_bytes(i): + return i +except NameError: + # Python 3 has no `buffer`; only `memoryview` + # However, it's faster to just slice the object directly, maybe it keeps a view internally + def buffer(obj, offset, size=None): + if size is None: + # return memoryview(obj)[offset:] + return obj[offset:] + else: + # return memoryview(obj)[offset:offset+size] + return obj[offset:offset + size] + # end buffer reimplementation + # smmap can return memory view objects, which can't be compared as buffers/bytes can ... + def to_bytes(i): + if isinstance(i, memoryview): + return i.tobytes() + return i + + memoryview = memoryview + +try: + MAXSIZE = sys.maxint +except AttributeError: + MAXSIZE = sys.maxsize diff --git a/libs/gitdb/utils/encoding.py b/libs/gitdb/utils/encoding.py new file mode 100644 index 000000000..4d270af9d --- /dev/null +++ b/libs/gitdb/utils/encoding.py @@ -0,0 +1,31 @@ +from gitdb.utils import compat + +if compat.PY3: + string_types = (str, ) + text_type = str +else: + string_types = (basestring, ) + text_type = unicode + + +def force_bytes(data, encoding="ascii"): + if isinstance(data, bytes): + return data + + if isinstance(data, string_types): + return data.encode(encoding) + + return data + + +def force_text(data, encoding="utf-8"): + if isinstance(data, text_type): + return data + + if isinstance(data, bytes): + return data.decode(encoding) + + if compat.PY3: + return text_type(data, encoding) + else: + return text_type(data) |