# HG changeset patch # User Martin von Zweigbergk # Date 1487007856 28800 # Node ID 455677a7667f6658be5c6e3f3d7b81a29cf5f9a2 # Parent 72f25e17af9d6a206ea374c30f229ae9513f3f23# Parent f3807a135e43b0499db12dc79697e0b9589dd095 merge with stable diff -r f3807a135e43 -r 455677a7667f contrib/check-code.py --- a/contrib/check-code.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/check-code.py Mon Feb 13 09:44:16 2017 -0800 @@ -237,7 +237,7 @@ (r'lambda\s*\(.*,.*\)', "tuple parameter unpacking not available in Python 3+"), (r'(?`_ compression library. A C extension -and CFFI interface is provided. +and CFFI interface are provided. -The primary goal of the extension is to provide a Pythonic interface to -the underlying C API. This means exposing most of the features and flexibility +The primary goal of the project is to provide a rich interface to the +underlying C API through a Pythonic interface while not sacrificing +performance. This means exposing most of the features and flexibility of the C API while not sacrificing usability or safety that Python provides. The canonical home for this project is @@ -23,6 +24,9 @@ may be some backwards incompatible changes before 1.0. Though the author does not intend to make any major changes to the Python API. +This project is vendored and distributed with Mercurial 4.1, where it is +used in a production capacity. + There is continuous integration for Python versions 2.6, 2.7, and 3.3+ on Linux x86_x64 and Windows x86 and x86_64. The author is reasonably confident the extension is stable and works as advertised on these @@ -48,14 +52,15 @@ support compression without the framing headers. But the author doesn't believe it a high priority at this time. -The CFFI bindings are half-baked and need to be finished. +The CFFI bindings are feature complete and all tests run against both +the C extension and CFFI bindings to ensure behavior parity. Requirements ============ -This extension is designed to run with Python 2.6, 2.7, 3.3, 3.4, and 3.5 -on common platforms (Linux, Windows, and OS X). Only x86_64 is currently -well-tested as an architecture. +This extension is designed to run with Python 2.6, 2.7, 3.3, 3.4, 3.5, and +3.6 on common platforms (Linux, Windows, and OS X). Only x86_64 is +currently well-tested as an architecture. Installing ========== @@ -106,15 +111,11 @@ Comparison to Other Python Bindings =================================== -https://pypi.python.org/pypi/zstd is an alternative Python binding to +https://pypi.python.org/pypi/zstd is an alternate Python binding to Zstandard. At the time this was written, the latest release of that -package (1.0.0.2) had the following significant differences from this package: - -* It only exposes the simple API for compression and decompression operations. - This extension exposes the streaming API, dictionary training, and more. -* It adds a custom framing header to compressed data and there is no way to - disable it. This means that data produced with that module cannot be used by - other Zstandard implementations. +package (1.1.2) only exposed the simple APIs for compression and decompression. +This package exposes much more of the zstd API, including streaming and +dictionary compression. This package also has CFFI support. Bundling of Zstandard Source Code ================================= @@ -260,6 +261,10 @@ compressor's internal state into the output object. This may result in 0 or more ``write()`` calls to the output object. +Both ``write()`` and ``flush()`` return the number of bytes written to the +object's ``write()``. In many cases, small inputs do not accumulate enough +data to cause a write and ``write()`` will return ``0``. + If the size of the data being fed to this streaming compressor is known, you can declare it before compression begins:: @@ -476,6 +481,10 @@ the decompressor by calling ``write(data)`` and decompressed output is written to the output object by calling its ``write(data)`` method. +Calls to ``write()`` will return the number of bytes written to the output +object. Not all inputs will result in bytes being written, so return values +of ``0`` are possible. + The size of chunks being ``write()`` to the destination can be specified:: dctx = zstd.ZstdDecompressor() @@ -576,6 +585,53 @@ data = dobj.decompress(compressed_chunk_0) data = dobj.decompress(compressed_chunk_1) +Content-Only Dictionary Chain Decompression +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``decompress_content_dict_chain(frames)`` performs decompression of a list of +zstd frames produced using chained *content-only* dictionary compression. Such +a list of frames is produced by compressing discrete inputs where each +non-initial input is compressed with a *content-only* dictionary consisting +of the content of the previous input. + +For example, say you have the following inputs:: + + inputs = [b'input 1', b'input 2', b'input 3'] + +The zstd frame chain consists of: + +1. ``b'input 1'`` compressed in standalone/discrete mode +2. ``b'input 2'`` compressed using ``b'input 1'`` as a *content-only* dictionary +3. ``b'input 3'`` compressed using ``b'input 2'`` as a *content-only* dictionary + +Each zstd frame **must** have the content size written. + +The following Python code can be used to produce a *content-only dictionary +chain*:: + + def make_chain(inputs): + frames = [] + + # First frame is compressed in standalone/discrete mode. + zctx = zstd.ZstdCompressor(write_content_size=True) + frames.append(zctx.compress(inputs[0])) + + # Subsequent frames use the previous fulltext as a content-only dictionary + for i, raw in enumerate(inputs[1:]): + dict_data = zstd.ZstdCompressionDict(inputs[i]) + zctx = zstd.ZstdCompressor(write_content_size=True, dict_data=dict_data) + frames.append(zctx.compress(raw)) + + return frames + +``decompress_content_dict_chain()`` returns the uncompressed data of the last +element in the input chain. + +It is possible to implement *content-only dictionary chain* decompression +on top of other Python APIs. However, this function will likely be significantly +faster, especially for long input chains, as it avoids the overhead of +instantiating and passing around intermediate objects between C and Python. + Choosing an API --------------- @@ -634,6 +690,13 @@ dict_data = zstd.ZstdCompressionDict(data) +It is possible to construct a dictionary from *any* data. Unless the +data begins with a magic header, the dictionary will be treated as +*content-only*. *Content-only* dictionaries allow compression operations +that follow to reference raw data within the content. For one use of +*content-only* dictionaries, see +``ZstdDecompressor.decompress_content_dict_chain()``. + More interestingly, instances can be created by *training* on sample data:: dict_data = zstd.train_dictionary(size, samples) @@ -700,19 +763,57 @@ cctx = zstd.ZstdCompressor(compression_params=params) -The members of the ``CompressionParameters`` tuple are as follows:: +The members/attributes of ``CompressionParameters`` instances are as follows:: -* 0 - Window log -* 1 - Chain log -* 2 - Hash log -* 3 - Search log -* 4 - Search length -* 5 - Target length -* 6 - Strategy (one of the ``zstd.STRATEGY_`` constants) +* window_log +* chain_log +* hash_log +* search_log +* search_length +* target_length +* strategy + +This is the order the arguments are passed to the constructor if not using +named arguments. You'll need to read the Zstandard documentation for what these parameters do. +Frame Inspection +---------------- + +Data emitted from zstd compression is encapsulated in a *frame*. This frame +begins with a 4 byte *magic number* header followed by 2 to 14 bytes describing +the frame in more detail. For more info, see +https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md. + +``zstd.get_frame_parameters(data)`` parses a zstd *frame* header from a bytes +instance and return a ``FrameParameters`` object describing the frame. + +Depending on which fields are present in the frame and their values, the +length of the frame parameters varies. If insufficient bytes are passed +in to fully parse the frame parameters, ``ZstdError`` is raised. To ensure +frame parameters can be parsed, pass in at least 18 bytes. + +``FrameParameters`` instances have the following attributes: + +content_size + Integer size of original, uncompressed content. This will be ``0`` if the + original content size isn't written to the frame (controlled with the + ``write_content_size`` argument to ``ZstdCompressor``) or if the input + content size was ``0``. + +window_size + Integer size of maximum back-reference distance in compressed data. + +dict_id + Integer of dictionary ID used for compression. ``0`` if no dictionary + ID was used or if the dictionary ID was ``0``. + +has_checksum + Bool indicating whether a 4 byte content checksum is stored at the end + of the frame. + Misc Functionality ------------------ @@ -776,19 +877,32 @@ TARGETLENGTH_MAX Maximum value for compression parameter STRATEGY_FAST - Compression strategory + Compression strategy STRATEGY_DFAST - Compression strategory + Compression strategy STRATEGY_GREEDY - Compression strategory + Compression strategy STRATEGY_LAZY - Compression strategory + Compression strategy STRATEGY_LAZY2 - Compression strategory + Compression strategy STRATEGY_BTLAZY2 - Compression strategory + Compression strategy STRATEGY_BTOPT - Compression strategory + Compression strategy + +Performance Considerations +-------------------------- + +The ``ZstdCompressor`` and ``ZstdDecompressor`` types maintain state to a +persistent compression or decompression *context*. Reusing a ``ZstdCompressor`` +or ``ZstdDecompressor`` instance for multiple operations is faster than +instantiating a new ``ZstdCompressor`` or ``ZstdDecompressor`` for each +operation. The differences are magnified as the size of data decreases. For +example, the difference between *context* reuse and non-reuse for 100,000 +100 byte inputs will be significant (possiby over 10x faster to reuse contexts) +whereas 10 1,000,000 byte inputs will be more similar in speed (because the +time spent doing compression dwarfs time spent creating new *contexts*). Note on Zstandard's *Experimental* API ====================================== diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/compressiondict.c --- a/contrib/python-zstandard/c-ext/compressiondict.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/compressiondict.c Mon Feb 13 09:44:16 2017 -0800 @@ -28,7 +28,8 @@ void* dict; ZstdCompressionDict* result; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|O!", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|O!:train_dictionary", + kwlist, &capacity, &PyList_Type, &samples, (PyObject*)&DictParametersType, ¶meters)) { @@ -57,7 +58,6 @@ sampleItem = PyList_GetItem(samples, sampleIndex); if (!PyBytes_Check(sampleItem)) { PyErr_SetString(PyExc_ValueError, "samples must be bytes"); - /* TODO probably need to perform DECREF here */ return NULL; } samplesSize += PyBytes_GET_SIZE(sampleItem); @@ -133,10 +133,11 @@ self->dictSize = 0; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", #else - if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", #endif + &source, &sourceSize)) { return -1; } diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/compressionparams.c --- a/contrib/python-zstandard/c-ext/compressionparams.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/compressionparams.c Mon Feb 13 09:44:16 2017 -0800 @@ -25,7 +25,8 @@ ZSTD_compressionParameters params; CompressionParametersObject* result; - if (!PyArg_ParseTuple(args, "i|Kn", &compressionLevel, &sourceSize, &dictSize)) { + if (!PyArg_ParseTuple(args, "i|Kn:get_compression_parameters", + &compressionLevel, &sourceSize, &dictSize)) { return NULL; } @@ -47,12 +48,85 @@ return result; } +static int CompressionParameters_init(CompressionParametersObject* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "window_log", + "chain_log", + "hash_log", + "search_log", + "search_length", + "target_length", + "strategy", + NULL + }; + + unsigned windowLog; + unsigned chainLog; + unsigned hashLog; + unsigned searchLog; + unsigned searchLength; + unsigned targetLength; + unsigned strategy; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "IIIIIII:CompressionParameters", + kwlist, &windowLog, &chainLog, &hashLog, &searchLog, &searchLength, + &targetLength, &strategy)) { + return -1; + } + + if (windowLog < ZSTD_WINDOWLOG_MIN || windowLog > ZSTD_WINDOWLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid window log value"); + return -1; + } + + if (chainLog < ZSTD_CHAINLOG_MIN || chainLog > ZSTD_CHAINLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid chain log value"); + return -1; + } + + if (hashLog < ZSTD_HASHLOG_MIN || hashLog > ZSTD_HASHLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid hash log value"); + return -1; + } + + if (searchLog < ZSTD_SEARCHLOG_MIN || searchLog > ZSTD_SEARCHLOG_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid search log value"); + return -1; + } + + if (searchLength < ZSTD_SEARCHLENGTH_MIN || searchLength > ZSTD_SEARCHLENGTH_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid search length value"); + return -1; + } + + if (targetLength < ZSTD_TARGETLENGTH_MIN || targetLength > ZSTD_TARGETLENGTH_MAX) { + PyErr_SetString(PyExc_ValueError, "invalid target length value"); + return -1; + } + + if (strategy < ZSTD_fast || strategy > ZSTD_btopt) { + PyErr_SetString(PyExc_ValueError, "invalid strategy value"); + return -1; + } + + self->windowLog = windowLog; + self->chainLog = chainLog; + self->hashLog = hashLog; + self->searchLog = searchLog; + self->searchLength = searchLength; + self->targetLength = targetLength; + self->strategy = strategy; + + return 0; +} + PyObject* estimate_compression_context_size(PyObject* self, PyObject* args) { CompressionParametersObject* params; ZSTD_compressionParameters zparams; PyObject* result; - if (!PyArg_ParseTuple(args, "O!", &CompressionParametersType, ¶ms)) { + if (!PyArg_ParseTuple(args, "O!:estimate_compression_context_size", + &CompressionParametersType, ¶ms)) { return NULL; } @@ -64,113 +138,33 @@ PyDoc_STRVAR(CompressionParameters__doc__, "CompressionParameters: low-level control over zstd compression"); -static PyObject* CompressionParameters_new(PyTypeObject* subtype, PyObject* args, PyObject* kwargs) { - CompressionParametersObject* self; - unsigned windowLog; - unsigned chainLog; - unsigned hashLog; - unsigned searchLog; - unsigned searchLength; - unsigned targetLength; - unsigned strategy; - - if (!PyArg_ParseTuple(args, "IIIIIII", &windowLog, &chainLog, &hashLog, &searchLog, - &searchLength, &targetLength, &strategy)) { - return NULL; - } - - if (windowLog < ZSTD_WINDOWLOG_MIN || windowLog > ZSTD_WINDOWLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid window log value"); - return NULL; - } - - if (chainLog < ZSTD_CHAINLOG_MIN || chainLog > ZSTD_CHAINLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid chain log value"); - return NULL; - } - - if (hashLog < ZSTD_HASHLOG_MIN || hashLog > ZSTD_HASHLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid hash log value"); - return NULL; - } - - if (searchLog < ZSTD_SEARCHLOG_MIN || searchLog > ZSTD_SEARCHLOG_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid search log value"); - return NULL; - } - - if (searchLength < ZSTD_SEARCHLENGTH_MIN || searchLength > ZSTD_SEARCHLENGTH_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid search length value"); - return NULL; - } - - if (targetLength < ZSTD_TARGETLENGTH_MIN || targetLength > ZSTD_TARGETLENGTH_MAX) { - PyErr_SetString(PyExc_ValueError, "invalid target length value"); - return NULL; - } - - if (strategy < ZSTD_fast || strategy > ZSTD_btopt) { - PyErr_SetString(PyExc_ValueError, "invalid strategy value"); - return NULL; - } - - self = (CompressionParametersObject*)subtype->tp_alloc(subtype, 1); - if (!self) { - return NULL; - } - - self->windowLog = windowLog; - self->chainLog = chainLog; - self->hashLog = hashLog; - self->searchLog = searchLog; - self->searchLength = searchLength; - self->targetLength = targetLength; - self->strategy = strategy; - - return (PyObject*)self; -} - static void CompressionParameters_dealloc(PyObject* self) { PyObject_Del(self); } -static Py_ssize_t CompressionParameters_length(PyObject* self) { - return 7; -} - -static PyObject* CompressionParameters_item(PyObject* o, Py_ssize_t i) { - CompressionParametersObject* self = (CompressionParametersObject*)o; - - switch (i) { - case 0: - return PyLong_FromLong(self->windowLog); - case 1: - return PyLong_FromLong(self->chainLog); - case 2: - return PyLong_FromLong(self->hashLog); - case 3: - return PyLong_FromLong(self->searchLog); - case 4: - return PyLong_FromLong(self->searchLength); - case 5: - return PyLong_FromLong(self->targetLength); - case 6: - return PyLong_FromLong(self->strategy); - default: - PyErr_SetString(PyExc_IndexError, "index out of range"); - return NULL; - } -} - -static PySequenceMethods CompressionParameters_sq = { - CompressionParameters_length, /* sq_length */ - 0, /* sq_concat */ - 0, /* sq_repeat */ - CompressionParameters_item, /* sq_item */ - 0, /* sq_ass_item */ - 0, /* sq_contains */ - 0, /* sq_inplace_concat */ - 0 /* sq_inplace_repeat */ +static PyMemberDef CompressionParameters_members[] = { + { "window_log", T_UINT, + offsetof(CompressionParametersObject, windowLog), READONLY, + "window log" }, + { "chain_log", T_UINT, + offsetof(CompressionParametersObject, chainLog), READONLY, + "chain log" }, + { "hash_log", T_UINT, + offsetof(CompressionParametersObject, hashLog), READONLY, + "hash log" }, + { "search_log", T_UINT, + offsetof(CompressionParametersObject, searchLog), READONLY, + "search log" }, + { "search_length", T_UINT, + offsetof(CompressionParametersObject, searchLength), READONLY, + "search length" }, + { "target_length", T_UINT, + offsetof(CompressionParametersObject, targetLength), READONLY, + "target length" }, + { "strategy", T_INT, + offsetof(CompressionParametersObject, strategy), READONLY, + "strategy" }, + { NULL } }; PyTypeObject CompressionParametersType = { @@ -185,7 +179,7 @@ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ - &CompressionParameters_sq, /* tp_as_sequence */ + 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ @@ -193,7 +187,7 @@ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ CompressionParameters__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ @@ -202,16 +196,16 @@ 0, /* tp_iter */ 0, /* tp_iternext */ 0, /* tp_methods */ - 0, /* tp_members */ + CompressionParameters_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - 0, /* tp_init */ + (initproc)CompressionParameters_init, /* tp_init */ 0, /* tp_alloc */ - CompressionParameters_new, /* tp_new */ + PyType_GenericNew, /* tp_new */ }; void compressionparams_module_init(PyObject* mod) { diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/compressionwriter.c --- a/contrib/python-zstandard/c-ext/compressionwriter.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/compressionwriter.c Mon Feb 13 09:44:16 2017 -0800 @@ -52,7 +52,7 @@ ZSTD_outBuffer output; PyObject* res; - if (!PyArg_ParseTuple(args, "OOO", &exc_type, &exc_value, &exc_tb)) { + if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) { return NULL; } @@ -119,11 +119,12 @@ ZSTD_inBuffer input; ZSTD_outBuffer output; PyObject* res; + Py_ssize_t totalWrite = 0; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) { #else - if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) { #endif return NULL; } @@ -164,20 +165,21 @@ #endif output.dst, output.pos); Py_XDECREF(res); + totalWrite += output.pos; } output.pos = 0; } PyMem_Free(output.dst); - /* TODO return bytes written */ - Py_RETURN_NONE; + return PyLong_FromSsize_t(totalWrite); } static PyObject* ZstdCompressionWriter_flush(ZstdCompressionWriter* self, PyObject* args) { size_t zresult; ZSTD_outBuffer output; PyObject* res; + Py_ssize_t totalWrite = 0; if (!self->entered) { PyErr_SetString(ZstdError, "flush must be called from an active context manager"); @@ -215,14 +217,14 @@ #endif output.dst, output.pos); Py_XDECREF(res); + totalWrite += output.pos; } output.pos = 0; } PyMem_Free(output.dst); - /* TODO return bytes written */ - Py_RETURN_NONE; + return PyLong_FromSsize_t(totalWrite); } static PyMethodDef ZstdCompressionWriter_methods[] = { diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/compressobj.c --- a/contrib/python-zstandard/c-ext/compressobj.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/compressobj.c Mon Feb 13 09:44:16 2017 -0800 @@ -42,9 +42,9 @@ } #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "y#:compress", &source, &sourceSize)) { #else - if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "s#:compress", &source, &sourceSize)) { #endif return NULL; } @@ -98,7 +98,7 @@ PyObject* result = NULL; Py_ssize_t resultSize = 0; - if (!PyArg_ParseTuple(args, "|i", &flushMode)) { + if (!PyArg_ParseTuple(args, "|i:flush", &flushMode)) { return NULL; } diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/compressor.c --- a/contrib/python-zstandard/c-ext/compressor.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/compressor.c Mon Feb 13 09:44:16 2017 -0800 @@ -16,7 +16,7 @@ Py_BEGIN_ALLOW_THREADS memset(&zmem, 0, sizeof(zmem)); compressor->cdict = ZSTD_createCDict_advanced(compressor->dict->dictData, - compressor->dict->dictSize, *zparams, zmem); + compressor->dict->dictSize, 1, *zparams, zmem); Py_END_ALLOW_THREADS if (!compressor->cdict) { @@ -128,8 +128,8 @@ self->cparams = NULL; self->cdict = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO", kwlist, - &level, &ZstdCompressionDictType, &dict, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!O!OOO:ZstdCompressor", + kwlist, &level, &ZstdCompressionDictType, &dict, &CompressionParametersType, ¶ms, &writeChecksum, &writeContentSize, &writeDictID)) { return -1; @@ -243,8 +243,8 @@ PyObject* totalReadPy; PyObject* totalWritePy; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nkk", kwlist, &source, &dest, &sourceSize, - &inSize, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nkk:copy_stream", kwlist, + &source, &dest, &sourceSize, &inSize, &outSize)) { return NULL; } @@ -402,9 +402,9 @@ ZSTD_parameters zparams; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|O", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|O:compress", #else - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|O", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|O:compress", #endif kwlist, &source, &sourceSize, &allowEmpty)) { return NULL; @@ -512,7 +512,7 @@ return NULL; } - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", kwlist, &inSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n:compressobj", kwlist, &inSize)) { return NULL; } @@ -574,8 +574,8 @@ size_t outSize = ZSTD_CStreamOutSize(); ZstdCompressorIterator* result; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nkk", kwlist, &reader, &sourceSize, - &inSize, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nkk:read_from", kwlist, + &reader, &sourceSize, &inSize, &outSize)) { return NULL; } @@ -693,8 +693,8 @@ Py_ssize_t sourceSize = 0; size_t outSize = ZSTD_CStreamOutSize(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nk", kwlist, &writer, &sourceSize, - &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nk:write_to", kwlist, + &writer, &sourceSize, &outSize)) { return NULL; } diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/decompressionwriter.c --- a/contrib/python-zstandard/c-ext/decompressionwriter.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/decompressionwriter.c Mon Feb 13 09:44:16 2017 -0800 @@ -71,11 +71,12 @@ ZSTD_inBuffer input; ZSTD_outBuffer output; PyObject* res; + Py_ssize_t totalWrite = 0; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "y#:write", &source, &sourceSize)) { #else - if (!PyArg_ParseTuple(args, "s#", &source, &sourceSize)) { + if (!PyArg_ParseTuple(args, "s#:write", &source, &sourceSize)) { #endif return NULL; } @@ -116,15 +117,15 @@ #endif output.dst, output.pos); Py_XDECREF(res); + totalWrite += output.pos; output.pos = 0; } } PyMem_Free(output.dst); - /* TODO return bytes written */ - Py_RETURN_NONE; - } + return PyLong_FromSsize_t(totalWrite); +} static PyMethodDef ZstdDecompressionWriter_methods[] = { { "__enter__", (PyCFunction)ZstdDecompressionWriter_enter, METH_NOARGS, diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/decompressobj.c --- a/contrib/python-zstandard/c-ext/decompressobj.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/decompressobj.c Mon Feb 13 09:44:16 2017 -0800 @@ -41,9 +41,9 @@ } #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#", + if (!PyArg_ParseTuple(args, "y#:decompress", #else - if (!PyArg_ParseTuple(args, "s#", + if (!PyArg_ParseTuple(args, "s#:decompress", #endif &source, &sourceSize)) { return NULL; diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/decompressor.c --- a/contrib/python-zstandard/c-ext/decompressor.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/decompressor.c Mon Feb 13 09:44:16 2017 -0800 @@ -59,23 +59,19 @@ ZstdCompressionDict* dict = NULL; - self->refdctx = NULL; + self->dctx = NULL; self->dict = NULL; self->ddict = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist, &ZstdCompressionDictType, &dict)) { return -1; } - /* Instead of creating a ZSTD_DCtx for every decompression operation, - we create an instance at object creation time and recycle it via - ZSTD_copyDCTx() on each use. This means each use is a malloc+memcpy - instead of a malloc+init. */ /* TODO lazily initialize the reference ZSTD_DCtx on first use since not instances of ZstdDecompressor will use a ZSTD_DCtx. */ - self->refdctx = ZSTD_createDCtx(); - if (!self->refdctx) { + self->dctx = ZSTD_createDCtx(); + if (!self->dctx) { PyErr_NoMemory(); goto except; } @@ -88,17 +84,17 @@ return 0; except: - if (self->refdctx) { - ZSTD_freeDCtx(self->refdctx); - self->refdctx = NULL; + if (self->dctx) { + ZSTD_freeDCtx(self->dctx); + self->dctx = NULL; } return -1; } static void Decompressor_dealloc(ZstdDecompressor* self) { - if (self->refdctx) { - ZSTD_freeDCtx(self->refdctx); + if (self->dctx) { + ZSTD_freeDCtx(self->dctx); } Py_XDECREF(self->dict); @@ -150,8 +146,8 @@ PyObject* totalReadPy; PyObject* totalWritePy; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk", kwlist, &source, - &dest, &inSize, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist, + &source, &dest, &inSize, &outSize)) { return NULL; } @@ -243,7 +239,7 @@ Py_DecRef(totalReadPy); Py_DecRef(totalWritePy); - finally: +finally: if (output.dst) { PyMem_Free(output.dst); } @@ -291,28 +287,19 @@ unsigned long long decompressedSize; size_t destCapacity; PyObject* result = NULL; - ZSTD_DCtx* dctx = NULL; void* dictData = NULL; size_t dictSize = 0; size_t zresult; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress", #else - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress", #endif - &source, &sourceSize, &maxOutputSize)) { + kwlist, &source, &sourceSize, &maxOutputSize)) { return NULL; } - dctx = PyMem_Malloc(ZSTD_sizeof_DCtx(self->refdctx)); - if (!dctx) { - PyErr_NoMemory(); - return NULL; - } - - ZSTD_copyDCtx(dctx, self->refdctx); - if (self->dict) { dictData = self->dict->dictData; dictSize = self->dict->dictSize; @@ -320,12 +307,12 @@ if (dictData && !self->ddict) { Py_BEGIN_ALLOW_THREADS - self->ddict = ZSTD_createDDict(dictData, dictSize); + self->ddict = ZSTD_createDDict_byReference(dictData, dictSize); Py_END_ALLOW_THREADS if (!self->ddict) { PyErr_SetString(ZstdError, "could not create decompression dict"); - goto except; + return NULL; } } @@ -335,7 +322,7 @@ if (0 == maxOutputSize) { PyErr_SetString(ZstdError, "input data invalid or missing content size " "in frame header"); - goto except; + return NULL; } else { result = PyBytes_FromStringAndSize(NULL, maxOutputSize); @@ -348,45 +335,39 @@ } if (!result) { - goto except; + return NULL; } Py_BEGIN_ALLOW_THREADS if (self->ddict) { - zresult = ZSTD_decompress_usingDDict(dctx, PyBytes_AsString(result), destCapacity, + zresult = ZSTD_decompress_usingDDict(self->dctx, + PyBytes_AsString(result), destCapacity, source, sourceSize, self->ddict); } else { - zresult = ZSTD_decompressDCtx(dctx, PyBytes_AsString(result), destCapacity, source, sourceSize); + zresult = ZSTD_decompressDCtx(self->dctx, + PyBytes_AsString(result), destCapacity, source, sourceSize); } Py_END_ALLOW_THREADS if (ZSTD_isError(zresult)) { PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult)); - goto except; + Py_DecRef(result); + return NULL; } else if (decompressedSize && zresult != decompressedSize) { PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu", zresult, decompressedSize); - goto except; + Py_DecRef(result); + return NULL; } else if (zresult < destCapacity) { if (_PyBytes_Resize(&result, zresult)) { - goto except; + Py_DecRef(result); + return NULL; } } - goto finally; - -except: - Py_DecRef(result); - result = NULL; - -finally: - if (dctx) { - PyMem_FREE(dctx); - } - return result; } @@ -455,8 +436,8 @@ ZstdDecompressorIterator* result; size_t skipBytes = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk", kwlist, &reader, - &inSize, &outSize, &skipBytes)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist, + &reader, &inSize, &outSize, &skipBytes)) { return NULL; } @@ -534,19 +515,14 @@ goto finally; except: - if (result->reader) { - Py_DECREF(result->reader); - result->reader = NULL; - } + Py_CLEAR(result->reader); if (result->buffer) { PyBuffer_Release(result->buffer); - Py_DECREF(result->buffer); - result->buffer = NULL; + Py_CLEAR(result->buffer); } - Py_DECREF(result); - result = NULL; + Py_CLEAR(result); finally: @@ -577,7 +553,8 @@ size_t outSize = ZSTD_DStreamOutSize(); ZstdDecompressionWriter* result; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k", kwlist, &writer, &outSize)) { + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist, + &writer, &outSize)) { return NULL; } @@ -605,6 +582,200 @@ return result; } +PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__, +"Decompress a series of chunks using the content dictionary chaining technique\n" +); + +static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "frames", + NULL + }; + + PyObject* chunks; + Py_ssize_t chunksLen; + Py_ssize_t chunkIndex; + char parity = 0; + PyObject* chunk; + char* chunkData; + Py_ssize_t chunkSize; + ZSTD_DCtx* dctx = NULL; + size_t zresult; + ZSTD_frameParams frameParams; + void* buffer1 = NULL; + size_t buffer1Size = 0; + size_t buffer1ContentSize = 0; + void* buffer2 = NULL; + size_t buffer2Size = 0; + size_t buffer2ContentSize = 0; + void* destBuffer = NULL; + PyObject* result = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain", + kwlist, &PyList_Type, &chunks)) { + return NULL; + } + + chunksLen = PyList_Size(chunks); + if (!chunksLen) { + PyErr_SetString(PyExc_ValueError, "empty input chain"); + return NULL; + } + + /* The first chunk should not be using a dictionary. We handle it specially. */ + chunk = PyList_GetItem(chunks, 0); + if (!PyBytes_Check(chunk)) { + PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes"); + return NULL; + } + + /* We require that all chunks be zstd frames and that they have content size set. */ + PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); + zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + if (ZSTD_isError(zresult)) { + PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame"); + return NULL; + } + else if (zresult) { + PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame"); + return NULL; + } + + if (0 == frameParams.frameContentSize) { + PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame"); + return NULL; + } + + dctx = ZSTD_createDCtx(); + if (!dctx) { + PyErr_NoMemory(); + goto finally; + } + + buffer1Size = frameParams.frameContentSize; + buffer1 = PyMem_Malloc(buffer1Size); + if (!buffer1) { + goto finally; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult)); + goto finally; + } + + buffer1ContentSize = zresult; + + /* Special case of a simple chain. */ + if (1 == chunksLen) { + result = PyBytes_FromStringAndSize(buffer1, buffer1Size); + goto finally; + } + + /* This should ideally look at next chunk. But this is slightly simpler. */ + buffer2Size = frameParams.frameContentSize; + buffer2 = PyMem_Malloc(buffer2Size); + if (!buffer2) { + goto finally; + } + + /* For each subsequent chunk, use the previous fulltext as a content dictionary. + Our strategy is to have 2 buffers. One holds the previous fulltext (to be + used as a content dictionary) and the other holds the new fulltext. The + buffers grow when needed but never decrease in size. This limits the + memory allocator overhead. + */ + for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) { + chunk = PyList_GetItem(chunks, chunkIndex); + if (!PyBytes_Check(chunk)) { + PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex); + goto finally; + } + + PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize); + zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize); + if (ZSTD_isError(zresult)) { + PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex); + goto finally; + } + else if (zresult) { + PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex); + goto finally; + } + + if (0 == frameParams.frameContentSize) { + PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex); + goto finally; + } + + parity = chunkIndex % 2; + + /* This could definitely be abstracted to reduce code duplication. */ + if (parity) { + /* Resize destination buffer to hold larger content. */ + if (buffer2Size < frameParams.frameContentSize) { + buffer2Size = frameParams.frameContentSize; + destBuffer = PyMem_Realloc(buffer2, buffer2Size); + if (!destBuffer) { + goto finally; + } + buffer2 = destBuffer; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size, + chunkData, chunkSize, buffer1, buffer1ContentSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", + chunkIndex, ZSTD_getErrorName(zresult)); + goto finally; + } + buffer2ContentSize = zresult; + } + else { + if (buffer1Size < frameParams.frameContentSize) { + buffer1Size = frameParams.frameContentSize; + destBuffer = PyMem_Realloc(buffer1, buffer1Size); + if (!destBuffer) { + goto finally; + } + buffer1 = destBuffer; + } + + Py_BEGIN_ALLOW_THREADS + zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size, + chunkData, chunkSize, buffer2, buffer2ContentSize); + Py_END_ALLOW_THREADS + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "could not decompress chunk %zd: %s", + chunkIndex, ZSTD_getErrorName(zresult)); + goto finally; + } + buffer1ContentSize = zresult; + } + } + + result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1, + parity ? buffer2ContentSize : buffer1ContentSize); + +finally: + if (buffer2) { + PyMem_Free(buffer2); + } + if (buffer1) { + PyMem_Free(buffer1); + } + + if (dctx) { + ZSTD_freeDCtx(dctx); + } + + return result; +} + static PyMethodDef Decompressor_methods[] = { { "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS, Decompressor_copy_stream__doc__ }, @@ -616,6 +787,8 @@ Decompressor_read_from__doc__ }, { "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS, Decompressor_write_to__doc__ }, + { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain, + METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ }, { NULL, NULL } }; diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/dictparams.c --- a/contrib/python-zstandard/c-ext/dictparams.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/dictparams.c Mon Feb 13 09:44:16 2017 -0800 @@ -18,8 +18,8 @@ unsigned notificationLevel; unsigned dictID; - if (!PyArg_ParseTuple(args, "IiII", &selectivityLevel, &compressionLevel, - ¬ificationLevel, &dictID)) { + if (!PyArg_ParseTuple(args, "IiII:DictParameters", + &selectivityLevel, &compressionLevel, ¬ificationLevel, &dictID)) { return NULL; } @@ -40,6 +40,22 @@ PyObject_Del(self); } +static PyMemberDef DictParameters_members[] = { + { "selectivity_level", T_UINT, + offsetof(DictParametersObject, selectivityLevel), READONLY, + "selectivity level" }, + { "compression_level", T_INT, + offsetof(DictParametersObject, compressionLevel), READONLY, + "compression level" }, + { "notification_level", T_UINT, + offsetof(DictParametersObject, notificationLevel), READONLY, + "notification level" }, + { "dict_id", T_UINT, + offsetof(DictParametersObject, dictID), READONLY, + "dictionary ID" }, + { NULL } +}; + static Py_ssize_t DictParameters_length(PyObject* self) { return 4; } @@ -102,7 +118,7 @@ 0, /* tp_iter */ 0, /* tp_iternext */ 0, /* tp_methods */ - 0, /* tp_members */ + DictParameters_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/frameparams.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/c-ext/frameparams.c Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,132 @@ +/** +* Copyright (c) 2017-present, Gregory Szorc +* All rights reserved. +* +* This software may be modified and distributed under the terms +* of the BSD license. See the LICENSE file for details. +*/ + +#include "python-zstandard.h" + +extern PyObject* ZstdError; + +PyDoc_STRVAR(FrameParameters__doc__, + "FrameParameters: information about a zstd frame"); + +FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args) { + const char* source; + Py_ssize_t sourceSize; + ZSTD_frameParams params; + FrameParametersObject* result = NULL; + size_t zresult; + +#if PY_MAJOR_VERSION >= 3 + if (!PyArg_ParseTuple(args, "y#:get_frame_parameters", +#else + if (!PyArg_ParseTuple(args, "s#:get_frame_parameters", +#endif + &source, &sourceSize)) { + return NULL; + } + + /* Needed for Python 2 to reject unicode */ + if (!PyBytes_Check(PyTuple_GET_ITEM(args, 0))) { + PyErr_SetString(PyExc_TypeError, "argument must be bytes"); + return NULL; + } + + zresult = ZSTD_getFrameParams(¶ms, (void*)source, sourceSize); + + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "cannot get frame parameters: %s", ZSTD_getErrorName(zresult)); + return NULL; + } + + if (zresult) { + PyErr_Format(ZstdError, "not enough data for frame parameters; need %zu bytes", zresult); + return NULL; + } + + result = PyObject_New(FrameParametersObject, &FrameParametersType); + if (!result) { + return NULL; + } + + result->frameContentSize = params.frameContentSize; + result->windowSize = params.windowSize; + result->dictID = params.dictID; + result->checksumFlag = params.checksumFlag ? 1 : 0; + + return result; +} + +static void FrameParameters_dealloc(PyObject* self) { + PyObject_Del(self); +} + +static PyMemberDef FrameParameters_members[] = { + { "content_size", T_ULONGLONG, + offsetof(FrameParametersObject, frameContentSize), READONLY, + "frame content size" }, + { "window_size", T_UINT, + offsetof(FrameParametersObject, windowSize), READONLY, + "window size" }, + { "dict_id", T_UINT, + offsetof(FrameParametersObject, dictID), READONLY, + "dictionary ID" }, + { "has_checksum", T_BOOL, + offsetof(FrameParametersObject, checksumFlag), READONLY, + "checksum flag" }, + { NULL } +}; + +PyTypeObject FrameParametersType = { + PyVarObject_HEAD_INIT(NULL, 0) + "FrameParameters", /* tp_name */ + sizeof(FrameParametersObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)FrameParameters_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + FrameParameters__doc__, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + FrameParameters_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + +void frameparams_module_init(PyObject* mod) { + Py_TYPE(&FrameParametersType) = &PyType_Type; + if (PyType_Ready(&FrameParametersType) < 0) { + return; + } + + Py_IncRef((PyObject*)&FrameParametersType); + PyModule_AddObject(mod, "FrameParameters", (PyObject*)&FrameParametersType); +} diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/c-ext/python-zstandard.h --- a/contrib/python-zstandard/c-ext/python-zstandard.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/c-ext/python-zstandard.h Mon Feb 13 09:44:16 2017 -0800 @@ -8,6 +8,7 @@ #define PY_SSIZE_T_CLEAN #include +#include "structmember.h" #define ZSTD_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY @@ -15,7 +16,7 @@ #include "zstd.h" #include "zdict.h" -#define PYTHON_ZSTANDARD_VERSION "0.6.0" +#define PYTHON_ZSTANDARD_VERSION "0.7.0" typedef enum { compressorobj_flush_finish, @@ -37,6 +38,16 @@ typedef struct { PyObject_HEAD + unsigned long long frameContentSize; + unsigned windowSize; + unsigned dictID; + char checksumFlag; +} FrameParametersObject; + +extern PyTypeObject FrameParametersType; + +typedef struct { + PyObject_HEAD unsigned selectivityLevel; int compressionLevel; unsigned notificationLevel; @@ -115,7 +126,7 @@ typedef struct { PyObject_HEAD - ZSTD_DCtx* refdctx; + ZSTD_DCtx* dctx; ZstdCompressionDict* dict; ZSTD_DDict* ddict; @@ -172,6 +183,7 @@ void ztopy_compression_parameters(CompressionParametersObject* params, ZSTD_compressionParameters* zparams); CompressionParametersObject* get_compression_parameters(PyObject* self, PyObject* args); +FrameParametersObject* get_frame_parameters(PyObject* self, PyObject* args); PyObject* estimate_compression_context_size(PyObject* self, PyObject* args); ZSTD_CStream* CStream_from_ZstdCompressor(ZstdCompressor* compressor, Py_ssize_t sourceSize); ZSTD_DStream* DStream_from_ZstdDecompressor(ZstdDecompressor* decompressor); diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/make_cffi.py --- a/contrib/python-zstandard/make_cffi.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/make_cffi.py Mon Feb 13 09:44:16 2017 -0800 @@ -9,6 +9,7 @@ import cffi import distutils.ccompiler import os +import re import subprocess import tempfile @@ -19,6 +20,8 @@ 'common/entropy_common.c', 'common/error_private.c', 'common/fse_decompress.c', + 'common/pool.c', + 'common/threading.c', 'common/xxhash.c', 'common/zstd_common.c', 'compress/fse_compress.c', @@ -26,10 +29,17 @@ 'compress/zstd_compress.c', 'decompress/huf_decompress.c', 'decompress/zstd_decompress.c', + 'dictBuilder/cover.c', 'dictBuilder/divsufsort.c', 'dictBuilder/zdict.c', )] +HEADERS = [os.path.join(HERE, 'zstd', *p) for p in ( + ('zstd.h',), + ('common', 'pool.h'), + ('dictBuilder', 'zdict.h'), +)] + INCLUDE_DIRS = [os.path.join(HERE, d) for d in ( 'zstd', 'zstd/common', @@ -53,56 +63,92 @@ args.extend([ '-E', '-DZSTD_STATIC_LINKING_ONLY', + '-DZDICT_STATIC_LINKING_ONLY', ]) elif compiler.compiler_type == 'msvc': args = [compiler.cc] args.extend([ '/EP', '/DZSTD_STATIC_LINKING_ONLY', + '/DZDICT_STATIC_LINKING_ONLY', ]) else: raise Exception('unsupported compiler type: %s' % compiler.compiler_type) -# zstd.h includes , which is also included by cffi's boilerplate. -# This can lead to duplicate declarations. So we strip this include from the -# preprocessor invocation. +def preprocess(path): + # zstd.h includes , which is also included by cffi's boilerplate. + # This can lead to duplicate declarations. So we strip this include from the + # preprocessor invocation. + with open(path, 'rb') as fh: + lines = [l for l in fh if not l.startswith(b'#include ')] -with open(os.path.join(HERE, 'zstd', 'zstd.h'), 'rb') as fh: - lines = [l for l in fh if not l.startswith(b'#include ')] - -fd, input_file = tempfile.mkstemp(suffix='.h') -os.write(fd, b''.join(lines)) -os.close(fd) + fd, input_file = tempfile.mkstemp(suffix='.h') + os.write(fd, b''.join(lines)) + os.close(fd) -args.append(input_file) + try: + process = subprocess.Popen(args + [input_file], stdout=subprocess.PIPE) + output = process.communicate()[0] + ret = process.poll() + if ret: + raise Exception('preprocessor exited with error') -try: - process = subprocess.Popen(args, stdout=subprocess.PIPE) - output = process.communicate()[0] - ret = process.poll() - if ret: - raise Exception('preprocessor exited with error') -finally: - os.unlink(input_file) + return output + finally: + os.unlink(input_file) -def normalize_output(): + +def normalize_output(output): lines = [] for line in output.splitlines(): # CFFI's parser doesn't like __attribute__ on UNIX compilers. if line.startswith(b'__attribute__ ((visibility ("default"))) '): line = line[len(b'__attribute__ ((visibility ("default"))) '):] + if line.startswith(b'__attribute__((deprecated('): + continue + elif b'__declspec(deprecated(' in line: + continue + lines.append(line) return b'\n'.join(lines) + ffi = cffi.FFI() ffi.set_source('_zstd_cffi', ''' +#include "mem.h" #define ZSTD_STATIC_LINKING_ONLY #include "zstd.h" +#define ZDICT_STATIC_LINKING_ONLY +#include "pool.h" +#include "zdict.h" ''', sources=SOURCES, include_dirs=INCLUDE_DIRS) -ffi.cdef(normalize_output().decode('latin1')) +DEFINE = re.compile(b'^\\#define ([a-zA-Z0-9_]+) ') + +sources = [] + +for header in HEADERS: + preprocessed = preprocess(header) + sources.append(normalize_output(preprocessed)) + + # Do another pass over source and find constants that were preprocessed + # away. + with open(header, 'rb') as fh: + for line in fh: + line = line.strip() + m = DEFINE.match(line) + if not m: + continue + + # The parser doesn't like some constants with complex values. + if m.group(1) in (b'ZSTD_LIB_VERSION', b'ZSTD_VERSION_STRING'): + continue + + sources.append(m.group(0) + b' ...') + +ffi.cdef(u'\n'.join(s.decode('latin1') for s in sources)) if __name__ == '__main__': ffi.compile() diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/setup.py --- a/contrib/python-zstandard/setup.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/setup.py Mon Feb 13 09:44:16 2017 -0800 @@ -62,6 +62,7 @@ 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ], keywords='zstandard zstd compression', ext_modules=extensions, diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/setup_zstd.py --- a/contrib/python-zstandard/setup_zstd.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/setup_zstd.py Mon Feb 13 09:44:16 2017 -0800 @@ -12,6 +12,8 @@ 'common/entropy_common.c', 'common/error_private.c', 'common/fse_decompress.c', + 'common/pool.c', + 'common/threading.c', 'common/xxhash.c', 'common/zstd_common.c', 'compress/fse_compress.c', @@ -19,11 +21,13 @@ 'compress/zstd_compress.c', 'decompress/huf_decompress.c', 'decompress/zstd_decompress.c', + 'dictBuilder/cover.c', 'dictBuilder/divsufsort.c', 'dictBuilder/zdict.c', )] zstd_sources_legacy = ['zstd/%s' % p for p in ( + 'deprecated/zbuff_common.c', 'deprecated/zbuff_compress.c', 'deprecated/zbuff_decompress.c', 'legacy/zstd_v01.c', @@ -63,6 +67,7 @@ 'c-ext/decompressoriterator.c', 'c-ext/decompressionwriter.c', 'c-ext/dictparams.c', + 'c-ext/frameparams.c', ] zstd_depends = [ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/common.py --- a/contrib/python-zstandard/tests/common.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/common.py Mon Feb 13 09:44:16 2017 -0800 @@ -1,4 +1,50 @@ +import inspect import io +import types + + +def make_cffi(cls): + """Decorator to add CFFI versions of each test method.""" + + try: + import zstd_cffi + except ImportError: + return cls + + # If CFFI version is available, dynamically construct test methods + # that use it. + + for attr in dir(cls): + fn = getattr(cls, attr) + if not inspect.ismethod(fn) and not inspect.isfunction(fn): + continue + + if not fn.__name__.startswith('test_'): + continue + + name = '%s_cffi' % fn.__name__ + + # Replace the "zstd" symbol with the CFFI module instance. Then copy + # the function object and install it in a new attribute. + if isinstance(fn, types.FunctionType): + globs = dict(fn.__globals__) + globs['zstd'] = zstd_cffi + new_fn = types.FunctionType(fn.__code__, globs, name, + fn.__defaults__, fn.__closure__) + new_method = new_fn + else: + globs = dict(fn.__func__.func_globals) + globs['zstd'] = zstd_cffi + new_fn = types.FunctionType(fn.__func__.func_code, globs, name, + fn.__func__.func_defaults, + fn.__func__.func_closure) + new_method = types.UnboundMethodType(new_fn, fn.im_self, + fn.im_class) + + setattr(cls, name, new_method) + + return cls + class OpCountingBytesIO(io.BytesIO): def __init__(self, *args, **kwargs): diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_cffi.py --- a/contrib/python-zstandard/tests/test_cffi.py Fri Feb 10 18:20:58 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -import io - -try: - import unittest2 as unittest -except ImportError: - import unittest - -import zstd - -try: - import zstd_cffi -except ImportError: - raise unittest.SkipTest('cffi version of zstd not available') - - -class TestCFFIWriteToToCDecompressor(unittest.TestCase): - def test_simple(self): - orig = io.BytesIO() - orig.write(b'foo') - orig.write(b'bar') - orig.write(b'foobar' * 16384) - - dest = io.BytesIO() - cctx = zstd_cffi.ZstdCompressor() - with cctx.write_to(dest) as compressor: - compressor.write(orig.getvalue()) - - uncompressed = io.BytesIO() - dctx = zstd.ZstdDecompressor() - with dctx.write_to(uncompressed) as decompressor: - decompressor.write(dest.getvalue()) - - self.assertEqual(uncompressed.getvalue(), orig.getvalue()) - - diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_compressor.py --- a/contrib/python-zstandard/tests/test_compressor.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_compressor.py Mon Feb 13 09:44:16 2017 -0800 @@ -10,7 +10,10 @@ import zstd -from .common import OpCountingBytesIO +from .common import ( + make_cffi, + OpCountingBytesIO, +) if sys.version_info[0] >= 3: @@ -19,6 +22,7 @@ next = lambda it: it.next() +@make_cffi class TestCompressor(unittest.TestCase): def test_level_bounds(self): with self.assertRaises(ValueError): @@ -28,18 +32,17 @@ zstd.ZstdCompressor(level=23) +@make_cffi class TestCompressor_compress(unittest.TestCase): def test_compress_empty(self): cctx = zstd.ZstdCompressor(level=1) - cctx.compress(b'') - - cctx = zstd.ZstdCompressor(level=22) - cctx.compress(b'') - - def test_compress_empty(self): - cctx = zstd.ZstdCompressor(level=1) - self.assertEqual(cctx.compress(b''), - b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + result = cctx.compress(b'') + self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + params = zstd.get_frame_parameters(result) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 524288) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum, 0) # TODO should be temporary until https://github.com/facebook/zstd/issues/506 # is fixed. @@ -59,6 +62,13 @@ self.assertEqual(len(result), 999) self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') + # This matches the test for read_from() below. + cctx = zstd.ZstdCompressor(level=1) + result = cctx.compress(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE + b'o') + self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x40\x54\x00\x00' + b'\x10\x66\x66\x01\x00\xfb\xff\x39\xc0' + b'\x02\x09\x00\x00\x6f') + def test_write_checksum(self): cctx = zstd.ZstdCompressor(level=1) no_checksum = cctx.compress(b'foobar') @@ -67,6 +77,12 @@ self.assertEqual(len(with_checksum), len(no_checksum) + 4) + no_params = zstd.get_frame_parameters(no_checksum) + with_params = zstd.get_frame_parameters(with_checksum) + + self.assertFalse(no_params.has_checksum) + self.assertTrue(with_params.has_checksum) + def test_write_content_size(self): cctx = zstd.ZstdCompressor(level=1) no_size = cctx.compress(b'foobar' * 256) @@ -75,6 +91,11 @@ self.assertEqual(len(with_size), len(no_size) + 1) + no_params = zstd.get_frame_parameters(no_size) + with_params = zstd.get_frame_parameters(with_size) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 1536) + def test_no_dict_id(self): samples = [] for i in range(128): @@ -92,6 +113,11 @@ self.assertEqual(len(with_dict_id), len(no_dict_id) + 4) + no_params = zstd.get_frame_parameters(no_dict_id) + with_params = zstd.get_frame_parameters(with_dict_id) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 1584102229) + def test_compress_dict_multiple(self): samples = [] for i in range(128): @@ -107,6 +133,7 @@ cctx.compress(b'foo bar foobar foo bar foobar') +@make_cffi class TestCompressor_compressobj(unittest.TestCase): def test_compressobj_empty(self): cctx = zstd.ZstdCompressor(level=1) @@ -127,6 +154,12 @@ self.assertEqual(len(result), 999) self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') + params = zstd.get_frame_parameters(result) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1048576) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + def test_write_checksum(self): cctx = zstd.ZstdCompressor(level=1) cobj = cctx.compressobj() @@ -135,6 +168,15 @@ cobj = cctx.compressobj() with_checksum = cobj.compress(b'foobar') + cobj.flush() + no_params = zstd.get_frame_parameters(no_checksum) + with_params = zstd.get_frame_parameters(with_checksum) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertTrue(with_params.has_checksum) + self.assertEqual(len(with_checksum), len(no_checksum) + 4) def test_write_content_size(self): @@ -145,6 +187,15 @@ cobj = cctx.compressobj(size=len(b'foobar' * 256)) with_size = cobj.compress(b'foobar' * 256) + cobj.flush() + no_params = zstd.get_frame_parameters(no_size) + with_params = zstd.get_frame_parameters(with_size) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 1536) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertFalse(with_params.has_checksum) + self.assertEqual(len(with_size), len(no_size) + 1) def test_compress_after_finished(self): @@ -187,6 +238,7 @@ self.assertEqual(header, b'\x01\x00\x00') +@make_cffi class TestCompressor_copy_stream(unittest.TestCase): def test_no_read(self): source = object() @@ -229,6 +281,12 @@ self.assertEqual(r, 255 * 16384) self.assertEqual(w, 999) + params = zstd.get_frame_parameters(dest.getvalue()) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1048576) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + def test_write_checksum(self): source = io.BytesIO(b'foobar') no_checksum = io.BytesIO() @@ -244,6 +302,15 @@ self.assertEqual(len(with_checksum.getvalue()), len(no_checksum.getvalue()) + 4) + no_params = zstd.get_frame_parameters(no_checksum.getvalue()) + with_params = zstd.get_frame_parameters(with_checksum.getvalue()) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertTrue(with_params.has_checksum) + def test_write_content_size(self): source = io.BytesIO(b'foobar' * 256) no_size = io.BytesIO() @@ -268,6 +335,15 @@ self.assertEqual(len(with_size.getvalue()), len(no_size.getvalue()) + 1) + no_params = zstd.get_frame_parameters(no_size.getvalue()) + with_params = zstd.get_frame_parameters(with_size.getvalue()) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 1536) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertFalse(with_params.has_checksum) + def test_read_write_size(self): source = OpCountingBytesIO(b'foobarfoobar') dest = OpCountingBytesIO() @@ -288,18 +364,25 @@ return buffer.getvalue() +@make_cffi class TestCompressor_write_to(unittest.TestCase): def test_empty(self): - self.assertEqual(compress(b'', 1), - b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + result = compress(b'', 1) + self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') + + params = zstd.get_frame_parameters(result) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 524288) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) def test_multiple_compress(self): buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=5) with cctx.write_to(buffer) as compressor: - compressor.write(b'foo') - compressor.write(b'bar') - compressor.write(b'x' * 8192) + self.assertEqual(compressor.write(b'foo'), 0) + self.assertEqual(compressor.write(b'bar'), 0) + self.assertEqual(compressor.write(b'x' * 8192), 0) result = buffer.getvalue() self.assertEqual(result, @@ -318,11 +401,23 @@ buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.write_to(buffer) as compressor: - compressor.write(b'foo') - compressor.write(b'bar') - compressor.write(b'foo' * 16384) + self.assertEqual(compressor.write(b'foo'), 0) + self.assertEqual(compressor.write(b'bar'), 0) + self.assertEqual(compressor.write(b'foo' * 16384), 634) compressed = buffer.getvalue() + + params = zstd.get_frame_parameters(compressed) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, d.dict_id()) + self.assertFalse(params.has_checksum) + + self.assertEqual(compressed[0:32], + b'\x28\xb5\x2f\xfd\x03\x00\x55\x7b\x6b\x5e\x54\x00' + b'\x00\x00\x02\xfc\xf4\xa5\xba\x23\x3f\x85\xb3\x54' + b'\x00\x00\x18\x6f\x6f\x66\x01\x00') + h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92') @@ -332,11 +427,18 @@ buffer = io.BytesIO() cctx = zstd.ZstdCompressor(compression_params=params) with cctx.write_to(buffer) as compressor: - compressor.write(b'foo') - compressor.write(b'bar') - compressor.write(b'foobar' * 16384) + self.assertEqual(compressor.write(b'foo'), 0) + self.assertEqual(compressor.write(b'bar'), 0) + self.assertEqual(compressor.write(b'foobar' * 16384), 0) compressed = buffer.getvalue() + + params = zstd.get_frame_parameters(compressed) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1048576) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99') @@ -344,12 +446,21 @@ no_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1) with cctx.write_to(no_checksum) as compressor: - compressor.write(b'foobar') + self.assertEqual(compressor.write(b'foobar'), 0) with_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_checksum=True) with cctx.write_to(with_checksum) as compressor: - compressor.write(b'foobar') + self.assertEqual(compressor.write(b'foobar'), 0) + + no_params = zstd.get_frame_parameters(no_checksum.getvalue()) + with_params = zstd.get_frame_parameters(with_checksum.getvalue()) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertTrue(with_params.has_checksum) self.assertEqual(len(with_checksum.getvalue()), len(no_checksum.getvalue()) + 4) @@ -358,12 +469,12 @@ no_size = io.BytesIO() cctx = zstd.ZstdCompressor(level=1) with cctx.write_to(no_size) as compressor: - compressor.write(b'foobar' * 256) + self.assertEqual(compressor.write(b'foobar' * 256), 0) with_size = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_content_size=True) with cctx.write_to(with_size) as compressor: - compressor.write(b'foobar' * 256) + self.assertEqual(compressor.write(b'foobar' * 256), 0) # Source size is not known in streaming mode, so header not # written. @@ -373,7 +484,16 @@ # Declaring size will write the header. with_size = io.BytesIO() with cctx.write_to(with_size, size=len(b'foobar' * 256)) as compressor: - compressor.write(b'foobar' * 256) + self.assertEqual(compressor.write(b'foobar' * 256), 0) + + no_params = zstd.get_frame_parameters(no_size.getvalue()) + with_params = zstd.get_frame_parameters(with_size.getvalue()) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 1536) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, 0) + self.assertFalse(no_params.has_checksum) + self.assertFalse(with_params.has_checksum) self.assertEqual(len(with_size.getvalue()), len(no_size.getvalue()) + 1) @@ -390,12 +510,21 @@ with_dict_id = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, dict_data=d) with cctx.write_to(with_dict_id) as compressor: - compressor.write(b'foobarfoobar') + self.assertEqual(compressor.write(b'foobarfoobar'), 0) cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) no_dict_id = io.BytesIO() with cctx.write_to(no_dict_id) as compressor: - compressor.write(b'foobarfoobar') + self.assertEqual(compressor.write(b'foobarfoobar'), 0) + + no_params = zstd.get_frame_parameters(no_dict_id.getvalue()) + with_params = zstd.get_frame_parameters(with_dict_id.getvalue()) + self.assertEqual(no_params.content_size, 0) + self.assertEqual(with_params.content_size, 0) + self.assertEqual(no_params.dict_id, 0) + self.assertEqual(with_params.dict_id, d.dict_id()) + self.assertFalse(no_params.has_checksum) + self.assertFalse(with_params.has_checksum) self.assertEqual(len(with_dict_id.getvalue()), len(no_dict_id.getvalue()) + 4) @@ -412,9 +541,9 @@ cctx = zstd.ZstdCompressor(level=3) dest = OpCountingBytesIO() with cctx.write_to(dest, write_size=1) as compressor: - compressor.write(b'foo') - compressor.write(b'bar') - compressor.write(b'foobar') + self.assertEqual(compressor.write(b'foo'), 0) + self.assertEqual(compressor.write(b'bar'), 0) + self.assertEqual(compressor.write(b'foobar'), 0) self.assertEqual(len(dest.getvalue()), dest._write_count) @@ -422,15 +551,15 @@ cctx = zstd.ZstdCompressor(level=3) dest = OpCountingBytesIO() with cctx.write_to(dest) as compressor: - compressor.write(b'foo') + self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(dest._write_count, 0) - compressor.flush() + self.assertEqual(compressor.flush(), 12) self.assertEqual(dest._write_count, 1) - compressor.write(b'bar') + self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(dest._write_count, 1) - compressor.flush() + self.assertEqual(compressor.flush(), 6) self.assertEqual(dest._write_count, 2) - compressor.write(b'baz') + self.assertEqual(compressor.write(b'baz'), 0) self.assertEqual(dest._write_count, 3) @@ -438,10 +567,10 @@ cctx = zstd.ZstdCompressor(level=3, write_checksum=True) dest = OpCountingBytesIO() with cctx.write_to(dest) as compressor: - compressor.write(b'foobar' * 8192) + self.assertEqual(compressor.write(b'foobar' * 8192), 0) count = dest._write_count offset = dest.tell() - compressor.flush() + self.assertEqual(compressor.flush(), 23) self.assertGreater(dest._write_count, count) self.assertGreater(dest.tell(), offset) offset = dest.tell() @@ -456,18 +585,22 @@ self.assertEqual(header, b'\x01\x00\x00') +@make_cffi class TestCompressor_read_from(unittest.TestCase): def test_type_validation(self): cctx = zstd.ZstdCompressor() # Object with read() works. - cctx.read_from(io.BytesIO()) + for chunk in cctx.read_from(io.BytesIO()): + pass # Buffer protocol works. - cctx.read_from(b'foobar') + for chunk in cctx.read_from(b'foobar'): + pass with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): - cctx.read_from(True) + for chunk in cctx.read_from(True): + pass def test_read_empty(self): cctx = zstd.ZstdCompressor(level=1) @@ -521,6 +654,12 @@ # We should get the same output as the one-shot compression mechanism. self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) + params = zstd.get_frame_parameters(b''.join(chunks)) + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 262144) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + # Now check the buffer protocol. it = cctx.read_from(source.getvalue()) chunks = list(it) diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_data_structures.py --- a/contrib/python-zstandard/tests/test_data_structures.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_data_structures.py Mon Feb 13 09:44:16 2017 -0800 @@ -13,6 +13,12 @@ import zstd +from . common import ( + make_cffi, +) + + +@make_cffi class TestCompressionParameters(unittest.TestCase): def test_init_bad_arg_type(self): with self.assertRaises(TypeError): @@ -42,7 +48,81 @@ p = zstd.get_compression_parameters(1) self.assertIsInstance(p, zstd.CompressionParameters) - self.assertEqual(p[0], 19) + self.assertEqual(p.window_log, 19) + + def test_members(self): + p = zstd.CompressionParameters(10, 6, 7, 4, 5, 8, 1) + self.assertEqual(p.window_log, 10) + self.assertEqual(p.chain_log, 6) + self.assertEqual(p.hash_log, 7) + self.assertEqual(p.search_log, 4) + self.assertEqual(p.search_length, 5) + self.assertEqual(p.target_length, 8) + self.assertEqual(p.strategy, 1) + + +@make_cffi +class TestFrameParameters(unittest.TestCase): + def test_invalid_type(self): + with self.assertRaises(TypeError): + zstd.get_frame_parameters(None) + + with self.assertRaises(TypeError): + zstd.get_frame_parameters(u'foobarbaz') + + def test_invalid_input_sizes(self): + with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'): + zstd.get_frame_parameters(b'') + + with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'): + zstd.get_frame_parameters(zstd.FRAME_HEADER) + + def test_invalid_frame(self): + with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): + zstd.get_frame_parameters(b'foobarbaz') + + def test_attributes(self): + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x00') + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + + # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte. + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x01\x00\xff') + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, 255) + self.assertFalse(params.has_checksum) + + # Lowest 3rd bit indicates if checksum is present. + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x04\x00') + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, 0) + self.assertTrue(params.has_checksum) + + # Upper 2 bits indicate content size. + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x40\x00\xff\x00') + self.assertEqual(params.content_size, 511) + self.assertEqual(params.window_size, 1024) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + + # Window descriptor is 2nd byte after frame header. + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x40') + self.assertEqual(params.content_size, 0) + self.assertEqual(params.window_size, 262144) + self.assertEqual(params.dict_id, 0) + self.assertFalse(params.has_checksum) + + # Set multiple things. + params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x45\x40\x0f\x10\x00') + self.assertEqual(params.content_size, 272) + self.assertEqual(params.window_size, 262144) + self.assertEqual(params.dict_id, 15) + self.assertTrue(params.has_checksum) + if hypothesis: s_windowlog = strategies.integers(min_value=zstd.WINDOWLOG_MIN, @@ -65,6 +145,8 @@ zstd.STRATEGY_BTLAZY2, zstd.STRATEGY_BTOPT)) + + @make_cffi class TestCompressionParametersHypothesis(unittest.TestCase): @hypothesis.given(s_windowlog, s_chainlog, s_hashlog, s_searchlog, s_searchlength, s_targetlength, s_strategy) @@ -73,9 +155,6 @@ p = zstd.CompressionParameters(windowlog, chainlog, hashlog, searchlog, searchlength, targetlength, strategy) - self.assertEqual(tuple(p), - (windowlog, chainlog, hashlog, searchlog, - searchlength, targetlength, strategy)) # Verify we can instantiate a compressor with the supplied values. # ZSTD_checkCParams moves the goal posts on us from what's advertised diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_decompressor.py --- a/contrib/python-zstandard/tests/test_decompressor.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_decompressor.py Mon Feb 13 09:44:16 2017 -0800 @@ -10,7 +10,10 @@ import zstd -from .common import OpCountingBytesIO +from .common import ( + make_cffi, + OpCountingBytesIO, +) if sys.version_info[0] >= 3: @@ -19,6 +22,7 @@ next = lambda it: it.next() +@make_cffi class TestDecompressor_decompress(unittest.TestCase): def test_empty_input(self): dctx = zstd.ZstdDecompressor() @@ -119,6 +123,7 @@ self.assertEqual(decompressed, sources[i]) +@make_cffi class TestDecompressor_copy_stream(unittest.TestCase): def test_no_read(self): source = object() @@ -180,6 +185,7 @@ self.assertEqual(dest._write_count, len(dest.getvalue())) +@make_cffi class TestDecompressor_decompressobj(unittest.TestCase): def test_simple(self): data = zstd.ZstdCompressor(level=1).compress(b'foobar') @@ -207,6 +213,7 @@ return buffer.getvalue() +@make_cffi class TestDecompressor_write_to(unittest.TestCase): def test_empty_roundtrip(self): cctx = zstd.ZstdCompressor() @@ -256,14 +263,14 @@ buffer = io.BytesIO() cctx = zstd.ZstdCompressor(dict_data=d) with cctx.write_to(buffer) as compressor: - compressor.write(orig) + self.assertEqual(compressor.write(orig), 1544) compressed = buffer.getvalue() buffer = io.BytesIO() dctx = zstd.ZstdDecompressor(dict_data=d) with dctx.write_to(buffer) as decompressor: - decompressor.write(compressed) + self.assertEqual(decompressor.write(compressed), len(orig)) self.assertEqual(buffer.getvalue(), orig) @@ -291,6 +298,7 @@ self.assertEqual(dest._write_count, len(dest.getvalue())) +@make_cffi class TestDecompressor_read_from(unittest.TestCase): def test_type_validation(self): dctx = zstd.ZstdDecompressor() @@ -302,7 +310,7 @@ dctx.read_from(b'foobar') with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'): - dctx.read_from(True) + b''.join(dctx.read_from(True)) def test_empty_input(self): dctx = zstd.ZstdDecompressor() @@ -351,7 +359,7 @@ dctx = zstd.ZstdDecompressor() with self.assertRaisesRegexp(ValueError, 'skip_bytes must be smaller than read_size'): - dctx.read_from(b'', skip_bytes=1, read_size=1) + b''.join(dctx.read_from(b'', skip_bytes=1, read_size=1)) with self.assertRaisesRegexp(ValueError, 'skip_bytes larger than first input chunk'): b''.join(dctx.read_from(b'foobar', skip_bytes=10)) @@ -476,3 +484,94 @@ self.assertEqual(len(chunk), 1) self.assertEqual(source._read_count, len(source.getvalue())) + + +@make_cffi +class TestDecompressor_content_dict_chain(unittest.TestCase): + def test_bad_inputs_simple(self): + dctx = zstd.ZstdDecompressor() + + with self.assertRaises(TypeError): + dctx.decompress_content_dict_chain(b'foo') + + with self.assertRaises(TypeError): + dctx.decompress_content_dict_chain((b'foo', b'bar')) + + with self.assertRaisesRegexp(ValueError, 'empty input chain'): + dctx.decompress_content_dict_chain([]) + + with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'): + dctx.decompress_content_dict_chain([u'foo']) + + with self.assertRaisesRegexp(ValueError, 'chunk 0 must be bytes'): + dctx.decompress_content_dict_chain([True]) + + with self.assertRaisesRegexp(ValueError, 'chunk 0 is too small to contain a zstd frame'): + dctx.decompress_content_dict_chain([zstd.FRAME_HEADER]) + + with self.assertRaisesRegexp(ValueError, 'chunk 0 is not a valid zstd frame'): + dctx.decompress_content_dict_chain([b'foo' * 8]) + + no_size = zstd.ZstdCompressor().compress(b'foo' * 64) + + with self.assertRaisesRegexp(ValueError, 'chunk 0 missing content size in frame'): + dctx.decompress_content_dict_chain([no_size]) + + # Corrupt first frame. + frame = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64) + frame = frame[0:12] + frame[15:] + with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 0'): + dctx.decompress_content_dict_chain([frame]) + + def test_bad_subsequent_input(self): + initial = zstd.ZstdCompressor(write_content_size=True).compress(b'foo' * 64) + + dctx = zstd.ZstdDecompressor() + + with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'): + dctx.decompress_content_dict_chain([initial, u'foo']) + + with self.assertRaisesRegexp(ValueError, 'chunk 1 must be bytes'): + dctx.decompress_content_dict_chain([initial, None]) + + with self.assertRaisesRegexp(ValueError, 'chunk 1 is too small to contain a zstd frame'): + dctx.decompress_content_dict_chain([initial, zstd.FRAME_HEADER]) + + with self.assertRaisesRegexp(ValueError, 'chunk 1 is not a valid zstd frame'): + dctx.decompress_content_dict_chain([initial, b'foo' * 8]) + + no_size = zstd.ZstdCompressor().compress(b'foo' * 64) + + with self.assertRaisesRegexp(ValueError, 'chunk 1 missing content size in frame'): + dctx.decompress_content_dict_chain([initial, no_size]) + + # Corrupt second frame. + cctx = zstd.ZstdCompressor(write_content_size=True, dict_data=zstd.ZstdCompressionDict(b'foo' * 64)) + frame = cctx.compress(b'bar' * 64) + frame = frame[0:12] + frame[15:] + + with self.assertRaisesRegexp(zstd.ZstdError, 'could not decompress chunk 1'): + dctx.decompress_content_dict_chain([initial, frame]) + + def test_simple(self): + original = [ + b'foo' * 64, + b'foobar' * 64, + b'baz' * 64, + b'foobaz' * 64, + b'foobarbaz' * 64, + ] + + chunks = [] + chunks.append(zstd.ZstdCompressor(write_content_size=True).compress(original[0])) + for i, chunk in enumerate(original[1:]): + d = zstd.ZstdCompressionDict(original[i]) + cctx = zstd.ZstdCompressor(dict_data=d, write_content_size=True) + chunks.append(cctx.compress(chunk)) + + for i in range(1, len(original)): + chain = chunks[0:i] + expected = original[i - 1] + dctx = zstd.ZstdDecompressor() + decompressed = dctx.decompress_content_dict_chain(chain) + self.assertEqual(decompressed, expected) diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_estimate_sizes.py --- a/contrib/python-zstandard/tests/test_estimate_sizes.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_estimate_sizes.py Mon Feb 13 09:44:16 2017 -0800 @@ -5,7 +5,12 @@ import zstd +from . common import ( + make_cffi, +) + +@make_cffi class TestSizes(unittest.TestCase): def test_decompression_size(self): size = zstd.estimate_decompression_context_size() diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_module_attributes.py --- a/contrib/python-zstandard/tests/test_module_attributes.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_module_attributes.py Mon Feb 13 09:44:16 2017 -0800 @@ -7,9 +7,15 @@ import zstd +from . common import ( + make_cffi, +) + + +@make_cffi class TestModuleAttributes(unittest.TestCase): def test_version(self): - self.assertEqual(zstd.ZSTD_VERSION, (1, 1, 2)) + self.assertEqual(zstd.ZSTD_VERSION, (1, 1, 3)) def test_constants(self): self.assertEqual(zstd.MAX_COMPRESSION_LEVEL, 22) @@ -45,4 +51,4 @@ ) for a in attrs: - self.assertTrue(hasattr(zstd, a)) + self.assertTrue(hasattr(zstd, a), a) diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_roundtrip.py --- a/contrib/python-zstandard/tests/test_roundtrip.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_roundtrip.py Mon Feb 13 09:44:16 2017 -0800 @@ -13,10 +13,14 @@ import zstd +from .common import ( + make_cffi, +) compression_levels = strategies.integers(min_value=1, max_value=22) +@make_cffi class TestRoundTrip(unittest.TestCase): @hypothesis.given(strategies.binary(), compression_levels) def test_compress_write_to(self, data, level): diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/tests/test_train_dictionary.py --- a/contrib/python-zstandard/tests/test_train_dictionary.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/tests/test_train_dictionary.py Mon Feb 13 09:44:16 2017 -0800 @@ -7,6 +7,9 @@ import zstd +from . common import ( + make_cffi, +) if sys.version_info[0] >= 3: int_type = int @@ -14,6 +17,7 @@ int_type = long +@make_cffi class TestTrainDictionary(unittest.TestCase): def test_no_args(self): with self.assertRaises(TypeError): diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd.c --- a/contrib/python-zstandard/zstd.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd.c Mon Feb 13 09:44:16 2017 -0800 @@ -34,6 +34,11 @@ "Obtains a ``CompressionParameters`` instance from a compression level and\n" "optional input size and dictionary size"); +PyDoc_STRVAR(get_frame_parameters__doc__, +"get_frame_parameters(data)\n" +"\n" +"Obtains a ``FrameParameters`` instance by parsing data.\n"); + PyDoc_STRVAR(train_dictionary__doc__, "train_dictionary(dict_size, samples)\n" "\n" @@ -53,6 +58,8 @@ METH_NOARGS, estimate_decompression_context_size__doc__ }, { "get_compression_parameters", (PyCFunction)get_compression_parameters, METH_VARARGS, get_compression_parameters__doc__ }, + { "get_frame_parameters", (PyCFunction)get_frame_parameters, + METH_VARARGS, get_frame_parameters__doc__ }, { "train_dictionary", (PyCFunction)train_dictionary, METH_VARARGS | METH_KEYWORDS, train_dictionary__doc__ }, { NULL, NULL } @@ -70,6 +77,7 @@ void decompressobj_module_init(PyObject* mod); void decompressionwriter_module_init(PyObject* mod); void decompressoriterator_module_init(PyObject* mod); +void frameparams_module_init(PyObject* mod); void zstd_module_init(PyObject* m) { /* python-zstandard relies on unstable zstd C API features. This means @@ -87,7 +95,7 @@ We detect this mismatch here and refuse to load the module if this scenario is detected. */ - if (ZSTD_VERSION_NUMBER != 10102 || ZSTD_versionNumber() != 10102) { + if (ZSTD_VERSION_NUMBER != 10103 || ZSTD_versionNumber() != 10103) { PyErr_SetString(PyExc_ImportError, "zstd C API mismatch; Python bindings not compiled against expected zstd version"); return; } @@ -104,6 +112,7 @@ decompressobj_module_init(m); decompressionwriter_module_init(m); decompressoriterator_module_init(m); + frameparams_module_init(m); } #if PY_MAJOR_VERSION >= 3 diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/mem.h --- a/contrib/python-zstandard/zstd/common/mem.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/common/mem.h Mon Feb 13 09:44:16 2017 -0800 @@ -39,7 +39,7 @@ #endif /* code only tested on 32 and 64 bits systems */ -#define MEM_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } +#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/pool.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/pool.c Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,194 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + + +/* ====== Dependencies ======= */ +#include /* size_t */ +#include /* malloc, calloc, free */ +#include "pool.h" + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +#endif + + +#ifdef ZSTD_MULTITHREAD + +#include "threading.h" /* pthread adaptation */ + +/* A job is a function and an opaque argument */ +typedef struct POOL_job_s { + POOL_function function; + void *opaque; +} POOL_job; + +struct POOL_ctx_s { + /* Keep track of the threads */ + pthread_t *threads; + size_t numThreads; + + /* The queue is a circular buffer */ + POOL_job *queue; + size_t queueHead; + size_t queueTail; + size_t queueSize; + /* The mutex protects the queue */ + pthread_mutex_t queueMutex; + /* Condition variable for pushers to wait on when the queue is full */ + pthread_cond_t queuePushCond; + /* Condition variables for poppers to wait on when the queue is empty */ + pthread_cond_t queuePopCond; + /* Indicates if the queue is shutting down */ + int shutdown; +}; + +/* POOL_thread() : + Work thread for the thread pool. + Waits for jobs and executes them. + @returns : NULL on failure else non-null. +*/ +static void* POOL_thread(void* opaque) { + POOL_ctx* const ctx = (POOL_ctx*)opaque; + if (!ctx) { return NULL; } + for (;;) { + /* Lock the mutex and wait for a non-empty queue or until shutdown */ + pthread_mutex_lock(&ctx->queueMutex); + while (ctx->queueHead == ctx->queueTail && !ctx->shutdown) { + pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); + } + /* empty => shutting down: so stop */ + if (ctx->queueHead == ctx->queueTail) { + pthread_mutex_unlock(&ctx->queueMutex); + return opaque; + } + /* Pop a job off the queue */ + { POOL_job const job = ctx->queue[ctx->queueHead]; + ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize; + /* Unlock the mutex, signal a pusher, and run the job */ + pthread_mutex_unlock(&ctx->queueMutex); + pthread_cond_signal(&ctx->queuePushCond); + job.function(job.opaque); + } + } + /* Unreachable */ +} + +POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) { + POOL_ctx *ctx; + /* Check the parameters */ + if (!numThreads || !queueSize) { return NULL; } + /* Allocate the context and zero initialize */ + ctx = (POOL_ctx *)calloc(1, sizeof(POOL_ctx)); + if (!ctx) { return NULL; } + /* Initialize the job queue. + * It needs one extra space since one space is wasted to differentiate empty + * and full queues. + */ + ctx->queueSize = queueSize + 1; + ctx->queue = (POOL_job *)malloc(ctx->queueSize * sizeof(POOL_job)); + ctx->queueHead = 0; + ctx->queueTail = 0; + pthread_mutex_init(&ctx->queueMutex, NULL); + pthread_cond_init(&ctx->queuePushCond, NULL); + pthread_cond_init(&ctx->queuePopCond, NULL); + ctx->shutdown = 0; + /* Allocate space for the thread handles */ + ctx->threads = (pthread_t *)malloc(numThreads * sizeof(pthread_t)); + ctx->numThreads = 0; + /* Check for errors */ + if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; } + /* Initialize the threads */ + { size_t i; + for (i = 0; i < numThreads; ++i) { + if (pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) { + ctx->numThreads = i; + POOL_free(ctx); + return NULL; + } } + ctx->numThreads = numThreads; + } + return ctx; +} + +/*! POOL_join() : + Shutdown the queue, wake any sleeping threads, and join all of the threads. +*/ +static void POOL_join(POOL_ctx *ctx) { + /* Shut down the queue */ + pthread_mutex_lock(&ctx->queueMutex); + ctx->shutdown = 1; + pthread_mutex_unlock(&ctx->queueMutex); + /* Wake up sleeping threads */ + pthread_cond_broadcast(&ctx->queuePushCond); + pthread_cond_broadcast(&ctx->queuePopCond); + /* Join all of the threads */ + { size_t i; + for (i = 0; i < ctx->numThreads; ++i) { + pthread_join(ctx->threads[i], NULL); + } } +} + +void POOL_free(POOL_ctx *ctx) { + if (!ctx) { return; } + POOL_join(ctx); + pthread_mutex_destroy(&ctx->queueMutex); + pthread_cond_destroy(&ctx->queuePushCond); + pthread_cond_destroy(&ctx->queuePopCond); + if (ctx->queue) free(ctx->queue); + if (ctx->threads) free(ctx->threads); + free(ctx); +} + +void POOL_add(void *ctxVoid, POOL_function function, void *opaque) { + POOL_ctx *ctx = (POOL_ctx *)ctxVoid; + if (!ctx) { return; } + + pthread_mutex_lock(&ctx->queueMutex); + { POOL_job const job = {function, opaque}; + /* Wait until there is space in the queue for the new job */ + size_t newTail = (ctx->queueTail + 1) % ctx->queueSize; + while (ctx->queueHead == newTail && !ctx->shutdown) { + pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); + newTail = (ctx->queueTail + 1) % ctx->queueSize; + } + /* The queue is still going => there is space */ + if (!ctx->shutdown) { + ctx->queue[ctx->queueTail] = job; + ctx->queueTail = newTail; + } + } + pthread_mutex_unlock(&ctx->queueMutex); + pthread_cond_signal(&ctx->queuePopCond); +} + +#else /* ZSTD_MULTITHREAD not defined */ +/* No multi-threading support */ + +/* We don't need any data, but if it is empty malloc() might return NULL. */ +struct POOL_ctx_s { + int data; +}; + +POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) { + (void)numThreads; + (void)queueSize; + return (POOL_ctx *)malloc(sizeof(POOL_ctx)); +} + +void POOL_free(POOL_ctx *ctx) { + if (ctx) free(ctx); +} + +void POOL_add(void *ctx, POOL_function function, void *opaque) { + (void)ctx; + function(opaque); +} + +#endif /* ZSTD_MULTITHREAD */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/pool.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/pool.h Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,56 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ +#ifndef POOL_H +#define POOL_H + +#if defined (__cplusplus) +extern "C" { +#endif + + +#include /* size_t */ + +typedef struct POOL_ctx_s POOL_ctx; + +/*! POOL_create() : + Create a thread pool with at most `numThreads` threads. + `numThreads` must be at least 1. + The maximum number of queued jobs before blocking is `queueSize`. + `queueSize` must be at least 1. + @return : The POOL_ctx pointer on success else NULL. +*/ +POOL_ctx *POOL_create(size_t numThreads, size_t queueSize); + +/*! POOL_free() : + Free a thread pool returned by POOL_create(). +*/ +void POOL_free(POOL_ctx *ctx); + +/*! POOL_function : + The function type that can be added to a thread pool. +*/ +typedef void (*POOL_function)(void *); +/*! POOL_add_function : + The function type for a generic thread pool add function. +*/ +typedef void (*POOL_add_function)(void *, POOL_function, void *); + +/*! POOL_add() : + Add the job `function(opaque)` to the thread pool. + Possibly blocks until there is room in the queue. + Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed. +*/ +void POOL_add(void *ctx, POOL_function function, void *opaque); + + +#if defined (__cplusplus) +} +#endif + +#endif diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/threading.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/threading.c Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,79 @@ + +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + */ + +/** + * This file will hold wrapper for systems, which do not support pthreads + */ + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4206) /* disable: C4206: translation unit is empty (when ZSTD_MULTITHREAD is not defined) */ +#endif + + +#if defined(ZSTD_MULTITHREAD) && defined(_WIN32) + +/** + * Windows minimalist Pthread Wrapper, based on : + * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + */ + + +/* === Dependencies === */ +#include +#include +#include "threading.h" + + +/* === Implementation === */ + +static unsigned __stdcall worker(void *arg) +{ + pthread_t* const thread = (pthread_t*) arg; + thread->arg = thread->start_routine(thread->arg); + return 0; +} + +int pthread_create(pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg) +{ + (void)unused; + thread->arg = arg; + thread->start_routine = start_routine; + thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL); + + if (!thread->handle) + return errno; + else + return 0; +} + +int _pthread_join(pthread_t * thread, void **value_ptr) +{ + DWORD result; + + if (!thread->handle) return 0; + + result = WaitForSingleObject(thread->handle, INFINITE); + switch (result) { + case WAIT_OBJECT_0: + if (value_ptr) *value_ptr = thread->arg; + return 0; + case WAIT_ABANDONED: + return EINVAL; + default: + return GetLastError(); + } +} + +#endif /* ZSTD_MULTITHREAD */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/threading.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/common/threading.h Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,104 @@ + +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + */ + +#ifndef THREADING_H_938743 +#define THREADING_H_938743 + +#if defined (__cplusplus) +extern "C" { +#endif + +#if defined(ZSTD_MULTITHREAD) && defined(_WIN32) + +/** + * Windows minimalist Pthread Wrapper, based on : + * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + */ +#ifdef WINVER +# undef WINVER +#endif +#define WINVER 0x0600 + +#ifdef _WIN32_WINNT +# undef _WIN32_WINNT +#endif +#define _WIN32_WINNT 0x0600 + +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#include + +/* mutex */ +#define pthread_mutex_t CRITICAL_SECTION +#define pthread_mutex_init(a,b) InitializeCriticalSection((a)) +#define pthread_mutex_destroy(a) DeleteCriticalSection((a)) +#define pthread_mutex_lock(a) EnterCriticalSection((a)) +#define pthread_mutex_unlock(a) LeaveCriticalSection((a)) + +/* condition variable */ +#define pthread_cond_t CONDITION_VARIABLE +#define pthread_cond_init(a, b) InitializeConditionVariable((a)) +#define pthread_cond_destroy(a) /* No delete */ +#define pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) +#define pthread_cond_signal(a) WakeConditionVariable((a)) +#define pthread_cond_broadcast(a) WakeAllConditionVariable((a)) + +/* pthread_create() and pthread_join() */ +typedef struct { + HANDLE handle; + void* (*start_routine)(void*); + void* arg; +} pthread_t; + +int pthread_create(pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg); + +#define pthread_join(a, b) _pthread_join(&(a), (b)) +int _pthread_join(pthread_t* thread, void** value_ptr); + +/** + * add here more wrappers as required + */ + + +#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection mathod */ +/* === POSIX Systems === */ +# include + +#else /* ZSTD_MULTITHREAD not defined */ +/* No multithreading support */ + +#define pthread_mutex_t int /* #define rather than typedef, as sometimes pthread support is implicit, resulting in duplicated symbols */ +#define pthread_mutex_init(a,b) +#define pthread_mutex_destroy(a) +#define pthread_mutex_lock(a) +#define pthread_mutex_unlock(a) + +#define pthread_cond_t int +#define pthread_cond_init(a,b) +#define pthread_cond_destroy(a) +#define pthread_cond_wait(a,b) +#define pthread_cond_signal(a) +#define pthread_cond_broadcast(a) + +/* do not use pthread_t */ + +#endif /* ZSTD_MULTITHREAD */ + +#if defined (__cplusplus) +} +#endif + +#endif /* THREADING_H_938743 */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/zstd_common.c --- a/contrib/python-zstandard/zstd/common/zstd_common.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/common/zstd_common.c Mon Feb 13 09:44:16 2017 -0800 @@ -43,10 +43,6 @@ * provides error code string from enum */ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorName(code); } -/* --- ZBUFF Error Management (deprecated) --- */ -unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); } -const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } - /*=************************************************************** * Custom allocator diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/zstd_errors.h --- a/contrib/python-zstandard/zstd/common/zstd_errors.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/common/zstd_errors.h Mon Feb 13 09:44:16 2017 -0800 @@ -18,6 +18,20 @@ #include /* size_t */ +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +#else +# define ZSTDERRORLIB_VISIBILITY +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#endif + /*-**************************************** * error codes list ******************************************/ @@ -49,8 +63,8 @@ /*! ZSTD_getErrorCode() : convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, which can be used to compare directly with enum list published into "error_public.h" */ -ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); -const char* ZSTD_getErrorString(ZSTD_ErrorCode code); +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); #if defined (__cplusplus) diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/common/zstd_internal.h --- a/contrib/python-zstandard/zstd/common/zstd_internal.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/common/zstd_internal.h Mon Feb 13 09:44:16 2017 -0800 @@ -267,4 +267,13 @@ } +/* hidden functions */ + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); + + #endif /* ZSTD_CCOMMON_H_MODULE */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/compress/zstd_compress.c --- a/contrib/python-zstandard/zstd/compress/zstd_compress.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/compress/zstd_compress.c Mon Feb 13 09:44:16 2017 -0800 @@ -51,8 +51,7 @@ /*-************************************* * Context memory management ***************************************/ -struct ZSTD_CCtx_s -{ +struct ZSTD_CCtx_s { const BYTE* nextSrc; /* next block here to continue on current prefix */ const BYTE* base; /* All regular indexes relative to this position */ const BYTE* dictBase; /* extDict indexes relative to this position */ @@ -61,10 +60,11 @@ U32 nextToUpdate; /* index from which to continue dictionary update */ U32 nextToUpdate3; /* index from which to continue dictionary update */ U32 hashLog3; /* dispatch table : larger == faster, more memory */ - U32 loadedDictEnd; + U32 loadedDictEnd; /* index of end of dictionary */ + U32 forceWindow; /* force back-references to respect limit of 1<customMem), &customMem, sizeof(customMem)); + cctx->customMem = customMem; return cctx; } @@ -119,6 +119,15 @@ return sizeof(*cctx) + cctx->workSpaceSize; } +size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value) +{ + switch(param) + { + case ZSTD_p_forceWindow : cctx->forceWindow = value>0; cctx->loadedDictEnd = 0; return 0; + default: return ERROR(parameter_unknown); + } +} + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) /* hidden interface */ { return &(ctx->seqStore); @@ -318,6 +327,14 @@ } } +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { + int i; + for (i=0; irep[i] = 0; +} /*! ZSTD_copyCCtx() : * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. @@ -735,12 +752,19 @@ if ((size_t)(op-ostart) >= maxCSize) return 0; } /* confirm repcodes */ - { int i; for (i=0; irep[i] = zc->savedRep[i]; } + { int i; for (i=0; irep[i] = zc->repToConfirm[i]; } return op - ostart; } +#if 0 /* for debug */ +# define STORESEQ_DEBUG +#include /* fprintf */ +U32 g_startDebug = 0; +const BYTE* g_start = NULL; +#endif + /*! ZSTD_storeSeq() : Store a sequence (literal length, literals, offset code and match length code) into seqStore_t. `offsetCode` : distance to match, or 0 == repCode. @@ -748,13 +772,14 @@ */ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t matchCode) { -#if 0 /* for debug */ - static const BYTE* g_start = NULL; - const U32 pos = (U32)((const BYTE*)literals - g_start); - if (g_start==NULL) g_start = (const BYTE*)literals; - //if ((pos > 1) && (pos < 50000)) - printf("Cpos %6u :%5u literals & match %3u bytes at distance %6u \n", - pos, (U32)litLength, (U32)matchCode+MINMATCH, (U32)offsetCode); +#ifdef STORESEQ_DEBUG + if (g_startDebug) { + const U32 pos = (U32)((const BYTE*)literals - g_start); + if (g_start==NULL) g_start = (const BYTE*)literals; + if ((pos > 1895000) && (pos < 1895300)) + fprintf(stderr, "Cpos %6u :%5u literals & match %3u bytes at distance %6u \n", + pos, (U32)litLength, (U32)matchCode+MINMATCH, (U32)offsetCode); + } #endif /* copy Literals */ ZSTD_wildcopy(seqStorePtr->lit, literals, litLength); @@ -1004,8 +1029,8 @@ } } } /* save reps for next block */ - cctx->savedRep[0] = offset_1 ? offset_1 : offsetSaved; - cctx->savedRep[1] = offset_2 ? offset_2 : offsetSaved; + cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; + cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -1119,7 +1144,7 @@ } } } /* save reps for next block */ - ctx->savedRep[0] = offset_1; ctx->savedRep[1] = offset_2; + ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -1273,8 +1298,8 @@ } } } /* save reps for next block */ - cctx->savedRep[0] = offset_1 ? offset_1 : offsetSaved; - cctx->savedRep[1] = offset_2 ? offset_2 : offsetSaved; + cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; + cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -1423,7 +1448,7 @@ } } } /* save reps for next block */ - ctx->savedRep[0] = offset_1; ctx->savedRep[1] = offset_2; + ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -1955,8 +1980,8 @@ } } /* Save reps for next block */ - ctx->savedRep[0] = offset_1 ? offset_1 : savedOffset; - ctx->savedRep[1] = offset_2 ? offset_2 : savedOffset; + ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset; + ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -2150,7 +2175,7 @@ } } /* Save reps for next block */ - ctx->savedRep[0] = offset_1; ctx->savedRep[1] = offset_2; + ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2; /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -2409,12 +2434,14 @@ cctx->nextSrc = ip + srcSize; - { size_t const cSize = frame ? + if (srcSize) { + size_t const cSize = frame ? ZSTD_compress_generic (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize); if (ZSTD_isError(cSize)) return cSize; return cSize + fhSize; - } + } else + return fhSize; } @@ -2450,7 +2477,7 @@ zc->dictBase = zc->base; zc->base += ip - zc->nextSrc; zc->nextToUpdate = zc->dictLimit; - zc->loadedDictEnd = (U32)(iend - zc->base); + zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base); zc->nextSrc = iend; if (srcSize <= HASH_READ_SIZE) return 0; @@ -2557,9 +2584,9 @@ } if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted); - cctx->rep[0] = MEM_readLE32(dictPtr+0); if (cctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); - cctx->rep[1] = MEM_readLE32(dictPtr+4); if (cctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); - cctx->rep[2] = MEM_readLE32(dictPtr+8); if (cctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); + cctx->rep[0] = MEM_readLE32(dictPtr+0); if (cctx->rep[0] == 0 || cctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); + cctx->rep[1] = MEM_readLE32(dictPtr+4); if (cctx->rep[1] == 0 || cctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); + cctx->rep[2] = MEM_readLE32(dictPtr+8); if (cctx->rep[2] == 0 || cctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); dictPtr += 12; { U32 offcodeMax = MaxOff; @@ -2594,7 +2621,6 @@ } } - /*! ZSTD_compressBegin_internal() : * @return : 0, or an error code */ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, @@ -2626,9 +2652,9 @@ } -size_t ZSTD_compressBegin(ZSTD_CCtx* zc, int compressionLevel) +size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) { - return ZSTD_compressBegin_usingDict(zc, NULL, 0, compressionLevel); + return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); } @@ -2733,7 +2759,8 @@ /* ===== Dictionary API ===== */ struct ZSTD_CDict_s { - void* dictContent; + void* dictBuffer; + const void* dictContent; size_t dictContentSize; ZSTD_CCtx* refContext; }; /* typedef'd tp ZSTD_CDict within "zstd.h" */ @@ -2741,39 +2768,45 @@ size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) { if (cdict==NULL) return 0; /* support sizeof on NULL */ - return ZSTD_sizeof_CCtx(cdict->refContext) + cdict->dictContentSize; + return ZSTD_sizeof_CCtx(cdict->refContext) + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict); } -ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, ZSTD_parameters params, ZSTD_customMem customMem) +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, unsigned byReference, + ZSTD_parameters params, ZSTD_customMem customMem) { if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; if (!customMem.customAlloc || !customMem.customFree) return NULL; { ZSTD_CDict* const cdict = (ZSTD_CDict*) ZSTD_malloc(sizeof(ZSTD_CDict), customMem); - void* const dictContent = ZSTD_malloc(dictSize, customMem); ZSTD_CCtx* const cctx = ZSTD_createCCtx_advanced(customMem); - if (!dictContent || !cdict || !cctx) { - ZSTD_free(dictContent, customMem); + if (!cdict || !cctx) { ZSTD_free(cdict, customMem); ZSTD_free(cctx, customMem); return NULL; } - if (dictSize) { - memcpy(dictContent, dict, dictSize); + if ((byReference) || (!dictBuffer) || (!dictSize)) { + cdict->dictBuffer = NULL; + cdict->dictContent = dictBuffer; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, customMem); + if (!internalBuffer) { ZSTD_free(cctx, customMem); ZSTD_free(cdict, customMem); return NULL; } + memcpy(internalBuffer, dictBuffer, dictSize); + cdict->dictBuffer = internalBuffer; + cdict->dictContent = internalBuffer; } - { size_t const errorCode = ZSTD_compressBegin_advanced(cctx, dictContent, dictSize, params, 0); + + { size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0); if (ZSTD_isError(errorCode)) { - ZSTD_free(dictContent, customMem); + ZSTD_free(cdict->dictBuffer, customMem); + ZSTD_free(cctx, customMem); ZSTD_free(cdict, customMem); - ZSTD_free(cctx, customMem); return NULL; } } - cdict->dictContent = dictContent; + cdict->refContext = cctx; cdict->dictContentSize = dictSize; - cdict->refContext = cctx; return cdict; } } @@ -2783,7 +2816,15 @@ ZSTD_customMem const allocator = { NULL, NULL, NULL }; ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize); params.fParams.contentSizeFlag = 1; - return ZSTD_createCDict_advanced(dict, dictSize, params, allocator); + return ZSTD_createCDict_advanced(dict, dictSize, 0, params, allocator); +} + +ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize); + params.fParams.contentSizeFlag = 1; + return ZSTD_createCDict_advanced(dict, dictSize, 1, params, allocator); } size_t ZSTD_freeCDict(ZSTD_CDict* cdict) @@ -2791,7 +2832,7 @@ if (cdict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = cdict->refContext->customMem; ZSTD_freeCCtx(cdict->refContext); - ZSTD_free(cdict->dictContent, cMem); + ZSTD_free(cdict->dictBuffer, cMem); ZSTD_free(cdict, cMem); return 0; } @@ -2801,7 +2842,7 @@ return ZSTD_getParamsFromCCtx(cdict->refContext); } -size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, U64 pledgedSrcSize) +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize) { if (cdict->dictContentSize) CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize)) else CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, cdict->refContext->params, pledgedSrcSize)); @@ -2900,7 +2941,7 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) { - if (zcs->inBuffSize==0) return ERROR(stage_wrong); /* zcs has not been init at least once */ + if (zcs->inBuffSize==0) return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */ if (zcs->cdict) CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize)) else CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize)); @@ -2937,9 +2978,9 @@ if (zcs->outBuff == NULL) return ERROR(memory_allocation); } - if (dict) { + if (dict && dictSize >= 8) { ZSTD_freeCDict(zcs->cdictLocal); - zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, params, zcs->customMem); + zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem); if (zcs->cdictLocal == NULL) return ERROR(memory_allocation); zcs->cdict = zcs->cdictLocal; } else zcs->cdict = NULL; @@ -2956,6 +2997,7 @@ ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict); size_t const initError = ZSTD_initCStream_advanced(zcs, NULL, 0, params, 0); zcs->cdict = cdict; + zcs->cctx->dictID = params.fParams.noDictIDFlag ? 0 : cdict->refContext->dictID; return initError; } @@ -2967,7 +3009,8 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); + ZSTD_parameters params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); + if (pledgedSrcSize) params.fParams.contentSizeFlag = 1; return ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize); } diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/compress/zstd_opt.h --- a/contrib/python-zstandard/zstd/compress/zstd_opt.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/compress/zstd_opt.h Mon Feb 13 09:44:16 2017 -0800 @@ -38,7 +38,7 @@ ssPtr->cachedLiterals = NULL; ssPtr->cachedPrice = ssPtr->cachedLitLength = 0; - ssPtr->staticPrices = 0; + ssPtr->staticPrices = 0; if (ssPtr->litLengthSum == 0) { if (srcSize <= 1024) ssPtr->staticPrices = 1; @@ -56,7 +56,7 @@ for (u=0; u<=MaxLit; u++) { ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u]>>ZSTD_FREQ_DIV); - ssPtr->litSum += ssPtr->litFreq[u]; + ssPtr->litSum += ssPtr->litFreq[u]; } for (u=0; u<=MaxLL; u++) ssPtr->litLengthFreq[u] = 1; @@ -634,7 +634,7 @@ } } /* for (cur=0; cur < last_pos; ) */ /* Save reps for next block */ - { int i; for (i=0; isavedRep[i] = rep[i]; } + { int i; for (i=0; irepToConfirm[i] = rep[i]; } /* Last Literals */ { size_t const lastLLSize = iend - anchor; @@ -825,7 +825,7 @@ match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch); - if (match_num > 0 && matches[match_num-1].len > sufficient_len) { + if (match_num > 0 && (matches[match_num-1].len > sufficient_len || cur + matches[match_num-1].len >= ZSTD_OPT_NUM)) { best_mlen = matches[match_num-1].len; best_off = matches[match_num-1].off; last_pos = cur + 1; @@ -835,7 +835,7 @@ /* set prices using matches at position = cur */ for (u = 0; u < match_num; u++) { mlen = (u>0) ? matches[u-1].len+1 : best_mlen; - best_mlen = (cur + matches[u].len < ZSTD_OPT_NUM) ? matches[u].len : ZSTD_OPT_NUM - cur; + best_mlen = matches[u].len; while (mlen <= best_mlen) { if (opt[cur].mlen == 1) { @@ -907,7 +907,7 @@ } } /* for (cur=0; cur < last_pos; ) */ /* Save reps for next block */ - { int i; for (i=0; isavedRep[i] = rep[i]; } + { int i; for (i=0; irepToConfirm[i] = rep[i]; } /* Last Literals */ { size_t lastLLSize = iend - anchor; diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/compress/zstdmt_compress.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.c Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,740 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + + +/* ====== Tuning parameters ====== */ +#define ZSTDMT_NBTHREADS_MAX 128 + + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +#endif + + +/* ====== Dependencies ====== */ +#include /* malloc */ +#include /* memcpy */ +#include "pool.h" /* threadpool */ +#include "threading.h" /* mutex */ +#include "zstd_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include "zstdmt_compress.h" +#define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ +#include "xxhash.h" + + +/* ====== Debug ====== */ +#if 0 + +# include +# include +# include + static unsigned g_debugLevel = 3; +# define DEBUGLOGRAW(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __VA_ARGS__); } +# define DEBUGLOG(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __FILE__ ": "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, " \n"); } + +# define DEBUG_PRINTHEX(l,p,n) { \ + unsigned debug_u; \ + for (debug_u=0; debug_u<(n); debug_u++) \ + DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ + DEBUGLOGRAW(l, " \n"); \ +} + +static unsigned long long GetCurrentClockTimeMicroseconds() +{ + static clock_t _ticksPerSecond = 0; + if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK); + + struct tms junk; clock_t newTicks = (clock_t) times(&junk); + return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); +} + +#define MUTEX_WAIT_TIME_DLEVEL 5 +#define PTHREAD_MUTEX_LOCK(mutex) \ +if (g_debugLevel>=MUTEX_WAIT_TIME_DLEVEL) { \ + unsigned long long beforeTime = GetCurrentClockTimeMicroseconds(); \ + pthread_mutex_lock(mutex); \ + unsigned long long afterTime = GetCurrentClockTimeMicroseconds(); \ + unsigned long long elapsedTime = (afterTime-beforeTime); \ + if (elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */ \ + DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \ + elapsedTime, #mutex); \ + } \ +} else pthread_mutex_lock(mutex); + +#else + +# define DEBUGLOG(l, ...) {} /* disabled */ +# define PTHREAD_MUTEX_LOCK(m) pthread_mutex_lock(m) +# define DEBUG_PRINTHEX(l,p,n) {} + +#endif + + +/* ===== Buffer Pool ===== */ + +typedef struct buffer_s { + void* start; + size_t size; +} buffer_t; + +static const buffer_t g_nullBuffer = { NULL, 0 }; + +typedef struct ZSTDMT_bufferPool_s { + unsigned totalBuffers; + unsigned nbBuffers; + buffer_t bTable[1]; /* variable size */ +} ZSTDMT_bufferPool; + +static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbThreads) +{ + unsigned const maxNbBuffers = 2*nbThreads + 2; + ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)calloc(1, sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t)); + if (bufPool==NULL) return NULL; + bufPool->totalBuffers = maxNbBuffers; + bufPool->nbBuffers = 0; + return bufPool; +} + +static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) +{ + unsigned u; + if (!bufPool) return; /* compatibility with free on NULL */ + for (u=0; utotalBuffers; u++) + free(bufPool->bTable[u].start); + free(bufPool); +} + +/* assumption : invocation from main thread only ! */ +static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* pool, size_t bSize) +{ + if (pool->nbBuffers) { /* try to use an existing buffer */ + buffer_t const buf = pool->bTable[--(pool->nbBuffers)]; + size_t const availBufferSize = buf.size; + if ((availBufferSize >= bSize) & (availBufferSize <= 10*bSize)) /* large enough, but not too much */ + return buf; + free(buf.start); /* size conditions not respected : scratch this buffer and create a new one */ + } + /* create new buffer */ + { buffer_t buffer; + void* const start = malloc(bSize); + if (start==NULL) bSize = 0; + buffer.start = start; /* note : start can be NULL if malloc fails ! */ + buffer.size = bSize; + return buffer; + } +} + +/* store buffer for later re-use, up to pool capacity */ +static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* pool, buffer_t buf) +{ + if (buf.start == NULL) return; /* release on NULL */ + if (pool->nbBuffers < pool->totalBuffers) { + pool->bTable[pool->nbBuffers++] = buf; /* store for later re-use */ + return; + } + /* Reached bufferPool capacity (should not happen) */ + free(buf.start); +} + + +/* ===== CCtx Pool ===== */ + +typedef struct { + unsigned totalCCtx; + unsigned availCCtx; + ZSTD_CCtx* cctx[1]; /* variable size */ +} ZSTDMT_CCtxPool; + +/* assumption : CCtxPool invocation only from main thread */ + +/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */ +static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) +{ + unsigned u; + for (u=0; utotalCCtx; u++) + ZSTD_freeCCtx(pool->cctx[u]); /* note : compatible with free on NULL */ + free(pool); +} + +/* ZSTDMT_createCCtxPool() : + * implies nbThreads >= 1 , checked by caller ZSTDMT_createCCtx() */ +static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbThreads) +{ + ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) calloc(1, sizeof(ZSTDMT_CCtxPool) + (nbThreads-1)*sizeof(ZSTD_CCtx*)); + if (!cctxPool) return NULL; + cctxPool->totalCCtx = nbThreads; + cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */ + cctxPool->cctx[0] = ZSTD_createCCtx(); + if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } + DEBUGLOG(1, "cctxPool created, with %u threads", nbThreads); + return cctxPool; +} + +static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* pool) +{ + if (pool->availCCtx) { + pool->availCCtx--; + return pool->cctx[pool->availCCtx]; + } + return ZSTD_createCCtx(); /* note : can be NULL, when creation fails ! */ +} + +static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return; /* compatibility with release on NULL */ + if (pool->availCCtx < pool->totalCCtx) + pool->cctx[pool->availCCtx++] = cctx; + else + /* pool overflow : should not happen, since totalCCtx==nbThreads */ + ZSTD_freeCCtx(cctx); +} + + +/* ===== Thread worker ===== */ + +typedef struct { + buffer_t buffer; + size_t filled; +} inBuff_t; + +typedef struct { + ZSTD_CCtx* cctx; + buffer_t src; + const void* srcStart; + size_t srcSize; + size_t dictSize; + buffer_t dstBuff; + size_t cSize; + size_t dstFlushed; + unsigned firstChunk; + unsigned lastChunk; + unsigned jobCompleted; + unsigned jobScanned; + pthread_mutex_t* jobCompleted_mutex; + pthread_cond_t* jobCompleted_cond; + ZSTD_parameters params; + ZSTD_CDict* cdict; + unsigned long long fullFrameSize; +} ZSTDMT_jobDescription; + +/* ZSTDMT_compressChunk() : POOL_function type */ +void ZSTDMT_compressChunk(void* jobDescription) +{ + ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription; + const void* const src = (const char*)job->srcStart + job->dictSize; + buffer_t const dstBuff = job->dstBuff; + DEBUGLOG(3, "job (first:%u) (last:%u) : dictSize %u, srcSize %u", job->firstChunk, job->lastChunk, (U32)job->dictSize, (U32)job->srcSize); + if (job->cdict) { + size_t const initError = ZSTD_compressBegin_usingCDict(job->cctx, job->cdict, job->fullFrameSize); + if (job->cdict) DEBUGLOG(3, "using CDict "); + if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; } + } else { + size_t const initError = ZSTD_compressBegin_advanced(job->cctx, job->srcStart, job->dictSize, job->params, job->fullFrameSize); + if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; } + ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceWindow, 1); + } + if (!job->firstChunk) { /* flush frame header */ + size_t const hSize = ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, 0); + if (ZSTD_isError(hSize)) { job->cSize = hSize; goto _endJob; } + ZSTD_invalidateRepCodes(job->cctx); + } + + DEBUGLOG(4, "Compressing : "); + DEBUG_PRINTHEX(4, job->srcStart, 12); + job->cSize = (job->lastChunk) ? /* last chunk signal */ + ZSTD_compressEnd (job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize) : + ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize); + DEBUGLOG(3, "compressed %u bytes into %u bytes (first:%u) (last:%u)", (unsigned)job->srcSize, (unsigned)job->cSize, job->firstChunk, job->lastChunk); + +_endJob: + PTHREAD_MUTEX_LOCK(job->jobCompleted_mutex); + job->jobCompleted = 1; + job->jobScanned = 0; + pthread_cond_signal(job->jobCompleted_cond); + pthread_mutex_unlock(job->jobCompleted_mutex); +} + + +/* ------------------------------------------ */ +/* ===== Multi-threaded compression ===== */ +/* ------------------------------------------ */ + +struct ZSTDMT_CCtx_s { + POOL_ctx* factory; + ZSTDMT_bufferPool* buffPool; + ZSTDMT_CCtxPool* cctxPool; + pthread_mutex_t jobCompleted_mutex; + pthread_cond_t jobCompleted_cond; + size_t targetSectionSize; + size_t marginSize; + size_t inBuffSize; + size_t dictSize; + size_t targetDictSize; + inBuff_t inBuff; + ZSTD_parameters params; + XXH64_state_t xxhState; + unsigned nbThreads; + unsigned jobIDMask; + unsigned doneJobID; + unsigned nextJobID; + unsigned frameEnded; + unsigned allJobsCompleted; + unsigned overlapRLog; + unsigned long long frameContentSize; + size_t sectionSize; + ZSTD_CDict* cdict; + ZSTD_CStream* cstream; + ZSTDMT_jobDescription jobs[1]; /* variable size (must lies at the end) */ +}; + +ZSTDMT_CCtx *ZSTDMT_createCCtx(unsigned nbThreads) +{ + ZSTDMT_CCtx* cctx; + U32 const minNbJobs = nbThreads + 2; + U32 const nbJobsLog2 = ZSTD_highbit32(minNbJobs) + 1; + U32 const nbJobs = 1 << nbJobsLog2; + DEBUGLOG(5, "nbThreads : %u ; minNbJobs : %u ; nbJobsLog2 : %u ; nbJobs : %u \n", + nbThreads, minNbJobs, nbJobsLog2, nbJobs); + if ((nbThreads < 1) | (nbThreads > ZSTDMT_NBTHREADS_MAX)) return NULL; + cctx = (ZSTDMT_CCtx*) calloc(1, sizeof(ZSTDMT_CCtx) + nbJobs*sizeof(ZSTDMT_jobDescription)); + if (!cctx) return NULL; + cctx->nbThreads = nbThreads; + cctx->jobIDMask = nbJobs - 1; + cctx->allJobsCompleted = 1; + cctx->sectionSize = 0; + cctx->overlapRLog = 3; + cctx->factory = POOL_create(nbThreads, 1); + cctx->buffPool = ZSTDMT_createBufferPool(nbThreads); + cctx->cctxPool = ZSTDMT_createCCtxPool(nbThreads); + if (!cctx->factory | !cctx->buffPool | !cctx->cctxPool) { /* one object was not created */ + ZSTDMT_freeCCtx(cctx); + return NULL; + } + if (nbThreads==1) { + cctx->cstream = ZSTD_createCStream(); + if (!cctx->cstream) { + ZSTDMT_freeCCtx(cctx); return NULL; + } } + pthread_mutex_init(&cctx->jobCompleted_mutex, NULL); /* Todo : check init function return */ + pthread_cond_init(&cctx->jobCompleted_cond, NULL); + DEBUGLOG(4, "mt_cctx created, for %u threads \n", nbThreads); + return cctx; +} + +/* ZSTDMT_releaseAllJobResources() : + * Ensure all workers are killed first. */ +static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx) +{ + unsigned jobID; + for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) { + ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].dstBuff); + mtctx->jobs[jobID].dstBuff = g_nullBuffer; + ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].src); + mtctx->jobs[jobID].src = g_nullBuffer; + ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[jobID].cctx); + mtctx->jobs[jobID].cctx = NULL; + } + memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription)); + ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->inBuff.buffer); + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->allJobsCompleted = 1; +} + +size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) +{ + if (mtctx==NULL) return 0; /* compatible with free on NULL */ + POOL_free(mtctx->factory); + if (!mtctx->allJobsCompleted) ZSTDMT_releaseAllJobResources(mtctx); /* stop workers first */ + ZSTDMT_freeBufferPool(mtctx->buffPool); /* release job resources into pools first */ + ZSTDMT_freeCCtxPool(mtctx->cctxPool); + ZSTD_freeCDict(mtctx->cdict); + ZSTD_freeCStream(mtctx->cstream); + pthread_mutex_destroy(&mtctx->jobCompleted_mutex); + pthread_cond_destroy(&mtctx->jobCompleted_cond); + free(mtctx); + return 0; +} + +size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value) +{ + switch(parameter) + { + case ZSTDMT_p_sectionSize : + mtctx->sectionSize = value; + return 0; + case ZSTDMT_p_overlapSectionLog : + DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value); + mtctx->overlapRLog = (value >= 9) ? 0 : 9 - value; + return 0; + default : + return ERROR(compressionParameter_unsupported); + } +} + + +/* ------------------------------------------ */ +/* ===== Multi-threaded compression ===== */ +/* ------------------------------------------ */ + +size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0); + size_t const chunkTargetSize = (size_t)1 << (params.cParams.windowLog + 2); + unsigned const nbChunksMax = (unsigned)(srcSize / chunkTargetSize) + (srcSize < chunkTargetSize) /* min 1 */; + unsigned nbChunks = MIN(nbChunksMax, mtctx->nbThreads); + size_t const proposedChunkSize = (srcSize + (nbChunks-1)) / nbChunks; + size_t const avgChunkSize = ((proposedChunkSize & 0x1FFFF) < 0xFFFF) ? proposedChunkSize + 0xFFFF : proposedChunkSize; /* avoid too small last block */ + size_t remainingSrcSize = srcSize; + const char* const srcStart = (const char*)src; + size_t frameStartPos = 0; + + DEBUGLOG(3, "windowLog : %2u => chunkTargetSize : %u bytes ", params.cParams.windowLog, (U32)chunkTargetSize); + DEBUGLOG(2, "nbChunks : %2u (chunkSize : %u bytes) ", nbChunks, (U32)avgChunkSize); + params.fParams.contentSizeFlag = 1; + + if (nbChunks==1) { /* fallback to single-thread mode */ + ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0]; + return ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); + } + + { unsigned u; + for (u=0; ubuffPool, dstBufferCapacity) : dstAsBuffer; + ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(mtctx->cctxPool); + + if ((cctx==NULL) || (dstBuffer.start==NULL)) { + mtctx->jobs[u].cSize = ERROR(memory_allocation); /* job result */ + mtctx->jobs[u].jobCompleted = 1; + nbChunks = u+1; + break; /* let's wait for previous jobs to complete, but don't start new ones */ + } + + mtctx->jobs[u].srcStart = srcStart + frameStartPos; + mtctx->jobs[u].srcSize = chunkSize; + mtctx->jobs[u].fullFrameSize = srcSize; + mtctx->jobs[u].params = params; + mtctx->jobs[u].dstBuff = dstBuffer; + mtctx->jobs[u].cctx = cctx; + mtctx->jobs[u].firstChunk = (u==0); + mtctx->jobs[u].lastChunk = (u==nbChunks-1); + mtctx->jobs[u].jobCompleted = 0; + mtctx->jobs[u].jobCompleted_mutex = &mtctx->jobCompleted_mutex; + mtctx->jobs[u].jobCompleted_cond = &mtctx->jobCompleted_cond; + + DEBUGLOG(3, "posting job %u (%u bytes)", u, (U32)chunkSize); + DEBUG_PRINTHEX(3, mtctx->jobs[u].srcStart, 12); + POOL_add(mtctx->factory, ZSTDMT_compressChunk, &mtctx->jobs[u]); + + frameStartPos += chunkSize; + remainingSrcSize -= chunkSize; + } } + /* note : since nbChunks <= nbThreads, all jobs should be running immediately in parallel */ + + { unsigned chunkID; + size_t error = 0, dstPos = 0; + for (chunkID=0; chunkIDjobCompleted_mutex); + while (mtctx->jobs[chunkID].jobCompleted==0) { + DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", chunkID); + pthread_cond_wait(&mtctx->jobCompleted_cond, &mtctx->jobCompleted_mutex); + } + pthread_mutex_unlock(&mtctx->jobCompleted_mutex); + DEBUGLOG(3, "ready to write chunk %u ", chunkID); + + ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[chunkID].cctx); + mtctx->jobs[chunkID].cctx = NULL; + mtctx->jobs[chunkID].srcStart = NULL; + { size_t const cSize = mtctx->jobs[chunkID].cSize; + if (ZSTD_isError(cSize)) error = cSize; + if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall); + if (chunkID) { /* note : chunk 0 is already written directly into dst */ + if (!error) memcpy((char*)dst + dstPos, mtctx->jobs[chunkID].dstBuff.start, cSize); + ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[chunkID].dstBuff); + mtctx->jobs[chunkID].dstBuff = g_nullBuffer; + } + dstPos += cSize ; + } + } + if (!error) DEBUGLOG(3, "compressed size : %u ", (U32)dstPos); + return error ? error : dstPos; + } + +} + + +/* ====================================== */ +/* ======= Streaming API ======= */ +/* ====================================== */ + +static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* zcs) { + while (zcs->doneJobID < zcs->nextJobID) { + unsigned const jobID = zcs->doneJobID & zcs->jobIDMask; + PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex); + while (zcs->jobs[jobID].jobCompleted==0) { + DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", zcs->doneJobID); /* we want to block when waiting for data to flush */ + pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex); + } + pthread_mutex_unlock(&zcs->jobCompleted_mutex); + zcs->doneJobID++; + } +} + + +static size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, + const void* dict, size_t dictSize, unsigned updateDict, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + ZSTD_customMem const cmem = { NULL, NULL, NULL }; + DEBUGLOG(3, "Started new compression, with windowLog : %u", params.cParams.windowLog); + if (zcs->nbThreads==1) return ZSTD_initCStream_advanced(zcs->cstream, dict, dictSize, params, pledgedSrcSize); + if (zcs->allJobsCompleted == 0) { /* previous job not correctly finished */ + ZSTDMT_waitForAllJobsCompleted(zcs); + ZSTDMT_releaseAllJobResources(zcs); + zcs->allJobsCompleted = 1; + } + zcs->params = params; + if (updateDict) { + ZSTD_freeCDict(zcs->cdict); zcs->cdict = NULL; + if (dict && dictSize) { + zcs->cdict = ZSTD_createCDict_advanced(dict, dictSize, 0, params, cmem); + if (zcs->cdict == NULL) return ERROR(memory_allocation); + } } + zcs->frameContentSize = pledgedSrcSize; + zcs->targetDictSize = (zcs->overlapRLog>=9) ? 0 : (size_t)1 << (zcs->params.cParams.windowLog - zcs->overlapRLog); + DEBUGLOG(4, "overlapRLog : %u ", zcs->overlapRLog); + DEBUGLOG(3, "overlap Size : %u KB", (U32)(zcs->targetDictSize>>10)); + zcs->targetSectionSize = zcs->sectionSize ? zcs->sectionSize : (size_t)1 << (zcs->params.cParams.windowLog + 2); + zcs->targetSectionSize = MAX(ZSTDMT_SECTION_SIZE_MIN, zcs->targetSectionSize); + zcs->targetSectionSize = MAX(zcs->targetDictSize, zcs->targetSectionSize); + DEBUGLOG(3, "Section Size : %u KB", (U32)(zcs->targetSectionSize>>10)); + zcs->marginSize = zcs->targetSectionSize >> 2; + zcs->inBuffSize = zcs->targetDictSize + zcs->targetSectionSize + zcs->marginSize; + zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize); + if (zcs->inBuff.buffer.start == NULL) return ERROR(memory_allocation); + zcs->inBuff.filled = 0; + zcs->dictSize = 0; + zcs->doneJobID = 0; + zcs->nextJobID = 0; + zcs->frameEnded = 0; + zcs->allJobsCompleted = 0; + if (params.fParams.checksumFlag) XXH64_reset(&zcs->xxhState, 0); + return 0; +} + +size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + return ZSTDMT_initCStream_internal(zcs, dict, dictSize, 1, params, pledgedSrcSize); +} + +/* ZSTDMT_resetCStream() : + * pledgedSrcSize is optional and can be zero == unknown */ +size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* zcs, unsigned long long pledgedSrcSize) +{ + if (zcs->nbThreads==1) return ZSTD_resetCStream(zcs->cstream, pledgedSrcSize); + return ZSTDMT_initCStream_internal(zcs, NULL, 0, 0, zcs->params, pledgedSrcSize); +} + +size_t ZSTDMT_initCStream(ZSTDMT_CCtx* zcs, int compressionLevel) { + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, 0); + return ZSTDMT_initCStream_internal(zcs, NULL, 0, 1, params, 0); +} + + +static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* zcs, size_t srcSize, unsigned endFrame) +{ + size_t const dstBufferCapacity = ZSTD_compressBound(srcSize); + buffer_t const dstBuffer = ZSTDMT_getBuffer(zcs->buffPool, dstBufferCapacity); + ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(zcs->cctxPool); + unsigned const jobID = zcs->nextJobID & zcs->jobIDMask; + + if ((cctx==NULL) || (dstBuffer.start==NULL)) { + zcs->jobs[jobID].jobCompleted = 1; + zcs->nextJobID++; + ZSTDMT_waitForAllJobsCompleted(zcs); + ZSTDMT_releaseAllJobResources(zcs); + return ERROR(memory_allocation); + } + + DEBUGLOG(4, "preparing job %u to compress %u bytes with %u preload ", zcs->nextJobID, (U32)srcSize, (U32)zcs->dictSize); + zcs->jobs[jobID].src = zcs->inBuff.buffer; + zcs->jobs[jobID].srcStart = zcs->inBuff.buffer.start; + zcs->jobs[jobID].srcSize = srcSize; + zcs->jobs[jobID].dictSize = zcs->dictSize; /* note : zcs->inBuff.filled is presumed >= srcSize + dictSize */ + zcs->jobs[jobID].params = zcs->params; + if (zcs->nextJobID) zcs->jobs[jobID].params.fParams.checksumFlag = 0; /* do not calculate checksum within sections, just keep it in header for first section */ + zcs->jobs[jobID].cdict = zcs->nextJobID==0 ? zcs->cdict : NULL; + zcs->jobs[jobID].fullFrameSize = zcs->frameContentSize; + zcs->jobs[jobID].dstBuff = dstBuffer; + zcs->jobs[jobID].cctx = cctx; + zcs->jobs[jobID].firstChunk = (zcs->nextJobID==0); + zcs->jobs[jobID].lastChunk = endFrame; + zcs->jobs[jobID].jobCompleted = 0; + zcs->jobs[jobID].dstFlushed = 0; + zcs->jobs[jobID].jobCompleted_mutex = &zcs->jobCompleted_mutex; + zcs->jobs[jobID].jobCompleted_cond = &zcs->jobCompleted_cond; + + /* get a new buffer for next input */ + if (!endFrame) { + size_t const newDictSize = MIN(srcSize + zcs->dictSize, zcs->targetDictSize); + zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize); + if (zcs->inBuff.buffer.start == NULL) { /* not enough memory to allocate next input buffer */ + zcs->jobs[jobID].jobCompleted = 1; + zcs->nextJobID++; + ZSTDMT_waitForAllJobsCompleted(zcs); + ZSTDMT_releaseAllJobResources(zcs); + return ERROR(memory_allocation); + } + DEBUGLOG(5, "inBuff filled to %u", (U32)zcs->inBuff.filled); + zcs->inBuff.filled -= srcSize + zcs->dictSize - newDictSize; + DEBUGLOG(5, "new job : filled to %u, with %u dict and %u src", (U32)zcs->inBuff.filled, (U32)newDictSize, (U32)(zcs->inBuff.filled - newDictSize)); + memmove(zcs->inBuff.buffer.start, (const char*)zcs->jobs[jobID].srcStart + zcs->dictSize + srcSize - newDictSize, zcs->inBuff.filled); + DEBUGLOG(5, "new inBuff pre-filled"); + zcs->dictSize = newDictSize; + } else { + zcs->inBuff.buffer = g_nullBuffer; + zcs->inBuff.filled = 0; + zcs->dictSize = 0; + zcs->frameEnded = 1; + if (zcs->nextJobID == 0) + zcs->params.fParams.checksumFlag = 0; /* single chunk : checksum is calculated directly within worker thread */ + } + + DEBUGLOG(3, "posting job %u : %u bytes (end:%u) (note : doneJob = %u=>%u)", zcs->nextJobID, (U32)zcs->jobs[jobID].srcSize, zcs->jobs[jobID].lastChunk, zcs->doneJobID, zcs->doneJobID & zcs->jobIDMask); + POOL_add(zcs->factory, ZSTDMT_compressChunk, &zcs->jobs[jobID]); /* this call is blocking when thread worker pool is exhausted */ + zcs->nextJobID++; + return 0; +} + + +/* ZSTDMT_flushNextJob() : + * output : will be updated with amount of data flushed . + * blockToFlush : if >0, the function will block and wait if there is no data available to flush . + * @return : amount of data remaining within internal buffer, 1 if unknown but > 0, 0 if no more, or an error code */ +static size_t ZSTDMT_flushNextJob(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned blockToFlush) +{ + unsigned const wJobID = zcs->doneJobID & zcs->jobIDMask; + if (zcs->doneJobID == zcs->nextJobID) return 0; /* all flushed ! */ + PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex); + while (zcs->jobs[wJobID].jobCompleted==0) { + DEBUGLOG(5, "waiting for jobCompleted signal from job %u", zcs->doneJobID); + if (!blockToFlush) { pthread_mutex_unlock(&zcs->jobCompleted_mutex); return 0; } /* nothing ready to be flushed => skip */ + pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex); /* block when nothing available to flush */ + } + pthread_mutex_unlock(&zcs->jobCompleted_mutex); + /* compression job completed : output can be flushed */ + { ZSTDMT_jobDescription job = zcs->jobs[wJobID]; + if (!job.jobScanned) { + if (ZSTD_isError(job.cSize)) { + DEBUGLOG(5, "compression error detected "); + ZSTDMT_waitForAllJobsCompleted(zcs); + ZSTDMT_releaseAllJobResources(zcs); + return job.cSize; + } + ZSTDMT_releaseCCtx(zcs->cctxPool, job.cctx); + zcs->jobs[wJobID].cctx = NULL; + DEBUGLOG(5, "zcs->params.fParams.checksumFlag : %u ", zcs->params.fParams.checksumFlag); + if (zcs->params.fParams.checksumFlag) { + XXH64_update(&zcs->xxhState, (const char*)job.srcStart + job.dictSize, job.srcSize); + if (zcs->frameEnded && (zcs->doneJobID+1 == zcs->nextJobID)) { /* write checksum at end of last section */ + U32 const checksum = (U32)XXH64_digest(&zcs->xxhState); + DEBUGLOG(4, "writing checksum : %08X \n", checksum); + MEM_writeLE32((char*)job.dstBuff.start + job.cSize, checksum); + job.cSize += 4; + zcs->jobs[wJobID].cSize += 4; + } } + ZSTDMT_releaseBuffer(zcs->buffPool, job.src); + zcs->jobs[wJobID].srcStart = NULL; + zcs->jobs[wJobID].src = g_nullBuffer; + zcs->jobs[wJobID].jobScanned = 1; + } + { size_t const toWrite = MIN(job.cSize - job.dstFlushed, output->size - output->pos); + DEBUGLOG(4, "Flushing %u bytes from job %u ", (U32)toWrite, zcs->doneJobID); + memcpy((char*)output->dst + output->pos, (const char*)job.dstBuff.start + job.dstFlushed, toWrite); + output->pos += toWrite; + job.dstFlushed += toWrite; + } + if (job.dstFlushed == job.cSize) { /* output buffer fully flushed => move to next one */ + ZSTDMT_releaseBuffer(zcs->buffPool, job.dstBuff); + zcs->jobs[wJobID].dstBuff = g_nullBuffer; + zcs->jobs[wJobID].jobCompleted = 0; + zcs->doneJobID++; + } else { + zcs->jobs[wJobID].dstFlushed = job.dstFlushed; + } + /* return value : how many bytes left in buffer ; fake it to 1 if unknown but >0 */ + if (job.cSize > job.dstFlushed) return (job.cSize - job.dstFlushed); + if (zcs->doneJobID < zcs->nextJobID) return 1; /* still some buffer to flush */ + zcs->allJobsCompleted = zcs->frameEnded; /* frame completed and entirely flushed */ + return 0; /* everything flushed */ +} } + + +size_t ZSTDMT_compressStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + size_t const newJobThreshold = zcs->dictSize + zcs->targetSectionSize + zcs->marginSize; + if (zcs->frameEnded) return ERROR(stage_wrong); /* current frame being ended. Only flush is allowed. Restart with init */ + if (zcs->nbThreads==1) return ZSTD_compressStream(zcs->cstream, output, input); + + /* fill input buffer */ + { size_t const toLoad = MIN(input->size - input->pos, zcs->inBuffSize - zcs->inBuff.filled); + memcpy((char*)zcs->inBuff.buffer.start + zcs->inBuff.filled, input->src, toLoad); + input->pos += toLoad; + zcs->inBuff.filled += toLoad; + } + + if ( (zcs->inBuff.filled >= newJobThreshold) /* filled enough : let's compress */ + && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) { /* avoid overwriting job round buffer */ + CHECK_F( ZSTDMT_createCompressionJob(zcs, zcs->targetSectionSize, 0) ); + } + + /* check for data to flush */ + CHECK_F( ZSTDMT_flushNextJob(zcs, output, (zcs->inBuff.filled == zcs->inBuffSize)) ); /* block if it wasn't possible to create new job due to saturation */ + + /* recommended next input size : fill current input buffer */ + return zcs->inBuffSize - zcs->inBuff.filled; /* note : could be zero when input buffer is fully filled and no more availability to create new job */ +} + + +static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned endFrame) +{ + size_t const srcSize = zcs->inBuff.filled - zcs->dictSize; + + if (srcSize) DEBUGLOG(4, "flushing : %u bytes left to compress", (U32)srcSize); + if ( ((srcSize > 0) || (endFrame && !zcs->frameEnded)) + && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) { + CHECK_F( ZSTDMT_createCompressionJob(zcs, srcSize, endFrame) ); + } + + /* check if there is any data available to flush */ + DEBUGLOG(5, "zcs->doneJobID : %u ; zcs->nextJobID : %u ", zcs->doneJobID, zcs->nextJobID); + return ZSTDMT_flushNextJob(zcs, output, 1); +} + + +size_t ZSTDMT_flushStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output) +{ + if (zcs->nbThreads==1) return ZSTD_flushStream(zcs->cstream, output); + return ZSTDMT_flushStream_internal(zcs, output, 0); +} + +size_t ZSTDMT_endStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output) +{ + if (zcs->nbThreads==1) return ZSTD_endStream(zcs->cstream, output); + return ZSTDMT_flushStream_internal(zcs, output, 1); +} diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/compress/zstdmt_compress.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/compress/zstdmt_compress.h Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,78 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + + #ifndef ZSTDMT_COMPRESS_H + #define ZSTDMT_COMPRESS_H + + #if defined (__cplusplus) + extern "C" { + #endif + + +/* Note : All prototypes defined in this file shall be considered experimental. + * There is no guarantee of API continuity (yet) on any of these prototypes */ + +/* === Dependencies === */ +#include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ +#include "zstd.h" /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */ + + +/* === Simple one-pass functions === */ + +typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx; +ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads); +ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* cctx); + +ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + + +/* === Streaming functions === */ + +ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel); +ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ + +ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ + + +/* === Advanced functions and parameters === */ + +#ifndef ZSTDMT_SECTION_SIZE_MIN +# define ZSTDMT_SECTION_SIZE_MIN (1U << 20) /* 1 MB - Minimum size of each compression job */ +#endif + +ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, const void* dict, size_t dictSize, /**< dict can be released after init, a local copy is preserved within zcs */ + ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ + +/* ZSDTMT_parameter : + * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */ +typedef enum { + ZSTDMT_p_sectionSize, /* size of input "section". Each section is compressed in parallel. 0 means default, which is dynamically determined within compression functions */ + ZSTDMT_p_overlapSectionLog /* Log of overlapped section; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */ +} ZSDTMT_parameter; + +/* ZSTDMT_setMTCtxParameter() : + * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter. + * The function must be called typically after ZSTD_createCCtx(). + * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDMT_COMPRESS_H */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/decompress/zstd_decompress.c --- a/contrib/python-zstandard/zstd/decompress/zstd_decompress.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/decompress/zstd_decompress.c Mon Feb 13 09:44:16 2017 -0800 @@ -1444,7 +1444,7 @@ #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1) if (ZSTD_isLegacy(src, srcSize)) return ZSTD_decompressLegacy(dst, dstCapacity, src, srcSize, dict, dictSize); #endif - ZSTD_decompressBegin_usingDict(dctx, dict, dictSize); + CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize)); ZSTD_checkContinuity(dctx, dst); return ZSTD_decompressFrame(dctx, dst, dstCapacity, src, srcSize); } @@ -1671,9 +1671,9 @@ } if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted); - dctx->rep[0] = MEM_readLE32(dictPtr+0); if (dctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); - dctx->rep[1] = MEM_readLE32(dictPtr+4); if (dctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); - dctx->rep[2] = MEM_readLE32(dictPtr+8); if (dctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); + dctx->rep[0] = MEM_readLE32(dictPtr+0); if (dctx->rep[0] == 0 || dctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted); + dctx->rep[1] = MEM_readLE32(dictPtr+4); if (dctx->rep[1] == 0 || dctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted); + dctx->rep[2] = MEM_readLE32(dictPtr+8); if (dctx->rep[2] == 0 || dctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted); dictPtr += 12; dctx->litEntropy = dctx->fseEntropy = 1; @@ -1713,39 +1713,44 @@ /* ====== ZSTD_DDict ====== */ struct ZSTD_DDict_s { - void* dict; + void* dictBuffer; + const void* dictContent; size_t dictSize; ZSTD_DCtx* refContext; }; /* typedef'd to ZSTD_DDict within "zstd.h" */ -ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, ZSTD_customMem customMem) +ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem) { if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem; if (!customMem.customAlloc || !customMem.customFree) return NULL; { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem); - void* const dictContent = ZSTD_malloc(dictSize, customMem); ZSTD_DCtx* const dctx = ZSTD_createDCtx_advanced(customMem); - if (!dictContent || !ddict || !dctx) { - ZSTD_free(dictContent, customMem); + if (!ddict || !dctx) { ZSTD_free(ddict, customMem); ZSTD_free(dctx, customMem); return NULL; } - if (dictSize) { - memcpy(dictContent, dict, dictSize); + if ((byReference) || (!dict) || (!dictSize)) { + ddict->dictBuffer = NULL; + ddict->dictContent = dict; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, customMem); + if (!internalBuffer) { ZSTD_free(dctx, customMem); ZSTD_free(ddict, customMem); return NULL; } + memcpy(internalBuffer, dict, dictSize); + ddict->dictBuffer = internalBuffer; + ddict->dictContent = internalBuffer; } - { size_t const errorCode = ZSTD_decompressBegin_usingDict(dctx, dictContent, dictSize); + { size_t const errorCode = ZSTD_decompressBegin_usingDict(dctx, ddict->dictContent, dictSize); if (ZSTD_isError(errorCode)) { - ZSTD_free(dictContent, customMem); + ZSTD_free(ddict->dictBuffer, customMem); ZSTD_free(ddict, customMem); ZSTD_free(dctx, customMem); return NULL; } } - ddict->dict = dictContent; ddict->dictSize = dictSize; ddict->refContext = dctx; return ddict; @@ -1758,15 +1763,27 @@ ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; - return ZSTD_createDDict_advanced(dict, dictSize, allocator); + return ZSTD_createDDict_advanced(dict, dictSize, 0, allocator); } + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is simply referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, it must remain read accessible throughout the lifetime of DDict */ +ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + return ZSTD_createDDict_advanced(dictBuffer, dictSize, 1, allocator); +} + + size_t ZSTD_freeDDict(ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = ddict->refContext->customMem; ZSTD_freeDCtx(ddict->refContext); - ZSTD_free(ddict->dict, cMem); + ZSTD_free(ddict->dictBuffer, cMem); ZSTD_free(ddict, cMem); return 0; } @@ -1775,7 +1792,7 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support sizeof on NULL */ - return sizeof(*ddict) + sizeof(ddict->refContext) + ddict->dictSize; + return sizeof(*ddict) + ZSTD_sizeof_DCtx(ddict->refContext) + (ddict->dictBuffer ? ddict->dictSize : 0) ; } /*! ZSTD_getDictID_fromDict() : @@ -1796,7 +1813,7 @@ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; - return ZSTD_getDictID_fromDict(ddict->dict, ddict->dictSize); + return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); } /*! ZSTD_getDictID_fromFrame() : @@ -1827,7 +1844,7 @@ const ZSTD_DDict* ddict) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1) - if (ZSTD_isLegacy(src, srcSize)) return ZSTD_decompressLegacy(dst, dstCapacity, src, srcSize, ddict->dict, ddict->dictSize); + if (ZSTD_isLegacy(src, srcSize)) return ZSTD_decompressLegacy(dst, dstCapacity, src, srcSize, ddict->dictContent, ddict->dictSize); #endif ZSTD_refDCtx(dctx, ddict->refContext); ZSTD_checkContinuity(dctx, dst); @@ -1919,7 +1936,7 @@ zds->stage = zdss_loadHeader; zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; ZSTD_freeDDict(zds->ddictLocal); - if (dict) { + if (dict && dictSize >= 8) { zds->ddictLocal = ZSTD_createDDict(dict, dictSize); if (zds->ddictLocal == NULL) return ERROR(memory_allocation); } else zds->ddictLocal = NULL; @@ -1956,7 +1973,7 @@ switch(paramType) { default : return ERROR(parameter_unknown); - case ZSTDdsp_maxWindowSize : zds->maxWindowSize = paramValue ? paramValue : (U32)(-1); break; + case DStream_p_maxWindowSize : zds->maxWindowSize = paramValue ? paramValue : (U32)(-1); break; } return 0; } @@ -2007,7 +2024,7 @@ #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) { U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); if (legacyVersion) { - const void* const dict = zds->ddict ? zds->ddict->dict : NULL; + const void* const dict = zds->ddict ? zds->ddict->dictContent : NULL; size_t const dictSize = zds->ddict ? zds->ddict->dictSize : 0; CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext, zds->previousLegacyVersion, legacyVersion, dict, dictSize)); diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/dictBuilder/cover.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/python-zstandard/zstd/dictBuilder/cover.c Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,1021 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ + +#include "mem.h" /* read */ +#include "pool.h" +#include "threading.h" +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + +/*-************************************* +* Constants +***************************************/ +#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) + +/*-************************************* +* Console display +***************************************/ +static int g_displayLevel = 2; +#define DISPLAY(...) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } +#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + DISPLAY(__VA_ARGS__); \ + } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) + +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + if (displayLevel >= 4) \ + fflush(stdout); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; + +/*-************************************* +* Hash table +*************************************** +* A small specialized hash map for storing activeDmers. +* The map does not resize, so if it becomes full it will loop forever. +* Thus, the map must be large enough to store every value. +* The map implements linear probing and keeps its load less than 0.5. +*/ + +#define MAP_EMPTY_VALUE ((U32)-1) +typedef struct COVER_map_pair_t_s { + U32 key; + U32 value; +} COVER_map_pair_t; + +typedef struct COVER_map_s { + COVER_map_pair_t *data; + U32 sizeLog; + U32 size; + U32 sizeMask; +} COVER_map_t; + +/** + * Clear the map. + */ +static void COVER_map_clear(COVER_map_t *map) { + memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t)); +} + +/** + * Initializes a map of the given size. + * Returns 1 on success and 0 on failure. + * The map must be destroyed with COVER_map_destroy(). + * The map is only guaranteed to be large enough to hold size elements. + */ +static int COVER_map_init(COVER_map_t *map, U32 size) { + map->sizeLog = ZSTD_highbit32(size) + 2; + map->size = (U32)1 << map->sizeLog; + map->sizeMask = map->size - 1; + map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t)); + if (!map->data) { + map->sizeLog = 0; + map->size = 0; + return 0; + } + COVER_map_clear(map); + return 1; +} + +/** + * Internal hash function + */ +static const U32 prime4bytes = 2654435761U; +static U32 COVER_map_hash(COVER_map_t *map, U32 key) { + return (key * prime4bytes) >> (32 - map->sizeLog); +} + +/** + * Helper function that returns the index that a key should be placed into. + */ +static U32 COVER_map_index(COVER_map_t *map, U32 key) { + const U32 hash = COVER_map_hash(map, key); + U32 i; + for (i = hash;; i = (i + 1) & map->sizeMask) { + COVER_map_pair_t *pos = &map->data[i]; + if (pos->value == MAP_EMPTY_VALUE) { + return i; + } + if (pos->key == key) { + return i; + } + } +} + +/** + * Returns the pointer to the value for key. + * If key is not in the map, it is inserted and the value is set to 0. + * The map must not be full. + */ +static U32 *COVER_map_at(COVER_map_t *map, U32 key) { + COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)]; + if (pos->value == MAP_EMPTY_VALUE) { + pos->key = key; + pos->value = 0; + } + return &pos->value; +} + +/** + * Deletes key from the map if present. + */ +static void COVER_map_remove(COVER_map_t *map, U32 key) { + U32 i = COVER_map_index(map, key); + COVER_map_pair_t *del = &map->data[i]; + U32 shift = 1; + if (del->value == MAP_EMPTY_VALUE) { + return; + } + for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) { + COVER_map_pair_t *const pos = &map->data[i]; + /* If the position is empty we are done */ + if (pos->value == MAP_EMPTY_VALUE) { + del->value = MAP_EMPTY_VALUE; + return; + } + /* If pos can be moved to del do so */ + if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) { + del->key = pos->key; + del->value = pos->value; + del = pos; + shift = 1; + } else { + ++shift; + } + } +} + +/** + * Destroyes a map that is inited with COVER_map_init(). + */ +static void COVER_map_destroy(COVER_map_t *map) { + if (map->data) { + free(map->data); + } + map->data = NULL; + map->size = 0; +} + +/*-************************************* +* Context +***************************************/ + +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + U32 *suffix; + size_t suffixSize; + U32 *freqs; + U32 *dmerAt; + unsigned d; +} COVER_ctx_t; + +/* We need a global context for qsort... */ +static COVER_ctx_t *g_ctx = NULL; + +/*-************************************* +* Helper functions +***************************************/ + +/** + * Returns the sum of the sample sizes. + */ +static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + size_t i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + +/** + * Returns -1 if the dmer at lp is less than the dmer at rp. + * Return 0 if the dmers at lp and rp are equal. + * Returns 1 if the dmer at lp is greater than the dmer at rp. + */ +static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) { + const U32 lhs = *(const U32 *)lp; + const U32 rhs = *(const U32 *)rp; + return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); +} + +/** + * Same as COVER_cmp() except ties are broken by pointer value + * NOTE: g_ctx must be set to call this function. A global is required because + * qsort doesn't take an opaque pointer. + */ +static int COVER_strict_cmp(const void *lp, const void *rp) { + int result = COVER_cmp(g_ctx, lp, rp); + if (result == 0) { + result = lp < rp ? -1 : 1; + } + return result; +} + +/** + * Returns the first pointer in [first, last) whose element does not compare + * less than value. If no such element exists it returns last. + */ +static const size_t *COVER_lower_bound(const size_t *first, const size_t *last, + size_t value) { + size_t count = last - first; + while (count != 0) { + size_t step = count / 2; + const size_t *ptr = first; + ptr += step; + if (*ptr < value) { + first = ++ptr; + count -= step + 1; + } else { + count = step; + } + } + return first; +} + +/** + * Generic groupBy function. + * Groups an array sorted by cmp into groups with equivalent values. + * Calls grp for each group. + */ +static void +COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx, + int (*cmp)(COVER_ctx_t *, const void *, const void *), + void (*grp)(COVER_ctx_t *, const void *, const void *)) { + const BYTE *ptr = (const BYTE *)data; + size_t num = 0; + while (num < count) { + const BYTE *grpEnd = ptr + size; + ++num; + while (num < count && cmp(ctx, ptr, grpEnd) == 0) { + grpEnd += size; + ++num; + } + grp(ctx, ptr, grpEnd); + ptr = grpEnd; + } +} + +/*-************************************* +* Cover functions +***************************************/ + +/** + * Called on each group of positions with the same dmer. + * Counts the frequency of each dmer and saves it in the suffix array. + * Fills `ctx->dmerAt`. + */ +static void COVER_group(COVER_ctx_t *ctx, const void *group, + const void *groupEnd) { + /* The group consists of all the positions with the same first d bytes. */ + const U32 *grpPtr = (const U32 *)group; + const U32 *grpEnd = (const U32 *)groupEnd; + /* The dmerId is how we will reference this dmer. + * This allows us to map the whole dmer space to a much smaller space, the + * size of the suffix array. + */ + const U32 dmerId = (U32)(grpPtr - ctx->suffix); + /* Count the number of samples this dmer shows up in */ + U32 freq = 0; + /* Details */ + const size_t *curOffsetPtr = ctx->offsets; + const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples; + /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a + * different sample than the last. + */ + size_t curSampleEnd = ctx->offsets[0]; + for (; grpPtr != grpEnd; ++grpPtr) { + /* Save the dmerId for this position so we can get back to it. */ + ctx->dmerAt[*grpPtr] = dmerId; + /* Dictionaries only help for the first reference to the dmer. + * After that zstd can reference the match from the previous reference. + * So only count each dmer once for each sample it is in. + */ + if (*grpPtr < curSampleEnd) { + continue; + } + freq += 1; + /* Binary search to find the end of the sample *grpPtr is in. + * In the common case that grpPtr + 1 == grpEnd we can skip the binary + * search because the loop is over. + */ + if (grpPtr + 1 != grpEnd) { + const size_t *sampleEndPtr = + COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr); + curSampleEnd = *sampleEndPtr; + curOffsetPtr = sampleEndPtr + 1; + } + } + /* At this point we are never going to look at this segment of the suffix + * array again. We take advantage of this fact to save memory. + * We store the frequency of the dmer in the first position of the group, + * which is dmerId. + */ + ctx->suffix[dmerId] = freq; +} + +/** + * A segment is a range in the source as well as the score of the segment. + */ +typedef struct { + U32 begin; + U32 end; + double score; +} COVER_segment_t; + +/** + * Selects the best segment in an epoch. + * Segments of are scored according to the function: + * + * Let F(d) be the frequency of dmer d. + * Let S_i be the dmer at position i of segment S which has length k. + * + * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) + * + * Once the dmer d is in the dictionay we set F(d) = 0. + */ +static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, + COVER_map_t *activeDmers, U32 begin, + U32 end, COVER_params_t parameters) { + /* Constants */ + const U32 k = parameters.k; + const U32 d = parameters.d; + const U32 dmersInK = k - d + 1; + /* Try each segment (activeSegment) and save the best (bestSegment) */ + COVER_segment_t bestSegment = {0, 0, 0}; + COVER_segment_t activeSegment; + /* Reset the activeDmers in the segment */ + COVER_map_clear(activeDmers); + /* The activeSegment starts at the beginning of the epoch. */ + activeSegment.begin = begin; + activeSegment.end = begin; + activeSegment.score = 0; + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* The dmerId for the dmer at the next position */ + U32 newDmer = ctx->dmerAt[activeSegment.end]; + /* The entry in activeDmers for this dmerId */ + U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer); + /* If the dmer isn't already present in the segment add its score. */ + if (*newDmerOcc == 0) { + /* The paper suggest using the L-0.5 norm, but experiments show that it + * doesn't help. + */ + activeSegment.score += freqs[newDmer]; + } + /* Add the dmer to the segment */ + activeSegment.end += 1; + *newDmerOcc += 1; + + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + U32 delDmer = ctx->dmerAt[activeSegment.begin]; + U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer); + activeSegment.begin += 1; + *delDmerOcc -= 1; + /* If this is the last occurence of the dmer, subtract its score */ + if (*delDmerOcc == 0) { + COVER_map_remove(activeDmers, delDmer); + activeSegment.score -= freqs[delDmer]; + } + } + + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } + } + { + /* Trim off the zero frequency head and tail from the segment. */ + U32 newBegin = bestSegment.end; + U32 newEnd = bestSegment.begin; + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + U32 freq = freqs[ctx->dmerAt[pos]]; + if (freq != 0) { + newBegin = MIN(newBegin, pos); + newEnd = pos + 1; + } + } + bestSegment.begin = newBegin; + bestSegment.end = newEnd; + } + { + /* Zero out the frequency of each dmer covered by the chosen segment. */ + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + freqs[ctx->dmerAt[pos]] = 0; + } + } + return bestSegment; +} + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int COVER_checkParameters(COVER_params_t parameters) { + /* k and d are required parameters */ + if (parameters.d == 0 || parameters.k == 0) { + return 0; + } + /* d <= k */ + if (parameters.d > parameters.k) { + return 0; + } + return 1; +} + +/** + * Clean up a context initialized with `COVER_ctx_init()`. + */ +static void COVER_ctx_destroy(COVER_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->suffix) { + free(ctx->suffix); + ctx->suffix = NULL; + } + if (ctx->freqs) { + free(ctx->freqs); + ctx->freqs = NULL; + } + if (ctx->dmerAt) { + free(ctx->dmerAt); + ctx->dmerAt = NULL; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + +/** + * Prepare a context for dictionary building. + * The context is only dependent on the parameter `d` and can used multiple + * times. + * Returns 1 on success or zero on error. + * The context must be destroyed with `COVER_ctx_destroy()`. + */ +static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + unsigned d) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); + /* Checks */ + if (totalSamplesSize < d || + totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", + (COVER_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + /* Zero the context */ + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, + (U32)totalSamplesSize); + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + /* Partial suffix array */ + ctx->suffixSize = totalSamplesSize - d + 1; + ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + /* Maps index to the dmerID */ + ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + /* The offsets of each file */ + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n"); + COVER_ctx_destroy(ctx); + return 0; + } + ctx->freqs = NULL; + ctx->d = d; + + /* Fill offsets from the samlesSizes */ + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + DISPLAYLEVEL(2, "Constructing partial suffix array\n"); + { + /* suffix is a partial suffix array. + * It only sorts suffixes by their first parameters.d bytes. + * The sort is stable, so each dmer group is sorted by position in input. + */ + U32 i; + for (i = 0; i < ctx->suffixSize; ++i) { + ctx->suffix[i] = i; + } + /* qsort doesn't take an opaque pointer, so pass as a global */ + g_ctx = ctx; + qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp); + } + DISPLAYLEVEL(2, "Computing frequencies\n"); + /* For each dmer group (group of positions with the same first d bytes): + * 1. For each position we set dmerAt[position] = dmerID. The dmerID is + * (groupBeginPtr - suffix). This allows us to go from position to + * dmerID so we can look up values in freq. + * 2. We calculate how many samples the dmer occurs in and save it in + * freqs[dmerId]. + */ + COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp, + &COVER_group); + ctx->freqs = ctx->suffix; + ctx->suffix = NULL; + return 1; +} + +/** + * Given the prepared context build the dictionary. + */ +static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, + COVER_map_t *activeDmers, void *dictBuffer, + size_t dictBufferCapacity, + COVER_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + /* Divide the data up into epochs of equal size. + * We will select at least one segment from each epoch. + */ + const U32 epochs = (U32)(dictBufferCapacity / parameters.k); + const U32 epochSize = (U32)(ctx->suffixSize / epochs); + size_t epoch; + DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, + epochSize); + /* Loop through the epochs until there are no more segments or the dictionary + * is full. + */ + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { + const U32 epochBegin = (U32)(epoch * epochSize); + const U32 epochEnd = epochBegin + epochSize; + size_t segmentSize; + /* Select a segment */ + COVER_segment_t segment = COVER_selectSegment( + ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); + /* Trim the segment if necessary and if it is empty then we are done */ + segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); + if (segmentSize == 0) { + break; + } + /* We fill the dictionary from the back to allow the best segments to be + * referenced with the smallest offsets. + */ + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + return tail; +} + +/** + * Translate from COVER_params_t to ZDICT_params_t required for finalizing the + * dictionary. + */ +static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) { + ZDICT_params_t zdictParams; + memset(&zdictParams, 0, sizeof(zdictParams)); + zdictParams.notificationLevel = 1; + zdictParams.dictID = parameters.dictID; + zdictParams.compressionLevel = parameters.compressionLevel; + return zdictParams; +} + +/** + * Constructs a dictionary using a heuristic based on the following paper: + * + * Liao, Petri, Moffat, Wirth + * Effective Construction of Relative Lempel-Ziv Dictionaries + * Published in WWW 2016. + */ +ZDICTLIB_API size_t COVER_trainFromBuffer( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + COVER_ctx_t ctx; + COVER_map_t activeDmers; + /* Checks */ + if (!COVER_checkParameters(parameters)) { + DISPLAYLEVEL(1, "Cover parameters incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Cover must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + /* Initialize global data */ + g_displayLevel = parameters.notificationLevel; + /* Initialize context and activeDmers */ + if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + parameters.d)) { + return ERROR(GENERIC); + } + if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { + DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); + COVER_ctx_destroy(&ctx); + return ERROR(GENERIC); + } + + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = + COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, + dictBufferCapacity, parameters); + ZDICT_params_t zdictParams = COVER_translateParams(parameters); + const size_t dictionarySize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbSamples, zdictParams); + if (!ZSTD_isError(dictionarySize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictionarySize); + } + COVER_ctx_destroy(&ctx); + COVER_map_destroy(&activeDmers); + return dictionarySize; + } +} + +/** + * COVER_best_t is used for two purposes: + * 1. Synchronizing threads. + * 2. Saving the best parameters and dictionary. + * + * All of the methods except COVER_best_init() are thread safe if zstd is + * compiled with multithreaded support. + */ +typedef struct COVER_best_s { + pthread_mutex_t mutex; + pthread_cond_t cond; + size_t liveJobs; + void *dict; + size_t dictSize; + COVER_params_t parameters; + size_t compressedSize; +} COVER_best_t; + +/** + * Initialize the `COVER_best_t`. + */ +static void COVER_best_init(COVER_best_t *best) { + if (!best) { + return; + } + pthread_mutex_init(&best->mutex, NULL); + pthread_cond_init(&best->cond, NULL); + best->liveJobs = 0; + best->dict = NULL; + best->dictSize = 0; + best->compressedSize = (size_t)-1; + memset(&best->parameters, 0, sizeof(best->parameters)); +} + +/** + * Wait until liveJobs == 0. + */ +static void COVER_best_wait(COVER_best_t *best) { + if (!best) { + return; + } + pthread_mutex_lock(&best->mutex); + while (best->liveJobs != 0) { + pthread_cond_wait(&best->cond, &best->mutex); + } + pthread_mutex_unlock(&best->mutex); +} + +/** + * Call COVER_best_wait() and then destroy the COVER_best_t. + */ +static void COVER_best_destroy(COVER_best_t *best) { + if (!best) { + return; + } + COVER_best_wait(best); + if (best->dict) { + free(best->dict); + } + pthread_mutex_destroy(&best->mutex); + pthread_cond_destroy(&best->cond); +} + +/** + * Called when a thread is about to be launched. + * Increments liveJobs. + */ +static void COVER_best_start(COVER_best_t *best) { + if (!best) { + return; + } + pthread_mutex_lock(&best->mutex); + ++best->liveJobs; + pthread_mutex_unlock(&best->mutex); +} + +/** + * Called when a thread finishes executing, both on error or success. + * Decrements liveJobs and signals any waiting threads if liveJobs == 0. + * If this dictionary is the best so far save it and its parameters. + */ +static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, + COVER_params_t parameters, void *dict, + size_t dictSize) { + if (!best) { + return; + } + { + size_t liveJobs; + pthread_mutex_lock(&best->mutex); + --best->liveJobs; + liveJobs = best->liveJobs; + /* If the new dictionary is better */ + if (compressedSize < best->compressedSize) { + /* Allocate space if necessary */ + if (!best->dict || best->dictSize < dictSize) { + if (best->dict) { + free(best->dict); + } + best->dict = malloc(dictSize); + if (!best->dict) { + best->compressedSize = ERROR(GENERIC); + best->dictSize = 0; + return; + } + } + /* Save the dictionary, parameters, and size */ + memcpy(best->dict, dict, dictSize); + best->dictSize = dictSize; + best->parameters = parameters; + best->compressedSize = compressedSize; + } + pthread_mutex_unlock(&best->mutex); + if (liveJobs == 0) { + pthread_cond_broadcast(&best->cond); + } + } +} + +/** + * Parameters for COVER_tryParameters(). + */ +typedef struct COVER_tryParameters_data_s { + const COVER_ctx_t *ctx; + COVER_best_t *best; + size_t dictBufferCapacity; + COVER_params_t parameters; +} COVER_tryParameters_data_t; + +/** + * Tries a set of parameters and upates the COVER_best_t with the results. + * This function is thread safe if zstd is compiled with multithreaded support. + * It takes its parameters as an *OWNING* opaque pointer to support threading. + */ +static void COVER_tryParameters(void *opaque) { + /* Save parameters as local variables */ + COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; + const COVER_ctx_t *const ctx = data->ctx; + const COVER_params_t parameters = data->parameters; + size_t dictBufferCapacity = data->dictBufferCapacity; + size_t totalCompressedSize = ERROR(GENERIC); + /* Allocate space for hash table, dict, and freqs */ + COVER_map_t activeDmers; + BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { + DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); + goto _cleanup; + } + if (!dict || !freqs) { + DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); + goto _cleanup; + } + /* Copy the frequencies because we need to modify them */ + memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32)); + /* Build the dictionary */ + { + const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, + dictBufferCapacity, parameters); + const ZDICT_params_t zdictParams = COVER_translateParams(parameters); + dictBufferCapacity = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); + if (ZDICT_isError(dictBufferCapacity)) { + DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + goto _cleanup; + } + } + /* Check total compressed size */ + { + /* Pointers */ + ZSTD_CCtx *cctx; + ZSTD_CDict *cdict; + void *dst; + /* Local variables */ + size_t dstCapacity; + size_t i; + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (i = 0; i < ctx->nbSamples; ++i) { + maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + /* Create the cctx and cdict */ + cctx = ZSTD_createCCtx(); + cdict = + ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel); + if (!dst || !cctx || !cdict) { + goto _compressCleanup; + } + /* Compress each sample and sum their sizes (or error) */ + totalCompressedSize = 0; + for (i = 0; i < ctx->nbSamples; ++i) { + const size_t size = ZSTD_compress_usingCDict( + cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], + ctx->samplesSizes[i], cdict); + if (ZSTD_isError(size)) { + totalCompressedSize = ERROR(GENERIC); + goto _compressCleanup; + } + totalCompressedSize += size; + } + _compressCleanup: + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + if (dst) { + free(dst); + } + } + +_cleanup: + COVER_best_finish(data->best, totalCompressedSize, parameters, dict, + dictBufferCapacity); + free(data); + COVER_map_destroy(&activeDmers); + if (dict) { + free(dict); + } + if (freqs) { + free(freqs); + } +} + +ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, + size_t dictBufferCapacity, + const void *samplesBuffer, + const size_t *samplesSizes, + unsigned nbSamples, + COVER_params_t *parameters) { + /* constants */ + const unsigned nbThreads = parameters->nbThreads; + const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; + const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps; + const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); + const unsigned kIterations = + (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); + /* Local variables */ + const int displayLevel = parameters->notificationLevel; + unsigned iteration = 1; + unsigned d; + unsigned k; + COVER_best_t best; + POOL_ctx *pool = NULL; + /* Checks */ + if (kMinK < kMaxD || kMaxK < kMinK) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Cover must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + if (nbThreads > 1) { + pool = POOL_create(nbThreads, 1); + if (!pool) { + return ERROR(memory_allocation); + } + } + /* Initialization */ + COVER_best_init(&best); + /* Turn down global display level to clean up display at level 2 and below */ + g_displayLevel = parameters->notificationLevel - 1; + /* Loop through d first because each new value needs a new context */ + LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", + kIterations); + for (d = kMinD; d <= kMaxD; d += 2) { + /* Initialize the context for this value of d */ + COVER_ctx_t ctx; + LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); + if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + return ERROR(GENERIC); + } + /* Loop through k reusing the same context */ + for (k = kMinK; k <= kMaxK; k += kStepSize) { + /* Prepare the arguments */ + COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc( + sizeof(COVER_tryParameters_data_t)); + LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k); + if (!data) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n"); + COVER_best_destroy(&best); + COVER_ctx_destroy(&ctx); + return ERROR(GENERIC); + } + data->ctx = &ctx; + data->best = &best; + data->dictBufferCapacity = dictBufferCapacity; + data->parameters = *parameters; + data->parameters.k = k; + data->parameters.d = d; + data->parameters.steps = kSteps; + /* Check the parameters */ + if (!COVER_checkParameters(data->parameters)) { + DISPLAYLEVEL(1, "Cover parameters incorrect\n"); + continue; + } + /* Call the function and pass ownership of data to it */ + COVER_best_start(&best); + if (pool) { + POOL_add(pool, &COVER_tryParameters, data); + } else { + COVER_tryParameters(data); + } + /* Print status */ + LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ", + (U32)((iteration * 100) / kIterations)); + ++iteration; + } + COVER_best_wait(&best); + COVER_ctx_destroy(&ctx); + } + LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", ""); + /* Fill the output buffer and parameters with output of the best parameters */ + { + const size_t dictSize = best.dictSize; + if (ZSTD_isError(best.compressedSize)) { + COVER_best_destroy(&best); + return best.compressedSize; + } + *parameters = best.parameters; + memcpy(dictBuffer, best.dict, dictSize); + COVER_best_destroy(&best); + POOL_free(pool); + return dictSize; + } +} diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/dictBuilder/zdict.c --- a/contrib/python-zstandard/zstd/dictBuilder/zdict.c Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.c Mon Feb 13 09:44:16 2017 -0800 @@ -36,12 +36,11 @@ #include /* clock */ #include "mem.h" /* read */ -#include "error_private.h" #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ #define HUF_STATIC_LINKING_ONLY -#include "huf.h" +#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */ #include "zstd_internal.h" /* includes zstd.h */ -#include "xxhash.h" +#include "xxhash.h" /* XXH64 */ #include "divsufsort.h" #ifndef ZDICT_STATIC_LINKING_ONLY # define ZDICT_STATIC_LINKING_ONLY @@ -61,7 +60,7 @@ #define NOISELENGTH 32 #define MINRATIO 4 -static const int g_compressionLevel_default = 5; +static const int g_compressionLevel_default = 6; static const U32 g_selectivity_default = 9; static const size_t g_provision_entropySize = 200; static const size_t g_min_fast_dictContent = 192; @@ -307,13 +306,13 @@ } while (length >=MINMATCHLENGTH); /* look backward */ - length = MINMATCHLENGTH; - while ((length >= MINMATCHLENGTH) & (start > 0)) { - length = ZDICT_count(b + pos, b + suffix[start - 1]); - if (length >= LLIMIT) length = LLIMIT - 1; - lengthList[length]++; - if (length >= MINMATCHLENGTH) start--; - } + length = MINMATCHLENGTH; + while ((length >= MINMATCHLENGTH) & (start > 0)) { + length = ZDICT_count(b + pos, b + suffix[start - 1]); + if (length >= LLIMIT) length = LLIMIT - 1; + lengthList[length]++; + if (length >= MINMATCHLENGTH) start--; + } /* largest useful length */ memset(cumulLength, 0, sizeof(cumulLength)); @@ -570,7 +569,7 @@ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } } cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize); - if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; } + if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } if (cSize) { /* if == 0; block is not compressible */ const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); @@ -825,6 +824,55 @@ } + +size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, + const void* customDictContent, size_t dictContentSize, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t params) +{ + size_t hSize; +#define HBUFFSIZE 256 + BYTE header[HBUFFSIZE]; + int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; + U32 const notificationLevel = params.notificationLevel; + + /* check conditions */ + if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); + if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); + + /* dictionary header */ + MEM_writeLE32(header, ZSTD_DICT_MAGIC); + { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); + U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; + U32 const dictID = params.dictID ? params.dictID : compliantID; + MEM_writeLE32(header+4, dictID); + } + hSize = 8; + + /* entropy tables */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + DISPLAYLEVEL(2, "statistics ... \n"); + { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize, + compressionLevel, + samplesBuffer, samplesSizes, nbSamples, + customDictContent, dictContentSize, + notificationLevel); + if (ZDICT_isError(eSize)) return eSize; + hSize += eSize; + } + + /* copy elements in final buffer ; note : src and dst buffer can overlap */ + if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize; + { size_t const dictSize = hSize + dictContentSize; + char* dictEnd = (char*)dictBuffer + dictSize; + memmove(dictEnd - dictContentSize, customDictContent, dictContentSize); + memcpy(dictBuffer, header, hSize); + return dictSize; + } +} + + size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/dictBuilder/zdict.h --- a/contrib/python-zstandard/zstd/dictBuilder/zdict.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/dictBuilder/zdict.h Mon Feb 13 09:44:16 2017 -0800 @@ -19,15 +19,18 @@ #include /* size_t */ -/*====== Export for Windows ======*/ -/*! -* ZSTD_DLL_EXPORT : -* Enable exporting of functions when building a Windows DLL -*/ -#if defined(_WIN32) && defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) -# define ZDICTLIB_API __declspec(dllexport) +/* ===== ZDICTLIB_API : control library symbols visibility ===== */ +#if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) #else -# define ZDICTLIB_API +# define ZDICTLIB_VISIBILITY +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZDICTLIB_API ZDICTLIB_VISIBILITY #endif @@ -79,27 +82,114 @@ or an error code, which can be tested by ZDICT_isError(). note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using notificationLevel>0. */ -size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, +ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t parameters); + +/*! COVER_params_t : + For all values 0 means default. + kMin and d are the only required parameters. +*/ +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ + unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ + unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ + + unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ + unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ + int compressionLevel; /* 0 means default; target a specific zstd compression level */ +} COVER_params_t; + + +/*! COVER_trainFromBuffer() : + Train a dictionary from an array of samples using the COVER algorithm. + Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + The resulting dictionary will be saved into `dictBuffer`. + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + or an error code, which can be tested with ZDICT_isError(). + Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte. + Tips : In general, a reasonable dictionary has a size of ~ 100 KB. + It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. + In general, it's recommended to provide a few thousands samples, but this can vary a lot. + It's recommended that total size of all samples be about ~x100 times the target size of dictionary. +*/ +ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + COVER_params_t parameters); + +/*! COVER_optimizeTrainFromBuffer() : + The same requirements as above hold for all the parameters except `parameters`. + This function tries many parameter combinations and picks the best parameters. + `*parameters` is filled with the best parameters found, and the dictionary + constructed with those parameters is stored in `dictBuffer`. + + All of the parameters d, k, steps are optional. + If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. + if steps is zero it defaults to its default value. + If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. + + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + or an error code, which can be tested with ZDICT_isError(). + On success `*parameters` contains the parameters selected. + Note : COVER_optimizeTrainFromBuffer() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. +*/ +ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + COVER_params_t *parameters); + +/*! ZDICT_finalizeDictionary() : + + Given a custom content as a basis for dictionary, and a set of samples, + finalize dictionary by adding headers and statistics. + + Samples must be stored concatenated in a flat buffer `samplesBuffer`, + supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. + + dictContentSize must be > ZDICT_CONTENTSIZE_MIN bytes. + maxDictSize must be >= dictContentSize, and must be > ZDICT_DICTSIZE_MIN bytes. + + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), + or an error code, which can be tested by ZDICT_isError(). + note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. + note 2 : dictBuffer and customDictContent can overlap +*/ +#define ZDICT_CONTENTSIZE_MIN 256 +#define ZDICT_DICTSIZE_MIN 512 +ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, + const void* customDictContent, size_t dictContentSize, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t parameters); -/*! ZDICT_addEntropyTablesFromBuffer() : - - Given a content-only dictionary (built using any 3rd party algorithm), - add entropy tables computed from an array of samples. - Samples must be stored concatenated in a flat buffer `samplesBuffer`, - supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. - The input dictionary content must be stored *at the end* of `dictBuffer`. - Its size is `dictContentSize`. - The resulting dictionary with added entropy tables will be *written back to `dictBuffer`*, - starting from its beginning. - @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`). -*/ +/* Deprecation warnings */ +/* It is generally possible to disable deprecation warnings from compiler, + for example with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual. + Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */ +#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS +# define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */ +#else +# define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define ZDICT_DEPRECATED(message) ZDICTLIB_API [[deprecated(message)]] +# elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__) +# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message))) +# elif (ZDICT_GCC_VERSION >= 301) +# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler") +# define ZDICT_DEPRECATED(message) ZDICTLIB_API +# endif +#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */ + +ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead") size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); - + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); #endif /* ZDICT_STATIC_LINKING_ONLY */ diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd/zstd.h --- a/contrib/python-zstandard/zstd/zstd.h Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd/zstd.h Mon Feb 13 09:44:16 2017 -0800 @@ -20,13 +20,16 @@ /* ===== ZSTDLIB_API : control library symbols visibility ===== */ #if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZSTDLIB_API __attribute__ ((visibility ("default"))) -#elif defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) -# define ZSTDLIB_API __declspec(dllexport) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +#else +# define ZSTDLIB_VISIBILITY +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) -# define ZSTDLIB_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ #else -# define ZSTDLIB_API +# define ZSTDLIB_API ZSTDLIB_VISIBILITY #endif @@ -53,7 +56,7 @@ /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 1 -#define ZSTD_VERSION_RELEASE 2 +#define ZSTD_VERSION_RELEASE 3 #define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE #define ZSTD_QUOTE(str) #str @@ -170,8 +173,8 @@ * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. * ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. * ZSTD_CDict can be created once and used by multiple threads concurrently, as its usage is read-only. -* `dict` can be released after ZSTD_CDict creation. */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel); +* `dictBuffer` can be released after ZSTD_CDict creation, as its content is copied within CDict */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, int compressionLevel); /*! ZSTD_freeCDict() : * Function frees memory allocated by ZSTD_createCDict(). */ @@ -191,8 +194,8 @@ /*! ZSTD_createDDict() : * Create a digested dictionary, ready to start decompression operation without startup delay. -* `dict` can be released after creation. */ -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize); +* dictBuffer can be released after DDict creation, as its content is copied inside DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); /*! ZSTD_freeDDict() : * Function frees memory allocated with ZSTD_createDDict() */ @@ -325,7 +328,7 @@ * ***************************************************************************************/ /* --- Constants ---*/ -#define ZSTD_MAGICNUMBER 0xFD2FB528 /* v0.8 */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ #define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U #define ZSTD_WINDOWLOG_MAX_32 25 @@ -345,8 +348,9 @@ #define ZSTD_TARGETLENGTH_MAX 999 #define ZSTD_FRAMEHEADERSIZE_MAX 18 /* for static allocation */ +#define ZSTD_FRAMEHEADERSIZE_MIN 6 static const size_t ZSTD_frameHeaderSize_prefix = 5; -static const size_t ZSTD_frameHeaderSize_min = 6; +static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN; static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX; static const size_t ZSTD_skippableHeaderSize = 8; /* magic number + skippable frame length */ @@ -365,9 +369,9 @@ } ZSTD_compressionParameters; typedef struct { - unsigned contentSizeFlag; /**< 1: content size will be in frame header (if known). */ - unsigned checksumFlag; /**< 1: will generate a 22-bits checksum at end of frame, to be used for error detection by decompressor */ - unsigned noDictIDFlag; /**< 1: no dict ID will be saved into frame header (if dictionary compression) */ + unsigned contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + unsigned checksumFlag; /**< 1: generate a 32-bits checksum at end of frame, for error detection */ + unsigned noDictIDFlag; /**< 1: no dictID will be saved into frame header (if dictionary compression) */ } ZSTD_frameParameters; typedef struct { @@ -397,9 +401,23 @@ * Gives the amount of memory used by a given ZSTD_CCtx */ ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +typedef enum { + ZSTD_p_forceWindow /* Force back-references to remain < windowSize, even when referencing Dictionary content (default:0)*/ +} ZSTD_CCtxParameter; +/*! ZSTD_setCCtxParameter() : + * Set advanced parameters, selected through enum ZSTD_CCtxParameter + * @result : 0, or an error code (which can be tested with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value); + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is simply referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + /*! ZSTD_createCDict_advanced() : * Create a ZSTD_CDict using external alloc and free, and customized compression parameters */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, unsigned byReference, ZSTD_parameters params, ZSTD_customMem customMem); /*! ZSTD_sizeof_CDict() : @@ -455,6 +473,15 @@ * Gives the amount of memory used by a given ZSTD_DCtx */ ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is simply referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + unsigned byReference, ZSTD_customMem customMem); + /*! ZSTD_sizeof_DDict() : * Gives the amount of memory used by a given ZSTD_DDict */ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); @@ -463,13 +490,13 @@ * Provides the dictID stored within dictionary. * if @return == 0, the dictionary is not conformant with Zstandard specification. * It can still be loaded, but as a content-only dictionary. */ -unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); /*! ZSTD_getDictID_fromDDict() : * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ -unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); /*! ZSTD_getDictID_fromFrame() : * Provides the dictID required to decompressed the frame stored within `src`. @@ -481,7 +508,7 @@ * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. * When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */ -unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); /******************************************************************** @@ -491,7 +518,7 @@ /*===== Advanced Streaming compression functions =====*/ ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); /**< pledgedSrcSize must be correct */ -ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */ ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be zero == unknown */ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /**< note : cdict will just be referenced, and must outlive compression session */ @@ -500,9 +527,9 @@ /*===== Advanced Streaming decompression functions =====*/ -typedef enum { ZSTDdsp_maxWindowSize } ZSTD_DStreamParameter_e; +typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e; ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */ ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue); ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /**< note : ddict will just be referenced, and must outlive decompression session */ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); /**< re-use decompression parameters from previous init; saves dictionary loading */ @@ -542,10 +569,10 @@ In which case, it will "discard" the relevant memory section from its history. Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. - It's possible to use a NULL,0 src content, in which case, it will write a final empty block to end the frame, - Without last block mark, frames will be considered unfinished (broken) by decoders. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames will be considered unfinished (corrupted) by decoders. - You can then reuse `ZSTD_CCtx` (ZSTD_compressBegin()) to compress some new frame. + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new frame. */ /*===== Buffer-less streaming compression functions =====*/ @@ -553,6 +580,7 @@ ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize); ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); diff -r f3807a135e43 -r 455677a7667f contrib/python-zstandard/zstd_cffi.py --- a/contrib/python-zstandard/zstd_cffi.py Fri Feb 10 18:20:58 2017 +0100 +++ b/contrib/python-zstandard/zstd_cffi.py Mon Feb 13 09:44:16 2017 -0800 @@ -8,145 +8,1035 @@ from __future__ import absolute_import, unicode_literals -import io +import sys from _zstd_cffi import ( ffi, lib, ) +if sys.version_info[0] == 2: + bytes_type = str + int_type = long +else: + bytes_type = bytes + int_type = int -_CSTREAM_IN_SIZE = lib.ZSTD_CStreamInSize() -_CSTREAM_OUT_SIZE = lib.ZSTD_CStreamOutSize() + +COMPRESSION_RECOMMENDED_INPUT_SIZE = lib.ZSTD_CStreamInSize() +COMPRESSION_RECOMMENDED_OUTPUT_SIZE = lib.ZSTD_CStreamOutSize() +DECOMPRESSION_RECOMMENDED_INPUT_SIZE = lib.ZSTD_DStreamInSize() +DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE = lib.ZSTD_DStreamOutSize() + +new_nonzero = ffi.new_allocator(should_clear_after_alloc=False) + + +MAX_COMPRESSION_LEVEL = lib.ZSTD_maxCLevel() +MAGIC_NUMBER = lib.ZSTD_MAGICNUMBER +FRAME_HEADER = b'\x28\xb5\x2f\xfd' +ZSTD_VERSION = (lib.ZSTD_VERSION_MAJOR, lib.ZSTD_VERSION_MINOR, lib.ZSTD_VERSION_RELEASE) + +WINDOWLOG_MIN = lib.ZSTD_WINDOWLOG_MIN +WINDOWLOG_MAX = lib.ZSTD_WINDOWLOG_MAX +CHAINLOG_MIN = lib.ZSTD_CHAINLOG_MIN +CHAINLOG_MAX = lib.ZSTD_CHAINLOG_MAX +HASHLOG_MIN = lib.ZSTD_HASHLOG_MIN +HASHLOG_MAX = lib.ZSTD_HASHLOG_MAX +HASHLOG3_MAX = lib.ZSTD_HASHLOG3_MAX +SEARCHLOG_MIN = lib.ZSTD_SEARCHLOG_MIN +SEARCHLOG_MAX = lib.ZSTD_SEARCHLOG_MAX +SEARCHLENGTH_MIN = lib.ZSTD_SEARCHLENGTH_MIN +SEARCHLENGTH_MAX = lib.ZSTD_SEARCHLENGTH_MAX +TARGETLENGTH_MIN = lib.ZSTD_TARGETLENGTH_MIN +TARGETLENGTH_MAX = lib.ZSTD_TARGETLENGTH_MAX + +STRATEGY_FAST = lib.ZSTD_fast +STRATEGY_DFAST = lib.ZSTD_dfast +STRATEGY_GREEDY = lib.ZSTD_greedy +STRATEGY_LAZY = lib.ZSTD_lazy +STRATEGY_LAZY2 = lib.ZSTD_lazy2 +STRATEGY_BTLAZY2 = lib.ZSTD_btlazy2 +STRATEGY_BTOPT = lib.ZSTD_btopt + +COMPRESSOBJ_FLUSH_FINISH = 0 +COMPRESSOBJ_FLUSH_BLOCK = 1 + + +class ZstdError(Exception): + pass -class _ZstdCompressionWriter(object): - def __init__(self, cstream, writer): - self._cstream = cstream +class CompressionParameters(object): + def __init__(self, window_log, chain_log, hash_log, search_log, + search_length, target_length, strategy): + if window_log < WINDOWLOG_MIN or window_log > WINDOWLOG_MAX: + raise ValueError('invalid window log value') + + if chain_log < CHAINLOG_MIN or chain_log > CHAINLOG_MAX: + raise ValueError('invalid chain log value') + + if hash_log < HASHLOG_MIN or hash_log > HASHLOG_MAX: + raise ValueError('invalid hash log value') + + if search_log < SEARCHLOG_MIN or search_log > SEARCHLOG_MAX: + raise ValueError('invalid search log value') + + if search_length < SEARCHLENGTH_MIN or search_length > SEARCHLENGTH_MAX: + raise ValueError('invalid search length value') + + if target_length < TARGETLENGTH_MIN or target_length > TARGETLENGTH_MAX: + raise ValueError('invalid target length value') + + if strategy < STRATEGY_FAST or strategy > STRATEGY_BTOPT: + raise ValueError('invalid strategy value') + + self.window_log = window_log + self.chain_log = chain_log + self.hash_log = hash_log + self.search_log = search_log + self.search_length = search_length + self.target_length = target_length + self.strategy = strategy + + def as_compression_parameters(self): + p = ffi.new('ZSTD_compressionParameters *')[0] + p.windowLog = self.window_log + p.chainLog = self.chain_log + p.hashLog = self.hash_log + p.searchLog = self.search_log + p.searchLength = self.search_length + p.targetLength = self.target_length + p.strategy = self.strategy + + return p + +def get_compression_parameters(level, source_size=0, dict_size=0): + params = lib.ZSTD_getCParams(level, source_size, dict_size) + return CompressionParameters(window_log=params.windowLog, + chain_log=params.chainLog, + hash_log=params.hashLog, + search_log=params.searchLog, + search_length=params.searchLength, + target_length=params.targetLength, + strategy=params.strategy) + + +def estimate_compression_context_size(params): + if not isinstance(params, CompressionParameters): + raise ValueError('argument must be a CompressionParameters') + + cparams = params.as_compression_parameters() + return lib.ZSTD_estimateCCtxSize(cparams) + + +def estimate_decompression_context_size(): + return lib.ZSTD_estimateDCtxSize() + + +class ZstdCompressionWriter(object): + def __init__(self, compressor, writer, source_size, write_size): + self._compressor = compressor self._writer = writer + self._source_size = source_size + self._write_size = write_size + self._entered = False def __enter__(self): + if self._entered: + raise ZstdError('cannot __enter__ multiple times') + + self._cstream = self._compressor._get_cstream(self._source_size) + self._entered = True return self def __exit__(self, exc_type, exc_value, exc_tb): + self._entered = False + if not exc_type and not exc_value and not exc_tb: out_buffer = ffi.new('ZSTD_outBuffer *') - out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) - out_buffer.size = _CSTREAM_OUT_SIZE + dst_buffer = ffi.new('char[]', self._write_size) + out_buffer.dst = dst_buffer + out_buffer.size = self._write_size out_buffer.pos = 0 while True: - res = lib.ZSTD_endStream(self._cstream, out_buffer) - if lib.ZSTD_isError(res): - raise Exception('error ending compression stream: %s' % lib.ZSTD_getErrorName) + zresult = lib.ZSTD_endStream(self._cstream, out_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('error ending compression stream: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) if out_buffer.pos: - self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) out_buffer.pos = 0 - if res == 0: + if zresult == 0: break + self._cstream = None + self._compressor = None + return False + def memory_size(self): + if not self._entered: + raise ZstdError('cannot determine size of an inactive compressor; ' + 'call when a context manager is active') + + return lib.ZSTD_sizeof_CStream(self._cstream) + def write(self, data): + if not self._entered: + raise ZstdError('write() must be called from an active context ' + 'manager') + + total_write = 0 + + data_buffer = ffi.from_buffer(data) + + in_buffer = ffi.new('ZSTD_inBuffer *') + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + out_buffer = ffi.new('ZSTD_outBuffer *') - out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) - out_buffer.size = _CSTREAM_OUT_SIZE + dst_buffer = ffi.new('char[]', self._write_size) + out_buffer.dst = dst_buffer + out_buffer.size = self._write_size + out_buffer.pos = 0 + + while in_buffer.pos < in_buffer.size: + zresult = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) + total_write += out_buffer.pos + out_buffer.pos = 0 + + return total_write + + def flush(self): + if not self._entered: + raise ZstdError('flush must be called from an active context manager') + + total_write = 0 + + out_buffer = ffi.new('ZSTD_outBuffer *') + dst_buffer = ffi.new('char[]', self._write_size) + out_buffer.dst = dst_buffer + out_buffer.size = self._write_size out_buffer.pos = 0 - # TODO can we reuse existing memory? - in_buffer = ffi.new('ZSTD_inBuffer *') - in_buffer.src = ffi.new('char[]', data) - in_buffer.size = len(data) - in_buffer.pos = 0 - while in_buffer.pos < in_buffer.size: - res = lib.ZSTD_compressStream(self._cstream, out_buffer, in_buffer) - if lib.ZSTD_isError(res): - raise Exception('zstd compress error: %s' % lib.ZSTD_getErrorName(res)) + while True: + zresult = lib.ZSTD_flushStream(self._cstream, out_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if not out_buffer.pos: + break + + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) + total_write += out_buffer.pos + out_buffer.pos = 0 + + return total_write + + +class ZstdCompressionObj(object): + def compress(self, data): + if self._finished: + raise ZstdError('cannot call compress() after compressor finished') + + data_buffer = ffi.from_buffer(data) + source = ffi.new('ZSTD_inBuffer *') + source.src = data_buffer + source.size = len(data_buffer) + source.pos = 0 + + chunks = [] + + while source.pos < len(data): + zresult = lib.ZSTD_compressStream(self._cstream, self._out, source) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if self._out.pos: + chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:]) + self._out.pos = 0 + + return b''.join(chunks) - if out_buffer.pos: - self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) - out_buffer.pos = 0 + def flush(self, flush_mode=COMPRESSOBJ_FLUSH_FINISH): + if flush_mode not in (COMPRESSOBJ_FLUSH_FINISH, COMPRESSOBJ_FLUSH_BLOCK): + raise ValueError('flush mode not recognized') + + if self._finished: + raise ZstdError('compressor object already finished') + + assert self._out.pos == 0 + + if flush_mode == COMPRESSOBJ_FLUSH_BLOCK: + zresult = lib.ZSTD_flushStream(self._cstream, self._out) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + # Output buffer is guaranteed to hold full block. + assert zresult == 0 + + if self._out.pos: + result = ffi.buffer(self._out.dst, self._out.pos)[:] + self._out.pos = 0 + return result + else: + return b'' + + assert flush_mode == COMPRESSOBJ_FLUSH_FINISH + self._finished = True + + chunks = [] + + while True: + zresult = lib.ZSTD_endStream(self._cstream, self._out) + if lib.ZSTD_isError(zresult): + raise ZstdError('error ending compression stream: %s' % + ffi.string(lib.ZSTD_getErroName(zresult))) + + if self._out.pos: + chunks.append(ffi.buffer(self._out.dst, self._out.pos)[:]) + self._out.pos = 0 + + if not zresult: + break + + # GC compression stream immediately. + self._cstream = None + + return b''.join(chunks) class ZstdCompressor(object): - def __init__(self, level=3, dict_data=None, compression_params=None): - if dict_data: - raise Exception('dict_data not yet supported') - if compression_params: - raise Exception('compression_params not yet supported') + def __init__(self, level=3, dict_data=None, compression_params=None, + write_checksum=False, write_content_size=False, + write_dict_id=True): + if level < 1: + raise ValueError('level must be greater than 0') + elif level > lib.ZSTD_maxCLevel(): + raise ValueError('level must be less than %d' % lib.ZSTD_maxCLevel()) self._compression_level = level + self._dict_data = dict_data + self._cparams = compression_params + self._fparams = ffi.new('ZSTD_frameParameters *')[0] + self._fparams.checksumFlag = write_checksum + self._fparams.contentSizeFlag = write_content_size + self._fparams.noDictIDFlag = not write_dict_id - def compress(self, data): - # Just use the stream API for now. - output = io.BytesIO() - with self.write_to(output) as compressor: - compressor.write(data) - return output.getvalue() + cctx = lib.ZSTD_createCCtx() + if cctx == ffi.NULL: + raise MemoryError() + + self._cctx = ffi.gc(cctx, lib.ZSTD_freeCCtx) + + def compress(self, data, allow_empty=False): + if len(data) == 0 and self._fparams.contentSizeFlag and not allow_empty: + raise ValueError('cannot write empty inputs when writing content sizes') + + # TODO use a CDict for performance. + dict_data = ffi.NULL + dict_size = 0 + + if self._dict_data: + dict_data = self._dict_data.as_bytes() + dict_size = len(self._dict_data) + + params = ffi.new('ZSTD_parameters *')[0] + if self._cparams: + params.cParams = self._cparams.as_compression_parameters() + else: + params.cParams = lib.ZSTD_getCParams(self._compression_level, len(data), + dict_size) + params.fParams = self._fparams + + dest_size = lib.ZSTD_compressBound(len(data)) + out = new_nonzero('char[]', dest_size) - def copy_stream(self, ifh, ofh): - cstream = self._get_cstream() + zresult = lib.ZSTD_compress_advanced(self._cctx, + ffi.addressof(out), dest_size, + data, len(data), + dict_data, dict_size, + params) + + if lib.ZSTD_isError(zresult): + raise ZstdError('cannot compress: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + return ffi.buffer(out, zresult)[:] + + def compressobj(self, size=0): + cstream = self._get_cstream(size) + cobj = ZstdCompressionObj() + cobj._cstream = cstream + cobj._out = ffi.new('ZSTD_outBuffer *') + cobj._dst_buffer = ffi.new('char[]', COMPRESSION_RECOMMENDED_OUTPUT_SIZE) + cobj._out.dst = cobj._dst_buffer + cobj._out.size = COMPRESSION_RECOMMENDED_OUTPUT_SIZE + cobj._out.pos = 0 + cobj._compressor = self + cobj._finished = False + + return cobj + + def copy_stream(self, ifh, ofh, size=0, + read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): + + if not hasattr(ifh, 'read'): + raise ValueError('first argument must have a read() method') + if not hasattr(ofh, 'write'): + raise ValueError('second argument must have a write() method') + + cstream = self._get_cstream(size) in_buffer = ffi.new('ZSTD_inBuffer *') out_buffer = ffi.new('ZSTD_outBuffer *') - out_buffer.dst = ffi.new('char[]', _CSTREAM_OUT_SIZE) - out_buffer.size = _CSTREAM_OUT_SIZE + dst_buffer = ffi.new('char[]', write_size) + out_buffer.dst = dst_buffer + out_buffer.size = write_size out_buffer.pos = 0 total_read, total_write = 0, 0 while True: - data = ifh.read(_CSTREAM_IN_SIZE) + data = ifh.read(read_size) if not data: break - total_read += len(data) - - in_buffer.src = ffi.new('char[]', data) - in_buffer.size = len(data) + data_buffer = ffi.from_buffer(data) + total_read += len(data_buffer) + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) in_buffer.pos = 0 while in_buffer.pos < in_buffer.size: - res = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer) - if lib.ZSTD_isError(res): - raise Exception('zstd compress error: %s' % - lib.ZSTD_getErrorName(res)) + zresult = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) if out_buffer.pos: ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) - total_write = out_buffer.pos + total_write += out_buffer.pos out_buffer.pos = 0 # We've finished reading. Flush the compressor. while True: - res = lib.ZSTD_endStream(cstream, out_buffer) - if lib.ZSTD_isError(res): - raise Exception('error ending compression stream: %s' % - lib.ZSTD_getErrorName(res)) + zresult = lib.ZSTD_endStream(cstream, out_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('error ending compression stream: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) if out_buffer.pos: ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) total_write += out_buffer.pos out_buffer.pos = 0 - if res == 0: + if zresult == 0: break return total_read, total_write - def write_to(self, writer): - return _ZstdCompressionWriter(self._get_cstream(), writer) + def write_to(self, writer, size=0, + write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): + + if not hasattr(writer, 'write'): + raise ValueError('must pass an object with a write() method') + + return ZstdCompressionWriter(self, writer, size, write_size) + + def read_from(self, reader, size=0, + read_size=COMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=COMPRESSION_RECOMMENDED_OUTPUT_SIZE): + if hasattr(reader, 'read'): + have_read = True + elif hasattr(reader, '__getitem__'): + have_read = False + buffer_offset = 0 + size = len(reader) + else: + raise ValueError('must pass an object with a read() method or ' + 'conforms to buffer protocol') + + cstream = self._get_cstream(size) + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + in_buffer.src = ffi.NULL + in_buffer.size = 0 + in_buffer.pos = 0 + + dst_buffer = ffi.new('char[]', write_size) + out_buffer.dst = dst_buffer + out_buffer.size = write_size + out_buffer.pos = 0 + + while True: + # We should never have output data sitting around after a previous + # iteration. + assert out_buffer.pos == 0 + + # Collect input data. + if have_read: + read_result = reader.read(read_size) + else: + remaining = len(reader) - buffer_offset + slice_size = min(remaining, read_size) + read_result = reader[buffer_offset:buffer_offset + slice_size] + buffer_offset += slice_size - def _get_cstream(self): + # No new input data. Break out of the read loop. + if not read_result: + break + + # Feed all read data into the compressor and emit output until + # exhausted. + read_buffer = ffi.from_buffer(read_result) + in_buffer.src = read_buffer + in_buffer.size = len(read_buffer) + in_buffer.pos = 0 + + while in_buffer.pos < in_buffer.size: + zresult = lib.ZSTD_compressStream(cstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd compress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + out_buffer.pos = 0 + yield data + + assert out_buffer.pos == 0 + + # And repeat the loop to collect more data. + continue + + # If we get here, input is exhausted. End the stream and emit what + # remains. + while True: + assert out_buffer.pos == 0 + zresult = lib.ZSTD_endStream(cstream, out_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('error ending compression stream: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + out_buffer.pos = 0 + yield data + + if zresult == 0: + break + + def _get_cstream(self, size): cstream = lib.ZSTD_createCStream() + if cstream == ffi.NULL: + raise MemoryError() + cstream = ffi.gc(cstream, lib.ZSTD_freeCStream) - res = lib.ZSTD_initCStream(cstream, self._compression_level) - if lib.ZSTD_isError(res): + dict_data = ffi.NULL + dict_size = 0 + if self._dict_data: + dict_data = self._dict_data.as_bytes() + dict_size = len(self._dict_data) + + zparams = ffi.new('ZSTD_parameters *')[0] + if self._cparams: + zparams.cParams = self._cparams.as_compression_parameters() + else: + zparams.cParams = lib.ZSTD_getCParams(self._compression_level, + size, dict_size) + zparams.fParams = self._fparams + + zresult = lib.ZSTD_initCStream_advanced(cstream, dict_data, dict_size, + zparams, size) + if lib.ZSTD_isError(zresult): raise Exception('cannot init CStream: %s' % - lib.ZSTD_getErrorName(res)) + ffi.string(lib.ZSTD_getErrorName(zresult))) return cstream + + +class FrameParameters(object): + def __init__(self, fparams): + self.content_size = fparams.frameContentSize + self.window_size = fparams.windowSize + self.dict_id = fparams.dictID + self.has_checksum = bool(fparams.checksumFlag) + + +def get_frame_parameters(data): + if not isinstance(data, bytes_type): + raise TypeError('argument must be bytes') + + params = ffi.new('ZSTD_frameParams *') + + zresult = lib.ZSTD_getFrameParams(params, data, len(data)) + if lib.ZSTD_isError(zresult): + raise ZstdError('cannot get frame parameters: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if zresult: + raise ZstdError('not enough data for frame parameters; need %d bytes' % + zresult) + + return FrameParameters(params[0]) + + +class ZstdCompressionDict(object): + def __init__(self, data): + assert isinstance(data, bytes_type) + self._data = data + + def __len__(self): + return len(self._data) + + def dict_id(self): + return int_type(lib.ZDICT_getDictID(self._data, len(self._data))) + + def as_bytes(self): + return self._data + + +def train_dictionary(dict_size, samples, parameters=None): + if not isinstance(samples, list): + raise TypeError('samples must be a list') + + total_size = sum(map(len, samples)) + + samples_buffer = new_nonzero('char[]', total_size) + sample_sizes = new_nonzero('size_t[]', len(samples)) + + offset = 0 + for i, sample in enumerate(samples): + if not isinstance(sample, bytes_type): + raise ValueError('samples must be bytes') + + l = len(sample) + ffi.memmove(samples_buffer + offset, sample, l) + offset += l + sample_sizes[i] = l + + dict_data = new_nonzero('char[]', dict_size) + + zresult = lib.ZDICT_trainFromBuffer(ffi.addressof(dict_data), dict_size, + ffi.addressof(samples_buffer), + ffi.addressof(sample_sizes, 0), + len(samples)) + if lib.ZDICT_isError(zresult): + raise ZstdError('Cannot train dict: %s' % + ffi.string(lib.ZDICT_getErrorName(zresult))) + + return ZstdCompressionDict(ffi.buffer(dict_data, zresult)[:]) + + +class ZstdDecompressionObj(object): + def __init__(self, decompressor): + self._decompressor = decompressor + self._dstream = self._decompressor._get_dstream() + self._finished = False + + def decompress(self, data): + if self._finished: + raise ZstdError('cannot use a decompressobj multiple times') + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + data_buffer = ffi.from_buffer(data) + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + + dst_buffer = ffi.new('char[]', DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE) + out_buffer.dst = dst_buffer + out_buffer.size = len(dst_buffer) + out_buffer.pos = 0 + + chunks = [] + + while in_buffer.pos < in_buffer.size: + zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd decompressor error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if zresult == 0: + self._finished = True + self._dstream = None + self._decompressor = None + + if out_buffer.pos: + chunks.append(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) + out_buffer.pos = 0 + + return b''.join(chunks) + + +class ZstdDecompressionWriter(object): + def __init__(self, decompressor, writer, write_size): + self._decompressor = decompressor + self._writer = writer + self._write_size = write_size + self._dstream = None + self._entered = False + + def __enter__(self): + if self._entered: + raise ZstdError('cannot __enter__ multiple times') + + self._dstream = self._decompressor._get_dstream() + self._entered = True + + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self._entered = False + self._dstream = None + + def memory_size(self): + if not self._dstream: + raise ZstdError('cannot determine size of inactive decompressor ' + 'call when context manager is active') + + return lib.ZSTD_sizeof_DStream(self._dstream) + + def write(self, data): + if not self._entered: + raise ZstdError('write must be called from an active context manager') + + total_write = 0 + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + data_buffer = ffi.from_buffer(data) + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + + dst_buffer = ffi.new('char[]', self._write_size) + out_buffer.dst = dst_buffer + out_buffer.size = len(dst_buffer) + out_buffer.pos = 0 + + while in_buffer.pos < in_buffer.size: + zresult = lib.ZSTD_decompressStream(self._dstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd decompress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + self._writer.write(ffi.buffer(out_buffer.dst, out_buffer.pos)[:]) + total_write += out_buffer.pos + out_buffer.pos = 0 + + return total_write + + +class ZstdDecompressor(object): + def __init__(self, dict_data=None): + self._dict_data = dict_data + + dctx = lib.ZSTD_createDCtx() + if dctx == ffi.NULL: + raise MemoryError() + + self._refdctx = ffi.gc(dctx, lib.ZSTD_freeDCtx) + + @property + def _ddict(self): + if self._dict_data: + dict_data = self._dict_data.as_bytes() + dict_size = len(self._dict_data) + + ddict = lib.ZSTD_createDDict(dict_data, dict_size) + if ddict == ffi.NULL: + raise ZstdError('could not create decompression dict') + else: + ddict = None + + self.__dict__['_ddict'] = ddict + return ddict + + def decompress(self, data, max_output_size=0): + data_buffer = ffi.from_buffer(data) + + orig_dctx = new_nonzero('char[]', lib.ZSTD_sizeof_DCtx(self._refdctx)) + dctx = ffi.cast('ZSTD_DCtx *', orig_dctx) + lib.ZSTD_copyDCtx(dctx, self._refdctx) + + ddict = self._ddict + + output_size = lib.ZSTD_getDecompressedSize(data_buffer, len(data_buffer)) + if output_size: + result_buffer = ffi.new('char[]', output_size) + result_size = output_size + else: + if not max_output_size: + raise ZstdError('input data invalid or missing content size ' + 'in frame header') + + result_buffer = ffi.new('char[]', max_output_size) + result_size = max_output_size + + if ddict: + zresult = lib.ZSTD_decompress_usingDDict(dctx, + result_buffer, result_size, + data_buffer, len(data_buffer), + ddict) + else: + zresult = lib.ZSTD_decompressDCtx(dctx, + result_buffer, result_size, + data_buffer, len(data_buffer)) + if lib.ZSTD_isError(zresult): + raise ZstdError('decompression error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + elif output_size and zresult != output_size: + raise ZstdError('decompression error: decompressed %d bytes; expected %d' % + (zresult, output_size)) + + return ffi.buffer(result_buffer, zresult)[:] + + def decompressobj(self): + return ZstdDecompressionObj(self) + + def read_from(self, reader, read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE, + skip_bytes=0): + if skip_bytes >= read_size: + raise ValueError('skip_bytes must be smaller than read_size') + + if hasattr(reader, 'read'): + have_read = True + elif hasattr(reader, '__getitem__'): + have_read = False + buffer_offset = 0 + size = len(reader) + else: + raise ValueError('must pass an object with a read() method or ' + 'conforms to buffer protocol') + + if skip_bytes: + if have_read: + reader.read(skip_bytes) + else: + if skip_bytes > size: + raise ValueError('skip_bytes larger than first input chunk') + + buffer_offset = skip_bytes + + dstream = self._get_dstream() + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + dst_buffer = ffi.new('char[]', write_size) + out_buffer.dst = dst_buffer + out_buffer.size = len(dst_buffer) + out_buffer.pos = 0 + + while True: + assert out_buffer.pos == 0 + + if have_read: + read_result = reader.read(read_size) + else: + remaining = size - buffer_offset + slice_size = min(remaining, read_size) + read_result = reader[buffer_offset:buffer_offset + slice_size] + buffer_offset += slice_size + + # No new input. Break out of read loop. + if not read_result: + break + + # Feed all read data into decompressor and emit output until + # exhausted. + read_buffer = ffi.from_buffer(read_result) + in_buffer.src = read_buffer + in_buffer.size = len(read_buffer) + in_buffer.pos = 0 + + while in_buffer.pos < in_buffer.size: + assert out_buffer.pos == 0 + + zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd decompress error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + data = ffi.buffer(out_buffer.dst, out_buffer.pos)[:] + out_buffer.pos = 0 + yield data + + if zresult == 0: + return + + # Repeat loop to collect more input data. + continue + + # If we get here, input is exhausted. + + def write_to(self, writer, write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): + if not hasattr(writer, 'write'): + raise ValueError('must pass an object with a write() method') + + return ZstdDecompressionWriter(self, writer, write_size) + + def copy_stream(self, ifh, ofh, + read_size=DECOMPRESSION_RECOMMENDED_INPUT_SIZE, + write_size=DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE): + if not hasattr(ifh, 'read'): + raise ValueError('first argument must have a read() method') + if not hasattr(ofh, 'write'): + raise ValueError('second argument must have a write() method') + + dstream = self._get_dstream() + + in_buffer = ffi.new('ZSTD_inBuffer *') + out_buffer = ffi.new('ZSTD_outBuffer *') + + dst_buffer = ffi.new('char[]', write_size) + out_buffer.dst = dst_buffer + out_buffer.size = write_size + out_buffer.pos = 0 + + total_read, total_write = 0, 0 + + # Read all available input. + while True: + data = ifh.read(read_size) + if not data: + break + + data_buffer = ffi.from_buffer(data) + total_read += len(data_buffer) + in_buffer.src = data_buffer + in_buffer.size = len(data_buffer) + in_buffer.pos = 0 + + # Flush all read data to output. + while in_buffer.pos < in_buffer.size: + zresult = lib.ZSTD_decompressStream(dstream, out_buffer, in_buffer) + if lib.ZSTD_isError(zresult): + raise ZstdError('zstd decompressor error: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + if out_buffer.pos: + ofh.write(ffi.buffer(out_buffer.dst, out_buffer.pos)) + total_write += out_buffer.pos + out_buffer.pos = 0 + + # Continue loop to keep reading. + + return total_read, total_write + + def decompress_content_dict_chain(self, frames): + if not isinstance(frames, list): + raise TypeError('argument must be a list') + + if not frames: + raise ValueError('empty input chain') + + # First chunk should not be using a dictionary. We handle it specially. + chunk = frames[0] + if not isinstance(chunk, bytes_type): + raise ValueError('chunk 0 must be bytes') + + # All chunks should be zstd frames and should have content size set. + chunk_buffer = ffi.from_buffer(chunk) + params = ffi.new('ZSTD_frameParams *') + zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer)) + if lib.ZSTD_isError(zresult): + raise ValueError('chunk 0 is not a valid zstd frame') + elif zresult: + raise ValueError('chunk 0 is too small to contain a zstd frame') + + if not params.frameContentSize: + raise ValueError('chunk 0 missing content size in frame') + + dctx = lib.ZSTD_createDCtx() + if dctx == ffi.NULL: + raise MemoryError() + + dctx = ffi.gc(dctx, lib.ZSTD_freeDCtx) + + last_buffer = ffi.new('char[]', params.frameContentSize) + + zresult = lib.ZSTD_decompressDCtx(dctx, last_buffer, len(last_buffer), + chunk_buffer, len(chunk_buffer)) + if lib.ZSTD_isError(zresult): + raise ZstdError('could not decompress chunk 0: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + # Special case of chain length of 1 + if len(frames) == 1: + return ffi.buffer(last_buffer, len(last_buffer))[:] + + i = 1 + while i < len(frames): + chunk = frames[i] + if not isinstance(chunk, bytes_type): + raise ValueError('chunk %d must be bytes' % i) + + chunk_buffer = ffi.from_buffer(chunk) + zresult = lib.ZSTD_getFrameParams(params, chunk_buffer, len(chunk_buffer)) + if lib.ZSTD_isError(zresult): + raise ValueError('chunk %d is not a valid zstd frame' % i) + elif zresult: + raise ValueError('chunk %d is too small to contain a zstd frame' % i) + + if not params.frameContentSize: + raise ValueError('chunk %d missing content size in frame' % i) + + dest_buffer = ffi.new('char[]', params.frameContentSize) + + zresult = lib.ZSTD_decompress_usingDict(dctx, dest_buffer, len(dest_buffer), + chunk_buffer, len(chunk_buffer), + last_buffer, len(last_buffer)) + if lib.ZSTD_isError(zresult): + raise ZstdError('could not decompress chunk %d' % i) + + last_buffer = dest_buffer + i += 1 + + return ffi.buffer(last_buffer, len(last_buffer))[:] + + def _get_dstream(self): + dstream = lib.ZSTD_createDStream() + if dstream == ffi.NULL: + raise MemoryError() + + dstream = ffi.gc(dstream, lib.ZSTD_freeDStream) + + if self._dict_data: + zresult = lib.ZSTD_initDStream_usingDict(dstream, + self._dict_data.as_bytes(), + len(self._dict_data)) + else: + zresult = lib.ZSTD_initDStream(dstream) + + if lib.ZSTD_isError(zresult): + raise ZstdError('could not initialize DStream: %s' % + ffi.string(lib.ZSTD_getErrorName(zresult))) + + return dstream diff -r f3807a135e43 -r 455677a7667f hgext/mq.py --- a/hgext/mq.py Fri Feb 10 18:20:58 2017 +0100 +++ b/hgext/mq.py Mon Feb 13 09:44:16 2017 -0800 @@ -14,7 +14,7 @@ Known patches are represented as patch files in the .hg/patches directory. Applied patches are both patch files and changesets. -Common tasks (use :hg:`help command` for more details):: +Common tasks (use :hg:`help COMMAND` for more details):: create new patch qnew import existing patch qimport diff -r f3807a135e43 -r 455677a7667f hgext/pager.py --- a/hgext/pager.py Fri Feb 10 18:20:58 2017 +0100 +++ b/hgext/pager.py Mon Feb 13 09:44:16 2017 -0800 @@ -72,6 +72,7 @@ commands, dispatch, encoding, + error, extensions, util, ) @@ -87,14 +88,10 @@ close_fds=util.closefds, stdin=subprocess.PIPE, stdout=util.stdout, stderr=util.stderr) - # back up original file objects and descriptors - olduifout = ui.fout - oldstdout = util.stdout + # back up original file descriptors stdoutfd = os.dup(util.stdout.fileno()) stderrfd = os.dup(util.stderr.fileno()) - # create new line-buffered stdout so that output can show up immediately - ui.fout = util.stdout = newstdout = os.fdopen(util.stdout.fileno(), 'wb', 1) os.dup2(pager.stdin.fileno(), util.stdout.fileno()) if ui._isatty(util.stderr): os.dup2(pager.stdin.fileno(), util.stderr.fileno()) @@ -103,17 +100,15 @@ def killpager(): if util.safehasattr(signal, "SIGINT"): signal.signal(signal.SIGINT, signal.SIG_IGN) - pager.stdin.close() - ui.fout = olduifout - util.stdout = oldstdout - # close new stdout while it's associated with pager; otherwise stdout - # fd would be closed when newstdout is deleted - newstdout.close() - # restore original fds: stdout is open again + # restore original fds, closing pager.stdin copies in the process os.dup2(stdoutfd, util.stdout.fileno()) os.dup2(stderrfd, util.stderr.fileno()) + pager.stdin.close() pager.wait() +def catchterm(*args): + raise error.SignalInterrupt + def uisetup(ui): class pagerui(ui.__class__): def _runpager(self, pagercmd): @@ -154,7 +149,7 @@ ui.setconfig('ui', 'formatted', ui.formatted(), 'pager') ui.setconfig('ui', 'interactive', False, 'pager') if util.safehasattr(signal, "SIGPIPE"): - signal.signal(signal.SIGPIPE, signal.SIG_DFL) + signal.signal(signal.SIGPIPE, catchterm) ui._runpager(p) return orig(ui, options, cmd, cmdfunc) diff -r f3807a135e43 -r 455677a7667f hgext/rebase.py --- a/hgext/rebase.py Fri Feb 10 18:20:58 2017 +0100 +++ b/hgext/rebase.py Mon Feb 13 09:44:16 2017 -0800 @@ -1367,7 +1367,7 @@ """store the currently rebased set on the repo object This is used by another function to prevent rebased revision to because - hidden (see issue4505)""" + hidden (see issue4504)""" repo = repo.unfiltered() revs = set(revs) repo._rebaseset = revs @@ -1383,7 +1383,7 @@ del repo._rebaseset def _rebasedvisible(orig, repo): - """ensure rebased revs stay visible (see issue4505)""" + """ensure rebased revs stay visible (see issue4504)""" blockers = orig(repo) blockers.update(getattr(repo, '_rebaseset', ())) return blockers diff -r f3807a135e43 -r 455677a7667f hgext/zeroconf/__init__.py --- a/hgext/zeroconf/__init__.py Fri Feb 10 18:20:58 2017 +0100 +++ b/hgext/zeroconf/__init__.py Mon Feb 13 09:44:16 2017 -0800 @@ -64,7 +64,9 @@ # Generic method, sometimes gives useless results try: dumbip = socket.gethostbyaddr(socket.gethostname())[2][0] - if not dumbip.startswith('127.') and ':' not in dumbip: + if ':' in dumbip: + dumbip = '127.0.0.1' + if not dumbip.startswith('127.'): return dumbip except (socket.gaierror, socket.herror): dumbip = '127.0.0.1' diff -r f3807a135e43 -r 455677a7667f mercurial/bundle2.py --- a/mercurial/bundle2.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/bundle2.py Mon Feb 13 09:44:16 2017 -0800 @@ -320,9 +320,6 @@ It iterates over each part then searches for and uses the proper handling code to process the part. Parts are processed in order. - This is very early version of this function that will be strongly reworked - before final usage. - Unknown Mandatory part will abort the process. It is temporarily possible to provide a prebuilt bundleoperation to the @@ -865,6 +862,11 @@ self._generated = None self.mandatory = mandatory + def __repr__(self): + cls = "%s.%s" % (self.__class__.__module__, self.__class__.__name__) + return ('<%s object at %x; id: %s; type: %s; mandatory: %s>' + % (cls, id(self), self.id, self.type, self.mandatory)) + def copy(self): """return a copy of the part diff -r f3807a135e43 -r 455677a7667f mercurial/cmdutil.py --- a/mercurial/cmdutil.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/cmdutil.py Mon Feb 13 09:44:16 2017 -0800 @@ -26,14 +26,12 @@ changelog, copies, crecord as crecordmod, - dirstateguard as dirstateguardmod, encoding, error, formatter, graphmod, lock as lockmod, match as matchmod, - mergeutil, obsolete, patch, pathutil, @@ -3366,11 +3364,6 @@ return cmd -def checkunresolved(ms): - ms._repo.ui.deprecwarn('checkunresolved moved from cmdutil to mergeutil', - '4.1') - return mergeutil.checkunresolved(ms) - # a list of (ui, repo, otherpeer, opts, missing) functions called by # commands.outgoing. "missing" is "missing" of the result of # "findcommonoutgoing()" @@ -3477,10 +3470,3 @@ if after[1]: hint = after[0] raise error.Abort(_('no %s in progress') % task, hint=hint) - -class dirstateguard(dirstateguardmod.dirstateguard): - def __init__(self, repo, name): - dirstateguardmod.dirstateguard.__init__(self, repo, name) - repo.ui.deprecwarn( - 'dirstateguard has moved from cmdutil to dirstateguard', - '4.1') diff -r f3807a135e43 -r 455677a7667f mercurial/commands.py --- a/mercurial/commands.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/commands.py Mon Feb 13 09:44:16 2017 -0800 @@ -59,6 +59,7 @@ revset, scmutil, server, + smartset, sshserver, sslutil, streamclone, @@ -1902,7 +1903,7 @@ fm.write('pythonver', _("checking Python version (%s)\n"), ("%d.%d.%d" % sys.version_info[:3])) fm.write('pythonlib', _("checking Python lib (%s)...\n"), - os.path.dirname(os.__file__)) + os.path.dirname(pycompat.fsencode(os.__file__))) security = set(sslutil.supportedprotocols) if sslutil.hassni: @@ -2804,8 +2805,8 @@ arevs = revset.makematcher(treebystage['analyzed'])(repo) brevs = revset.makematcher(treebystage['optimized'])(repo) if ui.verbose: - ui.note(("* analyzed set:\n"), revset.prettyformatset(arevs), "\n") - ui.note(("* optimized set:\n"), revset.prettyformatset(brevs), "\n") + ui.note(("* analyzed set:\n"), smartset.prettyformat(arevs), "\n") + ui.note(("* optimized set:\n"), smartset.prettyformat(brevs), "\n") arevs = list(arevs) brevs = list(brevs) if arevs == brevs: @@ -2828,7 +2829,7 @@ func = revset.makematcher(tree) revs = func(repo) if ui.verbose: - ui.note(("* set:\n"), revset.prettyformatset(revs), "\n") + ui.note(("* set:\n"), smartset.prettyformat(revs), "\n") for c in revs: ui.write("%s\n" % c) @@ -3916,6 +3917,7 @@ if ui.verbose: keep.append('verbose') + fullname = name section = None subtopic = None if name and '.' in name: @@ -3938,7 +3940,7 @@ # to look for, or we could have simply failed to found "foo.bar" # because bar isn't a section of foo if section and not (formatted and name): - raise error.Abort(_("help section not found")) + raise error.Abort(_("help section not found: %s") % fullname) if 'verbose' in pruned: keep.append('omitted') @@ -4127,8 +4129,9 @@ Import a list of patches and commit them individually (unless --no-commit is specified). - To read a patch from standard input, use "-" as the patch name. If - a URL is specified, the patch will be downloaded from there. + To read a patch from standard input (stdin), use "-" as the patch + name. If a URL is specified, the patch will be downloaded from + there. Import first applies changes to the working directory (unless --bypass is specified), import will abort if there are outstanding @@ -4198,6 +4201,10 @@ hg import incoming-patches.mbox + - import patches from stdin:: + + hg import - + - attempt to exactly restore an exported changeset (not always possible):: diff -r f3807a135e43 -r 455677a7667f mercurial/commandserver.py --- a/mercurial/commandserver.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/commandserver.py Mon Feb 13 09:44:16 2017 -0800 @@ -447,6 +447,7 @@ self._sock = None self._oldsigchldhandler = None self._workerpids = set() # updated by signal handler; do not iterate + self._socketunlinked = None def init(self): self._sock = socket.socket(socket.AF_UNIX) @@ -455,11 +456,17 @@ o = signal.signal(signal.SIGCHLD, self._sigchldhandler) self._oldsigchldhandler = o self._servicehandler.printbanner(self.address) + self._socketunlinked = False + + def _unlinksocket(self): + if not self._socketunlinked: + self._servicehandler.unlinksocket(self.address) + self._socketunlinked = True def _cleanup(self): signal.signal(signal.SIGCHLD, self._oldsigchldhandler) self._sock.close() - self._servicehandler.unlinksocket(self.address) + self._unlinksocket() # don't kill child processes as they have active clients, just wait self._reapworkers(0) @@ -470,11 +477,23 @@ self._cleanup() def _mainloop(self): + exiting = False h = self._servicehandler - while not h.shouldexit(): + while True: + if not exiting and h.shouldexit(): + # clients can no longer connect() to the domain socket, so + # we stop queuing new requests. + # for requests that are queued (connect()-ed, but haven't been + # accept()-ed), handle them before exit. otherwise, clients + # waiting for recv() will receive ECONNRESET. + self._unlinksocket() + exiting = True try: ready = select.select([self._sock], [], [], h.pollinterval)[0] if not ready: + # only exit if we completed all queued requests + if exiting: + break continue conn, _addr = self._sock.accept() except (select.error, socket.error) as inst: diff -r f3807a135e43 -r 455677a7667f mercurial/destutil.py --- a/mercurial/destutil.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/destutil.py Mon Feb 13 09:44:16 2017 -0800 @@ -37,10 +37,6 @@ hint = _("commit and merge, or update --clean to" " discard changes") raise error.UpdateAbort(msg, hint=hint) - elif not check: # destination is not a descendant. - msg = _("not a linear update") - hint = _("merge or update --check to force update") - raise error.UpdateAbort(msg, hint=hint) def _destupdateobs(repo, clean, check): """decide of an update destination from obsolescence markers""" diff -r f3807a135e43 -r 455677a7667f mercurial/exchange.py --- a/mercurial/exchange.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/exchange.py Mon Feb 13 09:44:16 2017 -0800 @@ -1724,9 +1724,15 @@ if url.startswith('remote:http:') or url.startswith('remote:https:'): captureoutput = True try: + # note: outside bundle1, 'heads' is expected to be empty and this + # 'check_heads' call wil be a no-op check_heads(repo, heads, 'uploading changes') # push can proceed - if util.safehasattr(cg, 'params'): + if not util.safehasattr(cg, 'params'): + # legacy case: bundle1 (changegroup 01) + lockandtr[1] = repo.lock() + r = cg.apply(repo, source, url) + else: r = None try: def gettransaction(): @@ -1765,9 +1771,6 @@ mandatory=False) parts.append(part) raise - else: - lockandtr[1] = repo.lock() - r = cg.apply(repo, source, url) finally: lockmod.release(lockandtr[2], lockandtr[1], lockandtr[0]) if recordout is not None: diff -r f3807a135e43 -r 455677a7667f mercurial/filemerge.py --- a/mercurial/filemerge.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/filemerge.py Mon Feb 13 09:44:16 2017 -0800 @@ -489,6 +489,9 @@ args = util.interpolate(r'\$', replace, args, lambda s: util.shellquote(util.localpath(s))) cmd = toolpath + ' ' + args + if _toolbool(ui, tool, "gui"): + repo.ui.status(_('running merge tool %s for file %s\n') % + (tool, fcd.path())) repo.ui.debug('launching merge tool: %s\n' % cmd) r = ui.system(cmd, cwd=repo.root, environ=env) repo.ui.debug('merge tool returned: %s\n' % r) diff -r f3807a135e43 -r 455677a7667f mercurial/localrepo.py --- a/mercurial/localrepo.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/localrepo.py Mon Feb 13 09:44:16 2017 -0800 @@ -1852,6 +1852,11 @@ listsubrepos) def heads(self, start=None): + if start is None: + cl = self.changelog + headrevs = reversed(cl.headrevs()) + return [cl.node(rev) for rev in headrevs] + heads = self.changelog.heads(start) # sort the output in rev descending order return sorted(heads, key=self.changelog.rev, reverse=True) diff -r f3807a135e43 -r 455677a7667f mercurial/merge.py --- a/mercurial/merge.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/merge.py Mon Feb 13 09:44:16 2017 -0800 @@ -1448,7 +1448,7 @@ """ Perform a merge between the working directory and the given node - node = the node to update to, or None if unspecified + node = the node to update to branchmerge = whether to merge between branches force = whether to force branch merging or file overwriting matcher = a matcher to filter file lists (dirstate not updated) @@ -1491,7 +1491,9 @@ Return the same tuple as applyupdates(). """ - onode = node + # This functon used to find the default destination if node was None, but + # that's now in destutil.py. + assert node is not None # If we're doing a partial update, we need to skip updating # the dirstate, so make a note of any partial-ness to the # update here. @@ -1550,37 +1552,30 @@ if pas not in ([p1], [p2]): # nonlinear dirty = wc.dirty(missing=True) - if dirty or onode is None: + if dirty: # Branching is a bit strange to ensure we do the minimal - # amount of call to obsolete.background. + # amount of call to obsolete.foreground. foreground = obsolete.foreground(repo, [p1.node()]) # note: the variable contains a random identifier if repo[node].node() in foreground: - pas = [p1] # allow updating to successors - elif dirty: + pass # allow updating to successors + else: msg = _("uncommitted changes") - if onode is None: - hint = _("commit and merge, or update --clean to" - " discard changes") - else: - hint = _("commit or update --clean to discard" - " changes") - raise error.Abort(msg, hint=hint) - else: # node is none - msg = _("not a linear update") - hint = _("merge or update --check to force update") + hint = _("commit or update --clean to discard changes") raise error.Abort(msg, hint=hint) else: # Allow jumping branches if clean and specific rev given - pas = [p1] + pass + + if overwrite: + pas = [wc] + elif not branchmerge: + pas = [p1] # deprecated config: merge.followcopies followcopies = repo.ui.configbool('merge', 'followcopies', True) if overwrite: - pas = [wc] followcopies = False - elif pas == [p2]: # backwards - pas = [p1] elif not pas[0]: followcopies = False if not branchmerge and not wc.dirty(missing=True): diff -r f3807a135e43 -r 455677a7667f mercurial/revset.py --- a/mercurial/revset.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/revset.py Mon Feb 13 09:44:16 2017 -0800 @@ -26,9 +26,15 @@ pycompat, registrar, repoview, + smartset, util, ) +baseset = smartset.baseset +generatorset = smartset.generatorset +spanset = smartset.spanset +fullreposet = smartset.fullreposet + def _revancestors(repo, revs, followfirst): """Like revlog.ancestors(), but supports followfirst.""" if followfirst: @@ -2940,967 +2946,6 @@ funcs.add(tree[1][1]) return funcs -def _formatsetrepr(r): - """Format an optional printable representation of a set - - ======== ================================= - type(r) example - ======== ================================= - tuple ('', other) - str '' - callable lambda: '' % sorted(b) - object other - ======== ================================= - """ - if r is None: - return '' - elif isinstance(r, tuple): - return r[0] % r[1:] - elif isinstance(r, str): - return r - elif callable(r): - return r() - else: - return repr(r) - -class abstractsmartset(object): - - def __nonzero__(self): - """True if the smartset is not empty""" - raise NotImplementedError() - - def __contains__(self, rev): - """provide fast membership testing""" - raise NotImplementedError() - - def __iter__(self): - """iterate the set in the order it is supposed to be iterated""" - raise NotImplementedError() - - # Attributes containing a function to perform a fast iteration in a given - # direction. A smartset can have none, one, or both defined. - # - # Default value is None instead of a function returning None to avoid - # initializing an iterator just for testing if a fast method exists. - fastasc = None - fastdesc = None - - def isascending(self): - """True if the set will iterate in ascending order""" - raise NotImplementedError() - - def isdescending(self): - """True if the set will iterate in descending order""" - raise NotImplementedError() - - def istopo(self): - """True if the set will iterate in topographical order""" - raise NotImplementedError() - - def min(self): - """return the minimum element in the set""" - if self.fastasc is None: - v = min(self) - else: - for v in self.fastasc(): - break - else: - raise ValueError('arg is an empty sequence') - self.min = lambda: v - return v - - def max(self): - """return the maximum element in the set""" - if self.fastdesc is None: - return max(self) - else: - for v in self.fastdesc(): - break - else: - raise ValueError('arg is an empty sequence') - self.max = lambda: v - return v - - def first(self): - """return the first element in the set (user iteration perspective) - - Return None if the set is empty""" - raise NotImplementedError() - - def last(self): - """return the last element in the set (user iteration perspective) - - Return None if the set is empty""" - raise NotImplementedError() - - def __len__(self): - """return the length of the smartsets - - This can be expensive on smartset that could be lazy otherwise.""" - raise NotImplementedError() - - def reverse(self): - """reverse the expected iteration order""" - raise NotImplementedError() - - def sort(self, reverse=True): - """get the set to iterate in an ascending or descending order""" - raise NotImplementedError() - - def __and__(self, other): - """Returns a new object with the intersection of the two collections. - - This is part of the mandatory API for smartset.""" - if isinstance(other, fullreposet): - return self - return self.filter(other.__contains__, condrepr=other, cache=False) - - def __add__(self, other): - """Returns a new object with the union of the two collections. - - This is part of the mandatory API for smartset.""" - return addset(self, other) - - def __sub__(self, other): - """Returns a new object with the substraction of the two collections. - - This is part of the mandatory API for smartset.""" - c = other.__contains__ - return self.filter(lambda r: not c(r), condrepr=('', other), - cache=False) - - def filter(self, condition, condrepr=None, cache=True): - """Returns this smartset filtered by condition as a new smartset. - - `condition` is a callable which takes a revision number and returns a - boolean. Optional `condrepr` provides a printable representation of - the given `condition`. - - This is part of the mandatory API for smartset.""" - # builtin cannot be cached. but do not needs to - if cache and util.safehasattr(condition, 'func_code'): - condition = util.cachefunc(condition) - return filteredset(self, condition, condrepr) - -class baseset(abstractsmartset): - """Basic data structure that represents a revset and contains the basic - operation that it should be able to perform. - - Every method in this class should be implemented by any smartset class. - """ - def __init__(self, data=(), datarepr=None, istopo=False): - """ - datarepr: a tuple of (format, obj, ...), a function or an object that - provides a printable representation of the given data. - """ - self._ascending = None - self._istopo = istopo - if not isinstance(data, list): - if isinstance(data, set): - self._set = data - # set has no order we pick one for stability purpose - self._ascending = True - data = list(data) - self._list = data - self._datarepr = datarepr - - @util.propertycache - def _set(self): - return set(self._list) - - @util.propertycache - def _asclist(self): - asclist = self._list[:] - asclist.sort() - return asclist - - def __iter__(self): - if self._ascending is None: - return iter(self._list) - elif self._ascending: - return iter(self._asclist) - else: - return reversed(self._asclist) - - def fastasc(self): - return iter(self._asclist) - - def fastdesc(self): - return reversed(self._asclist) - - @util.propertycache - def __contains__(self): - return self._set.__contains__ - - def __nonzero__(self): - return bool(self._list) - - def sort(self, reverse=False): - self._ascending = not bool(reverse) - self._istopo = False - - def reverse(self): - if self._ascending is None: - self._list.reverse() - else: - self._ascending = not self._ascending - self._istopo = False - - def __len__(self): - return len(self._list) - - def isascending(self): - """Returns True if the collection is ascending order, False if not. - - This is part of the mandatory API for smartset.""" - if len(self) <= 1: - return True - return self._ascending is not None and self._ascending - - def isdescending(self): - """Returns True if the collection is descending order, False if not. - - This is part of the mandatory API for smartset.""" - if len(self) <= 1: - return True - return self._ascending is not None and not self._ascending - - def istopo(self): - """Is the collection is in topographical order or not. - - This is part of the mandatory API for smartset.""" - if len(self) <= 1: - return True - return self._istopo - - def first(self): - if self: - if self._ascending is None: - return self._list[0] - elif self._ascending: - return self._asclist[0] - else: - return self._asclist[-1] - return None - - def last(self): - if self: - if self._ascending is None: - return self._list[-1] - elif self._ascending: - return self._asclist[-1] - else: - return self._asclist[0] - return None - - def __repr__(self): - d = {None: '', False: '-', True: '+'}[self._ascending] - s = _formatsetrepr(self._datarepr) - if not s: - l = self._list - # if _list has been built from a set, it might have a different - # order from one python implementation to another. - # We fallback to the sorted version for a stable output. - if self._ascending is not None: - l = self._asclist - s = repr(l) - return '<%s%s %s>' % (type(self).__name__, d, s) - -class filteredset(abstractsmartset): - """Duck type for baseset class which iterates lazily over the revisions in - the subset and contains a function which tests for membership in the - revset - """ - def __init__(self, subset, condition=lambda x: True, condrepr=None): - """ - condition: a function that decide whether a revision in the subset - belongs to the revset or not. - condrepr: a tuple of (format, obj, ...), a function or an object that - provides a printable representation of the given condition. - """ - self._subset = subset - self._condition = condition - self._condrepr = condrepr - - def __contains__(self, x): - return x in self._subset and self._condition(x) - - def __iter__(self): - return self._iterfilter(self._subset) - - def _iterfilter(self, it): - cond = self._condition - for x in it: - if cond(x): - yield x - - @property - def fastasc(self): - it = self._subset.fastasc - if it is None: - return None - return lambda: self._iterfilter(it()) - - @property - def fastdesc(self): - it = self._subset.fastdesc - if it is None: - return None - return lambda: self._iterfilter(it()) - - def __nonzero__(self): - fast = None - candidates = [self.fastasc if self.isascending() else None, - self.fastdesc if self.isdescending() else None, - self.fastasc, - self.fastdesc] - for candidate in candidates: - if candidate is not None: - fast = candidate - break - - if fast is not None: - it = fast() - else: - it = self - - for r in it: - return True - return False - - def __len__(self): - # Basic implementation to be changed in future patches. - # until this gets improved, we use generator expression - # here, since list comprehensions are free to call __len__ again - # causing infinite recursion - l = baseset(r for r in self) - return len(l) - - def sort(self, reverse=False): - self._subset.sort(reverse=reverse) - - def reverse(self): - self._subset.reverse() - - def isascending(self): - return self._subset.isascending() - - def isdescending(self): - return self._subset.isdescending() - - def istopo(self): - return self._subset.istopo() - - def first(self): - for x in self: - return x - return None - - def last(self): - it = None - if self.isascending(): - it = self.fastdesc - elif self.isdescending(): - it = self.fastasc - if it is not None: - for x in it(): - return x - return None #empty case - else: - x = None - for x in self: - pass - return x - - def __repr__(self): - xs = [repr(self._subset)] - s = _formatsetrepr(self._condrepr) - if s: - xs.append(s) - return '<%s %s>' % (type(self).__name__, ', '.join(xs)) - -def _iterordered(ascending, iter1, iter2): - """produce an ordered iteration from two iterators with the same order - - The ascending is used to indicated the iteration direction. - """ - choice = max - if ascending: - choice = min - - val1 = None - val2 = None - try: - # Consume both iterators in an ordered way until one is empty - while True: - if val1 is None: - val1 = next(iter1) - if val2 is None: - val2 = next(iter2) - n = choice(val1, val2) - yield n - if val1 == n: - val1 = None - if val2 == n: - val2 = None - except StopIteration: - # Flush any remaining values and consume the other one - it = iter2 - if val1 is not None: - yield val1 - it = iter1 - elif val2 is not None: - # might have been equality and both are empty - yield val2 - for val in it: - yield val - -class addset(abstractsmartset): - """Represent the addition of two sets - - Wrapper structure for lazily adding two structures without losing much - performance on the __contains__ method - - If the ascending attribute is set, that means the two structures are - ordered in either an ascending or descending way. Therefore, we can add - them maintaining the order by iterating over both at the same time - - >>> xs = baseset([0, 3, 2]) - >>> ys = baseset([5, 2, 4]) - - >>> rs = addset(xs, ys) - >>> bool(rs), 0 in rs, 1 in rs, 5 in rs, rs.first(), rs.last() - (True, True, False, True, 0, 4) - >>> rs = addset(xs, baseset([])) - >>> bool(rs), 0 in rs, 1 in rs, rs.first(), rs.last() - (True, True, False, 0, 2) - >>> rs = addset(baseset([]), baseset([])) - >>> bool(rs), 0 in rs, rs.first(), rs.last() - (False, False, None, None) - - iterate unsorted: - >>> rs = addset(xs, ys) - >>> # (use generator because pypy could call len()) - >>> list(x for x in rs) # without _genlist - [0, 3, 2, 5, 4] - >>> assert not rs._genlist - >>> len(rs) - 5 - >>> [x for x in rs] # with _genlist - [0, 3, 2, 5, 4] - >>> assert rs._genlist - - iterate ascending: - >>> rs = addset(xs, ys, ascending=True) - >>> # (use generator because pypy could call len()) - >>> list(x for x in rs), list(x for x in rs.fastasc()) # without _asclist - ([0, 2, 3, 4, 5], [0, 2, 3, 4, 5]) - >>> assert not rs._asclist - >>> len(rs) - 5 - >>> [x for x in rs], [x for x in rs.fastasc()] - ([0, 2, 3, 4, 5], [0, 2, 3, 4, 5]) - >>> assert rs._asclist - - iterate descending: - >>> rs = addset(xs, ys, ascending=False) - >>> # (use generator because pypy could call len()) - >>> list(x for x in rs), list(x for x in rs.fastdesc()) # without _asclist - ([5, 4, 3, 2, 0], [5, 4, 3, 2, 0]) - >>> assert not rs._asclist - >>> len(rs) - 5 - >>> [x for x in rs], [x for x in rs.fastdesc()] - ([5, 4, 3, 2, 0], [5, 4, 3, 2, 0]) - >>> assert rs._asclist - - iterate ascending without fastasc: - >>> rs = addset(xs, generatorset(ys), ascending=True) - >>> assert rs.fastasc is None - >>> [x for x in rs] - [0, 2, 3, 4, 5] - - iterate descending without fastdesc: - >>> rs = addset(generatorset(xs), ys, ascending=False) - >>> assert rs.fastdesc is None - >>> [x for x in rs] - [5, 4, 3, 2, 0] - """ - def __init__(self, revs1, revs2, ascending=None): - self._r1 = revs1 - self._r2 = revs2 - self._iter = None - self._ascending = ascending - self._genlist = None - self._asclist = None - - def __len__(self): - return len(self._list) - - def __nonzero__(self): - return bool(self._r1) or bool(self._r2) - - @util.propertycache - def _list(self): - if not self._genlist: - self._genlist = baseset(iter(self)) - return self._genlist - - def __iter__(self): - """Iterate over both collections without repeating elements - - If the ascending attribute is not set, iterate over the first one and - then over the second one checking for membership on the first one so we - dont yield any duplicates. - - If the ascending attribute is set, iterate over both collections at the - same time, yielding only one value at a time in the given order. - """ - if self._ascending is None: - if self._genlist: - return iter(self._genlist) - def arbitraryordergen(): - for r in self._r1: - yield r - inr1 = self._r1.__contains__ - for r in self._r2: - if not inr1(r): - yield r - return arbitraryordergen() - # try to use our own fast iterator if it exists - self._trysetasclist() - if self._ascending: - attr = 'fastasc' - else: - attr = 'fastdesc' - it = getattr(self, attr) - if it is not None: - return it() - # maybe half of the component supports fast - # get iterator for _r1 - iter1 = getattr(self._r1, attr) - if iter1 is None: - # let's avoid side effect (not sure it matters) - iter1 = iter(sorted(self._r1, reverse=not self._ascending)) - else: - iter1 = iter1() - # get iterator for _r2 - iter2 = getattr(self._r2, attr) - if iter2 is None: - # let's avoid side effect (not sure it matters) - iter2 = iter(sorted(self._r2, reverse=not self._ascending)) - else: - iter2 = iter2() - return _iterordered(self._ascending, iter1, iter2) - - def _trysetasclist(self): - """populate the _asclist attribute if possible and necessary""" - if self._genlist is not None and self._asclist is None: - self._asclist = sorted(self._genlist) - - @property - def fastasc(self): - self._trysetasclist() - if self._asclist is not None: - return self._asclist.__iter__ - iter1 = self._r1.fastasc - iter2 = self._r2.fastasc - if None in (iter1, iter2): - return None - return lambda: _iterordered(True, iter1(), iter2()) - - @property - def fastdesc(self): - self._trysetasclist() - if self._asclist is not None: - return self._asclist.__reversed__ - iter1 = self._r1.fastdesc - iter2 = self._r2.fastdesc - if None in (iter1, iter2): - return None - return lambda: _iterordered(False, iter1(), iter2()) - - def __contains__(self, x): - return x in self._r1 or x in self._r2 - - def sort(self, reverse=False): - """Sort the added set - - For this we use the cached list with all the generated values and if we - know they are ascending or descending we can sort them in a smart way. - """ - self._ascending = not reverse - - def isascending(self): - return self._ascending is not None and self._ascending - - def isdescending(self): - return self._ascending is not None and not self._ascending - - def istopo(self): - # not worth the trouble asserting if the two sets combined are still - # in topographical order. Use the sort() predicate to explicitly sort - # again instead. - return False - - def reverse(self): - if self._ascending is None: - self._list.reverse() - else: - self._ascending = not self._ascending - - def first(self): - for x in self: - return x - return None - - def last(self): - self.reverse() - val = self.first() - self.reverse() - return val - - def __repr__(self): - d = {None: '', False: '-', True: '+'}[self._ascending] - return '<%s%s %r, %r>' % (type(self).__name__, d, self._r1, self._r2) - -class generatorset(abstractsmartset): - """Wrap a generator for lazy iteration - - Wrapper structure for generators that provides lazy membership and can - be iterated more than once. - When asked for membership it generates values until either it finds the - requested one or has gone through all the elements in the generator - """ - def __init__(self, gen, iterasc=None): - """ - gen: a generator producing the values for the generatorset. - """ - self._gen = gen - self._asclist = None - self._cache = {} - self._genlist = [] - self._finished = False - self._ascending = True - if iterasc is not None: - if iterasc: - self.fastasc = self._iterator - self.__contains__ = self._asccontains - else: - self.fastdesc = self._iterator - self.__contains__ = self._desccontains - - def __nonzero__(self): - # Do not use 'for r in self' because it will enforce the iteration - # order (default ascending), possibly unrolling a whole descending - # iterator. - if self._genlist: - return True - for r in self._consumegen(): - return True - return False - - def __contains__(self, x): - if x in self._cache: - return self._cache[x] - - # Use new values only, as existing values would be cached. - for l in self._consumegen(): - if l == x: - return True - - self._cache[x] = False - return False - - def _asccontains(self, x): - """version of contains optimised for ascending generator""" - if x in self._cache: - return self._cache[x] - - # Use new values only, as existing values would be cached. - for l in self._consumegen(): - if l == x: - return True - if l > x: - break - - self._cache[x] = False - return False - - def _desccontains(self, x): - """version of contains optimised for descending generator""" - if x in self._cache: - return self._cache[x] - - # Use new values only, as existing values would be cached. - for l in self._consumegen(): - if l == x: - return True - if l < x: - break - - self._cache[x] = False - return False - - def __iter__(self): - if self._ascending: - it = self.fastasc - else: - it = self.fastdesc - if it is not None: - return it() - # we need to consume the iterator - for x in self._consumegen(): - pass - # recall the same code - return iter(self) - - def _iterator(self): - if self._finished: - return iter(self._genlist) - - # We have to use this complex iteration strategy to allow multiple - # iterations at the same time. We need to be able to catch revision - # removed from _consumegen and added to genlist in another instance. - # - # Getting rid of it would provide an about 15% speed up on this - # iteration. - genlist = self._genlist - nextrev = self._consumegen().next - _len = len # cache global lookup - def gen(): - i = 0 - while True: - if i < _len(genlist): - yield genlist[i] - else: - yield nextrev() - i += 1 - return gen() - - def _consumegen(self): - cache = self._cache - genlist = self._genlist.append - for item in self._gen: - cache[item] = True - genlist(item) - yield item - if not self._finished: - self._finished = True - asc = self._genlist[:] - asc.sort() - self._asclist = asc - self.fastasc = asc.__iter__ - self.fastdesc = asc.__reversed__ - - def __len__(self): - for x in self._consumegen(): - pass - return len(self._genlist) - - def sort(self, reverse=False): - self._ascending = not reverse - - def reverse(self): - self._ascending = not self._ascending - - def isascending(self): - return self._ascending - - def isdescending(self): - return not self._ascending - - def istopo(self): - # not worth the trouble asserting if the two sets combined are still - # in topographical order. Use the sort() predicate to explicitly sort - # again instead. - return False - - def first(self): - if self._ascending: - it = self.fastasc - else: - it = self.fastdesc - if it is None: - # we need to consume all and try again - for x in self._consumegen(): - pass - return self.first() - return next(it(), None) - - def last(self): - if self._ascending: - it = self.fastdesc - else: - it = self.fastasc - if it is None: - # we need to consume all and try again - for x in self._consumegen(): - pass - return self.first() - return next(it(), None) - - def __repr__(self): - d = {False: '-', True: '+'}[self._ascending] - return '<%s%s>' % (type(self).__name__, d) - -class spanset(abstractsmartset): - """Duck type for baseset class which represents a range of revisions and - can work lazily and without having all the range in memory - - Note that spanset(x, y) behave almost like xrange(x, y) except for two - notable points: - - when x < y it will be automatically descending, - - revision filtered with this repoview will be skipped. - - """ - def __init__(self, repo, start=0, end=None): - """ - start: first revision included the set - (default to 0) - end: first revision excluded (last+1) - (default to len(repo) - - Spanset will be descending if `end` < `start`. - """ - if end is None: - end = len(repo) - self._ascending = start <= end - if not self._ascending: - start, end = end + 1, start +1 - self._start = start - self._end = end - self._hiddenrevs = repo.changelog.filteredrevs - - def sort(self, reverse=False): - self._ascending = not reverse - - def reverse(self): - self._ascending = not self._ascending - - def istopo(self): - # not worth the trouble asserting if the two sets combined are still - # in topographical order. Use the sort() predicate to explicitly sort - # again instead. - return False - - def _iterfilter(self, iterrange): - s = self._hiddenrevs - for r in iterrange: - if r not in s: - yield r - - def __iter__(self): - if self._ascending: - return self.fastasc() - else: - return self.fastdesc() - - def fastasc(self): - iterrange = xrange(self._start, self._end) - if self._hiddenrevs: - return self._iterfilter(iterrange) - return iter(iterrange) - - def fastdesc(self): - iterrange = xrange(self._end - 1, self._start - 1, -1) - if self._hiddenrevs: - return self._iterfilter(iterrange) - return iter(iterrange) - - def __contains__(self, rev): - hidden = self._hiddenrevs - return ((self._start <= rev < self._end) - and not (hidden and rev in hidden)) - - def __nonzero__(self): - for r in self: - return True - return False - - def __len__(self): - if not self._hiddenrevs: - return abs(self._end - self._start) - else: - count = 0 - start = self._start - end = self._end - for rev in self._hiddenrevs: - if (end < rev <= start) or (start <= rev < end): - count += 1 - return abs(self._end - self._start) - count - - def isascending(self): - return self._ascending - - def isdescending(self): - return not self._ascending - - def first(self): - if self._ascending: - it = self.fastasc - else: - it = self.fastdesc - for x in it(): - return x - return None - - def last(self): - if self._ascending: - it = self.fastdesc - else: - it = self.fastasc - for x in it(): - return x - return None - - def __repr__(self): - d = {False: '-', True: '+'}[self._ascending] - return '<%s%s %d:%d>' % (type(self).__name__, d, - self._start, self._end - 1) - -class fullreposet(spanset): - """a set containing all revisions in the repo - - This class exists to host special optimization and magic to handle virtual - revisions such as "null". - """ - - def __init__(self, repo): - super(fullreposet, self).__init__(repo) - - def __and__(self, other): - """As self contains the whole repo, all of the other set should also be - in self. Therefore `self & other = other`. - - This boldly assumes the other contains valid revs only. - """ - # other not a smartset, make is so - if not util.safehasattr(other, 'isascending'): - # filter out hidden revision - # (this boldly assumes all smartset are pure) - # - # `other` was used with "&", let's assume this is a set like - # object. - other = baseset(other - self._hiddenrevs) - - other.sort(reverse=self.isdescending()) - return other - -def prettyformatset(revs): - lines = [] - rs = repr(revs) - p = 0 - while p < len(rs): - q = rs.find('<', p + 1) - if q < 0: - q = len(rs) - l = rs.count('<', 0, p) - rs.count('>', 0, p) - assert l >= 0 - lines.append((l, rs[p:q].rstrip())) - p = q - return '\n'.join(' ' * l + s for l, s in lines) - def loadpredicate(ui, extname, registrarobj): """Load revset predicates from specified registrarobj """ diff -r f3807a135e43 -r 455677a7667f mercurial/smartset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/smartset.py Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,973 @@ +# smartset.py - data structure for revision set +# +# Copyright 2010 Matt Mackall +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +from . import ( + util, +) + +def _formatsetrepr(r): + """Format an optional printable representation of a set + + ======== ================================= + type(r) example + ======== ================================= + tuple ('', other) + str '' + callable lambda: '' % sorted(b) + object other + ======== ================================= + """ + if r is None: + return '' + elif isinstance(r, tuple): + return r[0] % r[1:] + elif isinstance(r, str): + return r + elif callable(r): + return r() + else: + return repr(r) + +class abstractsmartset(object): + + def __nonzero__(self): + """True if the smartset is not empty""" + raise NotImplementedError() + + def __contains__(self, rev): + """provide fast membership testing""" + raise NotImplementedError() + + def __iter__(self): + """iterate the set in the order it is supposed to be iterated""" + raise NotImplementedError() + + # Attributes containing a function to perform a fast iteration in a given + # direction. A smartset can have none, one, or both defined. + # + # Default value is None instead of a function returning None to avoid + # initializing an iterator just for testing if a fast method exists. + fastasc = None + fastdesc = None + + def isascending(self): + """True if the set will iterate in ascending order""" + raise NotImplementedError() + + def isdescending(self): + """True if the set will iterate in descending order""" + raise NotImplementedError() + + def istopo(self): + """True if the set will iterate in topographical order""" + raise NotImplementedError() + + def min(self): + """return the minimum element in the set""" + if self.fastasc is None: + v = min(self) + else: + for v in self.fastasc(): + break + else: + raise ValueError('arg is an empty sequence') + self.min = lambda: v + return v + + def max(self): + """return the maximum element in the set""" + if self.fastdesc is None: + return max(self) + else: + for v in self.fastdesc(): + break + else: + raise ValueError('arg is an empty sequence') + self.max = lambda: v + return v + + def first(self): + """return the first element in the set (user iteration perspective) + + Return None if the set is empty""" + raise NotImplementedError() + + def last(self): + """return the last element in the set (user iteration perspective) + + Return None if the set is empty""" + raise NotImplementedError() + + def __len__(self): + """return the length of the smartsets + + This can be expensive on smartset that could be lazy otherwise.""" + raise NotImplementedError() + + def reverse(self): + """reverse the expected iteration order""" + raise NotImplementedError() + + def sort(self, reverse=True): + """get the set to iterate in an ascending or descending order""" + raise NotImplementedError() + + def __and__(self, other): + """Returns a new object with the intersection of the two collections. + + This is part of the mandatory API for smartset.""" + if isinstance(other, fullreposet): + return self + return self.filter(other.__contains__, condrepr=other, cache=False) + + def __add__(self, other): + """Returns a new object with the union of the two collections. + + This is part of the mandatory API for smartset.""" + return addset(self, other) + + def __sub__(self, other): + """Returns a new object with the substraction of the two collections. + + This is part of the mandatory API for smartset.""" + c = other.__contains__ + return self.filter(lambda r: not c(r), condrepr=('', other), + cache=False) + + def filter(self, condition, condrepr=None, cache=True): + """Returns this smartset filtered by condition as a new smartset. + + `condition` is a callable which takes a revision number and returns a + boolean. Optional `condrepr` provides a printable representation of + the given `condition`. + + This is part of the mandatory API for smartset.""" + # builtin cannot be cached. but do not needs to + if cache and util.safehasattr(condition, 'func_code'): + condition = util.cachefunc(condition) + return filteredset(self, condition, condrepr) + +class baseset(abstractsmartset): + """Basic data structure that represents a revset and contains the basic + operation that it should be able to perform. + + Every method in this class should be implemented by any smartset class. + """ + def __init__(self, data=(), datarepr=None, istopo=False): + """ + datarepr: a tuple of (format, obj, ...), a function or an object that + provides a printable representation of the given data. + """ + self._ascending = None + self._istopo = istopo + if not isinstance(data, list): + if isinstance(data, set): + self._set = data + # set has no order we pick one for stability purpose + self._ascending = True + data = list(data) + self._list = data + self._datarepr = datarepr + + @util.propertycache + def _set(self): + return set(self._list) + + @util.propertycache + def _asclist(self): + asclist = self._list[:] + asclist.sort() + return asclist + + def __iter__(self): + if self._ascending is None: + return iter(self._list) + elif self._ascending: + return iter(self._asclist) + else: + return reversed(self._asclist) + + def fastasc(self): + return iter(self._asclist) + + def fastdesc(self): + return reversed(self._asclist) + + @util.propertycache + def __contains__(self): + return self._set.__contains__ + + def __nonzero__(self): + return bool(self._list) + + def sort(self, reverse=False): + self._ascending = not bool(reverse) + self._istopo = False + + def reverse(self): + if self._ascending is None: + self._list.reverse() + else: + self._ascending = not self._ascending + self._istopo = False + + def __len__(self): + return len(self._list) + + def isascending(self): + """Returns True if the collection is ascending order, False if not. + + This is part of the mandatory API for smartset.""" + if len(self) <= 1: + return True + return self._ascending is not None and self._ascending + + def isdescending(self): + """Returns True if the collection is descending order, False if not. + + This is part of the mandatory API for smartset.""" + if len(self) <= 1: + return True + return self._ascending is not None and not self._ascending + + def istopo(self): + """Is the collection is in topographical order or not. + + This is part of the mandatory API for smartset.""" + if len(self) <= 1: + return True + return self._istopo + + def first(self): + if self: + if self._ascending is None: + return self._list[0] + elif self._ascending: + return self._asclist[0] + else: + return self._asclist[-1] + return None + + def last(self): + if self: + if self._ascending is None: + return self._list[-1] + elif self._ascending: + return self._asclist[-1] + else: + return self._asclist[0] + return None + + def __repr__(self): + d = {None: '', False: '-', True: '+'}[self._ascending] + s = _formatsetrepr(self._datarepr) + if not s: + l = self._list + # if _list has been built from a set, it might have a different + # order from one python implementation to another. + # We fallback to the sorted version for a stable output. + if self._ascending is not None: + l = self._asclist + s = repr(l) + return '<%s%s %s>' % (type(self).__name__, d, s) + +class filteredset(abstractsmartset): + """Duck type for baseset class which iterates lazily over the revisions in + the subset and contains a function which tests for membership in the + revset + """ + def __init__(self, subset, condition=lambda x: True, condrepr=None): + """ + condition: a function that decide whether a revision in the subset + belongs to the revset or not. + condrepr: a tuple of (format, obj, ...), a function or an object that + provides a printable representation of the given condition. + """ + self._subset = subset + self._condition = condition + self._condrepr = condrepr + + def __contains__(self, x): + return x in self._subset and self._condition(x) + + def __iter__(self): + return self._iterfilter(self._subset) + + def _iterfilter(self, it): + cond = self._condition + for x in it: + if cond(x): + yield x + + @property + def fastasc(self): + it = self._subset.fastasc + if it is None: + return None + return lambda: self._iterfilter(it()) + + @property + def fastdesc(self): + it = self._subset.fastdesc + if it is None: + return None + return lambda: self._iterfilter(it()) + + def __nonzero__(self): + fast = None + candidates = [self.fastasc if self.isascending() else None, + self.fastdesc if self.isdescending() else None, + self.fastasc, + self.fastdesc] + for candidate in candidates: + if candidate is not None: + fast = candidate + break + + if fast is not None: + it = fast() + else: + it = self + + for r in it: + return True + return False + + def __len__(self): + # Basic implementation to be changed in future patches. + # until this gets improved, we use generator expression + # here, since list comprehensions are free to call __len__ again + # causing infinite recursion + l = baseset(r for r in self) + return len(l) + + def sort(self, reverse=False): + self._subset.sort(reverse=reverse) + + def reverse(self): + self._subset.reverse() + + def isascending(self): + return self._subset.isascending() + + def isdescending(self): + return self._subset.isdescending() + + def istopo(self): + return self._subset.istopo() + + def first(self): + for x in self: + return x + return None + + def last(self): + it = None + if self.isascending(): + it = self.fastdesc + elif self.isdescending(): + it = self.fastasc + if it is not None: + for x in it(): + return x + return None #empty case + else: + x = None + for x in self: + pass + return x + + def __repr__(self): + xs = [repr(self._subset)] + s = _formatsetrepr(self._condrepr) + if s: + xs.append(s) + return '<%s %s>' % (type(self).__name__, ', '.join(xs)) + +def _iterordered(ascending, iter1, iter2): + """produce an ordered iteration from two iterators with the same order + + The ascending is used to indicated the iteration direction. + """ + choice = max + if ascending: + choice = min + + val1 = None + val2 = None + try: + # Consume both iterators in an ordered way until one is empty + while True: + if val1 is None: + val1 = next(iter1) + if val2 is None: + val2 = next(iter2) + n = choice(val1, val2) + yield n + if val1 == n: + val1 = None + if val2 == n: + val2 = None + except StopIteration: + # Flush any remaining values and consume the other one + it = iter2 + if val1 is not None: + yield val1 + it = iter1 + elif val2 is not None: + # might have been equality and both are empty + yield val2 + for val in it: + yield val + +class addset(abstractsmartset): + """Represent the addition of two sets + + Wrapper structure for lazily adding two structures without losing much + performance on the __contains__ method + + If the ascending attribute is set, that means the two structures are + ordered in either an ascending or descending way. Therefore, we can add + them maintaining the order by iterating over both at the same time + + >>> xs = baseset([0, 3, 2]) + >>> ys = baseset([5, 2, 4]) + + >>> rs = addset(xs, ys) + >>> bool(rs), 0 in rs, 1 in rs, 5 in rs, rs.first(), rs.last() + (True, True, False, True, 0, 4) + >>> rs = addset(xs, baseset([])) + >>> bool(rs), 0 in rs, 1 in rs, rs.first(), rs.last() + (True, True, False, 0, 2) + >>> rs = addset(baseset([]), baseset([])) + >>> bool(rs), 0 in rs, rs.first(), rs.last() + (False, False, None, None) + + iterate unsorted: + >>> rs = addset(xs, ys) + >>> # (use generator because pypy could call len()) + >>> list(x for x in rs) # without _genlist + [0, 3, 2, 5, 4] + >>> assert not rs._genlist + >>> len(rs) + 5 + >>> [x for x in rs] # with _genlist + [0, 3, 2, 5, 4] + >>> assert rs._genlist + + iterate ascending: + >>> rs = addset(xs, ys, ascending=True) + >>> # (use generator because pypy could call len()) + >>> list(x for x in rs), list(x for x in rs.fastasc()) # without _asclist + ([0, 2, 3, 4, 5], [0, 2, 3, 4, 5]) + >>> assert not rs._asclist + >>> len(rs) + 5 + >>> [x for x in rs], [x for x in rs.fastasc()] + ([0, 2, 3, 4, 5], [0, 2, 3, 4, 5]) + >>> assert rs._asclist + + iterate descending: + >>> rs = addset(xs, ys, ascending=False) + >>> # (use generator because pypy could call len()) + >>> list(x for x in rs), list(x for x in rs.fastdesc()) # without _asclist + ([5, 4, 3, 2, 0], [5, 4, 3, 2, 0]) + >>> assert not rs._asclist + >>> len(rs) + 5 + >>> [x for x in rs], [x for x in rs.fastdesc()] + ([5, 4, 3, 2, 0], [5, 4, 3, 2, 0]) + >>> assert rs._asclist + + iterate ascending without fastasc: + >>> rs = addset(xs, generatorset(ys), ascending=True) + >>> assert rs.fastasc is None + >>> [x for x in rs] + [0, 2, 3, 4, 5] + + iterate descending without fastdesc: + >>> rs = addset(generatorset(xs), ys, ascending=False) + >>> assert rs.fastdesc is None + >>> [x for x in rs] + [5, 4, 3, 2, 0] + """ + def __init__(self, revs1, revs2, ascending=None): + self._r1 = revs1 + self._r2 = revs2 + self._iter = None + self._ascending = ascending + self._genlist = None + self._asclist = None + + def __len__(self): + return len(self._list) + + def __nonzero__(self): + return bool(self._r1) or bool(self._r2) + + @util.propertycache + def _list(self): + if not self._genlist: + self._genlist = baseset(iter(self)) + return self._genlist + + def __iter__(self): + """Iterate over both collections without repeating elements + + If the ascending attribute is not set, iterate over the first one and + then over the second one checking for membership on the first one so we + dont yield any duplicates. + + If the ascending attribute is set, iterate over both collections at the + same time, yielding only one value at a time in the given order. + """ + if self._ascending is None: + if self._genlist: + return iter(self._genlist) + def arbitraryordergen(): + for r in self._r1: + yield r + inr1 = self._r1.__contains__ + for r in self._r2: + if not inr1(r): + yield r + return arbitraryordergen() + # try to use our own fast iterator if it exists + self._trysetasclist() + if self._ascending: + attr = 'fastasc' + else: + attr = 'fastdesc' + it = getattr(self, attr) + if it is not None: + return it() + # maybe half of the component supports fast + # get iterator for _r1 + iter1 = getattr(self._r1, attr) + if iter1 is None: + # let's avoid side effect (not sure it matters) + iter1 = iter(sorted(self._r1, reverse=not self._ascending)) + else: + iter1 = iter1() + # get iterator for _r2 + iter2 = getattr(self._r2, attr) + if iter2 is None: + # let's avoid side effect (not sure it matters) + iter2 = iter(sorted(self._r2, reverse=not self._ascending)) + else: + iter2 = iter2() + return _iterordered(self._ascending, iter1, iter2) + + def _trysetasclist(self): + """populate the _asclist attribute if possible and necessary""" + if self._genlist is not None and self._asclist is None: + self._asclist = sorted(self._genlist) + + @property + def fastasc(self): + self._trysetasclist() + if self._asclist is not None: + return self._asclist.__iter__ + iter1 = self._r1.fastasc + iter2 = self._r2.fastasc + if None in (iter1, iter2): + return None + return lambda: _iterordered(True, iter1(), iter2()) + + @property + def fastdesc(self): + self._trysetasclist() + if self._asclist is not None: + return self._asclist.__reversed__ + iter1 = self._r1.fastdesc + iter2 = self._r2.fastdesc + if None in (iter1, iter2): + return None + return lambda: _iterordered(False, iter1(), iter2()) + + def __contains__(self, x): + return x in self._r1 or x in self._r2 + + def sort(self, reverse=False): + """Sort the added set + + For this we use the cached list with all the generated values and if we + know they are ascending or descending we can sort them in a smart way. + """ + self._ascending = not reverse + + def isascending(self): + return self._ascending is not None and self._ascending + + def isdescending(self): + return self._ascending is not None and not self._ascending + + def istopo(self): + # not worth the trouble asserting if the two sets combined are still + # in topographical order. Use the sort() predicate to explicitly sort + # again instead. + return False + + def reverse(self): + if self._ascending is None: + self._list.reverse() + else: + self._ascending = not self._ascending + + def first(self): + for x in self: + return x + return None + + def last(self): + self.reverse() + val = self.first() + self.reverse() + return val + + def __repr__(self): + d = {None: '', False: '-', True: '+'}[self._ascending] + return '<%s%s %r, %r>' % (type(self).__name__, d, self._r1, self._r2) + +class generatorset(abstractsmartset): + """Wrap a generator for lazy iteration + + Wrapper structure for generators that provides lazy membership and can + be iterated more than once. + When asked for membership it generates values until either it finds the + requested one or has gone through all the elements in the generator + """ + def __init__(self, gen, iterasc=None): + """ + gen: a generator producing the values for the generatorset. + """ + self._gen = gen + self._asclist = None + self._cache = {} + self._genlist = [] + self._finished = False + self._ascending = True + if iterasc is not None: + if iterasc: + self.fastasc = self._iterator + self.__contains__ = self._asccontains + else: + self.fastdesc = self._iterator + self.__contains__ = self._desccontains + + def __nonzero__(self): + # Do not use 'for r in self' because it will enforce the iteration + # order (default ascending), possibly unrolling a whole descending + # iterator. + if self._genlist: + return True + for r in self._consumegen(): + return True + return False + + def __contains__(self, x): + if x in self._cache: + return self._cache[x] + + # Use new values only, as existing values would be cached. + for l in self._consumegen(): + if l == x: + return True + + self._cache[x] = False + return False + + def _asccontains(self, x): + """version of contains optimised for ascending generator""" + if x in self._cache: + return self._cache[x] + + # Use new values only, as existing values would be cached. + for l in self._consumegen(): + if l == x: + return True + if l > x: + break + + self._cache[x] = False + return False + + def _desccontains(self, x): + """version of contains optimised for descending generator""" + if x in self._cache: + return self._cache[x] + + # Use new values only, as existing values would be cached. + for l in self._consumegen(): + if l == x: + return True + if l < x: + break + + self._cache[x] = False + return False + + def __iter__(self): + if self._ascending: + it = self.fastasc + else: + it = self.fastdesc + if it is not None: + return it() + # we need to consume the iterator + for x in self._consumegen(): + pass + # recall the same code + return iter(self) + + def _iterator(self): + if self._finished: + return iter(self._genlist) + + # We have to use this complex iteration strategy to allow multiple + # iterations at the same time. We need to be able to catch revision + # removed from _consumegen and added to genlist in another instance. + # + # Getting rid of it would provide an about 15% speed up on this + # iteration. + genlist = self._genlist + nextrev = self._consumegen().next + _len = len # cache global lookup + def gen(): + i = 0 + while True: + if i < _len(genlist): + yield genlist[i] + else: + yield nextrev() + i += 1 + return gen() + + def _consumegen(self): + cache = self._cache + genlist = self._genlist.append + for item in self._gen: + cache[item] = True + genlist(item) + yield item + if not self._finished: + self._finished = True + asc = self._genlist[:] + asc.sort() + self._asclist = asc + self.fastasc = asc.__iter__ + self.fastdesc = asc.__reversed__ + + def __len__(self): + for x in self._consumegen(): + pass + return len(self._genlist) + + def sort(self, reverse=False): + self._ascending = not reverse + + def reverse(self): + self._ascending = not self._ascending + + def isascending(self): + return self._ascending + + def isdescending(self): + return not self._ascending + + def istopo(self): + # not worth the trouble asserting if the two sets combined are still + # in topographical order. Use the sort() predicate to explicitly sort + # again instead. + return False + + def first(self): + if self._ascending: + it = self.fastasc + else: + it = self.fastdesc + if it is None: + # we need to consume all and try again + for x in self._consumegen(): + pass + return self.first() + return next(it(), None) + + def last(self): + if self._ascending: + it = self.fastdesc + else: + it = self.fastasc + if it is None: + # we need to consume all and try again + for x in self._consumegen(): + pass + return self.first() + return next(it(), None) + + def __repr__(self): + d = {False: '-', True: '+'}[self._ascending] + return '<%s%s>' % (type(self).__name__, d) + +class spanset(abstractsmartset): + """Duck type for baseset class which represents a range of revisions and + can work lazily and without having all the range in memory + + Note that spanset(x, y) behave almost like xrange(x, y) except for two + notable points: + - when x < y it will be automatically descending, + - revision filtered with this repoview will be skipped. + + """ + def __init__(self, repo, start=0, end=None): + """ + start: first revision included the set + (default to 0) + end: first revision excluded (last+1) + (default to len(repo) + + Spanset will be descending if `end` < `start`. + """ + if end is None: + end = len(repo) + self._ascending = start <= end + if not self._ascending: + start, end = end + 1, start +1 + self._start = start + self._end = end + self._hiddenrevs = repo.changelog.filteredrevs + + def sort(self, reverse=False): + self._ascending = not reverse + + def reverse(self): + self._ascending = not self._ascending + + def istopo(self): + # not worth the trouble asserting if the two sets combined are still + # in topographical order. Use the sort() predicate to explicitly sort + # again instead. + return False + + def _iterfilter(self, iterrange): + s = self._hiddenrevs + for r in iterrange: + if r not in s: + yield r + + def __iter__(self): + if self._ascending: + return self.fastasc() + else: + return self.fastdesc() + + def fastasc(self): + iterrange = xrange(self._start, self._end) + if self._hiddenrevs: + return self._iterfilter(iterrange) + return iter(iterrange) + + def fastdesc(self): + iterrange = xrange(self._end - 1, self._start - 1, -1) + if self._hiddenrevs: + return self._iterfilter(iterrange) + return iter(iterrange) + + def __contains__(self, rev): + hidden = self._hiddenrevs + return ((self._start <= rev < self._end) + and not (hidden and rev in hidden)) + + def __nonzero__(self): + for r in self: + return True + return False + + def __len__(self): + if not self._hiddenrevs: + return abs(self._end - self._start) + else: + count = 0 + start = self._start + end = self._end + for rev in self._hiddenrevs: + if (end < rev <= start) or (start <= rev < end): + count += 1 + return abs(self._end - self._start) - count + + def isascending(self): + return self._ascending + + def isdescending(self): + return not self._ascending + + def first(self): + if self._ascending: + it = self.fastasc + else: + it = self.fastdesc + for x in it(): + return x + return None + + def last(self): + if self._ascending: + it = self.fastdesc + else: + it = self.fastasc + for x in it(): + return x + return None + + def __repr__(self): + d = {False: '-', True: '+'}[self._ascending] + return '<%s%s %d:%d>' % (type(self).__name__, d, + self._start, self._end - 1) + +class fullreposet(spanset): + """a set containing all revisions in the repo + + This class exists to host special optimization and magic to handle virtual + revisions such as "null". + """ + + def __init__(self, repo): + super(fullreposet, self).__init__(repo) + + def __and__(self, other): + """As self contains the whole repo, all of the other set should also be + in self. Therefore `self & other = other`. + + This boldly assumes the other contains valid revs only. + """ + # other not a smartset, make is so + if not util.safehasattr(other, 'isascending'): + # filter out hidden revision + # (this boldly assumes all smartset are pure) + # + # `other` was used with "&", let's assume this is a set like + # object. + other = baseset(other - self._hiddenrevs) + + other.sort(reverse=self.isdescending()) + return other + +def prettyformat(revs): + lines = [] + rs = repr(revs) + p = 0 + while p < len(rs): + q = rs.find('<', p + 1) + if q < 0: + q = len(rs) + l = rs.count('<', 0, p) - rs.count('>', 0, p) + assert l >= 0 + lines.append((l, rs[p:q].rstrip())) + p = q + return '\n'.join(' ' * l + s for l, s in lines) diff -r f3807a135e43 -r 455677a7667f mercurial/store.py --- a/mercurial/store.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/store.py Mon Feb 13 09:44:16 2017 -0800 @@ -101,7 +101,7 @@ e = '_' if pycompat.ispy3: xchr = lambda x: bytes([x]) - asciistr = bytes(xrange(127)) + asciistr = [bytes(a) for a in range(127)] else: xchr = chr asciistr = map(chr, xrange(127)) diff -r f3807a135e43 -r 455677a7667f mercurial/util.py --- a/mercurial/util.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/util.py Mon Feb 13 09:44:16 2017 -0800 @@ -63,9 +63,21 @@ urlreq = pycompat.urlreq xmlrpclib = pycompat.xmlrpclib +def isatty(fp): + try: + return fp.isatty() + except AttributeError: + return False + +# glibc determines buffering on first write to stdout - if we replace a TTY +# destined stdout with a pipe destined stdout (e.g. pager), we want line +# buffering +if isatty(stdout): + stdout = os.fdopen(stdout.fileno(), 'wb', 1) + if pycompat.osname == 'nt': from . import windows as platform - stdout = platform.winstdout(pycompat.stdout) + stdout = platform.winstdout(stdout) else: from . import posix as platform @@ -2750,12 +2762,6 @@ u.user = u.passwd = None return str(u) -def isatty(fp): - try: - return fp.isatty() - except AttributeError: - return False - timecount = unitcountfn( (1, 1e3, _('%.0f s')), (100, 1, _('%.1f s')), diff -r f3807a135e43 -r 455677a7667f mercurial/verify.py --- a/mercurial/verify.py Fri Feb 10 18:20:58 2017 +0100 +++ b/mercurial/verify.py Mon Feb 13 09:44:16 2017 -0800 @@ -18,6 +18,7 @@ from . import ( error, revlog, + scmutil, util, ) @@ -32,21 +33,13 @@ f = f.replace('//', '/') return f -def _validpath(repo, path): - """Returns False if a path should NOT be treated as part of a repo. - - For all in-core cases, this returns True, as we have no way for a - path to be mentioned in the history but not actually be - relevant. For narrow clones, this is important because many - filelogs will be missing, and changelog entries may mention - modified files that are outside the narrow scope. - """ - return True - class verifier(object): - def __init__(self, repo): + # The match argument is always None in hg core, but e.g. the narrowhg + # extension will pass in a matcher here. + def __init__(self, repo, match=None): self.repo = repo.unfiltered() self.ui = repo.ui + self.match = match or scmutil.matchall(repo) self.badrevs = set() self.errors = 0 self.warnings = 0 @@ -170,6 +163,7 @@ def _verifychangelog(self): ui = self.ui repo = self.repo + match = self.match cl = repo.changelog ui.status(_("checking changesets\n")) @@ -189,7 +183,7 @@ mflinkrevs.setdefault(changes[0], []).append(i) self.refersmf = True for f in changes[3]: - if _validpath(repo, f): + if match(f): filelinkrevs.setdefault(_normpath(f), []).append(i) except Exception as inst: self.refersmf = True @@ -201,6 +195,7 @@ progress=None): repo = self.repo ui = self.ui + match = self.match mfl = self.repo.manifestlog mf = mfl._revlog.dirlog(dir) @@ -243,12 +238,14 @@ elif f == "/dev/null": # ignore this in very old repos continue fullpath = dir + _normpath(f) - if not _validpath(repo, fullpath): - continue if fl == 't': + if not match.visitdir(fullpath): + continue subdirnodes.setdefault(fullpath + '/', {}).setdefault( fn, []).append(lr) else: + if not match(fullpath): + continue filenodes.setdefault(fullpath, {}).setdefault(fn, lr) except Exception as inst: self.exc(lr, _("reading delta %s") % short(n), inst, label) diff -r f3807a135e43 -r 455677a7667f tests/run-tests.py --- a/tests/run-tests.py Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/run-tests.py Mon Feb 13 09:44:16 2017 -0800 @@ -114,15 +114,20 @@ def checkportisavailable(port): """return true if a port seems free to bind on localhost""" - try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(('localhost', port)) - s.close() - return True - except socket.error as exc: - if not exc.errno == errno.EADDRINUSE: - raise - return False + families = [getattr(socket, i, None) + for i in ('AF_INET', 'AF_INET6') + if getattr(socket, i, None) is not None] + for family in families: + try: + s = socket.socket(family, socket.SOCK_STREAM) + s.bind(('localhost', port)) + s.close() + return True + except socket.error as exc: + if exc.errno not in (errno.EADDRINUSE, errno.EADDRNOTAVAIL, + errno.EPROTONOSUPPORT): + raise + return False closefds = os.name == 'posix' def Popen4(cmd, wd, timeout, env=None): diff -r f3807a135e43 -r 455677a7667f tests/test-check-help.t --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-check-help.t Mon Feb 13 09:44:16 2017 -0800 @@ -0,0 +1,25 @@ +#require test-repo + + $ . "$TESTDIR/helpers-testrepo.sh" + + $ cat <<'EOF' > scanhelptopics.py + > from __future__ import absolute_import, print_function + > import re + > import sys + > topics = set() + > topicre = re.compile(r':hg:`help ([a-z0-9\-.]+)`') + > for fname in sys.argv: + > with open(fname) as f: + > topics.update(m.group(1) for m in topicre.finditer(f.read())) + > for s in sorted(topics): + > print(s) + > EOF + + $ cd "$TESTDIR"/.. + +Check if ":hg:`help TOPIC`" is valid: +(use "xargs -n1 -t" to see which help commands are executed) + + $ hg files 'glob:{hgext,mercurial}/**/*.py' \ + > | xargs python "$TESTTMP/scanhelptopics.py" \ + > | xargs -n1 hg help > /dev/null diff -r f3807a135e43 -r 455677a7667f tests/test-check-py3-compat.t --- a/tests/test-check-py3-compat.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-check-py3-compat.t Mon Feb 13 09:44:16 2017 -0800 @@ -7,7 +7,6 @@ contrib/python-zstandard/setup.py not using absolute_import contrib/python-zstandard/setup_zstd.py not using absolute_import contrib/python-zstandard/tests/common.py not using absolute_import - contrib/python-zstandard/tests/test_cffi.py not using absolute_import contrib/python-zstandard/tests/test_compressor.py not using absolute_import contrib/python-zstandard/tests/test_data_structures.py not using absolute_import contrib/python-zstandard/tests/test_decompressor.py not using absolute_import diff -r f3807a135e43 -r 455677a7667f tests/test-check-pyflakes.t --- a/tests/test-check-pyflakes.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-check-pyflakes.t Mon Feb 13 09:44:16 2017 -0800 @@ -7,9 +7,8 @@ (skipping binary file random-seed) $ hg locate 'set:**.py or grep("^#!.*python")' -X hgext/fsmonitor/pywatchman \ - > -X mercurial/pycompat.py \ + > -X mercurial/pycompat.py -X contrib/python-zstandard \ > 2>/dev/null \ > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py" - contrib/python-zstandard/tests/test_data_structures.py:107: local variable 'size' is assigned to but never used tests/filterpyflakes.py:39: undefined name 'undefinedname' diff -r f3807a135e43 -r 455677a7667f tests/test-doctest.py --- a/tests/test-doctest.py Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-doctest.py Mon Feb 13 09:44:16 2017 -0800 @@ -29,6 +29,7 @@ testmod('mercurial.pathutil') testmod('mercurial.parser') testmod('mercurial.revset') +testmod('mercurial.smartset') testmod('mercurial.store') testmod('mercurial.subrepo') testmod('mercurial.templatefilters') diff -r f3807a135e43 -r 455677a7667f tests/test-help.t --- a/tests/test-help.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-help.t Mon Feb 13 09:44:16 2017 -0800 @@ -1547,11 +1547,11 @@ "default:pushurl" should be used instead. $ hg help glossary.mcguffin - abort: help section not found + abort: help section not found: glossary.mcguffin [255] $ hg help glossary.mc.guffin - abort: help section not found + abort: help section not found: glossary.mc.guffin [255] $ hg help template.files diff -r f3807a135e43 -r 455677a7667f tests/test-mq.t --- a/tests/test-mq.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-mq.t Mon Feb 13 09:44:16 2017 -0800 @@ -25,7 +25,7 @@ Known patches are represented as patch files in the .hg/patches directory. Applied patches are both patch files and changesets. - Common tasks (use 'hg help command' for more details): + Common tasks (use 'hg help COMMAND' for more details): create new patch qnew import existing patch qimport diff -r f3807a135e43 -r 455677a7667f tests/test-revset.t --- a/tests/test-revset.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-revset.t Mon Feb 13 09:44:16 2017 -0800 @@ -40,6 +40,7 @@ > cmdutil, > node as nodemod, > revset, + > smartset, > ) > cmdtable = {} > command = cmdutil.command(cmdtable) @@ -59,7 +60,7 @@ > func = revset.match(ui, expr, repo) > revs = func(repo) > if ui.verbose: - > ui.note("* set:\n", revset.prettyformatset(revs), "\n") + > ui.note("* set:\n", smartset.prettyformat(revs), "\n") > for c in revs: > ui.write("%s\n" % c) > EOF diff -r f3807a135e43 -r 455677a7667f tests/test-update-branches.t --- a/tests/test-update-branches.t Fri Feb 10 18:20:58 2017 +0100 +++ b/tests/test-update-branches.t Mon Feb 13 09:44:16 2017 -0800 @@ -177,6 +177,28 @@ $ cd .. +Test updating to null revision + + $ hg init null-repo + $ cd null-repo + $ echo a > a + $ hg add a + $ hg ci -m a + $ hg up -qC 0 + $ echo b > b + $ hg add b + $ hg up null + 0 files updated, 0 files merged, 1 files removed, 0 files unresolved + $ hg st + A b + $ hg up -q 0 + $ hg st + A b + $ hg up -qC null + $ hg st + ? b + $ cd .. + Test updating with closed head ---------------------------------------------------------------------