Mercurial > hg
diff contrib/python-zstandard/c-ext/compressiondict.c @ 37495:b1fb341d8a61
zstandard: vendor python-zstandard 0.9.0
This was just released. It features a number of goodies. More info at
https://gregoryszorc.com/blog/2018/04/09/release-of-python-zstandard-0.9/.
The clang-format ignore list was updated to reflect the new source
of files.
The project contains a vendored copy of zstandard 1.3.4. The old
version was 1.1.3. One of the changes between those versions is that
zstandard is now dual licensed BSD + GPLv2 and the patent rights grant
has been removed. Good riddance.
The API should be backwards compatible. So no changes in core
should be needed. However, there were a number of changes in the
library that we'll want to adapt to. Those will be addressed in
subsequent commits.
Differential Revision: https://phab.mercurial-scm.org/D3198
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Mon, 09 Apr 2018 10:13:29 -0700 |
parents | e0dc40530c5a |
children | 675775c33ab6 |
line wrap: on
line diff
--- a/contrib/python-zstandard/c-ext/compressiondict.c Sun Apr 08 01:08:43 2018 +0200 +++ b/contrib/python-zstandard/c-ext/compressiondict.c Mon Apr 09 10:13:29 2018 -0700 @@ -14,125 +14,11 @@ static char* kwlist[] = { "dict_size", "samples", - "selectivity", - "level", - "notifications", - "dict_id", - NULL - }; - size_t capacity; - PyObject* samples; - Py_ssize_t samplesLen; - unsigned selectivity = 0; - int level = 0; - unsigned notifications = 0; - unsigned dictID = 0; - ZDICT_params_t zparams; - Py_ssize_t sampleIndex; - Py_ssize_t sampleSize; - PyObject* sampleItem; - size_t zresult; - void* sampleBuffer = NULL; - void* sampleOffset; - size_t samplesSize = 0; - size_t* sampleSizes = NULL; - void* dict = NULL; - ZstdCompressionDict* result = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary", - kwlist, - &capacity, - &PyList_Type, &samples, - &selectivity, &level, ¬ifications, &dictID)) { - return NULL; - } - - memset(&zparams, 0, sizeof(zparams)); - - zparams.selectivityLevel = selectivity; - zparams.compressionLevel = level; - zparams.notificationLevel = notifications; - zparams.dictID = dictID; - - /* Figure out the size of the raw samples */ - samplesLen = PyList_Size(samples); - for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { - sampleItem = PyList_GetItem(samples, sampleIndex); - if (!PyBytes_Check(sampleItem)) { - PyErr_SetString(PyExc_ValueError, "samples must be bytes"); - return NULL; - } - samplesSize += PyBytes_GET_SIZE(sampleItem); - } - - /* Now that we know the total size of the raw simples, we can allocate - a buffer for the raw data */ - sampleBuffer = PyMem_Malloc(samplesSize); - if (!sampleBuffer) { - PyErr_NoMemory(); - goto finally; - } - sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); - if (!sampleSizes) { - PyErr_NoMemory(); - goto finally; - } - - sampleOffset = sampleBuffer; - /* Now iterate again and assemble the samples in the buffer */ - for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { - sampleItem = PyList_GetItem(samples, sampleIndex); - sampleSize = PyBytes_GET_SIZE(sampleItem); - sampleSizes[sampleIndex] = sampleSize; - memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); - sampleOffset = (char*)sampleOffset + sampleSize; - } - - dict = PyMem_Malloc(capacity); - if (!dict) { - PyErr_NoMemory(); - goto finally; - } - - /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */ - Py_BEGIN_ALLOW_THREADS - zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, - sampleBuffer, sampleSizes, (unsigned int)samplesLen, - zparams); - Py_END_ALLOW_THREADS - if (ZDICT_isError(zresult)) { - PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); - PyMem_Free(dict); - goto finally; - } - - result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); - if (!result) { - goto finally; - } - - result->dictData = dict; - result->dictSize = zresult; - result->d = 0; - result->k = 0; - -finally: - PyMem_Free(sampleBuffer); - PyMem_Free(sampleSizes); - - return result; -} - -ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { - static char* kwlist[] = { - "dict_size", - "samples", "k", "d", "notifications", "dict_id", "level", - "optimize", "steps", "threads", NULL @@ -145,10 +31,9 @@ unsigned notifications = 0; unsigned dictID = 0; int level = 0; - PyObject* optimize = NULL; unsigned steps = 0; int threads = 0; - COVER_params_t params; + ZDICT_cover_params_t params; Py_ssize_t samplesLen; Py_ssize_t i; size_t samplesSize = 0; @@ -160,9 +45,9 @@ size_t zresult; ZstdCompressionDict* result = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary", kwlist, &capacity, &PyList_Type, &samples, - &k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) { + &k, &d, ¬ifications, &dictID, &level, &steps, &threads)) { return NULL; } @@ -175,9 +60,9 @@ params.d = d; params.steps = steps; params.nbThreads = threads; - params.notificationLevel = notifications; - params.dictID = dictID; - params.compressionLevel = level; + params.zParams.notificationLevel = notifications; + params.zParams.dictID = dictID; + params.zParams.compressionLevel = level; /* Figure out total size of input samples. */ samplesLen = PyList_Size(samples); @@ -219,12 +104,21 @@ } Py_BEGIN_ALLOW_THREADS - if (optimize && PyObject_IsTrue(optimize)) { - zresult = COVER_optimizeTrainFromBuffer(dict, capacity, + /* No parameters uses the default function, which will use default params + and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */ + if (!params.k && !params.d && !params.zParams.compressionLevel + && !params.zParams.notificationLevel && !params.zParams.dictID) { + zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer, + sampleSizes, (unsigned)samplesLen); + } + /* Use optimize mode if user controlled steps or threads explicitly. */ + else if (params.steps || params.nbThreads) { + zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity, sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); } + /* Non-optimize mode with explicit control. */ else { - zresult = COVER_trainFromBuffer(dict, capacity, + zresult = ZDICT_trainFromBuffer_cover(dict, capacity, sampleBuffer, sampleSizes, (unsigned)samplesLen, params); } Py_END_ALLOW_THREADS @@ -243,8 +137,11 @@ result->dictData = dict; result->dictSize = zresult; + result->dictType = ZSTD_dct_fullDict; result->d = params.d; result->k = params.k; + result->cdict = NULL; + result->ddict = NULL; finally: PyMem_Free(sampleBuffer); @@ -253,43 +150,99 @@ return result; } +int ensure_ddict(ZstdCompressionDict* dict) { + if (dict->ddict) { + return 0; + } + + Py_BEGIN_ALLOW_THREADS + dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize, + ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem); + Py_END_ALLOW_THREADS + if (!dict->ddict) { + PyErr_SetString(ZstdError, "could not create decompression dict"); + return 1; + } + + return 0; +} + PyDoc_STRVAR(ZstdCompressionDict__doc__, "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" "\n" "This type holds the results of a computed Zstandard compression dictionary.\n" -"Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" -"obtained from another source into the constructor.\n" +"Instances are obtained by calling ``train_dictionary()`` or by passing\n" +"bytes obtained from another source into the constructor.\n" ); -static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { - const char* source; - Py_ssize_t sourceSize; +static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "data", + "dict_type", + NULL + }; + + int result = -1; + Py_buffer source; + unsigned dictType = ZSTD_dct_auto; self->dictData = NULL; self->dictSize = 0; + self->cdict = NULL; + self->ddict = NULL; #if PY_MAJOR_VERSION >= 3 - if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict", #else - if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict", #endif - &source, &sourceSize)) { + kwlist, &source, &dictType)) { return -1; } - self->dictData = PyMem_Malloc(sourceSize); + if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { + PyErr_SetString(PyExc_ValueError, + "data buffer should be contiguous and have at most one dimension"); + goto finally; + } + + if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent + && dictType != ZSTD_dct_fullDict) { + PyErr_Format(PyExc_ValueError, + "invalid dictionary load mode: %d; must use DICT_TYPE_* constants", + dictType); + goto finally; + } + + self->dictType = dictType; + + self->dictData = PyMem_Malloc(source.len); if (!self->dictData) { PyErr_NoMemory(); - return -1; + goto finally; } - memcpy(self->dictData, source, sourceSize); - self->dictSize = sourceSize; + memcpy(self->dictData, source.buf, source.len); + self->dictSize = source.len; + + result = 0; - return 0; +finally: + PyBuffer_Release(&source); + return result; +} + +static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { + if (self->cdict) { + ZSTD_freeCDict(self->cdict); + self->cdict = NULL; } -static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { + if (self->ddict) { + ZSTD_freeDDict(self->ddict); + self->ddict = NULL; + } + if (self->dictData) { PyMem_Free(self->dictData); self->dictData = NULL; @@ -298,6 +251,74 @@ PyObject_Del(self); } +PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__, +"Precompute a dictionary so it can be used by multiple compressors.\n" +); + +static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { + static char* kwlist[] = { + "level", + "compression_params", + NULL + }; + + int level = 0; + ZstdCompressionParametersObject* compressionParams = NULL; + ZSTD_compressionParameters cParams; + size_t zresult; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist, + &level, &ZstdCompressionParametersType, &compressionParams)) { + return NULL; + } + + if (level && compressionParams) { + PyErr_SetString(PyExc_ValueError, + "must only specify one of level or compression_params"); + return NULL; + } + + if (!level && !compressionParams) { + PyErr_SetString(PyExc_ValueError, + "must specify one of level or compression_params"); + return NULL; + } + + if (self->cdict) { + zresult = ZSTD_freeCDict(self->cdict); + self->cdict = NULL; + if (ZSTD_isError(zresult)) { + PyErr_Format(ZstdError, "unable to free CDict: %s", + ZSTD_getErrorName(zresult)); + return NULL; + } + } + + if (level) { + cParams = ZSTD_getCParams(level, 0, self->dictSize); + } + else { + cParams.chainLog = compressionParams->chainLog; + cParams.hashLog = compressionParams->hashLog; + cParams.searchLength = compressionParams->minMatch; + cParams.searchLog = compressionParams->searchLog; + cParams.strategy = compressionParams->compressionStrategy; + cParams.targetLength = compressionParams->targetLength; + cParams.windowLog = compressionParams->windowLog; + } + + assert(!self->cdict); + self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize, + ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem); + + if (!self->cdict) { + PyErr_SetString(ZstdError, "unable to precompute dictionary"); + return NULL; + } + + Py_RETURN_NONE; +} + static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); @@ -313,6 +334,8 @@ PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, + { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress, + METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ }, { NULL, NULL } };