--- a/contrib/python-zstandard/c-ext/decompressor.c Thu Feb 09 21:44:32 2017 -0500
+++ b/contrib/python-zstandard/c-ext/decompressor.c Tue Feb 07 23:24:47 2017 -0800
@@ -59,23 +59,19 @@
ZstdCompressionDict* dict = NULL;
- self->refdctx = NULL;
+ self->dctx = NULL;
self->dict = NULL;
self->ddict = NULL;
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!", kwlist,
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist,
&ZstdCompressionDictType, &dict)) {
return -1;
}
- /* Instead of creating a ZSTD_DCtx for every decompression operation,
- we create an instance at object creation time and recycle it via
- ZSTD_copyDCTx() on each use. This means each use is a malloc+memcpy
- instead of a malloc+init. */
/* TODO lazily initialize the reference ZSTD_DCtx on first use since
not instances of ZstdDecompressor will use a ZSTD_DCtx. */
- self->refdctx = ZSTD_createDCtx();
- if (!self->refdctx) {
+ self->dctx = ZSTD_createDCtx();
+ if (!self->dctx) {
PyErr_NoMemory();
goto except;
}
@@ -88,17 +84,17 @@
return 0;
except:
- if (self->refdctx) {
- ZSTD_freeDCtx(self->refdctx);
- self->refdctx = NULL;
+ if (self->dctx) {
+ ZSTD_freeDCtx(self->dctx);
+ self->dctx = NULL;
}
return -1;
}
static void Decompressor_dealloc(ZstdDecompressor* self) {
- if (self->refdctx) {
- ZSTD_freeDCtx(self->refdctx);
+ if (self->dctx) {
+ ZSTD_freeDCtx(self->dctx);
}
Py_XDECREF(self->dict);
@@ -150,8 +146,8 @@
PyObject* totalReadPy;
PyObject* totalWritePy;
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk", kwlist, &source,
- &dest, &inSize, &outSize)) {
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist,
+ &source, &dest, &inSize, &outSize)) {
return NULL;
}
@@ -243,7 +239,7 @@
Py_DecRef(totalReadPy);
Py_DecRef(totalWritePy);
- finally:
+finally:
if (output.dst) {
PyMem_Free(output.dst);
}
@@ -291,28 +287,19 @@
unsigned long long decompressedSize;
size_t destCapacity;
PyObject* result = NULL;
- ZSTD_DCtx* dctx = NULL;
void* dictData = NULL;
size_t dictSize = 0;
size_t zresult;
#if PY_MAJOR_VERSION >= 3
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n", kwlist,
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress",
#else
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n", kwlist,
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress",
#endif
- &source, &sourceSize, &maxOutputSize)) {
+ kwlist, &source, &sourceSize, &maxOutputSize)) {
return NULL;
}
- dctx = PyMem_Malloc(ZSTD_sizeof_DCtx(self->refdctx));
- if (!dctx) {
- PyErr_NoMemory();
- return NULL;
- }
-
- ZSTD_copyDCtx(dctx, self->refdctx);
-
if (self->dict) {
dictData = self->dict->dictData;
dictSize = self->dict->dictSize;
@@ -320,12 +307,12 @@
if (dictData && !self->ddict) {
Py_BEGIN_ALLOW_THREADS
- self->ddict = ZSTD_createDDict(dictData, dictSize);
+ self->ddict = ZSTD_createDDict_byReference(dictData, dictSize);
Py_END_ALLOW_THREADS
if (!self->ddict) {
PyErr_SetString(ZstdError, "could not create decompression dict");
- goto except;
+ return NULL;
}
}
@@ -335,7 +322,7 @@
if (0 == maxOutputSize) {
PyErr_SetString(ZstdError, "input data invalid or missing content size "
"in frame header");
- goto except;
+ return NULL;
}
else {
result = PyBytes_FromStringAndSize(NULL, maxOutputSize);
@@ -348,45 +335,39 @@
}
if (!result) {
- goto except;
+ return NULL;
}
Py_BEGIN_ALLOW_THREADS
if (self->ddict) {
- zresult = ZSTD_decompress_usingDDict(dctx, PyBytes_AsString(result), destCapacity,
+ zresult = ZSTD_decompress_usingDDict(self->dctx,
+ PyBytes_AsString(result), destCapacity,
source, sourceSize, self->ddict);
}
else {
- zresult = ZSTD_decompressDCtx(dctx, PyBytes_AsString(result), destCapacity, source, sourceSize);
+ zresult = ZSTD_decompressDCtx(self->dctx,
+ PyBytes_AsString(result), destCapacity, source, sourceSize);
}
Py_END_ALLOW_THREADS
if (ZSTD_isError(zresult)) {
PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult));
- goto except;
+ Py_DecRef(result);
+ return NULL;
}
else if (decompressedSize && zresult != decompressedSize) {
PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu",
zresult, decompressedSize);
- goto except;
+ Py_DecRef(result);
+ return NULL;
}
else if (zresult < destCapacity) {
if (_PyBytes_Resize(&result, zresult)) {
- goto except;
+ Py_DecRef(result);
+ return NULL;
}
}
- goto finally;
-
-except:
- Py_DecRef(result);
- result = NULL;
-
-finally:
- if (dctx) {
- PyMem_FREE(dctx);
- }
-
return result;
}
@@ -455,8 +436,8 @@
ZstdDecompressorIterator* result;
size_t skipBytes = 0;
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk", kwlist, &reader,
- &inSize, &outSize, &skipBytes)) {
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist,
+ &reader, &inSize, &outSize, &skipBytes)) {
return NULL;
}
@@ -534,19 +515,14 @@
goto finally;
except:
- if (result->reader) {
- Py_DECREF(result->reader);
- result->reader = NULL;
- }
+ Py_CLEAR(result->reader);
if (result->buffer) {
PyBuffer_Release(result->buffer);
- Py_DECREF(result->buffer);
- result->buffer = NULL;
+ Py_CLEAR(result->buffer);
}
- Py_DECREF(result);
- result = NULL;
+ Py_CLEAR(result);
finally:
@@ -577,7 +553,8 @@
size_t outSize = ZSTD_DStreamOutSize();
ZstdDecompressionWriter* result;
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k", kwlist, &writer, &outSize)) {
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist,
+ &writer, &outSize)) {
return NULL;
}
@@ -605,6 +582,200 @@
return result;
}
+PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__,
+"Decompress a series of chunks using the content dictionary chaining technique\n"
+);
+
+static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) {
+ static char* kwlist[] = {
+ "frames",
+ NULL
+ };
+
+ PyObject* chunks;
+ Py_ssize_t chunksLen;
+ Py_ssize_t chunkIndex;
+ char parity = 0;
+ PyObject* chunk;
+ char* chunkData;
+ Py_ssize_t chunkSize;
+ ZSTD_DCtx* dctx = NULL;
+ size_t zresult;
+ ZSTD_frameParams frameParams;
+ void* buffer1 = NULL;
+ size_t buffer1Size = 0;
+ size_t buffer1ContentSize = 0;
+ void* buffer2 = NULL;
+ size_t buffer2Size = 0;
+ size_t buffer2ContentSize = 0;
+ void* destBuffer = NULL;
+ PyObject* result = NULL;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain",
+ kwlist, &PyList_Type, &chunks)) {
+ return NULL;
+ }
+
+ chunksLen = PyList_Size(chunks);
+ if (!chunksLen) {
+ PyErr_SetString(PyExc_ValueError, "empty input chain");
+ return NULL;
+ }
+
+ /* The first chunk should not be using a dictionary. We handle it specially. */
+ chunk = PyList_GetItem(chunks, 0);
+ if (!PyBytes_Check(chunk)) {
+ PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes");
+ return NULL;
+ }
+
+ /* We require that all chunks be zstd frames and that they have content size set. */
+ PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
+ zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
+ if (ZSTD_isError(zresult)) {
+ PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame");
+ return NULL;
+ }
+ else if (zresult) {
+ PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame");
+ return NULL;
+ }
+
+ if (0 == frameParams.frameContentSize) {
+ PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame");
+ return NULL;
+ }
+
+ dctx = ZSTD_createDCtx();
+ if (!dctx) {
+ PyErr_NoMemory();
+ goto finally;
+ }
+
+ buffer1Size = frameParams.frameContentSize;
+ buffer1 = PyMem_Malloc(buffer1Size);
+ if (!buffer1) {
+ goto finally;
+ }
+
+ Py_BEGIN_ALLOW_THREADS
+ zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize);
+ Py_END_ALLOW_THREADS
+ if (ZSTD_isError(zresult)) {
+ PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult));
+ goto finally;
+ }
+
+ buffer1ContentSize = zresult;
+
+ /* Special case of a simple chain. */
+ if (1 == chunksLen) {
+ result = PyBytes_FromStringAndSize(buffer1, buffer1Size);
+ goto finally;
+ }
+
+ /* This should ideally look at next chunk. But this is slightly simpler. */
+ buffer2Size = frameParams.frameContentSize;
+ buffer2 = PyMem_Malloc(buffer2Size);
+ if (!buffer2) {
+ goto finally;
+ }
+
+ /* For each subsequent chunk, use the previous fulltext as a content dictionary.
+ Our strategy is to have 2 buffers. One holds the previous fulltext (to be
+ used as a content dictionary) and the other holds the new fulltext. The
+ buffers grow when needed but never decrease in size. This limits the
+ memory allocator overhead.
+ */
+ for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) {
+ chunk = PyList_GetItem(chunks, chunkIndex);
+ if (!PyBytes_Check(chunk)) {
+ PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex);
+ goto finally;
+ }
+
+ PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
+ zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
+ if (ZSTD_isError(zresult)) {
+ PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex);
+ goto finally;
+ }
+ else if (zresult) {
+ PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex);
+ goto finally;
+ }
+
+ if (0 == frameParams.frameContentSize) {
+ PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex);
+ goto finally;
+ }
+
+ parity = chunkIndex % 2;
+
+ /* This could definitely be abstracted to reduce code duplication. */
+ if (parity) {
+ /* Resize destination buffer to hold larger content. */
+ if (buffer2Size < frameParams.frameContentSize) {
+ buffer2Size = frameParams.frameContentSize;
+ destBuffer = PyMem_Realloc(buffer2, buffer2Size);
+ if (!destBuffer) {
+ goto finally;
+ }
+ buffer2 = destBuffer;
+ }
+
+ Py_BEGIN_ALLOW_THREADS
+ zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size,
+ chunkData, chunkSize, buffer1, buffer1ContentSize);
+ Py_END_ALLOW_THREADS
+ if (ZSTD_isError(zresult)) {
+ PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
+ chunkIndex, ZSTD_getErrorName(zresult));
+ goto finally;
+ }
+ buffer2ContentSize = zresult;
+ }
+ else {
+ if (buffer1Size < frameParams.frameContentSize) {
+ buffer1Size = frameParams.frameContentSize;
+ destBuffer = PyMem_Realloc(buffer1, buffer1Size);
+ if (!destBuffer) {
+ goto finally;
+ }
+ buffer1 = destBuffer;
+ }
+
+ Py_BEGIN_ALLOW_THREADS
+ zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size,
+ chunkData, chunkSize, buffer2, buffer2ContentSize);
+ Py_END_ALLOW_THREADS
+ if (ZSTD_isError(zresult)) {
+ PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
+ chunkIndex, ZSTD_getErrorName(zresult));
+ goto finally;
+ }
+ buffer1ContentSize = zresult;
+ }
+ }
+
+ result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1,
+ parity ? buffer2ContentSize : buffer1ContentSize);
+
+finally:
+ if (buffer2) {
+ PyMem_Free(buffer2);
+ }
+ if (buffer1) {
+ PyMem_Free(buffer1);
+ }
+
+ if (dctx) {
+ ZSTD_freeDCtx(dctx);
+ }
+
+ return result;
+}
+
static PyMethodDef Decompressor_methods[] = {
{ "copy_stream", (PyCFunction)Decompressor_copy_stream, METH_VARARGS | METH_KEYWORDS,
Decompressor_copy_stream__doc__ },
@@ -616,6 +787,8 @@
Decompressor_read_from__doc__ },
{ "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS,
Decompressor_write_to__doc__ },
+ { "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain,
+ METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ },
{ NULL, NULL }
};