hg-stable: comparison contrib/python-zstandard/c-ext/decompressor.c

equal deleted inserted replaced

-:5b60464efbde
+:c32454d69b85
 		NULL
 	};
 	ZstdCompressionDict* dict = NULL;
-	self->refdctx = NULL;
+	self->dctx = NULL;
 	self->dict = NULL;
 	self->ddict = NULL;
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!", kwlist,
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O!:ZstdDecompressor", kwlist,
 		&ZstdCompressionDictType, &dict)) {
 		return -1;
 	}
-	/* Instead of creating a ZSTD_DCtx for every decompression operation,
-	   we create an instance at object creation time and recycle it via
-	   ZSTD_copyDCTx() on each use. This means each use is a malloc+memcpy
-	   instead of a malloc+init. */
 	/* TODO lazily initialize the reference ZSTD_DCtx on first use since
 	   not instances of ZstdDecompressor will use a ZSTD_DCtx. */
-	self->refdctx = ZSTD_createDCtx();
+	self->dctx = ZSTD_createDCtx();
-	if (!self->refdctx) {
+	if (!self->dctx) {
 		PyErr_NoMemory();
 		goto except;
 	}
 	if (dict) {
 	}
 	return 0;
 except:
-	if (self->refdctx) {
+	if (self->dctx) {
-		ZSTD_freeDCtx(self->refdctx);
+		ZSTD_freeDCtx(self->dctx);
-		self->refdctx = NULL;
+		self->dctx = NULL;
 	}
 	return -1;
 }
 static void Decompressor_dealloc(ZstdDecompressor* self) {
-	if (self->refdctx) {
+	if (self->dctx) {
-		ZSTD_freeDCtx(self->refdctx);
+		ZSTD_freeDCtx(self->dctx);
 	}
 	Py_XDECREF(self->dict);
 	if (self->ddict) {
 	size_t zresult = 0;
 	PyObject* writeResult;
 	PyObject* totalReadPy;
 	PyObject* totalWritePy;
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk", kwlist, &source,
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|kk:copy_stream", kwlist,
-		&dest, &inSize, &outSize)) {
+		&source, &dest, &inSize, &outSize)) {
 		return NULL;
 	}
 	if (!PyObject_HasAttrString(source, "read")) {
 		PyErr_SetString(PyExc_ValueError, "first argument must have a read() method");
 	totalWritePy = PyLong_FromSsize_t(totalWrite);
 	res = PyTuple_Pack(2, totalReadPy, totalWritePy);
 	Py_DecRef(totalReadPy);
 	Py_DecRef(totalWritePy);
-	finally:
+finally:
 	if (output.dst) {
 		PyMem_Free(output.dst);
 	}
 	if (dstream) {
 	Py_ssize_t sourceSize;
 	Py_ssize_t maxOutputSize = 0;
 	unsigned long long decompressedSize;
 	size_t destCapacity;
 	PyObject* result = NULL;
-	ZSTD_DCtx* dctx = NULL;
 	void* dictData = NULL;
 	size_t dictSize = 0;
 	size_t zresult;
 #if PY_MAJOR_VERSION >= 3
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n", kwlist,
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y#|n:decompress",
 #else
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n", kwlist,
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|n:decompress",
 #endif
-		&source, &sourceSize, &maxOutputSize)) {
+		kwlist, &source, &sourceSize, &maxOutputSize)) {
 		return NULL;
 	}
-	dctx = PyMem_Malloc(ZSTD_sizeof_DCtx(self->refdctx));
-	if (!dctx) {
-		PyErr_NoMemory();
-		return NULL;
-	}
-	ZSTD_copyDCtx(dctx, self->refdctx);
 	if (self->dict) {
 		dictData = self->dict->dictData;
 		dictSize = self->dict->dictSize;
 	}
 	if (dictData && !self->ddict) {
 		Py_BEGIN_ALLOW_THREADS
-		self->ddict = ZSTD_createDDict(dictData, dictSize);
+		self->ddict = ZSTD_createDDict_byReference(dictData, dictSize);
 		Py_END_ALLOW_THREADS
 		if (!self->ddict) {
 			PyErr_SetString(ZstdError, "could not create decompression dict");
-			goto except;
+			return NULL;
 		}
 	}
 	decompressedSize = ZSTD_getDecompressedSize(source, sourceSize);
 	/* 0 returned if content size not in the zstd frame header */
 	if (0 == decompressedSize) {
 		if (0 == maxOutputSize) {
 			PyErr_SetString(ZstdError, "input data invalid or missing content size "
 				"in frame header");
-			goto except;
+			return NULL;
 		}
 		else {
 			result = PyBytes_FromStringAndSize(NULL, maxOutputSize);
 			destCapacity = maxOutputSize;
 		}
 		result = PyBytes_FromStringAndSize(NULL, decompressedSize);
 		destCapacity = decompressedSize;
 	}
 	if (!result) {
-		goto except;
+		return NULL;
 	}
 	Py_BEGIN_ALLOW_THREADS
 	if (self->ddict) {
-		zresult = ZSTD_decompress_usingDDict(dctx, PyBytes_AsString(result), destCapacity,
+		zresult = ZSTD_decompress_usingDDict(self->dctx,
+			PyBytes_AsString(result), destCapacity,
 			source, sourceSize, self->ddict);
 	}
 	else {
-		zresult = ZSTD_decompressDCtx(dctx, PyBytes_AsString(result), destCapacity, source, sourceSize);
+		zresult = ZSTD_decompressDCtx(self->dctx,
+			PyBytes_AsString(result), destCapacity, source, sourceSize);
 	}
 	Py_END_ALLOW_THREADS
 	if (ZSTD_isError(zresult)) {
 		PyErr_Format(ZstdError, "decompression error: %s", ZSTD_getErrorName(zresult));
-		goto except;
+		Py_DecRef(result);
+		return NULL;
 	}
 	else if (decompressedSize && zresult != decompressedSize) {
 		PyErr_Format(ZstdError, "decompression error: decompressed %zu bytes; expected %llu",
 			zresult, decompressedSize);
-		goto except;
+		Py_DecRef(result);
+		return NULL;
 	}
 	else if (zresult < destCapacity) {
 		if (_PyBytes_Resize(&result, zresult)) {
-			goto except;
+			Py_DecRef(result);
-		}
+			return NULL;
-	}
+		}
-	goto finally;
-except:
-	Py_DecRef(result);
-	result = NULL;
-finally:
-	if (dctx) {
-		PyMem_FREE(dctx);
 	}
 	return result;
 }
 	size_t inSize = ZSTD_DStreamInSize();
 	size_t outSize = ZSTD_DStreamOutSize();
 	ZstdDecompressorIterator* result;
 	size_t skipBytes = 0;
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk", kwlist, &reader,
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|kkk:read_from", kwlist,
-		&inSize, &outSize, &skipBytes)) {
+		&reader, &inSize, &outSize, &skipBytes)) {
 		return NULL;
 	}
 	if (skipBytes >= inSize) {
 		PyErr_SetString(PyExc_ValueError,
 	result->finishedOutput = 0;
 	goto finally;
 except:
-	if (result->reader) {
+	Py_CLEAR(result->reader);
-		Py_DECREF(result->reader);
-		result->reader = NULL;
-	}
 	if (result->buffer) {
 		PyBuffer_Release(result->buffer);
-		Py_DECREF(result->buffer);
+		Py_CLEAR(result->buffer);
-		result->buffer = NULL;
+	}
-	}
+	Py_CLEAR(result);
-	Py_DECREF(result);
-	result = NULL;
 finally:
 	return result;
 }
 	PyObject* writer;
 	size_t outSize = ZSTD_DStreamOutSize();
 	ZstdDecompressionWriter* result;
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k", kwlist, &writer, &outSize)) {
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|k:write_to", kwlist,
+		&writer, &outSize)) {
 		return NULL;
 	}
 	if (!PyObject_HasAttrString(writer, "write")) {
 		PyErr_SetString(PyExc_ValueError, "must pass an object with a write() method");
 	result->outSize = outSize;
 	result->entered = 0;
 	result->dstream = NULL;
+	return result;
+}
+PyDoc_STRVAR(Decompressor_decompress_content_dict_chain__doc__,
+"Decompress a series of chunks using the content dictionary chaining technique\n"
+);
+static PyObject* Decompressor_decompress_content_dict_chain(PyObject* self, PyObject* args, PyObject* kwargs) {
+	static char* kwlist[] = {
+		"frames",
+		NULL
+	};
+	PyObject* chunks;
+	Py_ssize_t chunksLen;
+	Py_ssize_t chunkIndex;
+	char parity = 0;
+	PyObject* chunk;
+	char* chunkData;
+	Py_ssize_t chunkSize;
+	ZSTD_DCtx* dctx = NULL;
+	size_t zresult;
+	ZSTD_frameParams frameParams;
+	void* buffer1 = NULL;
+	size_t buffer1Size = 0;
+	size_t buffer1ContentSize = 0;
+	void* buffer2 = NULL;
+	size_t buffer2Size = 0;
+	size_t buffer2ContentSize = 0;
+	void* destBuffer = NULL;
+	PyObject* result = NULL;
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!:decompress_content_dict_chain",
+		kwlist, &PyList_Type, &chunks)) {
+		return NULL;
+	}
+	chunksLen = PyList_Size(chunks);
+	if (!chunksLen) {
+		PyErr_SetString(PyExc_ValueError, "empty input chain");
+		return NULL;
+	}
+	/* The first chunk should not be using a dictionary. We handle it specially. */
+	chunk = PyList_GetItem(chunks, 0);
+	if (!PyBytes_Check(chunk)) {
+		PyErr_SetString(PyExc_ValueError, "chunk 0 must be bytes");
+		return NULL;
+	}
+	/* We require that all chunks be zstd frames and that they have content size set. */
+	PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
+	zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
+	if (ZSTD_isError(zresult)) {
+		PyErr_SetString(PyExc_ValueError, "chunk 0 is not a valid zstd frame");
+		return NULL;
+	}
+	else if (zresult) {
+		PyErr_SetString(PyExc_ValueError, "chunk 0 is too small to contain a zstd frame");
+		return NULL;
+	}
+	if (0 == frameParams.frameContentSize) {
+		PyErr_SetString(PyExc_ValueError, "chunk 0 missing content size in frame");
+		return NULL;
+	}
+	dctx = ZSTD_createDCtx();
+	if (!dctx) {
+		PyErr_NoMemory();
+		goto finally;
+	}
+	buffer1Size = frameParams.frameContentSize;
+	buffer1 = PyMem_Malloc(buffer1Size);
+	if (!buffer1) {
+		goto finally;
+	}
+	Py_BEGIN_ALLOW_THREADS
+	zresult = ZSTD_decompressDCtx(dctx, buffer1, buffer1Size, chunkData, chunkSize);
+	Py_END_ALLOW_THREADS
+	if (ZSTD_isError(zresult)) {
+		PyErr_Format(ZstdError, "could not decompress chunk 0: %s", ZSTD_getErrorName(zresult));
+		goto finally;
+	}
+	buffer1ContentSize = zresult;
+	/* Special case of a simple chain. */
+	if (1 == chunksLen) {
+		result = PyBytes_FromStringAndSize(buffer1, buffer1Size);
+		goto finally;
+	}
+	/* This should ideally look at next chunk. But this is slightly simpler. */
+	buffer2Size = frameParams.frameContentSize;
+	buffer2 = PyMem_Malloc(buffer2Size);
+	if (!buffer2) {
+		goto finally;
+	}
+	/* For each subsequent chunk, use the previous fulltext as a content dictionary.
+	   Our strategy is to have 2 buffers. One holds the previous fulltext (to be
+	   used as a content dictionary) and the other holds the new fulltext. The
+	   buffers grow when needed but never decrease in size. This limits the
+	   memory allocator overhead.
+	*/
+	for (chunkIndex = 1; chunkIndex < chunksLen; chunkIndex++) {
+		chunk = PyList_GetItem(chunks, chunkIndex);
+		if (!PyBytes_Check(chunk)) {
+			PyErr_Format(PyExc_ValueError, "chunk %zd must be bytes", chunkIndex);
+			goto finally;
+		}
+		PyBytes_AsStringAndSize(chunk, &chunkData, &chunkSize);
+		zresult = ZSTD_getFrameParams(&frameParams, (void*)chunkData, chunkSize);
+		if (ZSTD_isError(zresult)) {
+			PyErr_Format(PyExc_ValueError, "chunk %zd is not a valid zstd frame", chunkIndex);
+			goto finally;
+		}
+		else if (zresult) {
+			PyErr_Format(PyExc_ValueError, "chunk %zd is too small to contain a zstd frame", chunkIndex);
+			goto finally;
+		}
+		if (0 == frameParams.frameContentSize) {
+			PyErr_Format(PyExc_ValueError, "chunk %zd missing content size in frame", chunkIndex);
+			goto finally;
+		}
+		parity = chunkIndex % 2;
+		/* This could definitely be abstracted to reduce code duplication. */
+		if (parity) {
+			/* Resize destination buffer to hold larger content. */
+			if (buffer2Size < frameParams.frameContentSize) {
+				buffer2Size = frameParams.frameContentSize;
+				destBuffer = PyMem_Realloc(buffer2, buffer2Size);
+				if (!destBuffer) {
+					goto finally;
+				}
+				buffer2 = destBuffer;
+			}
+			Py_BEGIN_ALLOW_THREADS
+			zresult = ZSTD_decompress_usingDict(dctx, buffer2, buffer2Size,
+				chunkData, chunkSize, buffer1, buffer1ContentSize);
+			Py_END_ALLOW_THREADS
+			if (ZSTD_isError(zresult)) {
+				PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
+					chunkIndex, ZSTD_getErrorName(zresult));
+				goto finally;
+			}
+			buffer2ContentSize = zresult;
+		}
+		else {
+			if (buffer1Size < frameParams.frameContentSize) {
+				buffer1Size = frameParams.frameContentSize;
+				destBuffer = PyMem_Realloc(buffer1, buffer1Size);
+				if (!destBuffer) {
+					goto finally;
+				}
+				buffer1 = destBuffer;
+			}
+			Py_BEGIN_ALLOW_THREADS
+			zresult = ZSTD_decompress_usingDict(dctx, buffer1, buffer1Size,
+				chunkData, chunkSize, buffer2, buffer2ContentSize);
+			Py_END_ALLOW_THREADS
+			if (ZSTD_isError(zresult)) {
+				PyErr_Format(ZstdError, "could not decompress chunk %zd: %s",
+					chunkIndex, ZSTD_getErrorName(zresult));
+				goto finally;
+			}
+			buffer1ContentSize = zresult;
+		}
+	}
+	result = PyBytes_FromStringAndSize(parity ? buffer2 : buffer1,
+		parity ? buffer2ContentSize : buffer1ContentSize);
+finally:
+	if (buffer2) {
+		PyMem_Free(buffer2);
+	}
+	if (buffer1) {
+		PyMem_Free(buffer1);
+	}
+	if (dctx) {
+		ZSTD_freeDCtx(dctx);
+	}
 	return result;
 }
 static PyMethodDef Decompressor_methods[] = {
 	Decompressor_decompressobj__doc__ },
 	{ "read_from", (PyCFunction)Decompressor_read_from, METH_VARARGS | METH_KEYWORDS,
 	Decompressor_read_from__doc__ },
 	{ "write_to", (PyCFunction)Decompressor_write_to, METH_VARARGS | METH_KEYWORDS,
 	Decompressor_write_to__doc__ },
+	{ "decompress_content_dict_chain", (PyCFunction)Decompressor_decompress_content_dict_chain,
+	  METH_VARARGS | METH_KEYWORDS, Decompressor_decompress_content_dict_chain__doc__ },
 	{ NULL, NULL }
 };
 PyTypeObject ZstdDecompressorType = {
 	PyVarObject_HEAD_INIT(NULL, 0)

changeset 30924	c32454d69b85
parent 30830	08fa3a76a080
child 31799	e0dc40530c5a