contrib/python-zstandard/c-ext/compressionreader.c
author Matt Harbison <matt_harbison@yahoo.com>
Mon, 09 Aug 2021 00:49:50 -0400
branchstable
changeset 47827 9261326dd032
parent 46374 e92ca942ddca
permissions -rw-r--r--
ci: run --pyoxidized tests on Windows They still have numerous failure, but at least we can start fixing them now. Differential Revision: https://phab.mercurial-scm.org/D11278

/**
* Copyright (c) 2017-present, Gregory Szorc
* All rights reserved.
*
* This software may be modified and distributed under the terms
* of the BSD license. See the LICENSE file for details.
*/

#include "python-zstandard.h"

extern PyObject* ZstdError;

static void set_unsupported_operation(void) {
	PyObject* iomod;
	PyObject* exc;

	iomod = PyImport_ImportModule("io");
	if (NULL == iomod) {
		return;
	}

	exc = PyObject_GetAttrString(iomod, "UnsupportedOperation");
	if (NULL == exc) {
		Py_DECREF(iomod);
		return;
	}

	PyErr_SetNone(exc);
	Py_DECREF(exc);
	Py_DECREF(iomod);
}

static void reader_dealloc(ZstdCompressionReader* self) {
	Py_XDECREF(self->compressor);
	Py_XDECREF(self->reader);

	if (self->buffer.buf) {
		PyBuffer_Release(&self->buffer);
		memset(&self->buffer, 0, sizeof(self->buffer));
	}

	PyObject_Del(self);
}

static ZstdCompressionReader* reader_enter(ZstdCompressionReader* self) {
	if (self->entered) {
		PyErr_SetString(PyExc_ValueError, "cannot __enter__ multiple times");
		return NULL;
	}

	self->entered = 1;

	Py_INCREF(self);
	return self;
}

static PyObject* reader_exit(ZstdCompressionReader* self, PyObject* args) {
	PyObject* exc_type;
	PyObject* exc_value;
	PyObject* exc_tb;

	if (!PyArg_ParseTuple(args, "OOO:__exit__", &exc_type, &exc_value, &exc_tb)) {
		return NULL;
	}

	self->entered = 0;
	self->closed = 1;

	/* Release resources associated with source. */
	Py_CLEAR(self->reader);
	if (self->buffer.buf) {
		PyBuffer_Release(&self->buffer);
		memset(&self->buffer, 0, sizeof(self->buffer));
	}

    Py_CLEAR(self->compressor);

	Py_RETURN_FALSE;
}

static PyObject* reader_readable(ZstdCompressionReader* self) {
	Py_RETURN_TRUE;
}

static PyObject* reader_writable(ZstdCompressionReader* self) {
	Py_RETURN_FALSE;
}

static PyObject* reader_seekable(ZstdCompressionReader* self) {
	Py_RETURN_FALSE;
}

static PyObject* reader_readline(PyObject* self, PyObject* args) {
	set_unsupported_operation();
	return NULL;
}

static PyObject* reader_readlines(PyObject* self, PyObject* args) {
	set_unsupported_operation();
	return NULL;
}

static PyObject* reader_write(PyObject* self, PyObject* args) {
	PyErr_SetString(PyExc_OSError, "stream is not writable");
	return NULL;
}

static PyObject* reader_writelines(PyObject* self, PyObject* args) {
	PyErr_SetString(PyExc_OSError, "stream is not writable");
	return NULL;
}

static PyObject* reader_isatty(PyObject* self) {
	Py_RETURN_FALSE;
}

static PyObject* reader_flush(PyObject* self) {
	Py_RETURN_NONE;
}

static PyObject* reader_close(ZstdCompressionReader* self) {
	self->closed = 1;
	Py_RETURN_NONE;
}

static PyObject* reader_tell(ZstdCompressionReader* self) {
	/* TODO should this raise OSError since stream isn't seekable? */
	return PyLong_FromUnsignedLongLong(self->bytesCompressed);
}

int read_compressor_input(ZstdCompressionReader* self) {
	if (self->finishedInput) {
		return 0;
	}

	if (self->input.pos != self->input.size) {
		return 0;
	}

	if (self->reader) {
		Py_buffer buffer;

		assert(self->readResult == NULL);

		self->readResult = PyObject_CallMethod(self->reader, "read",
		    "k", self->readSize);

		if (NULL == self->readResult) {
			return -1;
		}

		memset(&buffer, 0, sizeof(buffer));

		if (0 != PyObject_GetBuffer(self->readResult, &buffer, PyBUF_CONTIG_RO)) {
			return -1;
		}

		/* EOF */
		if (0 == buffer.len) {
			self->finishedInput = 1;
			Py_CLEAR(self->readResult);
		}
		else {
			self->input.src = buffer.buf;
			self->input.size = buffer.len;
			self->input.pos = 0;
		}

		PyBuffer_Release(&buffer);
	}
	else {
		assert(self->buffer.buf);

		self->input.src = self->buffer.buf;
		self->input.size = self->buffer.len;
		self->input.pos = 0;
	}

	return 1;
}

int compress_input(ZstdCompressionReader* self, ZSTD_outBuffer* output) {
	size_t oldPos;
	size_t zresult;

	/* If we have data left over, consume it. */
	if (self->input.pos < self->input.size) {
		oldPos = output->pos;

		Py_BEGIN_ALLOW_THREADS
		zresult = ZSTD_compressStream2(self->compressor->cctx,
		    output, &self->input, ZSTD_e_continue);
		Py_END_ALLOW_THREADS

		self->bytesCompressed += output->pos - oldPos;

		/* Input exhausted. Clear out state tracking. */
		if (self->input.pos == self->input.size) {
			memset(&self->input, 0, sizeof(self->input));
			Py_CLEAR(self->readResult);

			if (self->buffer.buf) {
				self->finishedInput = 1;
			}
		}

		if (ZSTD_isError(zresult)) {
			PyErr_Format(ZstdError, "zstd compress error: %s", ZSTD_getErrorName(zresult));
			return -1;
		}
	}

    if (output->pos && output->pos == output->size) {
        return 1;
    }
    else {
        return 0;
    }
}

static PyObject* reader_read(ZstdCompressionReader* self, PyObject* args, PyObject* kwargs) {
	static char* kwlist[] = {
		"size",
		NULL
	};

	Py_ssize_t size = -1;
	PyObject* result = NULL;
	char* resultBuffer;
	Py_ssize_t resultSize;
	size_t zresult;
	size_t oldPos;
	int readResult, compressResult;

	if (self->closed) {
		PyErr_SetString(PyExc_ValueError, "stream is closed");
		return NULL;
	}

	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", kwlist, &size)) {
		return NULL;
	}

	if (size < -1) {
		PyErr_SetString(PyExc_ValueError, "cannot read negative amounts less than -1");
		return NULL;
	}

	if (size == -1) {
		return PyObject_CallMethod((PyObject*)self, "readall", NULL);
	}

	if (self->finishedOutput || size == 0) {
		return PyBytes_FromStringAndSize("", 0);
	}

	result = PyBytes_FromStringAndSize(NULL, size);
	if (NULL == result) {
		return NULL;
	}

	PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize);

	self->output.dst = resultBuffer;
	self->output.size = resultSize;
	self->output.pos = 0;

readinput:

    compressResult = compress_input(self, &self->output);

	if (-1 == compressResult) {
		Py_XDECREF(result);
		return NULL;
	}
	else if (0 == compressResult) {
		/* There is room in the output. We fall through to below, which will
		 * either get more input for us or will attempt to end the stream.
		 */
	}
	else if (1 == compressResult) {
		memset(&self->output, 0, sizeof(self->output));
		return result;
	}
	else {
		assert(0);
	}

	readResult = read_compressor_input(self);

	if (-1 == readResult) {
		return NULL;
	}
	else if (0 == readResult) { }
	else if (1 == readResult) { }
	else {
		assert(0);
	}

	if (self->input.size) {
		goto readinput;
	}

	/* Else EOF */
	oldPos = self->output.pos;

	zresult = ZSTD_compressStream2(self->compressor->cctx, &self->output,
		&self->input, ZSTD_e_end);

	self->bytesCompressed += self->output.pos - oldPos;

	if (ZSTD_isError(zresult)) {
		PyErr_Format(ZstdError, "error ending compression stream: %s",
			ZSTD_getErrorName(zresult));
		Py_XDECREF(result);
		return NULL;
	}

	assert(self->output.pos);

	if (0 == zresult) {
		self->finishedOutput = 1;
	}

	if (safe_pybytes_resize(&result, self->output.pos)) {
		Py_XDECREF(result);
		return NULL;
	}

	memset(&self->output, 0, sizeof(self->output));

	return result;
}

static PyObject* reader_read1(ZstdCompressionReader* self, PyObject* args, PyObject* kwargs) {
	static char* kwlist[] = {
		"size",
		NULL
	};

	Py_ssize_t size = -1;
	PyObject* result = NULL;
	char* resultBuffer;
	Py_ssize_t resultSize;
	ZSTD_outBuffer output;
	int compressResult;
	size_t oldPos;
	size_t zresult;

	if (self->closed) {
		PyErr_SetString(PyExc_ValueError, "stream is closed");
		return NULL;
	}

	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n:read1", kwlist, &size)) {
		return NULL;
	}

	if (size < -1) {
		PyErr_SetString(PyExc_ValueError, "cannot read negative amounts less than -1");
		return NULL;
	}

	if (self->finishedOutput || size == 0) {
		return PyBytes_FromStringAndSize("", 0);
	}

	if (size == -1) {
		size = ZSTD_CStreamOutSize();
	}

	result = PyBytes_FromStringAndSize(NULL, size);
	if (NULL == result) {
		return NULL;
	}

	PyBytes_AsStringAndSize(result, &resultBuffer, &resultSize);

	output.dst = resultBuffer;
	output.size = resultSize;
	output.pos = 0;

	/* read1() is supposed to use at most 1 read() from the underlying stream.
	   However, we can't satisfy this requirement with compression because
	   not every input will generate output. We /could/ flush the compressor,
	   but this may not be desirable. We allow multiple read() from the
	   underlying stream. But unlike read(), we return as soon as output data
	   is available.
	*/

	compressResult = compress_input(self, &output);

	if (-1 == compressResult) {
		Py_XDECREF(result);
		return NULL;
	}
	else if (0 == compressResult || 1 == compressResult) { }
	else {
		assert(0);
	}

	if (output.pos) {
		goto finally;
	}

	while (!self->finishedInput) {
		int readResult = read_compressor_input(self);

		if (-1 == readResult) {
			Py_XDECREF(result);
			return NULL;
		}
		else if (0 == readResult || 1 == readResult) { }
		else {
			assert(0);
		}

		compressResult = compress_input(self, &output);

		if (-1 == compressResult) {
			Py_XDECREF(result);
			return NULL;
		}
		else if (0 == compressResult || 1 == compressResult) { }
		else {
			assert(0);
		}

		if (output.pos) {
			goto finally;
		}
	}

	/* EOF */
	oldPos = output.pos;

	zresult = ZSTD_compressStream2(self->compressor->cctx, &output, &self->input,
        ZSTD_e_end);

	self->bytesCompressed += output.pos - oldPos;

	if (ZSTD_isError(zresult)) {
		PyErr_Format(ZstdError, "error ending compression stream: %s",
		    ZSTD_getErrorName(zresult));
		Py_XDECREF(result);
		return NULL;
	}

	if (zresult == 0) {
		self->finishedOutput = 1;
	}

finally:
	if (result) {
		if (safe_pybytes_resize(&result, output.pos)) {
			Py_XDECREF(result);
			return NULL;
		}
	}

	return result;
}

static PyObject* reader_readall(PyObject* self) {
	PyObject* chunks = NULL;
	PyObject* empty = NULL;
	PyObject* result = NULL;

	/* Our strategy is to collect chunks into a list then join all the
	 * chunks at the end. We could potentially use e.g. an io.BytesIO. But
	 * this feels simple enough to implement and avoids potentially expensive
	 * reallocations of large buffers.
	 */
	chunks = PyList_New(0);
	if (NULL == chunks) {
		return NULL;
	}

	while (1) {
		PyObject* chunk = PyObject_CallMethod(self, "read", "i", 1048576);
		if (NULL == chunk) {
			Py_DECREF(chunks);
			return NULL;
		}

		if (!PyBytes_Size(chunk)) {
			Py_DECREF(chunk);
			break;
		}

		if (PyList_Append(chunks, chunk)) {
			Py_DECREF(chunk);
			Py_DECREF(chunks);
			return NULL;
		}

		Py_DECREF(chunk);
	}

	empty = PyBytes_FromStringAndSize("", 0);
	if (NULL == empty) {
		Py_DECREF(chunks);
		return NULL;
	}

	result = PyObject_CallMethod(empty, "join", "O", chunks);

	Py_DECREF(empty);
	Py_DECREF(chunks);

	return result;
}

static PyObject* reader_readinto(ZstdCompressionReader* self, PyObject* args) {
	Py_buffer dest;
	ZSTD_outBuffer output;
	int readResult, compressResult;
	PyObject* result = NULL;
	size_t zresult;
	size_t oldPos;

	if (self->closed) {
		PyErr_SetString(PyExc_ValueError, "stream is closed");
		return NULL;
	}

	if (self->finishedOutput) {
		return PyLong_FromLong(0);
	}

	if (!PyArg_ParseTuple(args, "w*:readinto", &dest)) {
		return NULL;
	}

	if (!PyBuffer_IsContiguous(&dest, 'C') || dest.ndim > 1) {
		PyErr_SetString(PyExc_ValueError,
		    "destination buffer should be contiguous and have at most one dimension");
		goto finally;
	}

	output.dst = dest.buf;
	output.size = dest.len;
	output.pos = 0;

	compressResult = compress_input(self, &output);

	if (-1 == compressResult) {
		goto finally;
	}
	else if (0 == compressResult) {	}
	else if (1 == compressResult) {
		result = PyLong_FromSize_t(output.pos);
		goto finally;
	}
	else {
		assert(0);
	}

	while (!self->finishedInput) {
		readResult = read_compressor_input(self);

		if (-1 == readResult) {
			goto finally;
		}
		else if (0 == readResult || 1 == readResult) {}
		else {
			assert(0);
		}

		compressResult = compress_input(self, &output);

		if (-1 == compressResult) {
			goto finally;
		}
		else if (0 == compressResult) { }
		else if (1 == compressResult) {
			result = PyLong_FromSize_t(output.pos);
			goto finally;
		}
		else {
			assert(0);
		}
	}

	/* EOF */
	oldPos = output.pos;

	zresult = ZSTD_compressStream2(self->compressor->cctx, &output, &self->input,
	    ZSTD_e_end);

	self->bytesCompressed += self->output.pos - oldPos;

	if (ZSTD_isError(zresult)) {
		PyErr_Format(ZstdError, "error ending compression stream: %s",
		    ZSTD_getErrorName(zresult));
		goto finally;
	}

	assert(output.pos);

	if (0 == zresult) {
		self->finishedOutput = 1;
	}

	result = PyLong_FromSize_t(output.pos);

finally:
	PyBuffer_Release(&dest);

	return result;
}

static PyObject* reader_readinto1(ZstdCompressionReader* self, PyObject* args) {
	Py_buffer dest;
	PyObject* result = NULL;
	ZSTD_outBuffer output;
	int compressResult;
	size_t oldPos;
	size_t zresult;

	if (self->closed) {
		PyErr_SetString(PyExc_ValueError, "stream is closed");
		return NULL;
	}

	if (self->finishedOutput) {
		return PyLong_FromLong(0);
	}

	if (!PyArg_ParseTuple(args, "w*:readinto1", &dest)) {
		return NULL;
	}

	if (!PyBuffer_IsContiguous(&dest, 'C') || dest.ndim > 1) {
		PyErr_SetString(PyExc_ValueError,
		    "destination buffer should be contiguous and have at most one dimension");
		goto finally;
	}

	output.dst = dest.buf;
	output.size = dest.len;
	output.pos = 0;

	compressResult = compress_input(self, &output);

	if (-1 == compressResult) {
		goto finally;
	}
	else if (0 == compressResult || 1 == compressResult) { }
	else {
		assert(0);
	}

	if (output.pos) {
		result = PyLong_FromSize_t(output.pos);
		goto finally;
	}

	while (!self->finishedInput) {
		int readResult = read_compressor_input(self);

		if (-1 == readResult) {
			goto finally;
		}
		else if (0 == readResult || 1 == readResult) { }
		else {
			assert(0);
		}

		compressResult = compress_input(self, &output);

		if (-1 == compressResult) {
			goto finally;
		}
		else if (0 == compressResult) { }
		else if (1 == compressResult) {
			result = PyLong_FromSize_t(output.pos);
			goto finally;
		}
		else {
			assert(0);
		}

		/* If we produced output and we're not done with input, emit
		 * that output now, as we've hit restrictions of read1().
		 */
		if (output.pos && !self->finishedInput) {
			result = PyLong_FromSize_t(output.pos);
			goto finally;
		}

		/* Otherwise we either have no output or we've exhausted the
		 * input. Either we try to get more input or we fall through
		 * to EOF below */
	}

	/* EOF */
	oldPos = output.pos;

	zresult = ZSTD_compressStream2(self->compressor->cctx, &output, &self->input,
	    ZSTD_e_end);

	self->bytesCompressed += self->output.pos - oldPos;

	if (ZSTD_isError(zresult)) {
		PyErr_Format(ZstdError, "error ending compression stream: %s",
		    ZSTD_getErrorName(zresult));
		goto finally;
	}

	assert(output.pos);

	if (0 == zresult) {
		self->finishedOutput = 1;
	}

	result = PyLong_FromSize_t(output.pos);

finally:
	PyBuffer_Release(&dest);

	return result;
}

static PyObject* reader_iter(PyObject* self) {
	set_unsupported_operation();
	return NULL;
}

static PyObject* reader_iternext(PyObject* self) {
	set_unsupported_operation();
	return NULL;
}

static PyMethodDef reader_methods[] = {
	{ "__enter__", (PyCFunction)reader_enter, METH_NOARGS,
	PyDoc_STR("Enter a compression context") },
	{ "__exit__", (PyCFunction)reader_exit, METH_VARARGS,
	PyDoc_STR("Exit a compression context") },
	{ "close", (PyCFunction)reader_close, METH_NOARGS,
	PyDoc_STR("Close the stream so it cannot perform any more operations") },
	{ "flush", (PyCFunction)reader_flush, METH_NOARGS, PyDoc_STR("no-ops") },
	{ "isatty", (PyCFunction)reader_isatty, METH_NOARGS, PyDoc_STR("Returns False") },
	{ "readable", (PyCFunction)reader_readable, METH_NOARGS,
	PyDoc_STR("Returns True") },
	{ "read", (PyCFunction)reader_read, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("read compressed data") },
	{ "read1", (PyCFunction)reader_read1, METH_VARARGS | METH_KEYWORDS, NULL },
	{ "readall", (PyCFunction)reader_readall, METH_NOARGS, PyDoc_STR("Not implemented") },
	{ "readinto", (PyCFunction)reader_readinto, METH_VARARGS, NULL },
	{ "readinto1", (PyCFunction)reader_readinto1, METH_VARARGS, NULL },
	{ "readline", (PyCFunction)reader_readline, METH_VARARGS, PyDoc_STR("Not implemented") },
	{ "readlines", (PyCFunction)reader_readlines, METH_VARARGS, PyDoc_STR("Not implemented") },
	{ "seekable", (PyCFunction)reader_seekable, METH_NOARGS,
	PyDoc_STR("Returns False") },
	{ "tell", (PyCFunction)reader_tell, METH_NOARGS,
	PyDoc_STR("Returns current number of bytes compressed") },
	{ "writable", (PyCFunction)reader_writable, METH_NOARGS,
	PyDoc_STR("Returns False") },
	{ "write", reader_write, METH_VARARGS, PyDoc_STR("Raises OSError") },
	{ "writelines", reader_writelines, METH_VARARGS, PyDoc_STR("Not implemented") },
	{ NULL, NULL }
};

static PyMemberDef reader_members[] = {
	{ "closed", T_BOOL, offsetof(ZstdCompressionReader, closed),
	  READONLY, "whether stream is closed" },
	{ NULL }
};

PyTypeObject ZstdCompressionReaderType = {
	PyVarObject_HEAD_INIT(NULL, 0)
	"zstd.ZstdCompressionReader", /* tp_name */
	sizeof(ZstdCompressionReader), /* tp_basicsize */
	0, /* tp_itemsize */
	(destructor)reader_dealloc, /* tp_dealloc */
	0, /* tp_print */
	0, /* tp_getattr */
	0, /* tp_setattr */
	0, /* tp_compare */
	0, /* tp_repr */
	0, /* tp_as_number */
	0, /* tp_as_sequence */
	0, /* tp_as_mapping */
	0, /* tp_hash */
	0, /* tp_call */
	0, /* tp_str */
	0, /* tp_getattro */
	0, /* tp_setattro */
	0, /* tp_as_buffer */
	Py_TPFLAGS_DEFAULT, /* tp_flags */
	0, /* tp_doc */
	0, /* tp_traverse */
	0, /* tp_clear */
	0, /* tp_richcompare */
	0, /* tp_weaklistoffset */
	reader_iter, /* tp_iter */
	reader_iternext, /* tp_iternext */
	reader_methods, /* tp_methods */
	reader_members, /* tp_members */
	0, /* tp_getset */
	0, /* tp_base */
	0, /* tp_dict */
	0, /* tp_descr_get */
	0, /* tp_descr_set */
	0, /* tp_dictoffset */
	0, /* tp_init */
	0, /* tp_alloc */
	PyType_GenericNew, /* tp_new */
};

void compressionreader_module_init(PyObject* mod) {
	/* TODO make reader a sub-class of io.RawIOBase */

	Py_SET_TYPE(&ZstdCompressionReaderType, &PyType_Type);
	if (PyType_Ready(&ZstdCompressionReaderType) < 0) {
		return;
	}
}