comparison contrib/python-zstandard/c-ext/compressiondict.c @ 37495:b1fb341d8a61

zstandard: vendor python-zstandard 0.9.0 This was just released. It features a number of goodies. More info at https://gregoryszorc.com/blog/2018/04/09/release-of-python-zstandard-0.9/. The clang-format ignore list was updated to reflect the new source of files. The project contains a vendored copy of zstandard 1.3.4. The old version was 1.1.3. One of the changes between those versions is that zstandard is now dual licensed BSD + GPLv2 and the patent rights grant has been removed. Good riddance. The API should be backwards compatible. So no changes in core should be needed. However, there were a number of changes in the library that we'll want to adapt to. Those will be addressed in subsequent commits. Differential Revision: https://phab.mercurial-scm.org/D3198
author Gregory Szorc <gregory.szorc@gmail.com>
date Mon, 09 Apr 2018 10:13:29 -0700
parents e0dc40530c5a
children 675775c33ab6
comparison
equal deleted inserted replaced
37494:1ce7a55b09d1 37495:b1fb341d8a61
9 #include "python-zstandard.h" 9 #include "python-zstandard.h"
10 10
11 extern PyObject* ZstdError; 11 extern PyObject* ZstdError;
12 12
13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { 13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
14 static char* kwlist[] = {
15 "dict_size",
16 "samples",
17 "selectivity",
18 "level",
19 "notifications",
20 "dict_id",
21 NULL
22 };
23 size_t capacity;
24 PyObject* samples;
25 Py_ssize_t samplesLen;
26 unsigned selectivity = 0;
27 int level = 0;
28 unsigned notifications = 0;
29 unsigned dictID = 0;
30 ZDICT_params_t zparams;
31 Py_ssize_t sampleIndex;
32 Py_ssize_t sampleSize;
33 PyObject* sampleItem;
34 size_t zresult;
35 void* sampleBuffer = NULL;
36 void* sampleOffset;
37 size_t samplesSize = 0;
38 size_t* sampleSizes = NULL;
39 void* dict = NULL;
40 ZstdCompressionDict* result = NULL;
41
42 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary",
43 kwlist,
44 &capacity,
45 &PyList_Type, &samples,
46 &selectivity, &level, &notifications, &dictID)) {
47 return NULL;
48 }
49
50 memset(&zparams, 0, sizeof(zparams));
51
52 zparams.selectivityLevel = selectivity;
53 zparams.compressionLevel = level;
54 zparams.notificationLevel = notifications;
55 zparams.dictID = dictID;
56
57 /* Figure out the size of the raw samples */
58 samplesLen = PyList_Size(samples);
59 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
60 sampleItem = PyList_GetItem(samples, sampleIndex);
61 if (!PyBytes_Check(sampleItem)) {
62 PyErr_SetString(PyExc_ValueError, "samples must be bytes");
63 return NULL;
64 }
65 samplesSize += PyBytes_GET_SIZE(sampleItem);
66 }
67
68 /* Now that we know the total size of the raw simples, we can allocate
69 a buffer for the raw data */
70 sampleBuffer = PyMem_Malloc(samplesSize);
71 if (!sampleBuffer) {
72 PyErr_NoMemory();
73 goto finally;
74 }
75 sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t));
76 if (!sampleSizes) {
77 PyErr_NoMemory();
78 goto finally;
79 }
80
81 sampleOffset = sampleBuffer;
82 /* Now iterate again and assemble the samples in the buffer */
83 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) {
84 sampleItem = PyList_GetItem(samples, sampleIndex);
85 sampleSize = PyBytes_GET_SIZE(sampleItem);
86 sampleSizes[sampleIndex] = sampleSize;
87 memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize);
88 sampleOffset = (char*)sampleOffset + sampleSize;
89 }
90
91 dict = PyMem_Malloc(capacity);
92 if (!dict) {
93 PyErr_NoMemory();
94 goto finally;
95 }
96
97 /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */
98 Py_BEGIN_ALLOW_THREADS
99 zresult = ZDICT_trainFromBuffer_advanced(dict, capacity,
100 sampleBuffer, sampleSizes, (unsigned int)samplesLen,
101 zparams);
102 Py_END_ALLOW_THREADS
103 if (ZDICT_isError(zresult)) {
104 PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult));
105 PyMem_Free(dict);
106 goto finally;
107 }
108
109 result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType);
110 if (!result) {
111 goto finally;
112 }
113
114 result->dictData = dict;
115 result->dictSize = zresult;
116 result->d = 0;
117 result->k = 0;
118
119 finally:
120 PyMem_Free(sampleBuffer);
121 PyMem_Free(sampleSizes);
122
123 return result;
124 }
125
126 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) {
127 static char* kwlist[] = { 14 static char* kwlist[] = {
128 "dict_size", 15 "dict_size",
129 "samples", 16 "samples",
130 "k", 17 "k",
131 "d", 18 "d",
132 "notifications", 19 "notifications",
133 "dict_id", 20 "dict_id",
134 "level", 21 "level",
135 "optimize",
136 "steps", 22 "steps",
137 "threads", 23 "threads",
138 NULL 24 NULL
139 }; 25 };
140 26
143 unsigned k = 0; 29 unsigned k = 0;
144 unsigned d = 0; 30 unsigned d = 0;
145 unsigned notifications = 0; 31 unsigned notifications = 0;
146 unsigned dictID = 0; 32 unsigned dictID = 0;
147 int level = 0; 33 int level = 0;
148 PyObject* optimize = NULL;
149 unsigned steps = 0; 34 unsigned steps = 0;
150 int threads = 0; 35 int threads = 0;
151 COVER_params_t params; 36 ZDICT_cover_params_t params;
152 Py_ssize_t samplesLen; 37 Py_ssize_t samplesLen;
153 Py_ssize_t i; 38 Py_ssize_t i;
154 size_t samplesSize = 0; 39 size_t samplesSize = 0;
155 void* sampleBuffer = NULL; 40 void* sampleBuffer = NULL;
156 size_t* sampleSizes = NULL; 41 size_t* sampleSizes = NULL;
158 Py_ssize_t sampleSize; 43 Py_ssize_t sampleSize;
159 void* dict = NULL; 44 void* dict = NULL;
160 size_t zresult; 45 size_t zresult;
161 ZstdCompressionDict* result = NULL; 46 ZstdCompressionDict* result = NULL;
162 47
163 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary", 48 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary",
164 kwlist, &capacity, &PyList_Type, &samples, 49 kwlist, &capacity, &PyList_Type, &samples,
165 &k, &d, &notifications, &dictID, &level, &optimize, &steps, &threads)) { 50 &k, &d, &notifications, &dictID, &level, &steps, &threads)) {
166 return NULL; 51 return NULL;
167 } 52 }
168 53
169 if (threads < 0) { 54 if (threads < 0) {
170 threads = cpu_count(); 55 threads = cpu_count();
173 memset(&params, 0, sizeof(params)); 58 memset(&params, 0, sizeof(params));
174 params.k = k; 59 params.k = k;
175 params.d = d; 60 params.d = d;
176 params.steps = steps; 61 params.steps = steps;
177 params.nbThreads = threads; 62 params.nbThreads = threads;
178 params.notificationLevel = notifications; 63 params.zParams.notificationLevel = notifications;
179 params.dictID = dictID; 64 params.zParams.dictID = dictID;
180 params.compressionLevel = level; 65 params.zParams.compressionLevel = level;
181 66
182 /* Figure out total size of input samples. */ 67 /* Figure out total size of input samples. */
183 samplesLen = PyList_Size(samples); 68 samplesLen = PyList_Size(samples);
184 for (i = 0; i < samplesLen; i++) { 69 for (i = 0; i < samplesLen; i++) {
185 PyObject* sampleItem = PyList_GET_ITEM(samples, i); 70 PyObject* sampleItem = PyList_GET_ITEM(samples, i);
217 PyErr_NoMemory(); 102 PyErr_NoMemory();
218 goto finally; 103 goto finally;
219 } 104 }
220 105
221 Py_BEGIN_ALLOW_THREADS 106 Py_BEGIN_ALLOW_THREADS
222 if (optimize && PyObject_IsTrue(optimize)) { 107 /* No parameters uses the default function, which will use default params
223 zresult = COVER_optimizeTrainFromBuffer(dict, capacity, 108 and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */
109 if (!params.k && !params.d && !params.zParams.compressionLevel
110 && !params.zParams.notificationLevel && !params.zParams.dictID) {
111 zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer,
112 sampleSizes, (unsigned)samplesLen);
113 }
114 /* Use optimize mode if user controlled steps or threads explicitly. */
115 else if (params.steps || params.nbThreads) {
116 zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity,
224 sampleBuffer, sampleSizes, (unsigned)samplesLen, &params); 117 sampleBuffer, sampleSizes, (unsigned)samplesLen, &params);
225 } 118 }
119 /* Non-optimize mode with explicit control. */
226 else { 120 else {
227 zresult = COVER_trainFromBuffer(dict, capacity, 121 zresult = ZDICT_trainFromBuffer_cover(dict, capacity,
228 sampleBuffer, sampleSizes, (unsigned)samplesLen, params); 122 sampleBuffer, sampleSizes, (unsigned)samplesLen, params);
229 } 123 }
230 Py_END_ALLOW_THREADS 124 Py_END_ALLOW_THREADS
231 125
232 if (ZDICT_isError(zresult)) { 126 if (ZDICT_isError(zresult)) {
241 goto finally; 135 goto finally;
242 } 136 }
243 137
244 result->dictData = dict; 138 result->dictData = dict;
245 result->dictSize = zresult; 139 result->dictSize = zresult;
140 result->dictType = ZSTD_dct_fullDict;
246 result->d = params.d; 141 result->d = params.d;
247 result->k = params.k; 142 result->k = params.k;
143 result->cdict = NULL;
144 result->ddict = NULL;
248 145
249 finally: 146 finally:
250 PyMem_Free(sampleBuffer); 147 PyMem_Free(sampleBuffer);
251 PyMem_Free(sampleSizes); 148 PyMem_Free(sampleSizes);
252 149
253 return result; 150 return result;
151 }
152
153 int ensure_ddict(ZstdCompressionDict* dict) {
154 if (dict->ddict) {
155 return 0;
156 }
157
158 Py_BEGIN_ALLOW_THREADS
159 dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize,
160 ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem);
161 Py_END_ALLOW_THREADS
162 if (!dict->ddict) {
163 PyErr_SetString(ZstdError, "could not create decompression dict");
164 return 1;
165 }
166
167 return 0;
254 } 168 }
255 169
256 PyDoc_STRVAR(ZstdCompressionDict__doc__, 170 PyDoc_STRVAR(ZstdCompressionDict__doc__,
257 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" 171 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n"
258 "\n" 172 "\n"
259 "This type holds the results of a computed Zstandard compression dictionary.\n" 173 "This type holds the results of a computed Zstandard compression dictionary.\n"
260 "Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" 174 "Instances are obtained by calling ``train_dictionary()`` or by passing\n"
261 "obtained from another source into the constructor.\n" 175 "bytes obtained from another source into the constructor.\n"
262 ); 176 );
263 177
264 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { 178 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
265 const char* source; 179 static char* kwlist[] = {
266 Py_ssize_t sourceSize; 180 "data",
181 "dict_type",
182 NULL
183 };
184
185 int result = -1;
186 Py_buffer source;
187 unsigned dictType = ZSTD_dct_auto;
267 188
268 self->dictData = NULL; 189 self->dictData = NULL;
269 self->dictSize = 0; 190 self->dictSize = 0;
191 self->cdict = NULL;
192 self->ddict = NULL;
270 193
271 #if PY_MAJOR_VERSION >= 3 194 #if PY_MAJOR_VERSION >= 3
272 if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", 195 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict",
273 #else 196 #else
274 if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", 197 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict",
275 #endif 198 #endif
276 &source, &sourceSize)) { 199 kwlist, &source, &dictType)) {
277 return -1; 200 return -1;
278 } 201 }
279 202
280 self->dictData = PyMem_Malloc(sourceSize); 203 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) {
204 PyErr_SetString(PyExc_ValueError,
205 "data buffer should be contiguous and have at most one dimension");
206 goto finally;
207 }
208
209 if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent
210 && dictType != ZSTD_dct_fullDict) {
211 PyErr_Format(PyExc_ValueError,
212 "invalid dictionary load mode: %d; must use DICT_TYPE_* constants",
213 dictType);
214 goto finally;
215 }
216
217 self->dictType = dictType;
218
219 self->dictData = PyMem_Malloc(source.len);
281 if (!self->dictData) { 220 if (!self->dictData) {
282 PyErr_NoMemory(); 221 PyErr_NoMemory();
283 return -1; 222 goto finally;
284 } 223 }
285 224
286 memcpy(self->dictData, source, sourceSize); 225 memcpy(self->dictData, source.buf, source.len);
287 self->dictSize = sourceSize; 226 self->dictSize = source.len;
288 227
289 return 0; 228 result = 0;
290 } 229
230 finally:
231 PyBuffer_Release(&source);
232 return result;
233 }
291 234
292 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { 235 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) {
236 if (self->cdict) {
237 ZSTD_freeCDict(self->cdict);
238 self->cdict = NULL;
239 }
240
241 if (self->ddict) {
242 ZSTD_freeDDict(self->ddict);
243 self->ddict = NULL;
244 }
245
293 if (self->dictData) { 246 if (self->dictData) {
294 PyMem_Free(self->dictData); 247 PyMem_Free(self->dictData);
295 self->dictData = NULL; 248 self->dictData = NULL;
296 } 249 }
297 250
298 PyObject_Del(self); 251 PyObject_Del(self);
252 }
253
254 PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__,
255 "Precompute a dictionary so it can be used by multiple compressors.\n"
256 );
257
258 static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) {
259 static char* kwlist[] = {
260 "level",
261 "compression_params",
262 NULL
263 };
264
265 int level = 0;
266 ZstdCompressionParametersObject* compressionParams = NULL;
267 ZSTD_compressionParameters cParams;
268 size_t zresult;
269
270 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist,
271 &level, &ZstdCompressionParametersType, &compressionParams)) {
272 return NULL;
273 }
274
275 if (level && compressionParams) {
276 PyErr_SetString(PyExc_ValueError,
277 "must only specify one of level or compression_params");
278 return NULL;
279 }
280
281 if (!level && !compressionParams) {
282 PyErr_SetString(PyExc_ValueError,
283 "must specify one of level or compression_params");
284 return NULL;
285 }
286
287 if (self->cdict) {
288 zresult = ZSTD_freeCDict(self->cdict);
289 self->cdict = NULL;
290 if (ZSTD_isError(zresult)) {
291 PyErr_Format(ZstdError, "unable to free CDict: %s",
292 ZSTD_getErrorName(zresult));
293 return NULL;
294 }
295 }
296
297 if (level) {
298 cParams = ZSTD_getCParams(level, 0, self->dictSize);
299 }
300 else {
301 cParams.chainLog = compressionParams->chainLog;
302 cParams.hashLog = compressionParams->hashLog;
303 cParams.searchLength = compressionParams->minMatch;
304 cParams.searchLog = compressionParams->searchLog;
305 cParams.strategy = compressionParams->compressionStrategy;
306 cParams.targetLength = compressionParams->targetLength;
307 cParams.windowLog = compressionParams->windowLog;
308 }
309
310 assert(!self->cdict);
311 self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize,
312 ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem);
313
314 if (!self->cdict) {
315 PyErr_SetString(ZstdError, "unable to precompute dictionary");
316 return NULL;
317 }
318
319 Py_RETURN_NONE;
299 } 320 }
300 321
301 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { 322 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) {
302 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); 323 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize);
303 324
311 static PyMethodDef ZstdCompressionDict_methods[] = { 332 static PyMethodDef ZstdCompressionDict_methods[] = {
312 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, 333 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS,
313 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, 334 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") },
314 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, 335 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS,
315 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, 336 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") },
337 { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress,
338 METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ },
316 { NULL, NULL } 339 { NULL, NULL }
317 }; 340 };
318 341
319 static PyMemberDef ZstdCompressionDict_members[] = { 342 static PyMemberDef ZstdCompressionDict_members[] = {
320 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, 343 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY,