Mercurial > hg
comparison contrib/python-zstandard/c-ext/compressiondict.c @ 37495:b1fb341d8a61
zstandard: vendor python-zstandard 0.9.0
This was just released. It features a number of goodies. More info at
https://gregoryszorc.com/blog/2018/04/09/release-of-python-zstandard-0.9/.
The clang-format ignore list was updated to reflect the new source
of files.
The project contains a vendored copy of zstandard 1.3.4. The old
version was 1.1.3. One of the changes between those versions is that
zstandard is now dual licensed BSD + GPLv2 and the patent rights grant
has been removed. Good riddance.
The API should be backwards compatible. So no changes in core
should be needed. However, there were a number of changes in the
library that we'll want to adapt to. Those will be addressed in
subsequent commits.
Differential Revision: https://phab.mercurial-scm.org/D3198
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Mon, 09 Apr 2018 10:13:29 -0700 |
parents | e0dc40530c5a |
children | 675775c33ab6 |
comparison
equal
deleted
inserted
replaced
37494:1ce7a55b09d1 | 37495:b1fb341d8a61 |
---|---|
9 #include "python-zstandard.h" | 9 #include "python-zstandard.h" |
10 | 10 |
11 extern PyObject* ZstdError; | 11 extern PyObject* ZstdError; |
12 | 12 |
13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { | 13 ZstdCompressionDict* train_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { |
14 static char* kwlist[] = { | |
15 "dict_size", | |
16 "samples", | |
17 "selectivity", | |
18 "level", | |
19 "notifications", | |
20 "dict_id", | |
21 NULL | |
22 }; | |
23 size_t capacity; | |
24 PyObject* samples; | |
25 Py_ssize_t samplesLen; | |
26 unsigned selectivity = 0; | |
27 int level = 0; | |
28 unsigned notifications = 0; | |
29 unsigned dictID = 0; | |
30 ZDICT_params_t zparams; | |
31 Py_ssize_t sampleIndex; | |
32 Py_ssize_t sampleSize; | |
33 PyObject* sampleItem; | |
34 size_t zresult; | |
35 void* sampleBuffer = NULL; | |
36 void* sampleOffset; | |
37 size_t samplesSize = 0; | |
38 size_t* sampleSizes = NULL; | |
39 void* dict = NULL; | |
40 ZstdCompressionDict* result = NULL; | |
41 | |
42 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IiII:train_dictionary", | |
43 kwlist, | |
44 &capacity, | |
45 &PyList_Type, &samples, | |
46 &selectivity, &level, ¬ifications, &dictID)) { | |
47 return NULL; | |
48 } | |
49 | |
50 memset(&zparams, 0, sizeof(zparams)); | |
51 | |
52 zparams.selectivityLevel = selectivity; | |
53 zparams.compressionLevel = level; | |
54 zparams.notificationLevel = notifications; | |
55 zparams.dictID = dictID; | |
56 | |
57 /* Figure out the size of the raw samples */ | |
58 samplesLen = PyList_Size(samples); | |
59 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { | |
60 sampleItem = PyList_GetItem(samples, sampleIndex); | |
61 if (!PyBytes_Check(sampleItem)) { | |
62 PyErr_SetString(PyExc_ValueError, "samples must be bytes"); | |
63 return NULL; | |
64 } | |
65 samplesSize += PyBytes_GET_SIZE(sampleItem); | |
66 } | |
67 | |
68 /* Now that we know the total size of the raw simples, we can allocate | |
69 a buffer for the raw data */ | |
70 sampleBuffer = PyMem_Malloc(samplesSize); | |
71 if (!sampleBuffer) { | |
72 PyErr_NoMemory(); | |
73 goto finally; | |
74 } | |
75 sampleSizes = PyMem_Malloc(samplesLen * sizeof(size_t)); | |
76 if (!sampleSizes) { | |
77 PyErr_NoMemory(); | |
78 goto finally; | |
79 } | |
80 | |
81 sampleOffset = sampleBuffer; | |
82 /* Now iterate again and assemble the samples in the buffer */ | |
83 for (sampleIndex = 0; sampleIndex < samplesLen; sampleIndex++) { | |
84 sampleItem = PyList_GetItem(samples, sampleIndex); | |
85 sampleSize = PyBytes_GET_SIZE(sampleItem); | |
86 sampleSizes[sampleIndex] = sampleSize; | |
87 memcpy(sampleOffset, PyBytes_AS_STRING(sampleItem), sampleSize); | |
88 sampleOffset = (char*)sampleOffset + sampleSize; | |
89 } | |
90 | |
91 dict = PyMem_Malloc(capacity); | |
92 if (!dict) { | |
93 PyErr_NoMemory(); | |
94 goto finally; | |
95 } | |
96 | |
97 /* TODO consider using dup2() to redirect zstd's stderr writing to a buffer */ | |
98 Py_BEGIN_ALLOW_THREADS | |
99 zresult = ZDICT_trainFromBuffer_advanced(dict, capacity, | |
100 sampleBuffer, sampleSizes, (unsigned int)samplesLen, | |
101 zparams); | |
102 Py_END_ALLOW_THREADS | |
103 if (ZDICT_isError(zresult)) { | |
104 PyErr_Format(ZstdError, "Cannot train dict: %s", ZDICT_getErrorName(zresult)); | |
105 PyMem_Free(dict); | |
106 goto finally; | |
107 } | |
108 | |
109 result = PyObject_New(ZstdCompressionDict, &ZstdCompressionDictType); | |
110 if (!result) { | |
111 goto finally; | |
112 } | |
113 | |
114 result->dictData = dict; | |
115 result->dictSize = zresult; | |
116 result->d = 0; | |
117 result->k = 0; | |
118 | |
119 finally: | |
120 PyMem_Free(sampleBuffer); | |
121 PyMem_Free(sampleSizes); | |
122 | |
123 return result; | |
124 } | |
125 | |
126 ZstdCompressionDict* train_cover_dictionary(PyObject* self, PyObject* args, PyObject* kwargs) { | |
127 static char* kwlist[] = { | 14 static char* kwlist[] = { |
128 "dict_size", | 15 "dict_size", |
129 "samples", | 16 "samples", |
130 "k", | 17 "k", |
131 "d", | 18 "d", |
132 "notifications", | 19 "notifications", |
133 "dict_id", | 20 "dict_id", |
134 "level", | 21 "level", |
135 "optimize", | |
136 "steps", | 22 "steps", |
137 "threads", | 23 "threads", |
138 NULL | 24 NULL |
139 }; | 25 }; |
140 | 26 |
143 unsigned k = 0; | 29 unsigned k = 0; |
144 unsigned d = 0; | 30 unsigned d = 0; |
145 unsigned notifications = 0; | 31 unsigned notifications = 0; |
146 unsigned dictID = 0; | 32 unsigned dictID = 0; |
147 int level = 0; | 33 int level = 0; |
148 PyObject* optimize = NULL; | |
149 unsigned steps = 0; | 34 unsigned steps = 0; |
150 int threads = 0; | 35 int threads = 0; |
151 COVER_params_t params; | 36 ZDICT_cover_params_t params; |
152 Py_ssize_t samplesLen; | 37 Py_ssize_t samplesLen; |
153 Py_ssize_t i; | 38 Py_ssize_t i; |
154 size_t samplesSize = 0; | 39 size_t samplesSize = 0; |
155 void* sampleBuffer = NULL; | 40 void* sampleBuffer = NULL; |
156 size_t* sampleSizes = NULL; | 41 size_t* sampleSizes = NULL; |
158 Py_ssize_t sampleSize; | 43 Py_ssize_t sampleSize; |
159 void* dict = NULL; | 44 void* dict = NULL; |
160 size_t zresult; | 45 size_t zresult; |
161 ZstdCompressionDict* result = NULL; | 46 ZstdCompressionDict* result = NULL; |
162 | 47 |
163 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiOIi:train_cover_dictionary", | 48 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nO!|IIIIiIi:train_dictionary", |
164 kwlist, &capacity, &PyList_Type, &samples, | 49 kwlist, &capacity, &PyList_Type, &samples, |
165 &k, &d, ¬ifications, &dictID, &level, &optimize, &steps, &threads)) { | 50 &k, &d, ¬ifications, &dictID, &level, &steps, &threads)) { |
166 return NULL; | 51 return NULL; |
167 } | 52 } |
168 | 53 |
169 if (threads < 0) { | 54 if (threads < 0) { |
170 threads = cpu_count(); | 55 threads = cpu_count(); |
173 memset(¶ms, 0, sizeof(params)); | 58 memset(¶ms, 0, sizeof(params)); |
174 params.k = k; | 59 params.k = k; |
175 params.d = d; | 60 params.d = d; |
176 params.steps = steps; | 61 params.steps = steps; |
177 params.nbThreads = threads; | 62 params.nbThreads = threads; |
178 params.notificationLevel = notifications; | 63 params.zParams.notificationLevel = notifications; |
179 params.dictID = dictID; | 64 params.zParams.dictID = dictID; |
180 params.compressionLevel = level; | 65 params.zParams.compressionLevel = level; |
181 | 66 |
182 /* Figure out total size of input samples. */ | 67 /* Figure out total size of input samples. */ |
183 samplesLen = PyList_Size(samples); | 68 samplesLen = PyList_Size(samples); |
184 for (i = 0; i < samplesLen; i++) { | 69 for (i = 0; i < samplesLen; i++) { |
185 PyObject* sampleItem = PyList_GET_ITEM(samples, i); | 70 PyObject* sampleItem = PyList_GET_ITEM(samples, i); |
217 PyErr_NoMemory(); | 102 PyErr_NoMemory(); |
218 goto finally; | 103 goto finally; |
219 } | 104 } |
220 | 105 |
221 Py_BEGIN_ALLOW_THREADS | 106 Py_BEGIN_ALLOW_THREADS |
222 if (optimize && PyObject_IsTrue(optimize)) { | 107 /* No parameters uses the default function, which will use default params |
223 zresult = COVER_optimizeTrainFromBuffer(dict, capacity, | 108 and call ZDICT_optimizeTrainFromBuffer_cover under the hood. */ |
109 if (!params.k && !params.d && !params.zParams.compressionLevel | |
110 && !params.zParams.notificationLevel && !params.zParams.dictID) { | |
111 zresult = ZDICT_trainFromBuffer(dict, capacity, sampleBuffer, | |
112 sampleSizes, (unsigned)samplesLen); | |
113 } | |
114 /* Use optimize mode if user controlled steps or threads explicitly. */ | |
115 else if (params.steps || params.nbThreads) { | |
116 zresult = ZDICT_optimizeTrainFromBuffer_cover(dict, capacity, | |
224 sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); | 117 sampleBuffer, sampleSizes, (unsigned)samplesLen, ¶ms); |
225 } | 118 } |
119 /* Non-optimize mode with explicit control. */ | |
226 else { | 120 else { |
227 zresult = COVER_trainFromBuffer(dict, capacity, | 121 zresult = ZDICT_trainFromBuffer_cover(dict, capacity, |
228 sampleBuffer, sampleSizes, (unsigned)samplesLen, params); | 122 sampleBuffer, sampleSizes, (unsigned)samplesLen, params); |
229 } | 123 } |
230 Py_END_ALLOW_THREADS | 124 Py_END_ALLOW_THREADS |
231 | 125 |
232 if (ZDICT_isError(zresult)) { | 126 if (ZDICT_isError(zresult)) { |
241 goto finally; | 135 goto finally; |
242 } | 136 } |
243 | 137 |
244 result->dictData = dict; | 138 result->dictData = dict; |
245 result->dictSize = zresult; | 139 result->dictSize = zresult; |
140 result->dictType = ZSTD_dct_fullDict; | |
246 result->d = params.d; | 141 result->d = params.d; |
247 result->k = params.k; | 142 result->k = params.k; |
143 result->cdict = NULL; | |
144 result->ddict = NULL; | |
248 | 145 |
249 finally: | 146 finally: |
250 PyMem_Free(sampleBuffer); | 147 PyMem_Free(sampleBuffer); |
251 PyMem_Free(sampleSizes); | 148 PyMem_Free(sampleSizes); |
252 | 149 |
253 return result; | 150 return result; |
151 } | |
152 | |
153 int ensure_ddict(ZstdCompressionDict* dict) { | |
154 if (dict->ddict) { | |
155 return 0; | |
156 } | |
157 | |
158 Py_BEGIN_ALLOW_THREADS | |
159 dict->ddict = ZSTD_createDDict_advanced(dict->dictData, dict->dictSize, | |
160 ZSTD_dlm_byRef, dict->dictType, ZSTD_defaultCMem); | |
161 Py_END_ALLOW_THREADS | |
162 if (!dict->ddict) { | |
163 PyErr_SetString(ZstdError, "could not create decompression dict"); | |
164 return 1; | |
165 } | |
166 | |
167 return 0; | |
254 } | 168 } |
255 | 169 |
256 PyDoc_STRVAR(ZstdCompressionDict__doc__, | 170 PyDoc_STRVAR(ZstdCompressionDict__doc__, |
257 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" | 171 "ZstdCompressionDict(data) - Represents a computed compression dictionary\n" |
258 "\n" | 172 "\n" |
259 "This type holds the results of a computed Zstandard compression dictionary.\n" | 173 "This type holds the results of a computed Zstandard compression dictionary.\n" |
260 "Instances are obtained by calling ``train_dictionary()`` or by passing bytes\n" | 174 "Instances are obtained by calling ``train_dictionary()`` or by passing\n" |
261 "obtained from another source into the constructor.\n" | 175 "bytes obtained from another source into the constructor.\n" |
262 ); | 176 ); |
263 | 177 |
264 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args) { | 178 static int ZstdCompressionDict_init(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { |
265 const char* source; | 179 static char* kwlist[] = { |
266 Py_ssize_t sourceSize; | 180 "data", |
181 "dict_type", | |
182 NULL | |
183 }; | |
184 | |
185 int result = -1; | |
186 Py_buffer source; | |
187 unsigned dictType = ZSTD_dct_auto; | |
267 | 188 |
268 self->dictData = NULL; | 189 self->dictData = NULL; |
269 self->dictSize = 0; | 190 self->dictSize = 0; |
191 self->cdict = NULL; | |
192 self->ddict = NULL; | |
270 | 193 |
271 #if PY_MAJOR_VERSION >= 3 | 194 #if PY_MAJOR_VERSION >= 3 |
272 if (!PyArg_ParseTuple(args, "y#:ZstdCompressionDict", | 195 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|I:ZstdCompressionDict", |
273 #else | 196 #else |
274 if (!PyArg_ParseTuple(args, "s#:ZstdCompressionDict", | 197 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|I:ZstdCompressionDict", |
275 #endif | 198 #endif |
276 &source, &sourceSize)) { | 199 kwlist, &source, &dictType)) { |
277 return -1; | 200 return -1; |
278 } | 201 } |
279 | 202 |
280 self->dictData = PyMem_Malloc(sourceSize); | 203 if (!PyBuffer_IsContiguous(&source, 'C') || source.ndim > 1) { |
204 PyErr_SetString(PyExc_ValueError, | |
205 "data buffer should be contiguous and have at most one dimension"); | |
206 goto finally; | |
207 } | |
208 | |
209 if (dictType != ZSTD_dct_auto && dictType != ZSTD_dct_rawContent | |
210 && dictType != ZSTD_dct_fullDict) { | |
211 PyErr_Format(PyExc_ValueError, | |
212 "invalid dictionary load mode: %d; must use DICT_TYPE_* constants", | |
213 dictType); | |
214 goto finally; | |
215 } | |
216 | |
217 self->dictType = dictType; | |
218 | |
219 self->dictData = PyMem_Malloc(source.len); | |
281 if (!self->dictData) { | 220 if (!self->dictData) { |
282 PyErr_NoMemory(); | 221 PyErr_NoMemory(); |
283 return -1; | 222 goto finally; |
284 } | 223 } |
285 | 224 |
286 memcpy(self->dictData, source, sourceSize); | 225 memcpy(self->dictData, source.buf, source.len); |
287 self->dictSize = sourceSize; | 226 self->dictSize = source.len; |
288 | 227 |
289 return 0; | 228 result = 0; |
290 } | 229 |
230 finally: | |
231 PyBuffer_Release(&source); | |
232 return result; | |
233 } | |
291 | 234 |
292 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { | 235 static void ZstdCompressionDict_dealloc(ZstdCompressionDict* self) { |
236 if (self->cdict) { | |
237 ZSTD_freeCDict(self->cdict); | |
238 self->cdict = NULL; | |
239 } | |
240 | |
241 if (self->ddict) { | |
242 ZSTD_freeDDict(self->ddict); | |
243 self->ddict = NULL; | |
244 } | |
245 | |
293 if (self->dictData) { | 246 if (self->dictData) { |
294 PyMem_Free(self->dictData); | 247 PyMem_Free(self->dictData); |
295 self->dictData = NULL; | 248 self->dictData = NULL; |
296 } | 249 } |
297 | 250 |
298 PyObject_Del(self); | 251 PyObject_Del(self); |
252 } | |
253 | |
254 PyDoc_STRVAR(ZstdCompressionDict_precompute_compress__doc__, | |
255 "Precompute a dictionary so it can be used by multiple compressors.\n" | |
256 ); | |
257 | |
258 static PyObject* ZstdCompressionDict_precompute_compress(ZstdCompressionDict* self, PyObject* args, PyObject* kwargs) { | |
259 static char* kwlist[] = { | |
260 "level", | |
261 "compression_params", | |
262 NULL | |
263 }; | |
264 | |
265 int level = 0; | |
266 ZstdCompressionParametersObject* compressionParams = NULL; | |
267 ZSTD_compressionParameters cParams; | |
268 size_t zresult; | |
269 | |
270 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO!:precompute_compress", kwlist, | |
271 &level, &ZstdCompressionParametersType, &compressionParams)) { | |
272 return NULL; | |
273 } | |
274 | |
275 if (level && compressionParams) { | |
276 PyErr_SetString(PyExc_ValueError, | |
277 "must only specify one of level or compression_params"); | |
278 return NULL; | |
279 } | |
280 | |
281 if (!level && !compressionParams) { | |
282 PyErr_SetString(PyExc_ValueError, | |
283 "must specify one of level or compression_params"); | |
284 return NULL; | |
285 } | |
286 | |
287 if (self->cdict) { | |
288 zresult = ZSTD_freeCDict(self->cdict); | |
289 self->cdict = NULL; | |
290 if (ZSTD_isError(zresult)) { | |
291 PyErr_Format(ZstdError, "unable to free CDict: %s", | |
292 ZSTD_getErrorName(zresult)); | |
293 return NULL; | |
294 } | |
295 } | |
296 | |
297 if (level) { | |
298 cParams = ZSTD_getCParams(level, 0, self->dictSize); | |
299 } | |
300 else { | |
301 cParams.chainLog = compressionParams->chainLog; | |
302 cParams.hashLog = compressionParams->hashLog; | |
303 cParams.searchLength = compressionParams->minMatch; | |
304 cParams.searchLog = compressionParams->searchLog; | |
305 cParams.strategy = compressionParams->compressionStrategy; | |
306 cParams.targetLength = compressionParams->targetLength; | |
307 cParams.windowLog = compressionParams->windowLog; | |
308 } | |
309 | |
310 assert(!self->cdict); | |
311 self->cdict = ZSTD_createCDict_advanced(self->dictData, self->dictSize, | |
312 ZSTD_dlm_byRef, self->dictType, cParams, ZSTD_defaultCMem); | |
313 | |
314 if (!self->cdict) { | |
315 PyErr_SetString(ZstdError, "unable to precompute dictionary"); | |
316 return NULL; | |
317 } | |
318 | |
319 Py_RETURN_NONE; | |
299 } | 320 } |
300 | 321 |
301 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { | 322 static PyObject* ZstdCompressionDict_dict_id(ZstdCompressionDict* self) { |
302 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); | 323 unsigned dictID = ZDICT_getDictID(self->dictData, self->dictSize); |
303 | 324 |
311 static PyMethodDef ZstdCompressionDict_methods[] = { | 332 static PyMethodDef ZstdCompressionDict_methods[] = { |
312 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, | 333 { "dict_id", (PyCFunction)ZstdCompressionDict_dict_id, METH_NOARGS, |
313 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, | 334 PyDoc_STR("dict_id() -- obtain the numeric dictionary ID") }, |
314 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, | 335 { "as_bytes", (PyCFunction)ZstdCompressionDict_as_bytes, METH_NOARGS, |
315 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, | 336 PyDoc_STR("as_bytes() -- obtain the raw bytes constituting the dictionary data") }, |
337 { "precompute_compress", (PyCFunction)ZstdCompressionDict_precompute_compress, | |
338 METH_VARARGS | METH_KEYWORDS, ZstdCompressionDict_precompute_compress__doc__ }, | |
316 { NULL, NULL } | 339 { NULL, NULL } |
317 }; | 340 }; |
318 | 341 |
319 static PyMemberDef ZstdCompressionDict_members[] = { | 342 static PyMemberDef ZstdCompressionDict_members[] = { |
320 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, | 343 { "k", T_UINT, offsetof(ZstdCompressionDict, k), READONLY, |