comparison contrib/python-zstandard/tests/test_compressor.py @ 30435:b86a448a2965

zstd: vendor python-zstandard 0.5.0 As the commit message for the previous changeset says, we wish for zstd to be a 1st class citizen in Mercurial. To make that happen, we need to enable Python to talk to the zstd C API. And that requires bindings. This commit vendors a copy of existing Python bindings. Why do we need to vendor? As the commit message of the previous commit says, relying on systems in the wild to have the bindings or zstd present is a losing proposition. By distributing the zstd and bindings with Mercurial, we significantly increase our chances that zstd will work. Since zstd will deliver a better end-user experience by achieving better performance, this benefits our users. Another reason is that the Python bindings still aren't stable and the API is somewhat fluid. While Mercurial could be coded to target multiple versions of the Python bindings, it is safer to bundle an explicit, known working version. The added Python bindings are mostly a fully-featured interface to the zstd C API. They allow one-shot operations, streaming, reading and writing from objects implements the file object protocol, dictionary compression, control over low-level compression parameters, and more. The Python bindings work on Python 2.6, 2.7, and 3.3+ and have been tested on Linux and Windows. There are CFFI bindings, but they are lacking compared to the C extension. Upstream work will be needed before we can support zstd with PyPy. But it will be possible. The files added in this commit come from Git commit e637c1b214d5f869cf8116c550dcae23ec13b677 from https://github.com/indygreg/python-zstandard and are added without modifications. Some files from the upstream repository have been omitted, namely files related to continuous integration. In the spirit of full disclosure, I'm the maintainer of the "python-zstandard" project and have authored 100% of the code added in this commit. Unfortunately, the Python bindings have not been formally code reviewed by anyone. While I've tested much of the code thoroughly (I even have tests that fuzz APIs), there's a good chance there are bugs, memory leaks, not well thought out APIs, etc. If someone wants to review the code and send feedback to the GitHub project, it would be greatly appreciated. Despite my involvement with both projects, my opinions of code style differ from Mercurial's. The code in this commit introduces numerous code style violations in Mercurial's linters. So, the code is excluded from most lints. However, some violations I agree with. These have been added to the known violations ignore list for now.
author Gregory Szorc <gregory.szorc@gmail.com>
date Thu, 10 Nov 2016 22:15:58 -0800
parents
children b54a2984cdd4
comparison
equal deleted inserted replaced
30434:2e484bdea8c4 30435:b86a448a2965
1 import hashlib
2 import io
3 import struct
4 import sys
5
6 try:
7 import unittest2 as unittest
8 except ImportError:
9 import unittest
10
11 import zstd
12
13 from .common import OpCountingBytesIO
14
15
16 if sys.version_info[0] >= 3:
17 next = lambda it: it.__next__()
18 else:
19 next = lambda it: it.next()
20
21
22 class TestCompressor(unittest.TestCase):
23 def test_level_bounds(self):
24 with self.assertRaises(ValueError):
25 zstd.ZstdCompressor(level=0)
26
27 with self.assertRaises(ValueError):
28 zstd.ZstdCompressor(level=23)
29
30
31 class TestCompressor_compress(unittest.TestCase):
32 def test_compress_empty(self):
33 cctx = zstd.ZstdCompressor(level=1)
34 cctx.compress(b'')
35
36 cctx = zstd.ZstdCompressor(level=22)
37 cctx.compress(b'')
38
39 def test_compress_empty(self):
40 cctx = zstd.ZstdCompressor(level=1)
41 self.assertEqual(cctx.compress(b''),
42 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
43
44 def test_compress_large(self):
45 chunks = []
46 for i in range(255):
47 chunks.append(struct.Struct('>B').pack(i) * 16384)
48
49 cctx = zstd.ZstdCompressor(level=3)
50 result = cctx.compress(b''.join(chunks))
51 self.assertEqual(len(result), 999)
52 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
53
54 def test_write_checksum(self):
55 cctx = zstd.ZstdCompressor(level=1)
56 no_checksum = cctx.compress(b'foobar')
57 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
58 with_checksum = cctx.compress(b'foobar')
59
60 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
61
62 def test_write_content_size(self):
63 cctx = zstd.ZstdCompressor(level=1)
64 no_size = cctx.compress(b'foobar' * 256)
65 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
66 with_size = cctx.compress(b'foobar' * 256)
67
68 self.assertEqual(len(with_size), len(no_size) + 1)
69
70 def test_no_dict_id(self):
71 samples = []
72 for i in range(128):
73 samples.append(b'foo' * 64)
74 samples.append(b'bar' * 64)
75 samples.append(b'foobar' * 64)
76
77 d = zstd.train_dictionary(1024, samples)
78
79 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
80 with_dict_id = cctx.compress(b'foobarfoobar')
81
82 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
83 no_dict_id = cctx.compress(b'foobarfoobar')
84
85 self.assertEqual(len(with_dict_id), len(no_dict_id) + 4)
86
87 def test_compress_dict_multiple(self):
88 samples = []
89 for i in range(128):
90 samples.append(b'foo' * 64)
91 samples.append(b'bar' * 64)
92 samples.append(b'foobar' * 64)
93
94 d = zstd.train_dictionary(8192, samples)
95
96 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
97
98 for i in range(32):
99 cctx.compress(b'foo bar foobar foo bar foobar')
100
101
102 class TestCompressor_compressobj(unittest.TestCase):
103 def test_compressobj_empty(self):
104 cctx = zstd.ZstdCompressor(level=1)
105 cobj = cctx.compressobj()
106 self.assertEqual(cobj.compress(b''), b'')
107 self.assertEqual(cobj.flush(),
108 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
109
110 def test_compressobj_large(self):
111 chunks = []
112 for i in range(255):
113 chunks.append(struct.Struct('>B').pack(i) * 16384)
114
115 cctx = zstd.ZstdCompressor(level=3)
116 cobj = cctx.compressobj()
117
118 result = cobj.compress(b''.join(chunks)) + cobj.flush()
119 self.assertEqual(len(result), 999)
120 self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd')
121
122 def test_write_checksum(self):
123 cctx = zstd.ZstdCompressor(level=1)
124 cobj = cctx.compressobj()
125 no_checksum = cobj.compress(b'foobar') + cobj.flush()
126 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
127 cobj = cctx.compressobj()
128 with_checksum = cobj.compress(b'foobar') + cobj.flush()
129
130 self.assertEqual(len(with_checksum), len(no_checksum) + 4)
131
132 def test_write_content_size(self):
133 cctx = zstd.ZstdCompressor(level=1)
134 cobj = cctx.compressobj(size=len(b'foobar' * 256))
135 no_size = cobj.compress(b'foobar' * 256) + cobj.flush()
136 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
137 cobj = cctx.compressobj(size=len(b'foobar' * 256))
138 with_size = cobj.compress(b'foobar' * 256) + cobj.flush()
139
140 self.assertEqual(len(with_size), len(no_size) + 1)
141
142 def test_compress_after_flush(self):
143 cctx = zstd.ZstdCompressor()
144 cobj = cctx.compressobj()
145
146 cobj.compress(b'foo')
147 cobj.flush()
148
149 with self.assertRaisesRegexp(zstd.ZstdError, 'cannot call compress\(\) after flush'):
150 cobj.compress(b'foo')
151
152 with self.assertRaisesRegexp(zstd.ZstdError, 'flush\(\) already called'):
153 cobj.flush()
154
155
156 class TestCompressor_copy_stream(unittest.TestCase):
157 def test_no_read(self):
158 source = object()
159 dest = io.BytesIO()
160
161 cctx = zstd.ZstdCompressor()
162 with self.assertRaises(ValueError):
163 cctx.copy_stream(source, dest)
164
165 def test_no_write(self):
166 source = io.BytesIO()
167 dest = object()
168
169 cctx = zstd.ZstdCompressor()
170 with self.assertRaises(ValueError):
171 cctx.copy_stream(source, dest)
172
173 def test_empty(self):
174 source = io.BytesIO()
175 dest = io.BytesIO()
176
177 cctx = zstd.ZstdCompressor(level=1)
178 r, w = cctx.copy_stream(source, dest)
179 self.assertEqual(int(r), 0)
180 self.assertEqual(w, 9)
181
182 self.assertEqual(dest.getvalue(),
183 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
184
185 def test_large_data(self):
186 source = io.BytesIO()
187 for i in range(255):
188 source.write(struct.Struct('>B').pack(i) * 16384)
189 source.seek(0)
190
191 dest = io.BytesIO()
192 cctx = zstd.ZstdCompressor()
193 r, w = cctx.copy_stream(source, dest)
194
195 self.assertEqual(r, 255 * 16384)
196 self.assertEqual(w, 999)
197
198 def test_write_checksum(self):
199 source = io.BytesIO(b'foobar')
200 no_checksum = io.BytesIO()
201
202 cctx = zstd.ZstdCompressor(level=1)
203 cctx.copy_stream(source, no_checksum)
204
205 source.seek(0)
206 with_checksum = io.BytesIO()
207 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
208 cctx.copy_stream(source, with_checksum)
209
210 self.assertEqual(len(with_checksum.getvalue()),
211 len(no_checksum.getvalue()) + 4)
212
213 def test_write_content_size(self):
214 source = io.BytesIO(b'foobar' * 256)
215 no_size = io.BytesIO()
216
217 cctx = zstd.ZstdCompressor(level=1)
218 cctx.copy_stream(source, no_size)
219
220 source.seek(0)
221 with_size = io.BytesIO()
222 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
223 cctx.copy_stream(source, with_size)
224
225 # Source content size is unknown, so no content size written.
226 self.assertEqual(len(with_size.getvalue()),
227 len(no_size.getvalue()))
228
229 source.seek(0)
230 with_size = io.BytesIO()
231 cctx.copy_stream(source, with_size, size=len(source.getvalue()))
232
233 # We specified source size, so content size header is present.
234 self.assertEqual(len(with_size.getvalue()),
235 len(no_size.getvalue()) + 1)
236
237 def test_read_write_size(self):
238 source = OpCountingBytesIO(b'foobarfoobar')
239 dest = OpCountingBytesIO()
240 cctx = zstd.ZstdCompressor()
241 r, w = cctx.copy_stream(source, dest, read_size=1, write_size=1)
242
243 self.assertEqual(r, len(source.getvalue()))
244 self.assertEqual(w, 21)
245 self.assertEqual(source._read_count, len(source.getvalue()) + 1)
246 self.assertEqual(dest._write_count, len(dest.getvalue()))
247
248
249 def compress(data, level):
250 buffer = io.BytesIO()
251 cctx = zstd.ZstdCompressor(level=level)
252 with cctx.write_to(buffer) as compressor:
253 compressor.write(data)
254 return buffer.getvalue()
255
256
257 class TestCompressor_write_to(unittest.TestCase):
258 def test_empty(self):
259 self.assertEqual(compress(b'', 1),
260 b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
261
262 def test_multiple_compress(self):
263 buffer = io.BytesIO()
264 cctx = zstd.ZstdCompressor(level=5)
265 with cctx.write_to(buffer) as compressor:
266 compressor.write(b'foo')
267 compressor.write(b'bar')
268 compressor.write(b'x' * 8192)
269
270 result = buffer.getvalue()
271 self.assertEqual(result,
272 b'\x28\xb5\x2f\xfd\x00\x50\x75\x00\x00\x38\x66\x6f'
273 b'\x6f\x62\x61\x72\x78\x01\x00\xfc\xdf\x03\x23')
274
275 def test_dictionary(self):
276 samples = []
277 for i in range(128):
278 samples.append(b'foo' * 64)
279 samples.append(b'bar' * 64)
280 samples.append(b'foobar' * 64)
281
282 d = zstd.train_dictionary(8192, samples)
283
284 buffer = io.BytesIO()
285 cctx = zstd.ZstdCompressor(level=9, dict_data=d)
286 with cctx.write_to(buffer) as compressor:
287 compressor.write(b'foo')
288 compressor.write(b'bar')
289 compressor.write(b'foo' * 16384)
290
291 compressed = buffer.getvalue()
292 h = hashlib.sha1(compressed).hexdigest()
293 self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92')
294
295 def test_compression_params(self):
296 params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST)
297
298 buffer = io.BytesIO()
299 cctx = zstd.ZstdCompressor(compression_params=params)
300 with cctx.write_to(buffer) as compressor:
301 compressor.write(b'foo')
302 compressor.write(b'bar')
303 compressor.write(b'foobar' * 16384)
304
305 compressed = buffer.getvalue()
306 h = hashlib.sha1(compressed).hexdigest()
307 self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99')
308
309 def test_write_checksum(self):
310 no_checksum = io.BytesIO()
311 cctx = zstd.ZstdCompressor(level=1)
312 with cctx.write_to(no_checksum) as compressor:
313 compressor.write(b'foobar')
314
315 with_checksum = io.BytesIO()
316 cctx = zstd.ZstdCompressor(level=1, write_checksum=True)
317 with cctx.write_to(with_checksum) as compressor:
318 compressor.write(b'foobar')
319
320 self.assertEqual(len(with_checksum.getvalue()),
321 len(no_checksum.getvalue()) + 4)
322
323 def test_write_content_size(self):
324 no_size = io.BytesIO()
325 cctx = zstd.ZstdCompressor(level=1)
326 with cctx.write_to(no_size) as compressor:
327 compressor.write(b'foobar' * 256)
328
329 with_size = io.BytesIO()
330 cctx = zstd.ZstdCompressor(level=1, write_content_size=True)
331 with cctx.write_to(with_size) as compressor:
332 compressor.write(b'foobar' * 256)
333
334 # Source size is not known in streaming mode, so header not
335 # written.
336 self.assertEqual(len(with_size.getvalue()),
337 len(no_size.getvalue()))
338
339 # Declaring size will write the header.
340 with_size = io.BytesIO()
341 with cctx.write_to(with_size, size=len(b'foobar' * 256)) as compressor:
342 compressor.write(b'foobar' * 256)
343
344 self.assertEqual(len(with_size.getvalue()),
345 len(no_size.getvalue()) + 1)
346
347 def test_no_dict_id(self):
348 samples = []
349 for i in range(128):
350 samples.append(b'foo' * 64)
351 samples.append(b'bar' * 64)
352 samples.append(b'foobar' * 64)
353
354 d = zstd.train_dictionary(1024, samples)
355
356 with_dict_id = io.BytesIO()
357 cctx = zstd.ZstdCompressor(level=1, dict_data=d)
358 with cctx.write_to(with_dict_id) as compressor:
359 compressor.write(b'foobarfoobar')
360
361 cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
362 no_dict_id = io.BytesIO()
363 with cctx.write_to(no_dict_id) as compressor:
364 compressor.write(b'foobarfoobar')
365
366 self.assertEqual(len(with_dict_id.getvalue()),
367 len(no_dict_id.getvalue()) + 4)
368
369 def test_memory_size(self):
370 cctx = zstd.ZstdCompressor(level=3)
371 buffer = io.BytesIO()
372 with cctx.write_to(buffer) as compressor:
373 size = compressor.memory_size()
374
375 self.assertGreater(size, 100000)
376
377 def test_write_size(self):
378 cctx = zstd.ZstdCompressor(level=3)
379 dest = OpCountingBytesIO()
380 with cctx.write_to(dest, write_size=1) as compressor:
381 compressor.write(b'foo')
382 compressor.write(b'bar')
383 compressor.write(b'foobar')
384
385 self.assertEqual(len(dest.getvalue()), dest._write_count)
386
387
388 class TestCompressor_read_from(unittest.TestCase):
389 def test_type_validation(self):
390 cctx = zstd.ZstdCompressor()
391
392 # Object with read() works.
393 cctx.read_from(io.BytesIO())
394
395 # Buffer protocol works.
396 cctx.read_from(b'foobar')
397
398 with self.assertRaisesRegexp(ValueError, 'must pass an object with a read'):
399 cctx.read_from(True)
400
401 def test_read_empty(self):
402 cctx = zstd.ZstdCompressor(level=1)
403
404 source = io.BytesIO()
405 it = cctx.read_from(source)
406 chunks = list(it)
407 self.assertEqual(len(chunks), 1)
408 compressed = b''.join(chunks)
409 self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00')
410
411 # And again with the buffer protocol.
412 it = cctx.read_from(b'')
413 chunks = list(it)
414 self.assertEqual(len(chunks), 1)
415 compressed2 = b''.join(chunks)
416 self.assertEqual(compressed2, compressed)
417
418 def test_read_large(self):
419 cctx = zstd.ZstdCompressor(level=1)
420
421 source = io.BytesIO()
422 source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
423 source.write(b'o')
424 source.seek(0)
425
426 # Creating an iterator should not perform any compression until
427 # first read.
428 it = cctx.read_from(source, size=len(source.getvalue()))
429 self.assertEqual(source.tell(), 0)
430
431 # We should have exactly 2 output chunks.
432 chunks = []
433 chunk = next(it)
434 self.assertIsNotNone(chunk)
435 self.assertEqual(source.tell(), zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE)
436 chunks.append(chunk)
437 chunk = next(it)
438 self.assertIsNotNone(chunk)
439 chunks.append(chunk)
440
441 self.assertEqual(source.tell(), len(source.getvalue()))
442
443 with self.assertRaises(StopIteration):
444 next(it)
445
446 # And again for good measure.
447 with self.assertRaises(StopIteration):
448 next(it)
449
450 # We should get the same output as the one-shot compression mechanism.
451 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
452
453 # Now check the buffer protocol.
454 it = cctx.read_from(source.getvalue())
455 chunks = list(it)
456 self.assertEqual(len(chunks), 2)
457 self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
458
459 def test_read_write_size(self):
460 source = OpCountingBytesIO(b'foobarfoobar')
461 cctx = zstd.ZstdCompressor(level=3)
462 for chunk in cctx.read_from(source, read_size=1, write_size=1):
463 self.assertEqual(len(chunk), 1)
464
465 self.assertEqual(source._read_count, len(source.getvalue()) + 1)