contrib/python-zstandard/tests/test_train_dictionary.py
author Simon Sapin <simon.sapin@octobus.net>
Mon, 13 Sep 2021 18:48:48 +0200
changeset 47965 f9e6f2bb721d
parent 44147 5e84a96d865b
permissions -rw-r--r--
rhg: Don’t compare ambiguous files one byte at a time Even though the use of `BufReader` reduces the number of syscalls to read the file from disk, `.bytes()` yields a separate `Result` for every byte. Creating those results and dispatching on them is most likely costly. Instead, this commit opts for simplicity by reading the entire file into memory and comparing a single pair of byte strings. Note that memory already needs to contain the entire previous contents of the file, as read from the filelog. So with an extremely large file this doubles memory use but does not make it grow by orders of magnitude. At first I wrote code that still avoids reading the entire file into memory and compares one buffer at a time with `BufReader`. Find this code below for posterity. However its correctness is subtle. I ended up preferring the simplicity of the obviously-correct single comparison. ```rust let mut reader = BufReader::new(fobj); let mut expected = &contents_in_p1[..]; loop { let buf = reader.fill_buf().when_reading_file(&fs_path)?; if buf.is_empty() { // Found EOF return Ok(expected.is_empty()); } else if let Some(rest) = expected.drop_prefix(buf) { // What we read so far matches the expected content, continue reading let buf_len = buf.len(); reader.consume(buf_len); expected = rest } else { // Found different content return Ok(false); } } ``` Differential Revision: https://phab.mercurial-scm.org/D11412
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
     1
import struct
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
     2
import sys
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
     3
import unittest
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
     4
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
     5
import zstandard as zstd
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
     6
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
     7
from .common import (
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
     8
    generate_samples,
30895
c32454d69b85 zstd: vendor python-zstandard 0.7.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30435
diff changeset
     9
    make_cffi,
42937
69de49c4e39c zstandard: vendor python-zstandard 0.12
Gregory Szorc <gregory.szorc@gmail.com>
parents: 40121
diff changeset
    10
    random_input_data,
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    11
    TestCase,
30895
c32454d69b85 zstd: vendor python-zstandard 0.7.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30435
diff changeset
    12
)
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    13
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    14
if sys.version_info[0] >= 3:
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    15
    int_type = int
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    16
else:
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    17
    int_type = long
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    18
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    19
30895
c32454d69b85 zstd: vendor python-zstandard 0.7.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30435
diff changeset
    20
@make_cffi
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    21
class TestTrainDictionary(TestCase):
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    22
    def test_no_args(self):
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    23
        with self.assertRaises(TypeError):
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    24
            zstd.train_dictionary()
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    25
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    26
    def test_bad_args(self):
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    27
        with self.assertRaises(TypeError):
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    28
            zstd.train_dictionary(8192, u"foo")
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    29
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    30
        with self.assertRaises(ValueError):
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    31
            zstd.train_dictionary(8192, [u"foo"])
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    32
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    33
    def test_no_params(self):
42937
69de49c4e39c zstandard: vendor python-zstandard 0.12
Gregory Szorc <gregory.szorc@gmail.com>
parents: 40121
diff changeset
    34
        d = zstd.train_dictionary(8192, random_input_data())
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    35
        self.assertIsInstance(d.dict_id(), int_type)
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    36
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    37
        # The dictionary ID may be different across platforms.
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    38
        expected = b"\x37\xa4\x30\xec" + struct.pack("<I", d.dict_id())
30435
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    39
b86a448a2965 zstd: vendor python-zstandard 0.5.0
Gregory Szorc <gregory.szorc@gmail.com>
parents:
diff changeset
    40
        data = d.as_bytes()
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    41
        self.assertEqual(data[0:8], expected)
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    42
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    43
    def test_basic(self):
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    44
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    45
        self.assertIsInstance(d.dict_id(), int_type)
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    46
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    47
        data = d.as_bytes()
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    48
        self.assertEqual(data[0:4], b"\x37\xa4\x30\xec")
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    49
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    50
        self.assertEqual(d.k, 64)
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    51
        self.assertEqual(d.d, 16)
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    52
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    53
    def test_set_dict_id(self):
44147
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    54
        d = zstd.train_dictionary(
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    55
            8192, generate_samples(), k=64, d=16, dict_id=42
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    56
        )
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    57
        self.assertEqual(d.dict_id(), 42)
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    58
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    59
    def test_optimize(self):
44147
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    60
        d = zstd.train_dictionary(
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    61
            8192, generate_samples(), threads=-1, steps=1, d=16
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    62
        )
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    63
40121
73fef626dae3 zstandard: vendor python-zstandard 0.10.1
Gregory Szorc <gregory.szorc@gmail.com>
parents: 37495
diff changeset
    64
        # This varies by platform.
73fef626dae3 zstandard: vendor python-zstandard 0.10.1
Gregory Szorc <gregory.szorc@gmail.com>
parents: 37495
diff changeset
    65
        self.assertIn(d.k, (50, 2000))
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    66
        self.assertEqual(d.d, 16)
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    67
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    68
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    69
@make_cffi
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    70
class TestCompressionDict(TestCase):
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    71
    def test_bad_mode(self):
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    72
        with self.assertRaisesRegex(ValueError, "invalid dictionary load mode"):
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    73
            zstd.ZstdCompressionDict(b"foo", dict_type=42)
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    74
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    75
    def test_bad_precompute_compress(self):
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    76
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    77
44147
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    78
        with self.assertRaisesRegex(
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    79
            ValueError, "must specify one of level or "
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    80
        ):
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    81
            d.precompute_compress()
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    82
44147
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    83
        with self.assertRaisesRegex(
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    84
            ValueError, "must only specify one of level or "
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    85
        ):
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    86
            d.precompute_compress(
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    87
                level=3, compression_params=zstd.CompressionParameters()
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    88
            )
31796
e0dc40530c5a zstd: vendor python-zstandard 0.8.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 30895
diff changeset
    89
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    90
    def test_precompute_compress_rawcontent(self):
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    91
        d = zstd.ZstdCompressionDict(
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    92
            b"dictcontent" * 64, dict_type=zstd.DICT_TYPE_RAWCONTENT
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    93
        )
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    94
        d.precompute_compress(level=1)
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
    95
43994
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    96
        d = zstd.ZstdCompressionDict(
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    97
            b"dictcontent" * 64, dict_type=zstd.DICT_TYPE_FULLDICT
de7838053207 zstandard: vendor python-zstandard 0.13.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 42937
diff changeset
    98
        )
44147
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
    99
        with self.assertRaisesRegex(
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
   100
            zstd.ZstdError, "unable to precompute dictionary"
5e84a96d865b python-zstandard: blacken at 80 characters
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43994
diff changeset
   101
        ):
37495
b1fb341d8a61 zstandard: vendor python-zstandard 0.9.0
Gregory Szorc <gregory.szorc@gmail.com>
parents: 31796
diff changeset
   102
            d.precompute_compress(level=1)