mercurial/utils/cborutil.py
changeset 37711 65a23cc8e75b
child 37898 2ae6a3134362
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/utils/cborutil.py	Sat Apr 14 16:36:15 2018 -0700
@@ -0,0 +1,258 @@
+# cborutil.py - CBOR extensions
+#
+# Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import struct
+
+from ..thirdparty.cbor.cbor2 import (
+    decoder as decodermod,
+)
+
+# Very short very of RFC 7049...
+#
+# Each item begins with a byte. The 3 high bits of that byte denote the
+# "major type." The lower 5 bits denote the "subtype." Each major type
+# has its own encoding mechanism.
+#
+# Most types have lengths. However, bytestring, string, array, and map
+# can be indefinite length. These are denotes by a subtype with value 31.
+# Sub-components of those types then come afterwards and are terminated
+# by a "break" byte.
+
+MAJOR_TYPE_UINT = 0
+MAJOR_TYPE_NEGINT = 1
+MAJOR_TYPE_BYTESTRING = 2
+MAJOR_TYPE_STRING = 3
+MAJOR_TYPE_ARRAY = 4
+MAJOR_TYPE_MAP = 5
+MAJOR_TYPE_SEMANTIC = 6
+MAJOR_TYPE_SPECIAL = 7
+
+SUBTYPE_MASK = 0b00011111
+
+SUBTYPE_HALF_FLOAT = 25
+SUBTYPE_SINGLE_FLOAT = 26
+SUBTYPE_DOUBLE_FLOAT = 27
+SUBTYPE_INDEFINITE = 31
+
+# Indefinite types begin with their major type ORd with information value 31.
+BEGIN_INDEFINITE_BYTESTRING = struct.pack(
+    r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE)
+BEGIN_INDEFINITE_ARRAY = struct.pack(
+    r'>B', MAJOR_TYPE_ARRAY << 5 | SUBTYPE_INDEFINITE)
+BEGIN_INDEFINITE_MAP = struct.pack(
+    r'>B', MAJOR_TYPE_MAP << 5 | SUBTYPE_INDEFINITE)
+
+ENCODED_LENGTH_1 = struct.Struct(r'>B')
+ENCODED_LENGTH_2 = struct.Struct(r'>BB')
+ENCODED_LENGTH_3 = struct.Struct(r'>BH')
+ENCODED_LENGTH_4 = struct.Struct(r'>BL')
+ENCODED_LENGTH_5 = struct.Struct(r'>BQ')
+
+# The break ends an indefinite length item.
+BREAK = b'\xff'
+BREAK_INT = 255
+
+def encodelength(majortype, length):
+    """Obtain a value encoding the major type and its length."""
+    if length < 24:
+        return ENCODED_LENGTH_1.pack(majortype << 5 | length)
+    elif length < 256:
+        return ENCODED_LENGTH_2.pack(majortype << 5 | 24, length)
+    elif length < 65536:
+        return ENCODED_LENGTH_3.pack(majortype << 5 | 25, length)
+    elif length < 4294967296:
+        return ENCODED_LENGTH_4.pack(majortype << 5 | 26, length)
+    else:
+        return ENCODED_LENGTH_5.pack(majortype << 5 | 27, length)
+
+def streamencodebytestring(v):
+    yield encodelength(MAJOR_TYPE_BYTESTRING, len(v))
+    yield v
+
+def streamencodebytestringfromiter(it):
+    """Convert an iterator of chunks to an indefinite bytestring.
+
+    Given an input that is iterable and each element in the iterator is
+    representable as bytes, emit an indefinite length bytestring.
+    """
+    yield BEGIN_INDEFINITE_BYTESTRING
+
+    for chunk in it:
+        yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
+        yield chunk
+
+    yield BREAK
+
+def streamencodeindefinitebytestring(source, chunksize=65536):
+    """Given a large source buffer, emit as an indefinite length bytestring.
+
+    This is a generator of chunks constituting the encoded CBOR data.
+    """
+    yield BEGIN_INDEFINITE_BYTESTRING
+
+    i = 0
+    l = len(source)
+
+    while True:
+        chunk = source[i:i + chunksize]
+        i += len(chunk)
+
+        yield encodelength(MAJOR_TYPE_BYTESTRING, len(chunk))
+        yield chunk
+
+        if i >= l:
+            break
+
+    yield BREAK
+
+def streamencodeint(v):
+    if v >= 18446744073709551616 or v < -18446744073709551616:
+        raise ValueError('big integers not supported')
+
+    if v >= 0:
+        yield encodelength(MAJOR_TYPE_UINT, v)
+    else:
+        yield encodelength(MAJOR_TYPE_NEGINT, abs(v) - 1)
+
+def streamencodearray(l):
+    """Encode a known size iterable to an array."""
+
+    yield encodelength(MAJOR_TYPE_ARRAY, len(l))
+
+    for i in l:
+        for chunk in streamencode(i):
+            yield chunk
+
+def streamencodearrayfromiter(it):
+    """Encode an iterator of items to an indefinite length array."""
+
+    yield BEGIN_INDEFINITE_ARRAY
+
+    for i in it:
+        for chunk in streamencode(i):
+            yield chunk
+
+    yield BREAK
+
+def streamencodeset(s):
+    # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
+    # semantic tag 258 for finite sets.
+    yield encodelength(MAJOR_TYPE_SEMANTIC, 258)
+
+    for chunk in streamencodearray(sorted(s)):
+        yield chunk
+
+def streamencodemap(d):
+    """Encode dictionary to a generator.
+
+    Does not supporting indefinite length dictionaries.
+    """
+    yield encodelength(MAJOR_TYPE_MAP, len(d))
+
+    for key, value in sorted(d.iteritems()):
+        for chunk in streamencode(key):
+            yield chunk
+        for chunk in streamencode(value):
+            yield chunk
+
+def streamencodemapfromiter(it):
+    """Given an iterable of (key, value), encode to an indefinite length map."""
+    yield BEGIN_INDEFINITE_MAP
+
+    for key, value in it:
+        for chunk in streamencode(key):
+            yield chunk
+        for chunk in streamencode(value):
+            yield chunk
+
+    yield BREAK
+
+def streamencodebool(b):
+    # major type 7, simple value 20 and 21.
+    yield b'\xf5' if b else b'\xf4'
+
+def streamencodenone(v):
+    # major type 7, simple value 22.
+    yield b'\xf6'
+
+STREAM_ENCODERS = {
+    bytes: streamencodebytestring,
+    int: streamencodeint,
+    list: streamencodearray,
+    tuple: streamencodearray,
+    dict: streamencodemap,
+    set: streamencodeset,
+    bool: streamencodebool,
+    type(None): streamencodenone,
+}
+
+def streamencode(v):
+    """Encode a value in a streaming manner.
+
+    Given an input object, encode it to CBOR recursively.
+
+    Returns a generator of CBOR encoded bytes. There is no guarantee
+    that each emitted chunk fully decodes to a value or sub-value.
+
+    Encoding is deterministic - unordered collections are sorted.
+    """
+    fn = STREAM_ENCODERS.get(v.__class__)
+
+    if not fn:
+        raise ValueError('do not know how to encode %s' % type(v))
+
+    return fn(v)
+
+def readindefinitebytestringtoiter(fh, expectheader=True):
+    """Read an indefinite bytestring to a generator.
+
+    Receives an object with a ``read(X)`` method to read N bytes.
+
+    If ``expectheader`` is True, it is expected that the first byte read
+    will represent an indefinite length bytestring. Otherwise, we
+    expect the first byte to be part of the first bytestring chunk.
+    """
+    read = fh.read
+    decodeuint = decodermod.decode_uint
+    byteasinteger = decodermod.byte_as_integer
+
+    if expectheader:
+        initial = decodermod.byte_as_integer(read(1))
+
+        majortype = initial >> 5
+        subtype = initial & SUBTYPE_MASK
+
+        if majortype != MAJOR_TYPE_BYTESTRING:
+            raise decodermod.CBORDecodeError(
+                'expected major type %d; got %d' % (MAJOR_TYPE_BYTESTRING,
+                                                    majortype))
+
+        if subtype != SUBTYPE_INDEFINITE:
+            raise decodermod.CBORDecodeError(
+                'expected indefinite subtype; got %d' % subtype)
+
+    # The indefinite bytestring is composed of chunks of normal bytestrings.
+    # Read chunks until we hit a BREAK byte.
+
+    while True:
+        # We need to sniff for the BREAK byte.
+        initial = byteasinteger(read(1))
+
+        if initial == BREAK_INT:
+            break
+
+        length = decodeuint(fh, initial & SUBTYPE_MASK)
+        chunk = read(length)
+
+        if len(chunk) != length:
+            raise decodermod.CBORDecodeError(
+                'failed to read bytestring chunk: got %d bytes; expected %d' % (
+                    len(chunk), length))
+
+        yield chunk