Mercurial: comparison mercurial/utils/cborutil.py

equal deleted inserted replaced

-:fcc6bd11444b
+:aeb551a3bb8a
 # GNU General Public License version 2 or any later version.
 from __future__ import absolute_import
 import struct
+import sys
 from ..thirdparty.cbor.cbor2 import (
 decoder as decodermod,
 )
 MAJOR_TYPE_SEMANTIC = 6
 MAJOR_TYPE_SPECIAL = 7
 SUBTYPE_MASK = 0b00011111
+SUBTYPE_FALSE = 20
+SUBTYPE_TRUE = 21
+SUBTYPE_NULL = 22
 SUBTYPE_HALF_FLOAT = 25
 SUBTYPE_SINGLE_FLOAT = 26
 SUBTYPE_DOUBLE_FLOAT = 27
 SUBTYPE_INDEFINITE = 31
+SEMANTIC_TAG_FINITE_SET = 258
 # Indefinite types begin with their major type ORd with information value 31.
 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
 r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE)
 BEGIN_INDEFINITE_ARRAY = struct.pack(
 return type(v).__name__, v
 def streamencodeset(s):
 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
 # semantic tag 258 for finite sets.
-yield encodelength(MAJOR_TYPE_SEMANTIC, 258)
+yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
 yield chunk
 def streamencodemap(d):
 raise decodermod.CBORDecodeError(
 'failed to read bytestring chunk: got %d bytes; expected %d' % (
 len(chunk), length))
 yield chunk
+class CBORDecodeError(Exception):
+"""Represents an error decoding CBOR."""
+if sys.version_info.major >= 3:
+def _elementtointeger(b, i):
+return b[i]
+else:
+def _elementtointeger(b, i):
+return ord(b[i])
+STRUCT_BIG_UBYTE = struct.Struct(r'>B')
+STRUCT_BIG_USHORT = struct.Struct('>H')
+STRUCT_BIG_ULONG = struct.Struct('>L')
+STRUCT_BIG_ULONGLONG = struct.Struct('>Q')
+SPECIAL_NONE = 0
+SPECIAL_START_INDEFINITE_BYTESTRING = 1
+SPECIAL_START_ARRAY = 2
+SPECIAL_START_MAP = 3
+SPECIAL_START_SET = 4
+SPECIAL_INDEFINITE_BREAK = 5
+def decodeitem(b, offset=0):
+"""Decode a new CBOR value from a buffer at offset.
+This function attempts to decode up to one complete CBOR value
+from ``b`` starting at offset ``offset``.
+The beginning of a collection (such as an array, map, set, or
+indefinite length bytestring) counts as a single value. For these
+special cases, a state flag will indicate that a special value was seen.
+When called, the function either returns a decoded value or gives
+a hint as to how many more bytes are needed to do so. By calling
+the function repeatedly given a stream of bytes, the caller can
+build up the original values.
+Returns a tuple with the following elements:
+* Bool indicating whether a complete value was decoded.
+* A decoded value if first value is True otherwise None
+* Integer number of bytes. If positive, the number of bytes
+read. If negative, the number of bytes we need to read to
+decode this value or the next chunk in this value.
+* One of the ``SPECIAL_*`` constants indicating special treatment
+for this value. ``SPECIAL_NONE`` means this is a fully decoded
+simple value (such as an integer or bool).
+"""
+initial = _elementtointeger(b, offset)
+offset += 1
+majortype = initial >> 5
+subtype = initial & SUBTYPE_MASK
+if majortype == MAJOR_TYPE_UINT:
+complete, value, readcount = decodeuint(subtype, b, offset)
+if complete:
+return True, value, readcount + 1, SPECIAL_NONE
+else:
+return False, None, readcount, SPECIAL_NONE
+elif majortype == MAJOR_TYPE_NEGINT:
+# Negative integers are the same as UINT except inverted minus 1.
+complete, value, readcount = decodeuint(subtype, b, offset)
+if complete:
+return True, -value - 1, readcount + 1, SPECIAL_NONE
+else:
+return False, None, readcount, SPECIAL_NONE
+elif majortype == MAJOR_TYPE_BYTESTRING:
+# Beginning of bytestrings are treated as uints in order to
+# decode their length, which may be indefinite.
+complete, size, readcount = decodeuint(subtype, b, offset,
+allowindefinite=True)
+# We don't know the size of the bytestring. It must be a definitive
+# length since the indefinite subtype would be encoded in the initial
+# byte.
+if not complete:
+return False, None, readcount, SPECIAL_NONE
+# We know the length of the bytestring.
+if size is not None:
+# And the data is available in the buffer.
+if offset + readcount + size <= len(b):
+value = b[offset + readcount:offset + readcount + size]
+return True, value, readcount + size + 1, SPECIAL_NONE
+# And we need more data in order to return the bytestring.
+else:
+wanted = len(b) - offset - readcount - size
+return False, None, wanted, SPECIAL_NONE
+# It is an indefinite length bytestring.
+else:
+return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
+elif majortype == MAJOR_TYPE_STRING:
+raise CBORDecodeError('string major type not supported')
+elif majortype == MAJOR_TYPE_ARRAY:
+# Beginning of arrays are treated as uints in order to decode their
+# length. We don't allow indefinite length arrays.
+complete, size, readcount = decodeuint(subtype, b, offset)
+if complete:
+return True, size, readcount + 1, SPECIAL_START_ARRAY
+else:
+return False, None, readcount, SPECIAL_NONE
+elif majortype == MAJOR_TYPE_MAP:
+# Beginning of maps are treated as uints in order to decode their
+# number of elements. We don't allow indefinite length arrays.
+complete, size, readcount = decodeuint(subtype, b, offset)
+if complete:
+return True, size, readcount + 1, SPECIAL_START_MAP
+else:
+return False, None, readcount, SPECIAL_NONE
+elif majortype == MAJOR_TYPE_SEMANTIC:
+# Semantic tag value is read the same as a uint.
+complete, tagvalue, readcount = decodeuint(subtype, b, offset)
+if not complete:
+return False, None, readcount, SPECIAL_NONE
+# This behavior here is a little wonky. The main type being "decorated"
+# by this semantic tag follows. A more robust parser would probably emit
+# a special flag indicating this as a semantic tag and let the caller
+# deal with the types that follow. But since we don't support many
+# semantic tags, it is easier to deal with the special cases here and
+# hide complexity from the caller. If we add support for more semantic
+# tags, we should probably move semantic tag handling into the caller.
+if tagvalue == SEMANTIC_TAG_FINITE_SET:
+if offset + readcount >= len(b):
+return False, None, -1, SPECIAL_NONE
+complete, size, readcount2, special = decodeitem(b,
+offset + readcount)
+if not complete:
+return False, None, readcount2, SPECIAL_NONE
+if special != SPECIAL_START_ARRAY:
+raise CBORDecodeError('expected array after finite set '
+'semantic tag')
+return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
+else:
+raise CBORDecodeError('semantic tag %d not allowed' % tagvalue)
+elif majortype == MAJOR_TYPE_SPECIAL:
+# Only specific values for the information field are allowed.
+if subtype == SUBTYPE_FALSE:
+return True, False, 1, SPECIAL_NONE
+elif subtype == SUBTYPE_TRUE:
+return True, True, 1, SPECIAL_NONE
+elif subtype == SUBTYPE_NULL:
+return True, None, 1, SPECIAL_NONE
+elif subtype == SUBTYPE_INDEFINITE:
+return True, None, 1, SPECIAL_INDEFINITE_BREAK
+# If value is 24, subtype is in next byte.
+else:
+raise CBORDecodeError('special type %d not allowed' % subtype)
+else:
+assert False
+def decodeuint(subtype, b, offset=0, allowindefinite=False):
+"""Decode an unsigned integer.
+``subtype`` is the lower 5 bits from the initial byte CBOR item
+"header." ``b`` is a buffer containing bytes. ``offset`` points to
+the index of the first byte after the byte that ``subtype`` was
+derived from.
+``allowindefinite`` allows the special indefinite length value
+indicator.
+Returns a 3-tuple of (successful, value, count).
+The first element is a bool indicating if decoding completed. The 2nd
+is the decoded integer value or None if not fully decoded or the subtype
+is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
+If positive, it is the number of additional bytes decoded. If negative,
+it is the number of additional bytes needed to decode this value.
+"""
+# Small values are inline.
+if subtype < 24:
+return True, subtype, 0
+# Indefinite length specifier.
+elif subtype == 31:
+if allowindefinite:
+return True, None, 0
+else:
+raise CBORDecodeError('indefinite length uint not allowed here')
+elif subtype >= 28:
+raise CBORDecodeError('unsupported subtype on integer type: %d' %
+subtype)
+if subtype == 24:
+s = STRUCT_BIG_UBYTE
+elif subtype == 25:
+s = STRUCT_BIG_USHORT
+elif subtype == 26:
+s = STRUCT_BIG_ULONG
+elif subtype == 27:
+s = STRUCT_BIG_ULONGLONG
+else:
+raise CBORDecodeError('bounds condition checking violation')
+if len(b) - offset >= s.size:
+return True, s.unpack_from(b, offset)[0], s.size
+else:
+return False, None, len(b) - offset - s.size
+class bytestringchunk(bytes):
+"""Represents a chunk/segment in an indefinite length bytestring.
+This behaves like a ``bytes`` but in addition has the ``isfirst``
+and ``islast`` attributes indicating whether this chunk is the first
+or last in an indefinite length bytestring.
+"""
+def __new__(cls, v, first=False, last=False):
+self = bytes.__new__(cls, v)
+self.isfirst = first
+self.islast = last
+return self
+class sansiodecoder(object):
+"""A CBOR decoder that doesn't perform its own I/O.
+To use, construct an instance and feed it segments containing
+CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
+indicates whether a fully-decoded value is available, how many bytes
+were consumed, and offers a hint as to how many bytes should be fed
+in next time to decode the next value.
+The decoder assumes it will decode N discrete CBOR values, not just
+a single value. i.e. if the bytestream contains uints packed one after
+the other, the decoder will decode them all, rather than just the initial
+one.
+When ``decode()`` indicates a value is available, call ``getavailable()``
+to return all fully decoded values.
+``decode()`` can partially decode input. It is up to the caller to keep
+track of what data was consumed and to pass unconsumed data in on the
+next invocation.
+The decoder decodes atomically at the *item* level. See ``decodeitem()``.
+If an *item* cannot be fully decoded, the decoder won't record it as
+partially consumed. Instead, the caller will be instructed to pass in
+the initial bytes of this item on the next invocation. This does result
+in some redundant parsing. But the overhead should be minimal.
+This decoder only supports a subset of CBOR as required by Mercurial.
+It lacks support for:
+* Indefinite length arrays
+* Indefinite length maps
+* Use of indefinite length bytestrings as keys or values within
+arrays, maps, or sets.
+* Nested arrays, maps, or sets within sets
+* Any semantic tag that isn't a mathematical finite set
+* Floating point numbers
+* Undefined special value
+CBOR types are decoded to Python types as follows:
+uint -> int
+negint -> int
+bytestring -> bytes
+map -> dict
+array -> list
+True -> bool
+False -> bool
+null -> None
+indefinite length bytestring chunk -> [bytestringchunk]
+The only non-obvious mapping here is an indefinite length bytestring
+to the ``bytestringchunk`` type. This is to facilitate streaming
+indefinite length bytestrings out of the decoder and to differentiate
+a regular bytestring from an indefinite length bytestring.
+"""
+_STATE_NONE = 0
+_STATE_WANT_MAP_KEY = 1
+_STATE_WANT_MAP_VALUE = 2
+_STATE_WANT_ARRAY_VALUE = 3
+_STATE_WANT_SET_VALUE = 4
+_STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
+_STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
+def __init__(self):
+# TODO add support for limiting size of bytestrings
+# TODO add support for limiting number of keys / values in collections
+# TODO add support for limiting size of buffered partial values
+self.decodedbytecount = 0
+self._state = self._STATE_NONE
+# Stack of active nested collections. Each entry is a dict describing
+# the collection.
+self._collectionstack = []
+# Fully decoded key to use for the current map.
+self._currentmapkey = None
+# Fully decoded values available for retrieval.
+self._decodedvalues = []
+@property
+def inprogress(self):
+"""Whether the decoder has partially decoded a value."""
+return self._state != self._STATE_NONE
+def decode(self, b, offset=0):
+"""Attempt to decode bytes from an input buffer.
+``b`` is a collection of bytes and ``offset`` is the byte
+offset within that buffer from which to begin reading data.
+``b`` must support ``len()`` and accessing bytes slices via
+``__slice__``. Typically ``bytes`` instances are used.
+Returns a tuple with the following fields:
+* Bool indicating whether values are available for retrieval.
+* Integer indicating the number of bytes that were fully consumed,
+starting from ``offset``.
+* Integer indicating the number of bytes that are desired for the
+next call in order to decode an item.
+"""
+if not b:
+return bool(self._decodedvalues), 0, 0
+initialoffset = offset
+# We could easily split the body of this loop into a function. But
+# Python performance is sensitive to function calls and collections
+# are composed of many items. So leaving as a while loop could help
+# with performance. One thing that may not help is the use of
+# if..elif versus a lookup/dispatch table. There may be value
+# in switching that.
+while offset < len(b):
+# Attempt to decode an item. This could be a whole value or a
+# special value indicating an event, such as start or end of a
+# collection or indefinite length type.
+complete, value, readcount, special = decodeitem(b, offset)
+if readcount > 0:
+self.decodedbytecount += readcount
+if not complete:
+assert readcount < 0
+return (
+bool(self._decodedvalues),
+offset - initialoffset,
+-readcount,
+)
+offset += readcount
+# No nested state. We either have a full value or beginning of a
+# complex value to deal with.
+if self._state == self._STATE_NONE:
+# A normal value.
+if special == SPECIAL_NONE:
+self._decodedvalues.append(value)
+elif special == SPECIAL_START_ARRAY:
+self._collectionstack.append({
+'remaining': value,
+'v': [],
+})
+self._state = self._STATE_WANT_ARRAY_VALUE
+elif special == SPECIAL_START_MAP:
+self._collectionstack.append({
+'remaining': value,
+'v': {},
+})
+self._state = self._STATE_WANT_MAP_KEY
+elif special == SPECIAL_START_SET:
+self._collectionstack.append({
+'remaining': value,
+'v': set(),
+})
+self._state = self._STATE_WANT_SET_VALUE
+elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
+self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
+else:
+raise CBORDecodeError('unhandled special state: %d' %
+special)
+# This value becomes an element of the current array.
+elif self._state == self._STATE_WANT_ARRAY_VALUE:
+# Simple values get appended.
+if special == SPECIAL_NONE:
+c = self._collectionstack[-1]
+c['v'].append(value)
+c['remaining'] -= 1
+# self._state doesn't need changed.
+# An array nested within an array.
+elif special == SPECIAL_START_ARRAY:
+lastc = self._collectionstack[-1]
+newvalue = []
+lastc['v'].append(newvalue)
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue,
+})
+# self._state doesn't need changed.
+# A map nested within an array.
+elif special == SPECIAL_START_MAP:
+lastc = self._collectionstack[-1]
+newvalue = {}
+lastc['v'].append(newvalue)
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue
+})
+self._state = self._STATE_WANT_MAP_KEY
+elif special == SPECIAL_START_SET:
+lastc = self._collectionstack[-1]
+newvalue = set()
+lastc['v'].append(newvalue)
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue,
+})
+self._state = self._STATE_WANT_SET_VALUE
+elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
+raise CBORDecodeError('indefinite length bytestrings '
+'not allowed as array values')
+else:
+raise CBORDecodeError('unhandled special item when '
+'expecting array value: %d' % special)
+# This value becomes the key of the current map instance.
+elif self._state == self._STATE_WANT_MAP_KEY:
+if special == SPECIAL_NONE:
+self._currentmapkey = value
+self._state = self._STATE_WANT_MAP_VALUE
+elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
+raise CBORDecodeError('indefinite length bytestrings '
+'not allowed as map keys')
+elif special in (SPECIAL_START_ARRAY, SPECIAL_START_MAP,
+SPECIAL_START_SET):
+raise CBORDecodeError('collections not supported as map '
+'keys')
+# We do not allow special values to be used as map keys.
+else:
+raise CBORDecodeError('unhandled special item when '
+'expecting map key: %d' % special)
+# This value becomes the value of the current map key.
+elif self._state == self._STATE_WANT_MAP_VALUE:
+# Simple values simply get inserted into the map.
+if special == SPECIAL_NONE:
+lastc = self._collectionstack[-1]
+lastc['v'][self._currentmapkey] = value
+lastc['remaining'] -= 1
+self._state = self._STATE_WANT_MAP_KEY
+# A new array is used as the map value.
+elif special == SPECIAL_START_ARRAY:
+lastc = self._collectionstack[-1]
+newvalue = []
+lastc['v'][self._currentmapkey] = newvalue
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue,
+})
+self._state = self._STATE_WANT_ARRAY_VALUE
+# A new map is used as the map value.
+elif special == SPECIAL_START_MAP:
+lastc = self._collectionstack[-1]
+newvalue = {}
+lastc['v'][self._currentmapkey] = newvalue
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue,
+})
+self._state = self._STATE_WANT_MAP_KEY
+# A new set is used as the map value.
+elif special == SPECIAL_START_SET:
+lastc = self._collectionstack[-1]
+newvalue = set()
+lastc['v'][self._currentmapkey] = newvalue
+lastc['remaining'] -= 1
+self._collectionstack.append({
+'remaining': value,
+'v': newvalue,
+})
+self._state = self._STATE_WANT_SET_VALUE
+elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
+raise CBORDecodeError('indefinite length bytestrings not '
+'allowed as map values')
+else:
+raise CBORDecodeError('unhandled special item when '
+'expecting map value: %d' % special)
+self._currentmapkey = None
+# This value is added to the current set.
+elif self._state == self._STATE_WANT_SET_VALUE:
+if special == SPECIAL_NONE:
+lastc = self._collectionstack[-1]
+lastc['v'].add(value)
+lastc['remaining'] -= 1
+elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
+raise CBORDecodeError('indefinite length bytestrings not '
+'allowed as set values')
+elif special in (SPECIAL_START_ARRAY,
+SPECIAL_START_MAP,
+SPECIAL_START_SET):
+raise CBORDecodeError('collections not allowed as set '
+'values')
+# We don't allow non-trivial types to exist as set values.
+else:
+raise CBORDecodeError('unhandled special item when '
+'expecting set value: %d' % special)
+# This value represents the first chunk in an indefinite length
+# bytestring.
+elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
+# We received a full chunk.
+if special == SPECIAL_NONE:
+self._decodedvalues.append(bytestringchunk(value,
+first=True))
+self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
+# The end of stream marker. This means it is an empty
+# indefinite length bytestring.
+elif special == SPECIAL_INDEFINITE_BREAK:
+# We /could/ convert this to a b''. But we want to preserve
+# the nature of the underlying data so consumers expecting
+# an indefinite length bytestring get one.
+self._decodedvalues.append(bytestringchunk(b'',
+first=True,
+last=True))
+# Since indefinite length bytestrings can't be used in
+# collections, we must be at the root level.
+assert not self._collectionstack
+self._state = self._STATE_NONE
+else:
+raise CBORDecodeError('unexpected special value when '
+'expecting bytestring chunk: %d' %
+special)
+# This value represents the non-initial chunk in an indefinite
+# length bytestring.
+elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
+# We received a full chunk.
+if special == SPECIAL_NONE:
+self._decodedvalues.append(bytestringchunk(value))
+# The end of stream marker.
+elif special == SPECIAL_INDEFINITE_BREAK:
+self._decodedvalues.append(bytestringchunk(b'', last=True))
+# Since indefinite length bytestrings can't be used in
+# collections, we must be at the root level.
+assert not self._collectionstack
+self._state = self._STATE_NONE
+else:
+raise CBORDecodeError('unexpected special value when '
+'expecting bytestring chunk: %d' %
+special)
+else:
+raise CBORDecodeError('unhandled decoder state: %d' %
+self._state)
+# We could have just added the final value in a collection. End
+# all complete collections at the top of the stack.
+while True:
+# Bail if we're not waiting on a new collection item.
+if self._state not in (self._STATE_WANT_ARRAY_VALUE,
+self._STATE_WANT_MAP_KEY,
+self._STATE_WANT_SET_VALUE):
+break
+# Or we are expecting more items for this collection.
+lastc = self._collectionstack[-1]
+if lastc['remaining']:
+break
+# The collection at the top of the stack is complete.
+# Discard it, as it isn't needed for future items.
+self._collectionstack.pop()
+# If this is a nested collection, we don't emit it, since it
+# will be emitted by its parent collection. But we do need to
+# update state to reflect what the new top-most collection
+# on the stack is.
+if self._collectionstack:
+self._state = {
+list: self._STATE_WANT_ARRAY_VALUE,
+dict: self._STATE_WANT_MAP_KEY,
+set: self._STATE_WANT_SET_VALUE,
+}[type(self._collectionstack[-1]['v'])]
+# If this is the root collection, emit it.
+else:
+self._decodedvalues.append(lastc['v'])
+self._state = self._STATE_NONE
+return (
+bool(self._decodedvalues),
+offset - initialoffset,
+0,
+)
+def getavailable(self):
+"""Returns an iterator over fully decoded values.
+Once values are retrieved, they won't be available on the next call.
+"""
+l = list(self._decodedvalues)
+self._decodedvalues = []
+return l
+def decodeall(b):
+"""Decode all CBOR items present in an iterable of bytes.
+In addition to regular decode errors, raises CBORDecodeError if the
+entirety of the passed buffer does not fully decode to complete CBOR
+values. This includes failure to decode any value, incomplete collection
+types, incomplete indefinite length items, and extra data at the end of
+the buffer.
+"""
+if not b:
+return []
+decoder = sansiodecoder()
+havevalues, readcount, wantbytes = decoder.decode(b)
+if readcount != len(b):
+raise CBORDecodeError('input data not fully consumed')
+if decoder.inprogress:
+raise CBORDecodeError('input data not complete')
+return decoder.getavailable()

changeset 39411	aeb551a3bb8a
parent 37898	2ae6a3134362
child 39412	a40d3da89b7d