comparison mercurial/utils/cborutil.py @ 39411:aeb551a3bb8a

cborutil: implement sans I/O decoder The vendored CBOR package decodes by calling read(n) on an object. There are a number of disadvantages to this: * Uses blocking I/O. If sufficient data is not available, the decoder will hang until it is. * No support for partial reads. If the read(n) returns less data than requested, the decoder raises an error. * Requires the use of a file like object. If the original data is in say a buffer, we need to "cast" it to e.g. a BytesIO to appease the decoder. In addition, the vendored CBOR decoder doesn't provide flexibility that we desire. Specifically: * It buffers indefinite length bytestrings instead of streaming them. * It doesn't allow limiting the set of types that can be decoded. This property is useful when implementing a "hardened" decoder that is less susceptible to abusive input. * It doesn't provide sufficient "hook points" and introspection to institute checks around behavior. These are useful for implementing a "hardened" decoder. This all adds up to a reasonable set of justifications for writing our own decoder. So, this commit implements our own CBOR decoder. At the heart of the decoder is a function that decodes a single "item" from a buffer. This item can be a complete simple value or a special value, such as "start of array." Using this function, we can build a decoder that effectively iterates over the stream of decoded items and builds up higher-level values, such as arrays, maps, sets, and indefinite length bytestrings. And we can do this without performing I/O in the decoder itself. The core of the sans I/O decoder will probably not be used directly. Instead, it is expected that we'll build utility functions for invoking the decoder given specific input types. This will allow extreme flexibility in how data is delivered to the decoder. I'm pretty happy with the state of the decoder modulo the TODO items to track wanted features to help with a "hardened" decoder. The one thing I could be convinced to change is the handling of semantic tags. Since we only support a single semantic tag (sets), I thought it would be easier to handle them inline in decodeitem(). This is simpler now. But if we add support for other semantic tags, it will likely be easier to move semantic tag handling outside of decodeitem(). But, properly supporting semantic tags opens up a whole can of worms, as many semantic tags imply new types. I'm optimistic we won't need these in Mercurial. But who knows. I'm also pretty happy with the test coverage. Writing comprehensive tests for partial decoding did flush out a handful of bugs. One general improvement to testing would be fuzz testing for partial decoding. I may implement that later. I also anticipate switching the wire protocol code to this new decoder will flush out any lingering bugs. Differential Revision: https://phab.mercurial-scm.org/D4414
author Gregory Szorc <gregory.szorc@gmail.com>
date Tue, 28 Aug 2018 15:02:48 -0700
parents 2ae6a3134362
children a40d3da89b7d
comparison
equal deleted inserted replaced
39410:fcc6bd11444b 39411:aeb551a3bb8a
6 # GNU General Public License version 2 or any later version. 6 # GNU General Public License version 2 or any later version.
7 7
8 from __future__ import absolute_import 8 from __future__ import absolute_import
9 9
10 import struct 10 import struct
11 import sys
11 12
12 from ..thirdparty.cbor.cbor2 import ( 13 from ..thirdparty.cbor.cbor2 import (
13 decoder as decodermod, 14 decoder as decodermod,
14 ) 15 )
15 16
33 MAJOR_TYPE_SEMANTIC = 6 34 MAJOR_TYPE_SEMANTIC = 6
34 MAJOR_TYPE_SPECIAL = 7 35 MAJOR_TYPE_SPECIAL = 7
35 36
36 SUBTYPE_MASK = 0b00011111 37 SUBTYPE_MASK = 0b00011111
37 38
39 SUBTYPE_FALSE = 20
40 SUBTYPE_TRUE = 21
41 SUBTYPE_NULL = 22
38 SUBTYPE_HALF_FLOAT = 25 42 SUBTYPE_HALF_FLOAT = 25
39 SUBTYPE_SINGLE_FLOAT = 26 43 SUBTYPE_SINGLE_FLOAT = 26
40 SUBTYPE_DOUBLE_FLOAT = 27 44 SUBTYPE_DOUBLE_FLOAT = 27
41 SUBTYPE_INDEFINITE = 31 45 SUBTYPE_INDEFINITE = 31
46
47 SEMANTIC_TAG_FINITE_SET = 258
42 48
43 # Indefinite types begin with their major type ORd with information value 31. 49 # Indefinite types begin with their major type ORd with information value 31.
44 BEGIN_INDEFINITE_BYTESTRING = struct.pack( 50 BEGIN_INDEFINITE_BYTESTRING = struct.pack(
45 r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE) 51 r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE)
46 BEGIN_INDEFINITE_ARRAY = struct.pack( 52 BEGIN_INDEFINITE_ARRAY = struct.pack(
144 return type(v).__name__, v 150 return type(v).__name__, v
145 151
146 def streamencodeset(s): 152 def streamencodeset(s):
147 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines 153 # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines
148 # semantic tag 258 for finite sets. 154 # semantic tag 258 for finite sets.
149 yield encodelength(MAJOR_TYPE_SEMANTIC, 258) 155 yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET)
150 156
151 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)): 157 for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)):
152 yield chunk 158 yield chunk
153 159
154 def streamencodemap(d): 160 def streamencodemap(d):
258 raise decodermod.CBORDecodeError( 264 raise decodermod.CBORDecodeError(
259 'failed to read bytestring chunk: got %d bytes; expected %d' % ( 265 'failed to read bytestring chunk: got %d bytes; expected %d' % (
260 len(chunk), length)) 266 len(chunk), length))
261 267
262 yield chunk 268 yield chunk
269
270 class CBORDecodeError(Exception):
271 """Represents an error decoding CBOR."""
272
273 if sys.version_info.major >= 3:
274 def _elementtointeger(b, i):
275 return b[i]
276 else:
277 def _elementtointeger(b, i):
278 return ord(b[i])
279
280 STRUCT_BIG_UBYTE = struct.Struct(r'>B')
281 STRUCT_BIG_USHORT = struct.Struct('>H')
282 STRUCT_BIG_ULONG = struct.Struct('>L')
283 STRUCT_BIG_ULONGLONG = struct.Struct('>Q')
284
285 SPECIAL_NONE = 0
286 SPECIAL_START_INDEFINITE_BYTESTRING = 1
287 SPECIAL_START_ARRAY = 2
288 SPECIAL_START_MAP = 3
289 SPECIAL_START_SET = 4
290 SPECIAL_INDEFINITE_BREAK = 5
291
292 def decodeitem(b, offset=0):
293 """Decode a new CBOR value from a buffer at offset.
294
295 This function attempts to decode up to one complete CBOR value
296 from ``b`` starting at offset ``offset``.
297
298 The beginning of a collection (such as an array, map, set, or
299 indefinite length bytestring) counts as a single value. For these
300 special cases, a state flag will indicate that a special value was seen.
301
302 When called, the function either returns a decoded value or gives
303 a hint as to how many more bytes are needed to do so. By calling
304 the function repeatedly given a stream of bytes, the caller can
305 build up the original values.
306
307 Returns a tuple with the following elements:
308
309 * Bool indicating whether a complete value was decoded.
310 * A decoded value if first value is True otherwise None
311 * Integer number of bytes. If positive, the number of bytes
312 read. If negative, the number of bytes we need to read to
313 decode this value or the next chunk in this value.
314 * One of the ``SPECIAL_*`` constants indicating special treatment
315 for this value. ``SPECIAL_NONE`` means this is a fully decoded
316 simple value (such as an integer or bool).
317 """
318
319 initial = _elementtointeger(b, offset)
320 offset += 1
321
322 majortype = initial >> 5
323 subtype = initial & SUBTYPE_MASK
324
325 if majortype == MAJOR_TYPE_UINT:
326 complete, value, readcount = decodeuint(subtype, b, offset)
327
328 if complete:
329 return True, value, readcount + 1, SPECIAL_NONE
330 else:
331 return False, None, readcount, SPECIAL_NONE
332
333 elif majortype == MAJOR_TYPE_NEGINT:
334 # Negative integers are the same as UINT except inverted minus 1.
335 complete, value, readcount = decodeuint(subtype, b, offset)
336
337 if complete:
338 return True, -value - 1, readcount + 1, SPECIAL_NONE
339 else:
340 return False, None, readcount, SPECIAL_NONE
341
342 elif majortype == MAJOR_TYPE_BYTESTRING:
343 # Beginning of bytestrings are treated as uints in order to
344 # decode their length, which may be indefinite.
345 complete, size, readcount = decodeuint(subtype, b, offset,
346 allowindefinite=True)
347
348 # We don't know the size of the bytestring. It must be a definitive
349 # length since the indefinite subtype would be encoded in the initial
350 # byte.
351 if not complete:
352 return False, None, readcount, SPECIAL_NONE
353
354 # We know the length of the bytestring.
355 if size is not None:
356 # And the data is available in the buffer.
357 if offset + readcount + size <= len(b):
358 value = b[offset + readcount:offset + readcount + size]
359 return True, value, readcount + size + 1, SPECIAL_NONE
360
361 # And we need more data in order to return the bytestring.
362 else:
363 wanted = len(b) - offset - readcount - size
364 return False, None, wanted, SPECIAL_NONE
365
366 # It is an indefinite length bytestring.
367 else:
368 return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING
369
370 elif majortype == MAJOR_TYPE_STRING:
371 raise CBORDecodeError('string major type not supported')
372
373 elif majortype == MAJOR_TYPE_ARRAY:
374 # Beginning of arrays are treated as uints in order to decode their
375 # length. We don't allow indefinite length arrays.
376 complete, size, readcount = decodeuint(subtype, b, offset)
377
378 if complete:
379 return True, size, readcount + 1, SPECIAL_START_ARRAY
380 else:
381 return False, None, readcount, SPECIAL_NONE
382
383 elif majortype == MAJOR_TYPE_MAP:
384 # Beginning of maps are treated as uints in order to decode their
385 # number of elements. We don't allow indefinite length arrays.
386 complete, size, readcount = decodeuint(subtype, b, offset)
387
388 if complete:
389 return True, size, readcount + 1, SPECIAL_START_MAP
390 else:
391 return False, None, readcount, SPECIAL_NONE
392
393 elif majortype == MAJOR_TYPE_SEMANTIC:
394 # Semantic tag value is read the same as a uint.
395 complete, tagvalue, readcount = decodeuint(subtype, b, offset)
396
397 if not complete:
398 return False, None, readcount, SPECIAL_NONE
399
400 # This behavior here is a little wonky. The main type being "decorated"
401 # by this semantic tag follows. A more robust parser would probably emit
402 # a special flag indicating this as a semantic tag and let the caller
403 # deal with the types that follow. But since we don't support many
404 # semantic tags, it is easier to deal with the special cases here and
405 # hide complexity from the caller. If we add support for more semantic
406 # tags, we should probably move semantic tag handling into the caller.
407 if tagvalue == SEMANTIC_TAG_FINITE_SET:
408 if offset + readcount >= len(b):
409 return False, None, -1, SPECIAL_NONE
410
411 complete, size, readcount2, special = decodeitem(b,
412 offset + readcount)
413
414 if not complete:
415 return False, None, readcount2, SPECIAL_NONE
416
417 if special != SPECIAL_START_ARRAY:
418 raise CBORDecodeError('expected array after finite set '
419 'semantic tag')
420
421 return True, size, readcount + readcount2 + 1, SPECIAL_START_SET
422
423 else:
424 raise CBORDecodeError('semantic tag %d not allowed' % tagvalue)
425
426 elif majortype == MAJOR_TYPE_SPECIAL:
427 # Only specific values for the information field are allowed.
428 if subtype == SUBTYPE_FALSE:
429 return True, False, 1, SPECIAL_NONE
430 elif subtype == SUBTYPE_TRUE:
431 return True, True, 1, SPECIAL_NONE
432 elif subtype == SUBTYPE_NULL:
433 return True, None, 1, SPECIAL_NONE
434 elif subtype == SUBTYPE_INDEFINITE:
435 return True, None, 1, SPECIAL_INDEFINITE_BREAK
436 # If value is 24, subtype is in next byte.
437 else:
438 raise CBORDecodeError('special type %d not allowed' % subtype)
439 else:
440 assert False
441
442 def decodeuint(subtype, b, offset=0, allowindefinite=False):
443 """Decode an unsigned integer.
444
445 ``subtype`` is the lower 5 bits from the initial byte CBOR item
446 "header." ``b`` is a buffer containing bytes. ``offset`` points to
447 the index of the first byte after the byte that ``subtype`` was
448 derived from.
449
450 ``allowindefinite`` allows the special indefinite length value
451 indicator.
452
453 Returns a 3-tuple of (successful, value, count).
454
455 The first element is a bool indicating if decoding completed. The 2nd
456 is the decoded integer value or None if not fully decoded or the subtype
457 is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes.
458 If positive, it is the number of additional bytes decoded. If negative,
459 it is the number of additional bytes needed to decode this value.
460 """
461
462 # Small values are inline.
463 if subtype < 24:
464 return True, subtype, 0
465 # Indefinite length specifier.
466 elif subtype == 31:
467 if allowindefinite:
468 return True, None, 0
469 else:
470 raise CBORDecodeError('indefinite length uint not allowed here')
471 elif subtype >= 28:
472 raise CBORDecodeError('unsupported subtype on integer type: %d' %
473 subtype)
474
475 if subtype == 24:
476 s = STRUCT_BIG_UBYTE
477 elif subtype == 25:
478 s = STRUCT_BIG_USHORT
479 elif subtype == 26:
480 s = STRUCT_BIG_ULONG
481 elif subtype == 27:
482 s = STRUCT_BIG_ULONGLONG
483 else:
484 raise CBORDecodeError('bounds condition checking violation')
485
486 if len(b) - offset >= s.size:
487 return True, s.unpack_from(b, offset)[0], s.size
488 else:
489 return False, None, len(b) - offset - s.size
490
491 class bytestringchunk(bytes):
492 """Represents a chunk/segment in an indefinite length bytestring.
493
494 This behaves like a ``bytes`` but in addition has the ``isfirst``
495 and ``islast`` attributes indicating whether this chunk is the first
496 or last in an indefinite length bytestring.
497 """
498
499 def __new__(cls, v, first=False, last=False):
500 self = bytes.__new__(cls, v)
501 self.isfirst = first
502 self.islast = last
503
504 return self
505
506 class sansiodecoder(object):
507 """A CBOR decoder that doesn't perform its own I/O.
508
509 To use, construct an instance and feed it segments containing
510 CBOR-encoded bytes via ``decode()``. The return value from ``decode()``
511 indicates whether a fully-decoded value is available, how many bytes
512 were consumed, and offers a hint as to how many bytes should be fed
513 in next time to decode the next value.
514
515 The decoder assumes it will decode N discrete CBOR values, not just
516 a single value. i.e. if the bytestream contains uints packed one after
517 the other, the decoder will decode them all, rather than just the initial
518 one.
519
520 When ``decode()`` indicates a value is available, call ``getavailable()``
521 to return all fully decoded values.
522
523 ``decode()`` can partially decode input. It is up to the caller to keep
524 track of what data was consumed and to pass unconsumed data in on the
525 next invocation.
526
527 The decoder decodes atomically at the *item* level. See ``decodeitem()``.
528 If an *item* cannot be fully decoded, the decoder won't record it as
529 partially consumed. Instead, the caller will be instructed to pass in
530 the initial bytes of this item on the next invocation. This does result
531 in some redundant parsing. But the overhead should be minimal.
532
533 This decoder only supports a subset of CBOR as required by Mercurial.
534 It lacks support for:
535
536 * Indefinite length arrays
537 * Indefinite length maps
538 * Use of indefinite length bytestrings as keys or values within
539 arrays, maps, or sets.
540 * Nested arrays, maps, or sets within sets
541 * Any semantic tag that isn't a mathematical finite set
542 * Floating point numbers
543 * Undefined special value
544
545 CBOR types are decoded to Python types as follows:
546
547 uint -> int
548 negint -> int
549 bytestring -> bytes
550 map -> dict
551 array -> list
552 True -> bool
553 False -> bool
554 null -> None
555 indefinite length bytestring chunk -> [bytestringchunk]
556
557 The only non-obvious mapping here is an indefinite length bytestring
558 to the ``bytestringchunk`` type. This is to facilitate streaming
559 indefinite length bytestrings out of the decoder and to differentiate
560 a regular bytestring from an indefinite length bytestring.
561 """
562
563 _STATE_NONE = 0
564 _STATE_WANT_MAP_KEY = 1
565 _STATE_WANT_MAP_VALUE = 2
566 _STATE_WANT_ARRAY_VALUE = 3
567 _STATE_WANT_SET_VALUE = 4
568 _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5
569 _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6
570
571 def __init__(self):
572 # TODO add support for limiting size of bytestrings
573 # TODO add support for limiting number of keys / values in collections
574 # TODO add support for limiting size of buffered partial values
575
576 self.decodedbytecount = 0
577
578 self._state = self._STATE_NONE
579
580 # Stack of active nested collections. Each entry is a dict describing
581 # the collection.
582 self._collectionstack = []
583
584 # Fully decoded key to use for the current map.
585 self._currentmapkey = None
586
587 # Fully decoded values available for retrieval.
588 self._decodedvalues = []
589
590 @property
591 def inprogress(self):
592 """Whether the decoder has partially decoded a value."""
593 return self._state != self._STATE_NONE
594
595 def decode(self, b, offset=0):
596 """Attempt to decode bytes from an input buffer.
597
598 ``b`` is a collection of bytes and ``offset`` is the byte
599 offset within that buffer from which to begin reading data.
600
601 ``b`` must support ``len()`` and accessing bytes slices via
602 ``__slice__``. Typically ``bytes`` instances are used.
603
604 Returns a tuple with the following fields:
605
606 * Bool indicating whether values are available for retrieval.
607 * Integer indicating the number of bytes that were fully consumed,
608 starting from ``offset``.
609 * Integer indicating the number of bytes that are desired for the
610 next call in order to decode an item.
611 """
612 if not b:
613 return bool(self._decodedvalues), 0, 0
614
615 initialoffset = offset
616
617 # We could easily split the body of this loop into a function. But
618 # Python performance is sensitive to function calls and collections
619 # are composed of many items. So leaving as a while loop could help
620 # with performance. One thing that may not help is the use of
621 # if..elif versus a lookup/dispatch table. There may be value
622 # in switching that.
623 while offset < len(b):
624 # Attempt to decode an item. This could be a whole value or a
625 # special value indicating an event, such as start or end of a
626 # collection or indefinite length type.
627 complete, value, readcount, special = decodeitem(b, offset)
628
629 if readcount > 0:
630 self.decodedbytecount += readcount
631
632 if not complete:
633 assert readcount < 0
634 return (
635 bool(self._decodedvalues),
636 offset - initialoffset,
637 -readcount,
638 )
639
640 offset += readcount
641
642 # No nested state. We either have a full value or beginning of a
643 # complex value to deal with.
644 if self._state == self._STATE_NONE:
645 # A normal value.
646 if special == SPECIAL_NONE:
647 self._decodedvalues.append(value)
648
649 elif special == SPECIAL_START_ARRAY:
650 self._collectionstack.append({
651 'remaining': value,
652 'v': [],
653 })
654 self._state = self._STATE_WANT_ARRAY_VALUE
655
656 elif special == SPECIAL_START_MAP:
657 self._collectionstack.append({
658 'remaining': value,
659 'v': {},
660 })
661 self._state = self._STATE_WANT_MAP_KEY
662
663 elif special == SPECIAL_START_SET:
664 self._collectionstack.append({
665 'remaining': value,
666 'v': set(),
667 })
668 self._state = self._STATE_WANT_SET_VALUE
669
670 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
671 self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST
672
673 else:
674 raise CBORDecodeError('unhandled special state: %d' %
675 special)
676
677 # This value becomes an element of the current array.
678 elif self._state == self._STATE_WANT_ARRAY_VALUE:
679 # Simple values get appended.
680 if special == SPECIAL_NONE:
681 c = self._collectionstack[-1]
682 c['v'].append(value)
683 c['remaining'] -= 1
684
685 # self._state doesn't need changed.
686
687 # An array nested within an array.
688 elif special == SPECIAL_START_ARRAY:
689 lastc = self._collectionstack[-1]
690 newvalue = []
691
692 lastc['v'].append(newvalue)
693 lastc['remaining'] -= 1
694
695 self._collectionstack.append({
696 'remaining': value,
697 'v': newvalue,
698 })
699
700 # self._state doesn't need changed.
701
702 # A map nested within an array.
703 elif special == SPECIAL_START_MAP:
704 lastc = self._collectionstack[-1]
705 newvalue = {}
706
707 lastc['v'].append(newvalue)
708 lastc['remaining'] -= 1
709
710 self._collectionstack.append({
711 'remaining': value,
712 'v': newvalue
713 })
714
715 self._state = self._STATE_WANT_MAP_KEY
716
717 elif special == SPECIAL_START_SET:
718 lastc = self._collectionstack[-1]
719 newvalue = set()
720
721 lastc['v'].append(newvalue)
722 lastc['remaining'] -= 1
723
724 self._collectionstack.append({
725 'remaining': value,
726 'v': newvalue,
727 })
728
729 self._state = self._STATE_WANT_SET_VALUE
730
731 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
732 raise CBORDecodeError('indefinite length bytestrings '
733 'not allowed as array values')
734
735 else:
736 raise CBORDecodeError('unhandled special item when '
737 'expecting array value: %d' % special)
738
739 # This value becomes the key of the current map instance.
740 elif self._state == self._STATE_WANT_MAP_KEY:
741 if special == SPECIAL_NONE:
742 self._currentmapkey = value
743 self._state = self._STATE_WANT_MAP_VALUE
744
745 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
746 raise CBORDecodeError('indefinite length bytestrings '
747 'not allowed as map keys')
748
749 elif special in (SPECIAL_START_ARRAY, SPECIAL_START_MAP,
750 SPECIAL_START_SET):
751 raise CBORDecodeError('collections not supported as map '
752 'keys')
753
754 # We do not allow special values to be used as map keys.
755 else:
756 raise CBORDecodeError('unhandled special item when '
757 'expecting map key: %d' % special)
758
759 # This value becomes the value of the current map key.
760 elif self._state == self._STATE_WANT_MAP_VALUE:
761 # Simple values simply get inserted into the map.
762 if special == SPECIAL_NONE:
763 lastc = self._collectionstack[-1]
764 lastc['v'][self._currentmapkey] = value
765 lastc['remaining'] -= 1
766
767 self._state = self._STATE_WANT_MAP_KEY
768
769 # A new array is used as the map value.
770 elif special == SPECIAL_START_ARRAY:
771 lastc = self._collectionstack[-1]
772 newvalue = []
773
774 lastc['v'][self._currentmapkey] = newvalue
775 lastc['remaining'] -= 1
776
777 self._collectionstack.append({
778 'remaining': value,
779 'v': newvalue,
780 })
781
782 self._state = self._STATE_WANT_ARRAY_VALUE
783
784 # A new map is used as the map value.
785 elif special == SPECIAL_START_MAP:
786 lastc = self._collectionstack[-1]
787 newvalue = {}
788
789 lastc['v'][self._currentmapkey] = newvalue
790 lastc['remaining'] -= 1
791
792 self._collectionstack.append({
793 'remaining': value,
794 'v': newvalue,
795 })
796
797 self._state = self._STATE_WANT_MAP_KEY
798
799 # A new set is used as the map value.
800 elif special == SPECIAL_START_SET:
801 lastc = self._collectionstack[-1]
802 newvalue = set()
803
804 lastc['v'][self._currentmapkey] = newvalue
805 lastc['remaining'] -= 1
806
807 self._collectionstack.append({
808 'remaining': value,
809 'v': newvalue,
810 })
811
812 self._state = self._STATE_WANT_SET_VALUE
813
814 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
815 raise CBORDecodeError('indefinite length bytestrings not '
816 'allowed as map values')
817
818 else:
819 raise CBORDecodeError('unhandled special item when '
820 'expecting map value: %d' % special)
821
822 self._currentmapkey = None
823
824 # This value is added to the current set.
825 elif self._state == self._STATE_WANT_SET_VALUE:
826 if special == SPECIAL_NONE:
827 lastc = self._collectionstack[-1]
828 lastc['v'].add(value)
829 lastc['remaining'] -= 1
830
831 elif special == SPECIAL_START_INDEFINITE_BYTESTRING:
832 raise CBORDecodeError('indefinite length bytestrings not '
833 'allowed as set values')
834
835 elif special in (SPECIAL_START_ARRAY,
836 SPECIAL_START_MAP,
837 SPECIAL_START_SET):
838 raise CBORDecodeError('collections not allowed as set '
839 'values')
840
841 # We don't allow non-trivial types to exist as set values.
842 else:
843 raise CBORDecodeError('unhandled special item when '
844 'expecting set value: %d' % special)
845
846 # This value represents the first chunk in an indefinite length
847 # bytestring.
848 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST:
849 # We received a full chunk.
850 if special == SPECIAL_NONE:
851 self._decodedvalues.append(bytestringchunk(value,
852 first=True))
853
854 self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT
855
856 # The end of stream marker. This means it is an empty
857 # indefinite length bytestring.
858 elif special == SPECIAL_INDEFINITE_BREAK:
859 # We /could/ convert this to a b''. But we want to preserve
860 # the nature of the underlying data so consumers expecting
861 # an indefinite length bytestring get one.
862 self._decodedvalues.append(bytestringchunk(b'',
863 first=True,
864 last=True))
865
866 # Since indefinite length bytestrings can't be used in
867 # collections, we must be at the root level.
868 assert not self._collectionstack
869 self._state = self._STATE_NONE
870
871 else:
872 raise CBORDecodeError('unexpected special value when '
873 'expecting bytestring chunk: %d' %
874 special)
875
876 # This value represents the non-initial chunk in an indefinite
877 # length bytestring.
878 elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT:
879 # We received a full chunk.
880 if special == SPECIAL_NONE:
881 self._decodedvalues.append(bytestringchunk(value))
882
883 # The end of stream marker.
884 elif special == SPECIAL_INDEFINITE_BREAK:
885 self._decodedvalues.append(bytestringchunk(b'', last=True))
886
887 # Since indefinite length bytestrings can't be used in
888 # collections, we must be at the root level.
889 assert not self._collectionstack
890 self._state = self._STATE_NONE
891
892 else:
893 raise CBORDecodeError('unexpected special value when '
894 'expecting bytestring chunk: %d' %
895 special)
896
897 else:
898 raise CBORDecodeError('unhandled decoder state: %d' %
899 self._state)
900
901 # We could have just added the final value in a collection. End
902 # all complete collections at the top of the stack.
903 while True:
904 # Bail if we're not waiting on a new collection item.
905 if self._state not in (self._STATE_WANT_ARRAY_VALUE,
906 self._STATE_WANT_MAP_KEY,
907 self._STATE_WANT_SET_VALUE):
908 break
909
910 # Or we are expecting more items for this collection.
911 lastc = self._collectionstack[-1]
912
913 if lastc['remaining']:
914 break
915
916 # The collection at the top of the stack is complete.
917
918 # Discard it, as it isn't needed for future items.
919 self._collectionstack.pop()
920
921 # If this is a nested collection, we don't emit it, since it
922 # will be emitted by its parent collection. But we do need to
923 # update state to reflect what the new top-most collection
924 # on the stack is.
925 if self._collectionstack:
926 self._state = {
927 list: self._STATE_WANT_ARRAY_VALUE,
928 dict: self._STATE_WANT_MAP_KEY,
929 set: self._STATE_WANT_SET_VALUE,
930 }[type(self._collectionstack[-1]['v'])]
931
932 # If this is the root collection, emit it.
933 else:
934 self._decodedvalues.append(lastc['v'])
935 self._state = self._STATE_NONE
936
937 return (
938 bool(self._decodedvalues),
939 offset - initialoffset,
940 0,
941 )
942
943 def getavailable(self):
944 """Returns an iterator over fully decoded values.
945
946 Once values are retrieved, they won't be available on the next call.
947 """
948
949 l = list(self._decodedvalues)
950 self._decodedvalues = []
951 return l
952
953 def decodeall(b):
954 """Decode all CBOR items present in an iterable of bytes.
955
956 In addition to regular decode errors, raises CBORDecodeError if the
957 entirety of the passed buffer does not fully decode to complete CBOR
958 values. This includes failure to decode any value, incomplete collection
959 types, incomplete indefinite length items, and extra data at the end of
960 the buffer.
961 """
962 if not b:
963 return []
964
965 decoder = sansiodecoder()
966
967 havevalues, readcount, wantbytes = decoder.decode(b)
968
969 if readcount != len(b):
970 raise CBORDecodeError('input data not fully consumed')
971
972 if decoder.inprogress:
973 raise CBORDecodeError('input data not complete')
974
975 return decoder.getavailable()