changeset 26875:cf47bdb2183c

encoding: add getutf8char helper This allows us to find character boundaries in byte strings when trying to do custom encodings.
author Matt Mackall <mpm@selenic.com>
date Thu, 05 Nov 2015 16:48:46 -0600
parents 853154f27525
children b8381832ce2b
files mercurial/encoding.py
diffstat 1 files changed, 19 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/encoding.py	Sat Nov 07 16:03:09 2015 -0500
+++ b/mercurial/encoding.py	Thu Nov 05 16:48:46 2015 -0600
@@ -414,6 +414,25 @@
 
     return ''.join(_jsonmap[c] for c in toutf8b(s))
 
+_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
+
+def getutf8char(s, pos):
+    '''get the next full utf-8 character in the given string, starting at pos
+
+    Raises a UnicodeError if the given location does not start a valid
+    utf-8 character.
+    '''
+
+    # find how many bytes to attempt decoding from first nibble
+    l = _utf8len[ord(s[pos]) >> 4]
+    if not l: # ascii
+        return s[pos]
+
+    c = s[pos:pos + l]
+    # validate with attempted decode
+    c.decode("utf-8")
+    return c
+
 def toutf8b(s):
     '''convert a local, possibly-binary string into UTF-8b