Mercurial > hg
changeset 26875:cf47bdb2183c
encoding: add getutf8char helper
This allows us to find character boundaries in byte strings when
trying to do custom encodings.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 16:48:46 -0600 |
parents | 853154f27525 |
children | b8381832ce2b |
files | mercurial/encoding.py |
diffstat | 1 files changed, 19 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Sat Nov 07 16:03:09 2015 -0500 +++ b/mercurial/encoding.py Thu Nov 05 16:48:46 2015 -0600 @@ -414,6 +414,25 @@ return ''.join(_jsonmap[c] for c in toutf8b(s)) +_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] + +def getutf8char(s, pos): + '''get the next full utf-8 character in the given string, starting at pos + + Raises a UnicodeError if the given location does not start a valid + utf-8 character. + ''' + + # find how many bytes to attempt decoding from first nibble + l = _utf8len[ord(s[pos]) >> 4] + if not l: # ascii + return s[pos] + + c = s[pos:pos + l] + # validate with attempted decode + c.decode("utf-8") + return c + def toutf8b(s): '''convert a local, possibly-binary string into UTF-8b