mercurial/encoding.py
changeset 26879 a24b98f4e03c
parent 26878 d7e83f106459
child 26963 de5ae97ce9f4
equal deleted inserted replaced
26878:d7e83f106459 26879:a24b98f4e03c
   461     arbitrary bytes into an internal Unicode format that can be
   461     arbitrary bytes into an internal Unicode format that can be
   462     re-encoded back into the original. Here we are exposing the
   462     re-encoded back into the original. Here we are exposing the
   463     internal surrogate encoding as a UTF-8 string.)
   463     internal surrogate encoding as a UTF-8 string.)
   464     '''
   464     '''
   465 
   465 
   466     if isinstance(s, localstr):
   466     if "\xed" not in s:
   467         return s._utf8
   467         if isinstance(s, localstr):
   468 
   468             return s._utf8
   469     try:
   469         try:
   470         s.decode('utf-8')
   470             s.decode('utf-8')
   471         return s
   471             return s
   472     except UnicodeDecodeError:
   472         except UnicodeDecodeError:
   473         pass
   473             pass
   474 
   474 
   475     r = ""
   475     r = ""
   476     pos = 0
   476     pos = 0
   477     l = len(s)
   477     l = len(s)
   478     while pos < l:
   478     while pos < l:
   479         try:
   479         try:
   480             c = getutf8char(s, pos)
   480             c = getutf8char(s, pos)
   481             pos += len(c)
   481             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
       
   482                 # have to re-escape existing U+DCxx characters
       
   483                 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
       
   484                 pos += 1
       
   485             else:
       
   486                 pos += len(c)
   482         except UnicodeDecodeError:
   487         except UnicodeDecodeError:
   483             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
   488             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
   484             pos += 1
   489             pos += 1
   485         r += c
   490         r += c
   486     return r
   491     return r