mercurial/encoding.py
branchstable
changeset 38739 7acec9408e1c
parent 38615 443029011990
parent 37947 3ea3c96ada54
child 38783 e7aa113b14f7
equal deleted inserted replaced
38615:443029011990 38739:7acec9408e1c
    95         s = bytes.__new__(cls, l)
    95         s = bytes.__new__(cls, l)
    96         s._utf8 = u
    96         s._utf8 = u
    97         return s
    97         return s
    98     def __hash__(self):
    98     def __hash__(self):
    99         return hash(self._utf8) # avoid collisions in local string space
    99         return hash(self._utf8) # avoid collisions in local string space
       
   100 
       
   101 class safelocalstr(bytes):
       
   102     """Tagged string denoting it was previously an internal UTF-8 string,
       
   103     and can be converted back to UTF-8 losslessly
       
   104 
       
   105     >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
       
   106     >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
       
   107     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
       
   108     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
       
   109     """
   100 
   110 
   101 def tolocal(s):
   111 def tolocal(s):
   102     """
   112     """
   103     Convert a string from internal UTF-8 to local encoding
   113     Convert a string from internal UTF-8 to local encoding
   104 
   114 
   143                 # fast path
   153                 # fast path
   144                 return s
   154                 return s
   145             r = u.encode(_sysstr(encoding), u"replace")
   155             r = u.encode(_sysstr(encoding), u"replace")
   146             if u == r.decode(_sysstr(encoding)):
   156             if u == r.decode(_sysstr(encoding)):
   147                 # r is a safe, non-lossy encoding of s
   157                 # r is a safe, non-lossy encoding of s
   148                 return r
   158                 return safelocalstr(r)
   149             return localstr(s, r)
   159             return localstr(s, r)
   150         except UnicodeDecodeError:
   160         except UnicodeDecodeError:
   151             # we should only get here if we're looking at an ancient changeset
   161             # we should only get here if we're looking at an ancient changeset
   152             try:
   162             try:
   153                 u = s.decode(_sysstr(fallbackencoding))
   163                 u = s.decode(_sysstr(fallbackencoding))
   154                 r = u.encode(_sysstr(encoding), u"replace")
   164                 r = u.encode(_sysstr(encoding), u"replace")
   155                 if u == r.decode(_sysstr(encoding)):
   165                 if u == r.decode(_sysstr(encoding)):
   156                     # r is a safe, non-lossy encoding of s
   166                     # r is a safe, non-lossy encoding of s
   157                     return r
   167                     return safelocalstr(r)
   158                 return localstr(u.encode('UTF-8'), r)
   168                 return localstr(u.encode('UTF-8'), r)
   159             except UnicodeDecodeError:
   169             except UnicodeDecodeError:
   160                 u = s.decode("utf-8", "replace") # last ditch
   170                 u = s.decode("utf-8", "replace") # last ditch
   161                 # can't round-trip
   171                 # can't round-trip
   162                 return u.encode(_sysstr(encoding), u"replace")
   172                 return u.encode(_sysstr(encoding), u"replace")
   405     '''returns a string suitable for JSON
   415     '''returns a string suitable for JSON
   406 
   416 
   407     JSON is problematic for us because it doesn't support non-Unicode
   417     JSON is problematic for us because it doesn't support non-Unicode
   408     bytes. To deal with this, we take the following approach:
   418     bytes. To deal with this, we take the following approach:
   409 
   419 
   410     - localstr objects are converted back to UTF-8
   420     - localstr/safelocalstr objects are converted back to UTF-8
   411     - valid UTF-8/ASCII strings are passed as-is
   421     - valid UTF-8/ASCII strings are passed as-is
   412     - other strings are converted to UTF-8b surrogate encoding
   422     - other strings are converted to UTF-8b surrogate encoding
   413     - apply JSON-specified string escaping
   423     - apply JSON-specified string escaping
   414 
   424 
   415     (escapes are doubled in these tests)
   425     (escapes are doubled in these tests)
   498     - filenames and file contents in arbitrary other encodings can have
   508     - filenames and file contents in arbitrary other encodings can have
   499       be round-tripped or recovered by clueful clients
   509       be round-tripped or recovered by clueful clients
   500     - local strings that have a cached known UTF-8 encoding (aka
   510     - local strings that have a cached known UTF-8 encoding (aka
   501       localstr) get sent as UTF-8 so Unicode-oriented clients get the
   511       localstr) get sent as UTF-8 so Unicode-oriented clients get the
   502       Unicode data they want
   512       Unicode data they want
       
   513     - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
   503     - because we must preserve UTF-8 bytestring in places such as
   514     - because we must preserve UTF-8 bytestring in places such as
   504       filenames, metadata can't be roundtripped without help
   515       filenames, metadata can't be roundtripped without help
   505 
   516 
   506     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   517     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   507     arbitrary bytes into an internal Unicode format that can be
   518     arbitrary bytes into an internal Unicode format that can be
   508     re-encoded back into the original. Here we are exposing the
   519     re-encoded back into the original. Here we are exposing the
   509     internal surrogate encoding as a UTF-8 string.)
   520     internal surrogate encoding as a UTF-8 string.)
   510     '''
   521     '''
   511 
   522 
   512     if not isinstance(s, localstr) and isasciistr(s):
   523     if isinstance(s, localstr):
       
   524         # assume that the original UTF-8 sequence would never contain
       
   525         # invalid characters in U+DCxx range
       
   526         return s._utf8
       
   527     elif isinstance(s, safelocalstr):
       
   528         # already verified that s is non-lossy in legacy encoding, which
       
   529         # shouldn't contain characters in U+DCxx range
       
   530         return fromlocal(s)
       
   531     elif isasciistr(s):
   513         return s
   532         return s
   514     if "\xed" not in s:
   533     if "\xed" not in s:
   515         if isinstance(s, localstr):
       
   516             return s._utf8
       
   517         try:
   534         try:
   518             s.decode('utf-8', _utf8strict)
   535             s.decode('utf-8', _utf8strict)
   519             return s
   536             return s
   520         except UnicodeDecodeError:
   537         except UnicodeDecodeError:
   521             pass
   538             pass