95 s = bytes.__new__(cls, l) |
95 s = bytes.__new__(cls, l) |
96 s._utf8 = u |
96 s._utf8 = u |
97 return s |
97 return s |
98 def __hash__(self): |
98 def __hash__(self): |
99 return hash(self._utf8) # avoid collisions in local string space |
99 return hash(self._utf8) # avoid collisions in local string space |
|
100 |
|
101 class safelocalstr(bytes): |
|
102 """Tagged string denoting it was previously an internal UTF-8 string, |
|
103 and can be converted back to UTF-8 losslessly |
|
104 |
|
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
|
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
|
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
|
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
|
109 """ |
100 |
110 |
101 def tolocal(s): |
111 def tolocal(s): |
102 """ |
112 """ |
103 Convert a string from internal UTF-8 to local encoding |
113 Convert a string from internal UTF-8 to local encoding |
104 |
114 |
143 # fast path |
153 # fast path |
144 return s |
154 return s |
145 r = u.encode(_sysstr(encoding), u"replace") |
155 r = u.encode(_sysstr(encoding), u"replace") |
146 if u == r.decode(_sysstr(encoding)): |
156 if u == r.decode(_sysstr(encoding)): |
147 # r is a safe, non-lossy encoding of s |
157 # r is a safe, non-lossy encoding of s |
148 return r |
158 return safelocalstr(r) |
149 return localstr(s, r) |
159 return localstr(s, r) |
150 except UnicodeDecodeError: |
160 except UnicodeDecodeError: |
151 # we should only get here if we're looking at an ancient changeset |
161 # we should only get here if we're looking at an ancient changeset |
152 try: |
162 try: |
153 u = s.decode(_sysstr(fallbackencoding)) |
163 u = s.decode(_sysstr(fallbackencoding)) |
154 r = u.encode(_sysstr(encoding), u"replace") |
164 r = u.encode(_sysstr(encoding), u"replace") |
155 if u == r.decode(_sysstr(encoding)): |
165 if u == r.decode(_sysstr(encoding)): |
156 # r is a safe, non-lossy encoding of s |
166 # r is a safe, non-lossy encoding of s |
157 return r |
167 return safelocalstr(r) |
158 return localstr(u.encode('UTF-8'), r) |
168 return localstr(u.encode('UTF-8'), r) |
159 except UnicodeDecodeError: |
169 except UnicodeDecodeError: |
160 u = s.decode("utf-8", "replace") # last ditch |
170 u = s.decode("utf-8", "replace") # last ditch |
161 # can't round-trip |
171 # can't round-trip |
162 return u.encode(_sysstr(encoding), u"replace") |
172 return u.encode(_sysstr(encoding), u"replace") |
405 '''returns a string suitable for JSON |
415 '''returns a string suitable for JSON |
406 |
416 |
407 JSON is problematic for us because it doesn't support non-Unicode |
417 JSON is problematic for us because it doesn't support non-Unicode |
408 bytes. To deal with this, we take the following approach: |
418 bytes. To deal with this, we take the following approach: |
409 |
419 |
410 - localstr objects are converted back to UTF-8 |
420 - localstr/safelocalstr objects are converted back to UTF-8 |
411 - valid UTF-8/ASCII strings are passed as-is |
421 - valid UTF-8/ASCII strings are passed as-is |
412 - other strings are converted to UTF-8b surrogate encoding |
422 - other strings are converted to UTF-8b surrogate encoding |
413 - apply JSON-specified string escaping |
423 - apply JSON-specified string escaping |
414 |
424 |
415 (escapes are doubled in these tests) |
425 (escapes are doubled in these tests) |
498 - filenames and file contents in arbitrary other encodings can have |
508 - filenames and file contents in arbitrary other encodings can have |
499 be round-tripped or recovered by clueful clients |
509 be round-tripped or recovered by clueful clients |
500 - local strings that have a cached known UTF-8 encoding (aka |
510 - local strings that have a cached known UTF-8 encoding (aka |
501 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
511 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
502 Unicode data they want |
512 Unicode data they want |
|
513 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
503 - because we must preserve UTF-8 bytestring in places such as |
514 - because we must preserve UTF-8 bytestring in places such as |
504 filenames, metadata can't be roundtripped without help |
515 filenames, metadata can't be roundtripped without help |
505 |
516 |
506 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
517 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
507 arbitrary bytes into an internal Unicode format that can be |
518 arbitrary bytes into an internal Unicode format that can be |
508 re-encoded back into the original. Here we are exposing the |
519 re-encoded back into the original. Here we are exposing the |
509 internal surrogate encoding as a UTF-8 string.) |
520 internal surrogate encoding as a UTF-8 string.) |
510 ''' |
521 ''' |
511 |
522 |
512 if not isinstance(s, localstr) and isasciistr(s): |
523 if isinstance(s, localstr): |
|
524 # assume that the original UTF-8 sequence would never contain |
|
525 # invalid characters in U+DCxx range |
|
526 return s._utf8 |
|
527 elif isinstance(s, safelocalstr): |
|
528 # already verified that s is non-lossy in legacy encoding, which |
|
529 # shouldn't contain characters in U+DCxx range |
|
530 return fromlocal(s) |
|
531 elif isasciistr(s): |
513 return s |
532 return s |
514 if "\xed" not in s: |
533 if "\xed" not in s: |
515 if isinstance(s, localstr): |
|
516 return s._utf8 |
|
517 try: |
534 try: |
518 s.decode('utf-8', _utf8strict) |
535 s.decode('utf-8', _utf8strict) |
519 return s |
536 return s |
520 except UnicodeDecodeError: |
537 except UnicodeDecodeError: |
521 pass |
538 pass |