equal
deleted
inserted
replaced
461 arbitrary bytes into an internal Unicode format that can be |
461 arbitrary bytes into an internal Unicode format that can be |
462 re-encoded back into the original. Here we are exposing the |
462 re-encoded back into the original. Here we are exposing the |
463 internal surrogate encoding as a UTF-8 string.) |
463 internal surrogate encoding as a UTF-8 string.) |
464 ''' |
464 ''' |
465 |
465 |
466 if isinstance(s, localstr): |
466 if "\xed" not in s: |
467 return s._utf8 |
467 if isinstance(s, localstr): |
468 |
468 return s._utf8 |
469 try: |
469 try: |
470 s.decode('utf-8') |
470 s.decode('utf-8') |
471 return s |
471 return s |
472 except UnicodeDecodeError: |
472 except UnicodeDecodeError: |
473 pass |
473 pass |
474 |
474 |
475 r = "" |
475 r = "" |
476 pos = 0 |
476 pos = 0 |
477 l = len(s) |
477 l = len(s) |
478 while pos < l: |
478 while pos < l: |
479 try: |
479 try: |
480 c = getutf8char(s, pos) |
480 c = getutf8char(s, pos) |
481 pos += len(c) |
481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
|
482 # have to re-escape existing U+DCxx characters |
|
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
|
484 pos += 1 |
|
485 else: |
|
486 pos += len(c) |
482 except UnicodeDecodeError: |
487 except UnicodeDecodeError: |
483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
484 pos += 1 |
489 pos += 1 |
485 r += c |
490 r += c |
486 return r |
491 return r |