byteify-strings: do not rewrite system string literals to u''
authorYuya Nishihara <yuya@tcha.org>
Thu, 31 May 2018 23:44:35 +0900
changeset 38389 1d68fd5f614a
parent 38388 f701bc936e7f
child 38390 47dd23e6b116
byteify-strings: do not rewrite system string literals to u'' It would make things worse on Python 2 because unicode processing is generally slower than byte string. We should just leave system strings unmodified.
contrib/byteify-strings.py
--- a/contrib/byteify-strings.py	Thu May 31 22:34:23 2018 +0900
+++ b/contrib/byteify-strings.py	Thu May 31 23:44:35 2018 +0900
@@ -27,6 +27,8 @@
         The input token list may be mutated as part of processing. However,
         its changes do not necessarily match the output token stream.
         """
+        sysstrtokens = set()
+
         # The following utility functions access the tokens list and i index of
         # the for i, t enumerate(tokens) loop below
         def _isop(j, *o):
@@ -62,11 +64,11 @@
 
             return None
 
-        def _ensureunicode(j):
-            """Make sure the token at j is a unicode string
+        def _ensuresysstr(j):
+            """Make sure the token at j is a system string
 
-            This rewrites a string token to include the unicode literal prefix
-            so the string transformer won't add the byte prefix.
+            Remember the given token so the string transformer won't add
+            the byte prefix.
 
             Ignores tokens that are not strings. Assumes bounds checking has
             already been done.
@@ -74,7 +76,7 @@
             """
             st = tokens[j]
             if st.type == token.STRING and st.string.startswith(("'", '"')):
-                tokens[j] = st._replace(string='u%s' % st.string)
+                sysstrtokens.add(st)
 
         for i, t in enumerate(tokens):
             # Convert most string literals to byte literals. String literals
@@ -83,7 +85,7 @@
             # Rather than rewrite all string literals to use ``b''`` to indicate
             # byte strings, we apply this token transformer to insert the ``b``
             # prefix nearly everywhere.
-            if t.type == token.STRING:
+            if t.type == token.STRING and t not in sysstrtokens:
                 s = t.string
 
                 # Preserve docstrings as string literals. This is inconsistent
@@ -117,7 +119,7 @@
                         not _isop(i - 1, '.')):
                     arg1idx = _findargnofcall(1)
                     if arg1idx is not None:
-                        _ensureunicode(arg1idx)
+                        _ensuresysstr(arg1idx)
 
                 # .encode() and .decode() on str/bytes/unicode don't accept
                 # byte strings on Python 3.
@@ -125,7 +127,7 @@
                     for argn in range(2):
                         argidx = _findargnofcall(argn)
                         if argidx is not None:
-                            _ensureunicode(argidx)
+                            _ensuresysstr(argidx)
 
                 # It changes iteritems/values to items/values as they are not
                 # present in Python 3 world.