changeset 38384:1d9c97db465f

byteify-strings: fork py3 code transformer to make it a standalone command I'm thinking of making a one-off s/''/b''/g change for overall codebase to make linter happy. We could do that without maintaining the script, but I think it will be somewhat useful for extension authors. So it is in contrib.
author Yuya Nishihara <yuya@tcha.org>
date Thu, 31 May 2018 22:07:04 +0900
parents 23dc901cdf13
children a2976c27dac4
files contrib/byteify-strings.py
diffstat 1 files changed, 154 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/contrib/byteify-strings.py	Thu May 31 22:07:04 2018 +0900
@@ -0,0 +1,154 @@
+# byteify-strings.py - transform string literals to be Python 3 safe
+#
+# Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import io
+import token
+import tokenize
+
+if True:
+    def replacetokens(tokens, fullname):
+        """Transform a stream of tokens from raw to Python 3.
+
+        Returns a generator of possibly rewritten tokens.
+
+        The input token list may be mutated as part of processing. However,
+        its changes do not necessarily match the output token stream.
+        """
+        futureimpline = False
+
+        # The following utility functions access the tokens list and i index of
+        # the for i, t enumerate(tokens) loop below
+        def _isop(j, *o):
+            """Assert that tokens[j] is an OP with one of the given values"""
+            try:
+                return tokens[j].type == token.OP and tokens[j].string in o
+            except IndexError:
+                return False
+
+        def _findargnofcall(n):
+            """Find arg n of a call expression (start at 0)
+
+            Returns index of the first token of that argument, or None if
+            there is not that many arguments.
+
+            Assumes that token[i + 1] is '('.
+
+            """
+            nested = 0
+            for j in range(i + 2, len(tokens)):
+                if _isop(j, ')', ']', '}'):
+                    # end of call, tuple, subscription or dict / set
+                    nested -= 1
+                    if nested < 0:
+                        return None
+                elif n == 0:
+                    # this is the starting position of arg
+                    return j
+                elif _isop(j, '(', '[', '{'):
+                    nested += 1
+                elif _isop(j, ',') and nested == 0:
+                    n -= 1
+
+            return None
+
+        def _ensureunicode(j):
+            """Make sure the token at j is a unicode string
+
+            This rewrites a string token to include the unicode literal prefix
+            so the string transformer won't add the byte prefix.
+
+            Ignores tokens that are not strings. Assumes bounds checking has
+            already been done.
+
+            """
+            st = tokens[j]
+            if st.type == token.STRING and st.string.startswith(("'", '"')):
+                tokens[j] = st._replace(string='u%s' % st.string)
+
+        for i, t in enumerate(tokens):
+            # Convert most string literals to byte literals. String literals
+            # in Python 2 are bytes. String literals in Python 3 are unicode.
+            # Most strings in Mercurial are bytes and unicode strings are rare.
+            # Rather than rewrite all string literals to use ``b''`` to indicate
+            # byte strings, we apply this token transformer to insert the ``b``
+            # prefix nearly everywhere.
+            if t.type == token.STRING:
+                s = t.string
+
+                # Preserve docstrings as string literals. This is inconsistent
+                # with regular unprefixed strings. However, the
+                # "from __future__" parsing (which allows a module docstring to
+                # exist before it) doesn't properly handle the docstring if it
+                # is b''' prefixed, leading to a SyntaxError. We leave all
+                # docstrings as unprefixed to avoid this. This means Mercurial
+                # components touching docstrings need to handle unicode,
+                # unfortunately.
+                if s[0:3] in ("'''", '"""'):
+                    yield t
+                    continue
+
+                # If the first character isn't a quote, it is likely a string
+                # prefixing character (such as 'b', 'u', or 'r'. Ignore.
+                if s[0] not in ("'", '"'):
+                    yield t
+                    continue
+
+                # String literal. Prefix to make a b'' string.
+                yield t._replace(string='b%s' % t.string)
+                continue
+
+            # Insert compatibility imports at "from __future__ import" line.
+            # No '\n' should be added to preserve line numbers.
+            if (t.type == token.NAME and t.string == 'import' and
+                all(u.type == token.NAME for u in tokens[i - 2:i]) and
+                [u.string for u in tokens[i - 2:i]] == ['from', '__future__']):
+                futureimpline = True
+            if t.type == token.NEWLINE and futureimpline:
+                futureimpline = False
+                if fullname == 'mercurial.pycompat':
+                    yield t
+                    continue
+                r, c = t.start
+                l = (b'; from mercurial.pycompat import '
+                     b'delattr, getattr, hasattr, setattr, xrange, '
+                     b'open, unicode\n')
+                for u in tokenize.tokenize(io.BytesIO(l).readline):
+                    if u.type in (tokenize.ENCODING, token.ENDMARKER):
+                        continue
+                    yield u._replace(
+                        start=(r, c + u.start[1]), end=(r, c + u.end[1]))
+                continue
+
+            # This looks like a function call.
+            if t.type == token.NAME and _isop(i + 1, '('):
+                fn = t.string
+
+                # *attr() builtins don't accept byte strings to 2nd argument.
+                if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
+                        not _isop(i - 1, '.')):
+                    arg1idx = _findargnofcall(1)
+                    if arg1idx is not None:
+                        _ensureunicode(arg1idx)
+
+                # .encode() and .decode() on str/bytes/unicode don't accept
+                # byte strings on Python 3.
+                elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
+                    for argn in range(2):
+                        argidx = _findargnofcall(argn)
+                        if argidx is not None:
+                            _ensureunicode(argidx)
+
+                # It changes iteritems/values to items/values as they are not
+                # present in Python 3 world.
+                elif fn in ('iteritems', 'itervalues'):
+                    yield t._replace(string=fn[4:])
+                    continue
+
+            # Emit unmodified token.
+            yield t