--- a/contrib/byteify-strings.py Fri Jun 01 00:13:55 2018 +0900
+++ b/contrib/byteify-strings.py Sun Jun 03 18:19:54 2018 +0900
@@ -23,155 +23,154 @@
return t._replace(start=(t.start[0], t.start[1] + ofs),
end=(t.end[0], t.end[1] + ofs))
-if True:
- def replacetokens(tokens, opts):
- """Transform a stream of tokens from raw to Python 3.
+def replacetokens(tokens, opts):
+ """Transform a stream of tokens from raw to Python 3.
+
+ Returns a generator of possibly rewritten tokens.
+
+ The input token list may be mutated as part of processing. However,
+ its changes do not necessarily match the output token stream.
+ """
+ sysstrtokens = set()
- Returns a generator of possibly rewritten tokens.
+ # The following utility functions access the tokens list and i index of
+ # the for i, t enumerate(tokens) loop below
+ def _isop(j, *o):
+ """Assert that tokens[j] is an OP with one of the given values"""
+ try:
+ return tokens[j].type == token.OP and tokens[j].string in o
+ except IndexError:
+ return False
- The input token list may be mutated as part of processing. However,
- its changes do not necessarily match the output token stream.
- """
- sysstrtokens = set()
+ def _findargnofcall(n):
+ """Find arg n of a call expression (start at 0)
+
+ Returns index of the first token of that argument, or None if
+ there is not that many arguments.
+
+ Assumes that token[i + 1] is '('.
- # The following utility functions access the tokens list and i index of
- # the for i, t enumerate(tokens) loop below
- def _isop(j, *o):
- """Assert that tokens[j] is an OP with one of the given values"""
- try:
- return tokens[j].type == token.OP and tokens[j].string in o
- except IndexError:
- return False
+ """
+ nested = 0
+ for j in range(i + 2, len(tokens)):
+ if _isop(j, ')', ']', '}'):
+ # end of call, tuple, subscription or dict / set
+ nested -= 1
+ if nested < 0:
+ return None
+ elif n == 0:
+ # this is the starting position of arg
+ return j
+ elif _isop(j, '(', '[', '{'):
+ nested += 1
+ elif _isop(j, ',') and nested == 0:
+ n -= 1
- def _findargnofcall(n):
- """Find arg n of a call expression (start at 0)
+ return None
+
+ def _ensuresysstr(j):
+ """Make sure the token at j is a system string
- Returns index of the first token of that argument, or None if
- there is not that many arguments.
+ Remember the given token so the string transformer won't add
+ the byte prefix.
- Assumes that token[i + 1] is '('.
+ Ignores tokens that are not strings. Assumes bounds checking has
+ already been done.
- """
- nested = 0
- for j in range(i + 2, len(tokens)):
- if _isop(j, ')', ']', '}'):
- # end of call, tuple, subscription or dict / set
- nested -= 1
- if nested < 0:
- return None
- elif n == 0:
- # this is the starting position of arg
- return j
- elif _isop(j, '(', '[', '{'):
- nested += 1
- elif _isop(j, ',') and nested == 0:
- n -= 1
+ """
+ st = tokens[j]
+ if st.type == token.STRING and st.string.startswith(("'", '"')):
+ sysstrtokens.add(st)
- return None
-
- def _ensuresysstr(j):
- """Make sure the token at j is a system string
-
- Remember the given token so the string transformer won't add
- the byte prefix.
-
- Ignores tokens that are not strings. Assumes bounds checking has
- already been done.
+ coldelta = 0 # column increment for new opening parens
+ coloffset = -1 # column offset for the current line (-1: TBD)
+ parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
+ for i, t in enumerate(tokens):
+ # Compute the column offset for the current line, such that
+ # the current line will be aligned to the last opening paren
+ # as before.
+ if coloffset < 0:
+ if t.start[1] == parens[-1][1]:
+ coloffset = parens[-1][2]
+ elif t.start[1] + 1 == parens[-1][1]:
+ # fix misaligned indent of s/util.Abort/error.Abort/
+ coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
+ else:
+ coloffset = 0
- """
- st = tokens[j]
- if st.type == token.STRING and st.string.startswith(("'", '"')):
- sysstrtokens.add(st)
+ # Reset per-line attributes at EOL.
+ if t.type in (token.NEWLINE, tokenize.NL):
+ yield adjusttokenpos(t, coloffset)
+ coldelta = 0
+ coloffset = -1
+ continue
+
+ # Remember the last paren position.
+ if _isop(i, '(', '[', '{'):
+ parens.append(t.end + (coloffset + coldelta,))
+ elif _isop(i, ')', ']', '}'):
+ parens.pop()
- coldelta = 0 # column increment for new opening parens
- coloffset = -1 # column offset for the current line (-1: TBD)
- parens = [(0, 0, 0)] # stack of (line, end-column, column-offset)
- for i, t in enumerate(tokens):
- # Compute the column offset for the current line, such that
- # the current line will be aligned to the last opening paren
- # as before.
- if coloffset < 0:
- if t.start[1] == parens[-1][1]:
- coloffset = parens[-1][2]
- elif t.start[1] + 1 == parens[-1][1]:
- # fix misaligned indent of s/util.Abort/error.Abort/
- coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
- else:
- coloffset = 0
+ # Convert most string literals to byte literals. String literals
+ # in Python 2 are bytes. String literals in Python 3 are unicode.
+ # Most strings in Mercurial are bytes and unicode strings are rare.
+ # Rather than rewrite all string literals to use ``b''`` to indicate
+ # byte strings, we apply this token transformer to insert the ``b``
+ # prefix nearly everywhere.
+ if t.type == token.STRING and t not in sysstrtokens:
+ s = t.string
- # Reset per-line attributes at EOL.
- if t.type in (token.NEWLINE, tokenize.NL):
+ # Preserve docstrings as string literals. This is inconsistent
+ # with regular unprefixed strings. However, the
+ # "from __future__" parsing (which allows a module docstring to
+ # exist before it) doesn't properly handle the docstring if it
+ # is b''' prefixed, leading to a SyntaxError. We leave all
+ # docstrings as unprefixed to avoid this. This means Mercurial
+ # components touching docstrings need to handle unicode,
+ # unfortunately.
+ if s[0:3] in ("'''", '"""'):
yield adjusttokenpos(t, coloffset)
- coldelta = 0
- coloffset = -1
continue
- # Remember the last paren position.
- if _isop(i, '(', '[', '{'):
- parens.append(t.end + (coloffset + coldelta,))
- elif _isop(i, ')', ']', '}'):
- parens.pop()
-
- # Convert most string literals to byte literals. String literals
- # in Python 2 are bytes. String literals in Python 3 are unicode.
- # Most strings in Mercurial are bytes and unicode strings are rare.
- # Rather than rewrite all string literals to use ``b''`` to indicate
- # byte strings, we apply this token transformer to insert the ``b``
- # prefix nearly everywhere.
- if t.type == token.STRING and t not in sysstrtokens:
- s = t.string
-
- # Preserve docstrings as string literals. This is inconsistent
- # with regular unprefixed strings. However, the
- # "from __future__" parsing (which allows a module docstring to
- # exist before it) doesn't properly handle the docstring if it
- # is b''' prefixed, leading to a SyntaxError. We leave all
- # docstrings as unprefixed to avoid this. This means Mercurial
- # components touching docstrings need to handle unicode,
- # unfortunately.
- if s[0:3] in ("'''", '"""'):
- yield adjusttokenpos(t, coloffset)
- continue
-
- # If the first character isn't a quote, it is likely a string
- # prefixing character (such as 'b', 'u', or 'r'. Ignore.
- if s[0] not in ("'", '"'):
- yield adjusttokenpos(t, coloffset)
- continue
-
- # String literal. Prefix to make a b'' string.
- yield adjusttokenpos(t._replace(string='b%s' % t.string),
- coloffset)
- coldelta += 1
+ # If the first character isn't a quote, it is likely a string
+ # prefixing character (such as 'b', 'u', or 'r'. Ignore.
+ if s[0] not in ("'", '"'):
+ yield adjusttokenpos(t, coloffset)
continue
- # This looks like a function call.
- if t.type == token.NAME and _isop(i + 1, '('):
- fn = t.string
+ # String literal. Prefix to make a b'' string.
+ yield adjusttokenpos(t._replace(string='b%s' % t.string),
+ coloffset)
+ coldelta += 1
+ continue
- # *attr() builtins don't accept byte strings to 2nd argument.
- if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
- not _isop(i - 1, '.')):
- arg1idx = _findargnofcall(1)
- if arg1idx is not None:
- _ensuresysstr(arg1idx)
+ # This looks like a function call.
+ if t.type == token.NAME and _isop(i + 1, '('):
+ fn = t.string
+
+ # *attr() builtins don't accept byte strings to 2nd argument.
+ if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and
+ not _isop(i - 1, '.')):
+ arg1idx = _findargnofcall(1)
+ if arg1idx is not None:
+ _ensuresysstr(arg1idx)
- # .encode() and .decode() on str/bytes/unicode don't accept
- # byte strings on Python 3.
- elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
- for argn in range(2):
- argidx = _findargnofcall(argn)
- if argidx is not None:
- _ensuresysstr(argidx)
+ # .encode() and .decode() on str/bytes/unicode don't accept
+ # byte strings on Python 3.
+ elif fn in ('encode', 'decode') and _isop(i - 1, '.'):
+ for argn in range(2):
+ argidx = _findargnofcall(argn)
+ if argidx is not None:
+ _ensuresysstr(argidx)
- # It changes iteritems/values to items/values as they are not
- # present in Python 3 world.
- elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
- yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
- continue
+ # It changes iteritems/values to items/values as they are not
+ # present in Python 3 world.
+ elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
+ yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
+ continue
- # Emit unmodified token.
- yield adjusttokenpos(t, coloffset)
+ # Emit unmodified token.
+ yield adjusttokenpos(t, coloffset)
def process(fin, fout, opts):
tokens = tokenize.tokenize(fin.readline)