Mercurial > hg
changeset 11747:40d5633889bb
hgfixes: add a fixer to convert plain strings to bytestrings
This patch implements a 2to3 fixer that converts all plain strings in a python
source file to byte strings syntax. Example:
foo = 'Normal string'
would become
foo = b'Normal string'
The motivation behind this fixer can be found in
http://selenic.com/pipermail/mercurial-devel/2010-June/022363.html or, in other
words: the current hg source assumes that _most_ strings are "meant" to be byte
sequences, so it makes sense to make the convertion implemented by this patch.
As mentioned above, not all mercurial modules want to use strings as bytes,
examples include i18n (which uses unicode), and demandimport (in py3k, module
names are normal strings, thus unicode, and there's no need for a convertion).
Therefore, these modules are blacklisted in the fixer. There are also a few
functions that can take only unicode arguments, thus the convertion shouldn't
be done for those.
author | Renato Cunha <renatoc@gmail.com> |
---|---|
date | Tue, 03 Aug 2010 13:41:47 -0300 |
parents | 46ac30b17978 |
children | 37a70a784397 |
files | contrib/hgfixes/fix_bytes.py |
diffstat | 1 files changed, 96 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/hgfixes/fix_bytes.py Tue Aug 03 13:41:47 2010 -0300 @@ -0,0 +1,96 @@ +"""Fixer that changes plain strings to bytes strings.""" + +import re + +from lib2to3 import fixer_base +from lib2to3.pgen2 import token +from lib2to3.fixer_util import Name +from lib2to3.pygram import python_symbols as syms + +_re = re.compile(r'[rR]?[\'\"]') + +# XXX: Implementing a blacklist in 2to3 turned out to be more troublesome than +# blacklisting some modules inside the fixers. So, this is what I came with. + +blacklist = ['mercurial/demandimport.py', + 'mercurial/i18n.py', + ] + +def isdocstring(node): + def isclassorfunction(ancestor): + symbols = (syms.funcdef, syms.classdef) + # if the current node is a child of a function definition, a class + # definition or a file, then it is a docstring + if ancestor.type == syms.simple_stmt: + try: + while True: + if ancestor.type in symbols: + return True + ancestor = ancestor.parent + except AttributeError: + return False + return False + + def ismodule(ancestor): + # Our child is a docstring if we are a simple statement, and our + # ancestor is file_input. In other words, our child is a lone string in + # the source file. + try: + if (ancestor.type == syms.simple_stmt and + ancestor.parent.type == syms.file_input): + return True + except AttributeError: + return False + + def isdocassignment(ancestor): + # Assigning to __doc__, definitely a string + try: + while True: + if (ancestor.type == syms.expr_stmt and + Name('__doc__') in ancestor.children): + return True + ancestor = ancestor.parent + except AttributeError: + return False + + if ismodule(node.parent) or \ + isdocassignment(node.parent) or \ + isclassorfunction(node.parent): + return True + return False + +def shouldtransform(node): + specialnames = ['__main__'] + + if node.value in specialnames: + return False + + ggparent = node.parent.parent.parent + sggparent = str(ggparent) + + if 'getattr' in sggparent or \ + 'hasattr' in sggparent or \ + 'setattr' in sggparent or \ + 'encode' in sggparent or \ + 'decode' in sggparent: + return False + + return True + +class FixBytes(fixer_base.BaseFix): + + PATTERN = 'STRING' + + def transform(self, node, results): + if self.filename in blacklist: + return + if node.type == token.STRING: + if _re.match(node.value): + if isdocstring(node): + return + if not shouldtransform(node): + return + new = node.clone() + new.value = 'b' + new.value + return new +