contrib/hgfixes/fix_bytes.py
author Brodie Rao <brodie@sf.io>
Sun, 17 Nov 2013 18:04:28 -0500
changeset 20179 5bb3826bdac4
parent 17299 e51d4aedace9
child 21637 48ef68004ec9
permissions -rw-r--r--
revlog: read/cache chunks in fixed windows of 64 KB When reading a revlog chunk, instead of reading up to 64 KB ahead of the request offset and caching that, this change caches a fixed window before and after the requested data that falls on 64 KB boundaries. This increases cache hits when reading revlogs backwards. Running perfmoonwalk on the Mercurial repo (with almost 20,000 changesets) on Mac OS X with an SSD, before this change: $ hg perfmoonwalk ! wall 2.307994 comb 2.310000 user 2.120000 sys 0.190000 (best of 5) (Each run has 10,668 cache hits and 9,304 misses.) After this change: $ hg perfmoonwalk ! wall 1.814117 comb 1.810000 user 1.810000 sys 0.000000 (best of 6) (19,931 cache hits, 62 misses.) On a busy NFS share, before this change: $ hg perfmoonwalk ! wall 17.000034 comb 4.100000 user 3.270000 sys 0.830000 (best of 3) After: $ hg perfmoonwalk ! wall 1.746115 comb 1.670000 user 1.660000 sys 0.010000 (best of 5)

"""Fixer that changes plain strings to bytes strings."""

import re

from lib2to3 import fixer_base
from lib2to3.pgen2 import token
from lib2to3.fixer_util import Name
from lib2to3.pygram import python_symbols as syms

_re = re.compile(r'[rR]?[\'\"]')

# XXX: Implementing a blacklist in 2to3 turned out to be more troublesome than
# blacklisting some modules inside the fixers. So, this is what I came with.

blacklist = ['mercurial/demandimport.py',
             'mercurial/py3kcompat.py', # valid python 3 already
             'mercurial/i18n.py',
            ]

def isdocstring(node):
    def isclassorfunction(ancestor):
        symbols = (syms.funcdef, syms.classdef)
        # if the current node is a child of a function definition, a class
        # definition or a file, then it is a docstring
        if ancestor.type == syms.simple_stmt:
            try:
                while True:
                    if ancestor.type in symbols:
                        return True
                    ancestor = ancestor.parent
            except AttributeError:
                return False
        return False

    def ismodule(ancestor):
        # Our child is a docstring if we are a simple statement, and our
        # ancestor is file_input. In other words, our child is a lone string in
        # the source file.
        try:
            if (ancestor.type == syms.simple_stmt and
                ancestor.parent.type == syms.file_input):
                    return True
        except AttributeError:
            return False

    def isdocassignment(ancestor):
        # Assigning to __doc__, definitely a string
        try:
            while True:
                if (ancestor.type == syms.expr_stmt and
                    Name('__doc__') in ancestor.children):
                        return True
                ancestor = ancestor.parent
        except AttributeError:
            return False

    if ismodule(node.parent) or \
       isdocassignment(node.parent) or \
       isclassorfunction(node.parent):
        return True
    return False

def shouldtransform(node):
    specialnames = ['__main__']

    if node.value in specialnames:
        return False

    ggparent = node.parent.parent.parent
    sggparent = str(ggparent)

    if 'getattr' in sggparent or \
       'hasattr' in sggparent or \
       'setattr' in sggparent or \
       'encode' in sggparent or \
       'decode' in sggparent:
        return False

    return True

class FixBytes(fixer_base.BaseFix):

    PATTERN = 'STRING'

    def transform(self, node, results):
        if self.filename in blacklist:
            return
        if node.type == token.STRING:
            if _re.match(node.value):
                if isdocstring(node):
                    return
                if not shouldtransform(node):
                    return
                new = node.clone()
                new.value = 'b' + new.value
                return new