tests/test-parseindex2.py
author Brodie Rao <brodie@bitheap.org>
Mon, 19 Sep 2011 15:58:03 -0700
changeset 15141 16dc9a32ca04
parent 13254 5ef5eb1f3515
child 16363 2cdd7e63211b
permissions -rw-r--r--
mdiff: speed up showfunc for large diffs This addresses the following issues with showfunc: - Silly usage of regular expressions. - Doing str.rstrip() needlessly in an inner loop. - Doing catastrophic backtracking when trying to find a function line. Finding function text is now at worst O(n lines in the old file), and at best close to O(n hunks). Given a diff like this[1]: src/main/antlr3/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunker.g | 4 +- src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerLexer.java | 2 +- src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerParser.java | 29189 +++++---- 3 files changed, 14741 insertions(+), 14454 deletions(-) [1]: https://bitbucket.org/wwmm/chemicaltagger/changeset/d2bfbaecd4fc/raw Without this change, hg log --stat --config diff.showfunc=1 takes an absurdly long time to complete: CallCount Recursive Total(ms) Inline(ms) module:lineno(function) 32813 0 80.3546 40.6086 mercurial.mdiff:160(yieldhunk) +65062746 0 25.7227 25.7227 +<method 'match' of '_sre.SRE_Pattern' objects> +65062746 0 14.0221 14.0221 +<method 'rstrip' of 'str' objects> +1809 0 0.0009 0.0009 +mercurial.mdiff:148(contextend) +1809 0 0.0003 0.0003 +<len> 65062746 0 25.7227 25.7227 <method 'match' of '_sre.SRE_Pattern' objects> 65062763 0 14.0221 14.0221 <method 'rstrip' of 'str' objects> 543 0 0.1631 0.1631 <zlib.decompress> 3 0 0.0505 0.0505 <mercurial.bdiff.blocks> 31007 0 80.4564 0.0477 mercurial.mdiff:147(_unidiff) +32813 0 80.3546 40.6086 +mercurial.mdiff:160(yieldhunk) +3 0 0.0505 0.0505 +<mercurial.bdiff.blocks> +3618 0 0.0022 0.0022 +mercurial.mdiff:154(contextstart) +5427 0 0.0013 0.0013 +<len> +3 0 0.0001 0.0000 +re:188(compile) 1 0 80.8381 0.0322 mercurial.patch:1777(diffstatdata) +107499 0 0.0235 0.0235 +<method 'startswith' of 'str' objects> +31014 0 80.7820 0.0071 +mercurial.util:1284(iterlines) +3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects> +4 0 0.0000 0.0000 +mercurial.patch:1783(addresult) +3 0 0.0000 0.0000 +<method 'group' of '_sre.SRE_Match' objects> 6 0 0.0444 0.0283 mercurial.mdiff:12(splitnewlines) +6 0 0.0160 0.0160 +<method 'split' of 'str' objects> 32 0 0.0246 0.0246 <method 'update' of '_hashlib.HASH' objects> 11 0 0.0236 0.0236 <method 'read' of 'file' objects> Time: real 80.880 secs (user 80.200+0.000 sys 0.380+0.000) With this change, it's almost as fast as not using showfunc at all: CallCount Recursive Total(ms) Inline(ms) module:lineno(function) 543 0 0.1699 0.1699 <zlib.decompress> 3 0 0.0501 0.0501 <mercurial.bdiff.blocks> 32813 0 0.0415 0.0348 mercurial.mdiff:161(yieldhunk) +70837 0 0.0058 0.0058 +<method 'isalnum' of 'str' objects> +1809 0 0.0006 0.0006 +mercurial.mdiff:148(contextend) +1809 0 0.0002 0.0002 +<len> 1 0 0.4879 0.0310 mercurial.patch:1777(diffstatdata) +107499 0 0.0230 0.0230 +<method 'startswith' of 'str' objects> +31014 0 0.4335 0.0065 +mercurial.util:1284(iterlines) +3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects> +4 0 0.0000 0.0000 +mercurial.patch:1783(addresult) +1 0 0.0004 0.0000 +re:188(compile) 32 0 0.0293 0.0293 <method 'update' of '_hashlib.HASH' objects> 6 0 0.0427 0.0279 mercurial.mdiff:12(splitnewlines) +6 0 0.0147 0.0147 +<method 'split' of 'str' objects> 31007 0 0.1169 0.0235 mercurial.mdiff:147(_unidiff) +3 0 0.0501 0.0501 +<mercurial.bdiff.blocks> +32813 0 0.0415 0.0348 +mercurial.mdiff:161(yieldhunk) +3618 0 0.0012 0.0012 +mercurial.mdiff:154(contextstart) +5427 0 0.0006 0.0006 +<len> 107597 0 0.0230 0.0230 <method 'startswith' of 'str' objects> 16 0 0.0213 0.0213 <mercurial.mpatch.patches> 194 0 0.0149 0.0149 <method 'split' of 'str' objects> Time: real 0.530 secs (user 0.450+0.000 sys 0.070+0.000)

from mercurial import parsers
from mercurial.node import nullid, nullrev
import struct

# This unit test compares the return value of the original Python
# implementation of parseindex and the new C implementation for
# an index file with and without inlined data

# original python implementation
def gettype(q):
    return int(q & 0xFFFF)

def offset_type(offset, type):
    return long(long(offset) << 16 | type)

indexformatng = ">Qiiiiii20s12x"

def py_parseindex(data, inline) :
    s = 64
    cache = None
    index = []
    nodemap =  {nullid: nullrev}
    n = off = 0

    l = len(data) - s
    append = index.append
    if inline:
        cache = (0, data)
        while off <= l:
            e = struct.unpack(indexformatng, data[off:off + s])
            nodemap[e[7]] = n
            append(e)
            n += 1
            if e[1] < 0:
                break
            off += e[1] + s
    else:
        while off <= l:
            e = struct.unpack(indexformatng, data[off:off + s])
            nodemap[e[7]] = n
            append(e)
            n += 1
            off += s

    e = list(index[0])
    type = gettype(e[0])
    e[0] = offset_type(0, type)
    index[0] = tuple(e)

    # add the magic null revision at -1
    index.append((0, 0, 0, -1, -1, -1, -1, nullid))

    return index, cache


data_inlined = '\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x01\x8c' \
    '\x00\x00\x04\x07\x00\x00\x00\x00\x00\x00\x15\x15\xff\xff\xff' \
    '\xff\xff\xff\xff\xff\xebG\x97\xb7\x1fB\x04\xcf\x13V\x81\tw\x1b' \
    'w\xdduR\xda\xc6\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' \
    'x\x9c\x9d\x93?O\xc30\x10\xc5\xf7|\x8a\xdb\x9a\xa8m\x06\xd8*\x95' \
    '\x81B\xa1\xa2\xa2R\xcb\x86Pd\x9a\x0b5$vd_\x04\xfd\xf6\x9c\xff@' \
    '\x11!\x0b\xd9\xec\xf7\xbbw\xe7gG6\xad6\x04\xdaN\xc0\x92\xa0$)' \
    '\xb1\x82\xa2\xd1%\x16\xa4\x8b7\xa9\xca\xd4-\xb2Y\x02\xfc\xc9' \
    '\xcaS\xf9\xaeX\xed\xb6\xd77Q\x02\x83\xd4\x19\xf5--Y\xea\xe1W' \
    '\xab\xed\x10\xceR\x0f_\xdf\xdf\r\xe1,\xf5\xf0\xcb\xf5 \xceR\x0f' \
    '_\xdc\x0e\x0e\xc3R\x0f_\xae\x96\x9b!\x9e\xa5\x1e\xbf\xdb,\x06' \
    '\xc7q\x9a/\x88\x82\xc3B\xea\xb5\xb4TJ\x93\xb6\x82\x0e\xe16\xe6' \
    'KQ\xdb\xaf\xecG\xa3\xd1 \x01\xd3\x0b_^\xe8\xaa\xa0\xae\xad\xd1' \
    '&\xbef\x1bz\x08\xb0|\xc9Xz\x06\xf6Z\x91\x90J\xaa\x17\x90\xaa' \
    '\xd2\xa6\x11$5C\xcf\xba#\xa0\x03\x02*2\x92-\xfc\xb1\x94\xdf\xe2' \
    '\xae\xb8\'m\x8ey0^\x85\xd3\x82\xb4\xf0`:\x9c\x00\x8a\xfd\x01' \
    '\xb0\xc6\x86\x8b\xdd\xae\x80\xf3\xa9\x9fd\x16\n\x00R%\x1a\x06' \
    '\xe9\xd8b\x98\x1d\xf4\xf3+\x9bf\x01\xd8p\x1b\xf3.\xed\x9f^g\xc3' \
    '^\xd9W81T\xdb\xd5\x04sx|\xf2\xeb\xd6`%?x\xed"\x831\xbf\xf3\xdc' \
    'b\xeb%gaY\xe1\xad\x9f\xb9f\'1w\xa9\xa5a\x83s\x82J\xb98\xbc4\x8b' \
    '\x83\x00\x9f$z\xb8#\xa5\xb1\xdf\x98\xd9\xec\x1b\x89O\xe3Ts\x9a4' \
    '\x17m\x8b\xfc\x8f\xa5\x95\x9a\xfc\xfa\xed,\xe5|\xa1\xfe\x15\xb9' \
    '\xbc\xb2\x93\x1f\xf2\x95\xff\xdf,\x1a\xc5\xe7\x17*\x93Oz:>\x0e'

data_non_inlined = '\x00\x00\x00\x01\x00\x00\x00\x00\x00\x01D\x19' \
    '\x00\x07e\x12\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff' \
    '\xff\xff\xff\xff\xd1\xf4\xbb\xb0\xbe\xfc\x13\xbd\x8c\xd3\x9d' \
    '\x0f\xcd\xd9;\x8c\x07\x8cJ/\x00\x00\x00\x00\x00\x00\x00\x00\x00' \
    '\x00\x00\x00\x00\x00\x00\x01D\x19\x00\x00\x00\x00\x00\xdf\x00' \
    '\x00\x01q\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\xff' \
    '\xff\xff\xff\xc1\x12\xb9\x04\x96\xa4Z1t\x91\xdfsJ\x90\xf0\x9bh' \
    '\x07l&\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' \
    '\x00\x01D\xf8\x00\x00\x00\x00\x01\x1b\x00\x00\x01\xb8\x00\x00' \
    '\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\xff\xff\xff\xff\x02\n' \
    '\x0e\xc6&\xa1\x92\xae6\x0b\x02i\xfe-\xe5\xbao\x05\xd1\xe7\x00' \
    '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01F' \
    '\x13\x00\x00\x00\x00\x01\xec\x00\x00\x03\x06\x00\x00\x00\x01' \
    '\x00\x00\x00\x03\x00\x00\x00\x02\xff\xff\xff\xff\x12\xcb\xeby1' \
    '\xb6\r\x98B\xcb\x07\xbd`\x8f\x92\xd9\xc4\x84\xbdK\x00\x00\x00' \
    '\x00\x00\x00\x00\x00\x00\x00\x00\x00'

def runtest() :

    py_res_1 = py_parseindex(data_inlined, True)
    c_res_1 = parsers.parse_index2(data_inlined, True)

    py_res_2 = py_parseindex(data_non_inlined, False)
    c_res_2 = parsers.parse_index2(data_non_inlined, False)

    if py_res_1 != c_res_1:
        print "Parse index result (with inlined data) differs!"

    if py_res_2 != c_res_2:
        print "Parse index result (no inlined data) differs!"

    print "done"

runtest()