mdiff: speed up showfunc for large diffs
This addresses the following issues with showfunc:
- Silly usage of regular expressions.
- Doing str.rstrip() needlessly in an inner loop.
- Doing catastrophic backtracking when trying to find a function line.
Finding function text is now at worst O(n lines in the old file), and
at best close to O(n hunks).
Given a diff like this[1]:
src/main/antlr3/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunker.g | 4 +-
src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerLexer.java | 2 +-
src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerParser.java | 29189 +++++----
3 files changed, 14741 insertions(+), 14454 deletions(-)
[1]: https://bitbucket.org/wwmm/chemicaltagger/changeset/
d2bfbaecd4fc/raw
Without this change, hg log --stat --config diff.showfunc=1 takes an
absurdly long time to complete:
CallCount Recursive Total(ms) Inline(ms) module:lineno(function)
32813 0 80.3546 40.6086 mercurial.mdiff:160(yieldhunk)
+
65062746 0 25.7227 25.7227 +<method 'match' of '_sre.SRE_Pattern' objects>
+
65062746 0 14.0221 14.0221 +<method 'rstrip' of 'str' objects>
+1809 0 0.0009 0.0009 +mercurial.mdiff:148(contextend)
+1809 0 0.0003 0.0003 +<len>
65062746 0 25.7227 25.7227 <method 'match' of '_sre.SRE_Pattern' objects>
65062763 0 14.0221 14.0221 <method 'rstrip' of 'str' objects>
543 0 0.1631 0.1631 <zlib.decompress>
3 0 0.0505 0.0505 <mercurial.bdiff.blocks>
31007 0 80.4564 0.0477 mercurial.mdiff:147(_unidiff)
+32813 0 80.3546 40.6086 +mercurial.mdiff:160(yieldhunk)
+3 0 0.0505 0.0505 +<mercurial.bdiff.blocks>
+3618 0 0.0022 0.0022 +mercurial.mdiff:154(contextstart)
+5427 0 0.0013 0.0013 +<len>
+3 0 0.0001 0.0000 +re:188(compile)
1 0 80.8381 0.0322 mercurial.patch:1777(diffstatdata)
+107499 0 0.0235 0.0235 +<method 'startswith' of 'str' objects>
+31014 0 80.7820 0.0071 +mercurial.util:1284(iterlines)
+3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects>
+4 0 0.0000 0.0000 +mercurial.patch:1783(addresult)
+3 0 0.0000 0.0000 +<method 'group' of '_sre.SRE_Match' objects>
6 0 0.0444 0.0283 mercurial.mdiff:12(splitnewlines)
+6 0 0.0160 0.0160 +<method 'split' of 'str' objects>
32 0 0.0246 0.0246 <method 'update' of '_hashlib.HASH' objects>
11 0 0.0236 0.0236 <method 'read' of 'file' objects>
Time: real 80.880 secs (user 80.200+0.000 sys 0.380+0.000)
With this change, it's almost as fast as not using showfunc at all:
CallCount Recursive Total(ms) Inline(ms) module:lineno(function)
543 0 0.1699 0.1699 <zlib.decompress>
3 0 0.0501 0.0501 <mercurial.bdiff.blocks>
32813 0 0.0415 0.0348 mercurial.mdiff:161(yieldhunk)
+70837 0 0.0058 0.0058 +<method 'isalnum' of 'str' objects>
+1809 0 0.0006 0.0006 +mercurial.mdiff:148(contextend)
+1809 0 0.0002 0.0002 +<len>
1 0 0.4879 0.0310 mercurial.patch:1777(diffstatdata)
+107499 0 0.0230 0.0230 +<method 'startswith' of 'str' objects>
+31014 0 0.4335 0.0065 +mercurial.util:1284(iterlines)
+3 0 0.0000 0.0000 +<method 'search' of '_sre.SRE_Pattern' objects>
+4 0 0.0000 0.0000 +mercurial.patch:1783(addresult)
+1 0 0.0004 0.0000 +re:188(compile)
32 0 0.0293 0.0293 <method 'update' of '_hashlib.HASH' objects>
6 0 0.0427 0.0279 mercurial.mdiff:12(splitnewlines)
+6 0 0.0147 0.0147 +<method 'split' of 'str' objects>
31007 0 0.1169 0.0235 mercurial.mdiff:147(_unidiff)
+3 0 0.0501 0.0501 +<mercurial.bdiff.blocks>
+32813 0 0.0415 0.0348 +mercurial.mdiff:161(yieldhunk)
+3618 0 0.0012 0.0012 +mercurial.mdiff:154(contextstart)
+5427 0 0.0006 0.0006 +<len>
107597 0 0.0230 0.0230 <method 'startswith' of 'str' objects>
16 0 0.0213 0.0213 <mercurial.mpatch.patches>
194 0 0.0149 0.0149 <method 'split' of 'str' objects>
Time: real 0.530 secs (user 0.450+0.000 sys 0.070+0.000)
import os, sys, textwrap
# import from the live mercurial repo
sys.path.insert(0, "..")
# fall back to pure modules if required C extensions are not available
sys.path.append(os.path.join('..', 'mercurial', 'pure'))
from mercurial import demandimport; demandimport.enable()
from mercurial import encoding
from mercurial.commands import table, globalopts
from mercurial.i18n import _
from mercurial.help import helptable
from mercurial import extensions
from mercurial import util
def get_desc(docstr):
if not docstr:
return "", ""
# sanitize
docstr = docstr.strip("\n")
docstr = docstr.rstrip()
shortdesc = docstr.splitlines()[0].strip()
i = docstr.find("\n")
if i != -1:
desc = docstr[i + 2:]
else:
desc = shortdesc
desc = textwrap.dedent(desc)
return (shortdesc, desc)
def get_opts(opts):
for opt in opts:
if len(opt) == 5:
shortopt, longopt, default, desc, optlabel = opt
else:
shortopt, longopt, default, desc = opt
allopts = []
if shortopt:
allopts.append("-%s" % shortopt)
if longopt:
allopts.append("--%s" % longopt)
desc += default and _(" (default: %s)") % default or ""
yield (", ".join(allopts), desc)
def get_cmd(cmd, cmdtable):
d = {}
attr = cmdtable[cmd]
cmds = cmd.lstrip("^").split("|")
d['cmd'] = cmds[0]
d['aliases'] = cmd.split("|")[1:]
d['desc'] = get_desc(attr[0].__doc__)
d['opts'] = list(get_opts(attr[1]))
s = 'hg ' + cmds[0]
if len(attr) > 2:
if not attr[2].startswith('hg'):
s += ' ' + attr[2]
else:
s = attr[2]
d['synopsis'] = s.strip()
return d
def section(ui, s):
ui.write("%s\n%s\n\n" % (s, "-" * encoding.colwidth(s)))
def subsection(ui, s):
ui.write("%s\n%s\n\n" % (s, '"' * encoding.colwidth(s)))
def subsubsection(ui, s):
ui.write("%s\n%s\n\n" % (s, "." * encoding.colwidth(s)))
def subsubsubsection(ui, s):
ui.write("%s\n%s\n\n" % (s, "#" * encoding.colwidth(s)))
def show_doc(ui):
# print options
section(ui, _("Options"))
for optstr, desc in get_opts(globalopts):
ui.write("%s\n %s\n\n" % (optstr, desc))
# print cmds
section(ui, _("Commands"))
commandprinter(ui, table, subsection)
# print topics
for names, sec, doc in helptable:
if names[0] == "config":
# The config help topic is included in the hgrc.5 man
# page.
continue
for name in names:
ui.write(".. _%s:\n" % name)
ui.write("\n")
section(ui, sec)
if util.safehasattr(doc, '__call__'):
doc = doc()
ui.write(doc)
ui.write("\n")
section(ui, _("Extensions"))
ui.write(_("This section contains help for extensions that are distributed "
"together with Mercurial. Help for other extensions is available "
"in the help system."))
ui.write("\n\n"
".. contents::\n"
" :class: htmlonly\n"
" :local:\n"
" :depth: 1\n\n")
for extensionname in sorted(allextensionnames()):
mod = extensions.load(None, extensionname, None)
subsection(ui, extensionname)
ui.write("%s\n\n" % mod.__doc__)
cmdtable = getattr(mod, 'cmdtable', None)
if cmdtable:
subsubsection(ui, _('Commands'))
commandprinter(ui, cmdtable, subsubsubsection)
def commandprinter(ui, cmdtable, sectionfunc):
h = {}
for c, attr in cmdtable.items():
f = c.split("|")[0]
f = f.lstrip("^")
h[f] = c
cmds = h.keys()
cmds.sort()
for f in cmds:
if f.startswith("debug"):
continue
d = get_cmd(h[f], cmdtable)
sectionfunc(ui, d['cmd'])
# synopsis
ui.write("::\n\n")
synopsislines = d['synopsis'].splitlines()
for line in synopsislines:
# some commands (such as rebase) have a multi-line
# synopsis
ui.write(" %s\n" % line)
ui.write('\n')
# description
ui.write("%s\n\n" % d['desc'][1])
# options
opt_output = list(d['opts'])
if opt_output:
opts_len = max([len(line[0]) for line in opt_output])
ui.write(_("Options:\n\n"))
for optstr, desc in opt_output:
if desc:
s = "%-*s %s" % (opts_len, optstr, desc)
else:
s = optstr
ui.write("%s\n" % s)
ui.write("\n")
# aliases
if d['aliases']:
ui.write(_(" aliases: %s\n\n") % " ".join(d['aliases']))
def allextensionnames():
return extensions.enabled().keys() + extensions.disabled().keys()
if __name__ == "__main__":
show_doc(sys.stdout)