# HG changeset patch # User Patrick Mezard # Date 1321622583 -3600 # Node ID eeac5e17924354c4afeb6001ed69a06805189520 # Parent b35cf47286a6bf7fe6e7f63f1d56e1b5f96bf4b3 mdiff: replace wscleanup() regexps with C loops On my system it reduces: hg annotate -w mercurial/commands.py from 36s to less than 8s, to be compared with 6.3s when run without whitespace options. diff -r b35cf47286a6 -r eeac5e179243 mercurial/bdiff.c --- a/mercurial/bdiff.c Fri Nov 18 14:16:47 2011 +0100 +++ b/mercurial/bdiff.c Fri Nov 18 14:23:03 2011 +0100 @@ -425,11 +425,55 @@ return result ? result : PyErr_NoMemory(); } +/* + * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise, + * reduce whitespace sequences to a single space and trim remaining whitespace + * from end of lines. + */ +static PyObject *fixws(PyObject *self, PyObject *args) +{ + PyObject *s, *result = NULL; + char allws, c; + const char *r; + int i, rlen, wlen = 0; + char *w; + + if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws)) + return NULL; + r = PyBytes_AsString(s); + rlen = PyBytes_Size(s); + + w = (char *)malloc(rlen); + if (!w) + goto nomem; + + for (i = 0; i != rlen; i++) { + c = r[i]; + if (c == ' ' || c == '\t' || c == '\r') { + if (!allws && (wlen == 0 || w[wlen - 1] != ' ')) + w[wlen++] = ' '; + } else if (c == '\n' && !allws + && wlen > 0 && w[wlen - 1] == ' ') { + w[wlen - 1] = '\n'; + } else { + w[wlen++] = c; + } + } + + result = PyBytes_FromStringAndSize(w, wlen); + +nomem: + free(w); + return result ? result : PyErr_NoMemory(); +} + + static char mdiff_doc[] = "Efficient binary diff."; static PyMethodDef methods[] = { {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, + {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, {NULL, NULL} }; diff -r b35cf47286a6 -r eeac5e179243 mercurial/mdiff.py --- a/mercurial/mdiff.py Fri Nov 18 14:16:47 2011 +0100 +++ b/mercurial/mdiff.py Fri Nov 18 14:23:03 2011 +0100 @@ -67,10 +67,9 @@ def wsclean(opts, text, blank=True): if opts.ignorews: - text = re.sub('[ \t\r]+', '', text) + text = bdiff.fixws(text, 1) elif opts.ignorewsamount: - text = re.sub('[ \t\r]+', ' ', text) - text = text.replace(' \n', '\n') + text = bdiff.fixws(text, 0) if blank and opts.ignoreblanklines: text = re.sub('\n+', '\n', text).strip('\n') return text diff -r b35cf47286a6 -r eeac5e179243 mercurial/pure/bdiff.py --- a/mercurial/pure/bdiff.py Fri Nov 18 14:16:47 2011 +0100 +++ b/mercurial/pure/bdiff.py Fri Nov 18 14:23:03 2011 +0100 @@ -5,7 +5,7 @@ # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. -import struct, difflib +import struct, difflib, re def splitnewlines(text): '''like str.splitlines, but only split on newlines.''' @@ -78,3 +78,10 @@ d = _normalizeblocks(an, bn, d) return [(i, i + n, j, j + n) for (i, j, n) in d] +def fixws(text, allws): + if allws: + text = re.sub('[ \t\r]+', '', text) + else: + text = re.sub('[ \t\r]+', ' ', text) + text = text.replace(' \n', '\n') + return text diff -r b35cf47286a6 -r eeac5e179243 tests/test-bdiff.py --- a/tests/test-bdiff.py Fri Nov 18 14:16:47 2011 +0100 +++ b/tests/test-bdiff.py Fri Nov 18 14:23:03 2011 +0100 @@ -50,3 +50,17 @@ showdiff("x\n\nx\n\nx\n\nx\n\nz\n", "x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n") print "done" + +def testfixws(a, b, allws): + c = bdiff.fixws(a, allws) + if c != b: + print "*** fixws", repr(a), repr(b), allws + print "got:" + print repr(c) + +testfixws(" \ta\r b\t\n", "ab\n", 1) +testfixws(" \ta\r b\t\n", " a b\n", 0) +testfixws("", "", 1) +testfixws("", "", 0) + +print "done" diff -r b35cf47286a6 -r eeac5e179243 tests/test-bdiff.py.out --- a/tests/test-bdiff.py.out Fri Nov 18 14:16:47 2011 +0100 +++ b/tests/test-bdiff.py.out Fri Nov 18 14:23:03 2011 +0100 @@ -21,3 +21,4 @@ 6 6 'y\n\n' 9 9 'y\n\n' done +done