# HG changeset patch # User Bryan O'Sullivan # Date 1206551530 25200 # Node ID 0231f763ebc8d01133c050afebbc155e3aeb64bc # Parent 1e4ddc9ac9f7abbaeaefc79995fc6abc8e43236f manifest: improve parsing performance by 8x via a new C extension diff -r 1e4ddc9ac9f7 -r 0231f763ebc8 mercurial/manifest.py --- a/mercurial/manifest.py Mon Mar 24 18:05:30 2008 -0500 +++ b/mercurial/manifest.py Wed Mar 26 10:12:10 2008 -0700 @@ -8,7 +8,7 @@ from node import bin, hex, nullid from revlog import revlog, RevlogError from i18n import _ -import array, struct, mdiff +import array, struct, mdiff, parsers class manifestdict(dict): def __init__(self, mapping=None, flags=None): @@ -39,14 +39,7 @@ def parse(self, lines): mfdict = manifestdict() - fdict = mfdict._flags - for l in lines.splitlines(): - f, n = l.split('\0') - if len(n) > 40: - fdict[f] = n[40:] - mfdict[f] = bin(n[:40]) - else: - mfdict[f] = bin(n) + parsers.parse_manifest(mfdict, mfdict._flags, lines) return mfdict def readdelta(self, node): diff -r 1e4ddc9ac9f7 -r 0231f763ebc8 mercurial/parsers.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/parsers.c Wed Mar 26 10:12:10 2008 -0700 @@ -0,0 +1,168 @@ +/* + parsers.c - efficient content parsing + + Copyright 2008 Matt Mackall and others + + This software may be used and distributed according to the terms of + the GNU General Public License, incorporated herein by reference. +*/ + +#include +#include +#include + +static int hexdigit(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + return -1; +} + +/* + * Turn a hex-encoded string into binary. + */ +static PyObject *unhexlify(const char *str, int len) +{ + PyObject *ret = NULL; + char *c, *d; + + if (len % 2) { + PyErr_SetString(PyExc_ValueError, + "input is not even in length"); + goto bail; + } + + ret = PyString_FromStringAndSize(NULL, len / 2); + if (!ret) + goto bail; + + d = PyString_AsString(ret); + if (!d) + goto bail; + + for (c = str; c < str + len;) { + int hi = hexdigit(*c++); + int lo = hexdigit(*c++); + + if (hi == -1 || lo == -1) { + PyErr_SetString(PyExc_ValueError, + "input contains non-hex character"); + goto bail; + } + + *d++ = (hi << 4) | lo; + } + + goto done; + +bail: + Py_XDECREF(ret); + ret = NULL; +done: + return ret; +} + +/* + * This code assumes that a manifest is stitched together with newline + * ('\n') characters. + */ +static PyObject *parse_manifest(PyObject *self, PyObject *args) +{ + PyObject *mfdict, *fdict; + char *str, *cur, *start, *zero; + int len; + + if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest", + &PyDict_Type, &mfdict, + &PyDict_Type, &fdict, + &str, &len)) + goto quit; + + for (start = cur = str, zero = NULL; cur < str + len; cur++) { + PyObject *file = NULL, *node = NULL; + PyObject *flags = NULL; + int nlen; + + if (!*cur) { + zero = cur; + continue; + } + else if (*cur != '\n') + continue; + + if (!zero) { + PyErr_SetString(PyExc_ValueError, + "manifest entry has no separator"); + goto quit; + } + + file = PyString_FromStringAndSize(start, zero - start); + if (!file) + goto bail; + + nlen = cur - zero - 1; + + node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen); + if (!node) + goto bail; + + if (nlen > 40) { + PyObject *flags; + + flags = PyString_FromStringAndSize(zero + 41, + nlen - 40); + if (!flags) + goto bail; + + if (PyDict_SetItem(fdict, file, flags) == -1) + goto bail; + } + + if (PyDict_SetItem(mfdict, file, node) == -1) + goto bail; + + start = cur + 1; + zero = NULL; + + Py_XDECREF(flags); + Py_XDECREF(node); + Py_XDECREF(file); + continue; + bail: + Py_XDECREF(flags); + Py_XDECREF(node); + Py_XDECREF(file); + goto quit; + } + + if (len > 0 && *(cur - 1) != '\n') { + PyErr_SetString(PyExc_ValueError, + "manifest contains trailing garbage"); + goto quit; + } + + Py_INCREF(Py_None); + return Py_None; + +quit: + return NULL; +} + +static char parsers_doc[] = "Efficient content parsing."; + +static PyMethodDef methods[] = { + {"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"}, + {NULL, NULL} +}; + +PyMODINIT_FUNC initparsers(void) +{ + Py_InitModule3("parsers", methods, parsers_doc); +} diff -r 1e4ddc9ac9f7 -r 0231f763ebc8 setup.py --- a/setup.py Mon Mar 24 18:05:30 2008 -0500 +++ b/setup.py Wed Mar 26 10:12:10 2008 -0700 @@ -88,10 +88,11 @@ cmdclass = {'install_data': install_package_data} ext_modules=[ - Extension('mercurial.mpatch', ['mercurial/mpatch.c']), + Extension('mercurial.base85', ['mercurial/base85.c']), Extension('mercurial.bdiff', ['mercurial/bdiff.c']), - Extension('mercurial.base85', ['mercurial/base85.c']), - Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']) + Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']), + Extension('mercurial.mpatch', ['mercurial/mpatch.c']), + Extension('mercurial.parsers', ['mercurial/parsers.c']), ] packages = ['mercurial', 'mercurial.hgweb', 'hgext', 'hgext.convert']