revlog: speed up prefix matching against nodes
authorBryan O'Sullivan <bryano@fb.com>
Sat, 12 May 2012 10:55:08 +0200
changeset 16665 e410be860393
parent 16664 5bc6edf71b39
child 16674 76c744e0c5bb
revlog: speed up prefix matching against nodes The radix tree already contains all the information we need to determine whether a short string is an unambiguous node identifier. We now make use of this information. In a kernel tree, this improves the performance of "hg log -q -r24bf01de75" from 0.27 seconds to 0.06.
mercurial/parsers.c
mercurial/revlog.py
--- a/mercurial/parsers.c	Sat May 12 10:55:08 2012 +0200
+++ b/mercurial/parsers.c	Sat May 12 10:55:08 2012 +0200
@@ -566,7 +566,7 @@
 		return -2;
 
 	if (hex)
-		maxlevel = nodelen > 40 ? 40 : nodelen;
+		maxlevel = nodelen > 40 ? 40 : (int)nodelen;
 	else
 		maxlevel = nodelen > 20 ? 40 : ((int)nodelen * 2);
 
@@ -795,6 +795,77 @@
 	return NULL;
 }
 
+static int nt_partialmatch(indexObject *self, const char *node,
+			   Py_ssize_t nodelen)
+{
+	int rev;
+
+	if (nt_init(self) == -1)
+		return -3;
+
+	if (self->ntrev > 0) {
+		/* ensure that the radix tree is fully populated */
+		for (rev = self->ntrev - 1; rev >= 0; rev--) {
+			const char *n = index_node(self, rev);
+			if (n == NULL)
+				return -2;
+			if (nt_insert(self, n, rev) == -1)
+				return -3;
+		}
+		self->ntrev = rev;
+	}
+
+	return nt_find(self, node, nodelen, 1);
+}
+
+static PyObject *index_partialmatch(indexObject *self, PyObject *args)
+{
+	const char *fullnode;
+	int nodelen;
+	char *node;
+	int rev, i;
+
+	if (!PyArg_ParseTuple(args, "s#", &node, &nodelen))
+		return NULL;
+
+	if (nodelen < 4) {
+		PyErr_SetString(PyExc_ValueError, "key too short");
+		return NULL;
+	}
+
+	if (nodelen > 40)
+		nodelen = 40;
+
+	for (i = 0; i < nodelen; i++)
+		hexdigit(node, i);
+	if (PyErr_Occurred()) {
+		/* input contains non-hex characters */
+		PyErr_Clear();
+		Py_RETURN_NONE;
+	}
+
+	rev = nt_partialmatch(self, node, nodelen);
+
+	switch (rev) {
+	case -4:
+		raise_revlog_error();
+	case -3:
+		return NULL;
+	case -2:
+		Py_RETURN_NONE;
+	case -1:
+		return PyString_FromStringAndSize(nullid, 20);
+	}
+
+	fullnode = index_node(self, rev);
+	if (fullnode == NULL) {
+		PyErr_Format(PyExc_IndexError,
+			     "could not access rev %d", rev);
+		return NULL;
+	}
+	return PyString_FromStringAndSize(fullnode, 20);
+}
+
 static PyObject *index_m_get(indexObject *self, PyObject *args)
 {
 	char *node;
@@ -1077,6 +1148,8 @@
 	 "get an index entry"},
 	{"insert", (PyCFunction)index_insert, METH_VARARGS,
 	 "insert an index entry"},
+	{"partialmatch", (PyCFunction)index_partialmatch, METH_VARARGS,
+	 "match a potentially ambiguous node ID"},
 	{"stats", (PyCFunction)index_stats, METH_NOARGS,
 	 "stats for the index"},
 	{NULL} /* Sentinel */
--- a/mercurial/revlog.py	Sat May 12 10:55:08 2012 +0200
+++ b/mercurial/revlog.py	Sat May 12 10:55:08 2012 +0200
@@ -756,6 +756,15 @@
                 pass
 
     def _partialmatch(self, id):
+        try:
+            return self.index.partialmatch(id)
+        except RevlogError:
+            # parsers.c radix tree lookup gave multiple matches
+            raise LookupError(id, self.indexfile, _("ambiguous identifier"))
+        except (AttributeError, ValueError):
+            # we are pure python, or key was too short to search radix tree
+            pass
+
         if id in self._pcache:
             return self._pcache[id]