--- a/mercurial/parsers.c Thu Apr 12 20:22:18 2012 -0500
+++ b/mercurial/parsers.c Thu Apr 12 14:05:59 2012 -0700
@@ -215,10 +215,27 @@
}
/*
- * A list-like object that decodes the contents of a RevlogNG index
- * file on demand. It has limited support for insert and delete at the
- * last element before the end. The last entry is always a sentinel
- * nullid.
+ * A base-16 trie for fast node->rev mapping.
+ *
+ * Positive value is index of the next node in the trie
+ * Negative value is a leaf: -(rev + 1)
+ * Zero is empty
+ */
+typedef struct {
+ int children[16];
+} nodetree;
+
+/*
+ * This class has two behaviours.
+ *
+ * When used in a list-like way (with integer keys), we decode an
+ * entry in a RevlogNG index file on demand. Our last entry is a
+ * sentinel, always a nullid. We have limited support for
+ * integer-keyed insert and delete, only at elements right before the
+ * sentinel.
+ *
+ * With string keys, we lazily perform a reverse mapping from node to
+ * rev, using a base-16 trie.
*/
typedef struct {
PyObject_HEAD
@@ -229,10 +246,18 @@
Py_ssize_t raw_length; /* original number of elements */
Py_ssize_t length; /* current number of elements */
PyObject *added; /* populated on demand */
+ nodetree *nt; /* base-16 trie */
+ int ntlength; /* # nodes in use */
+ int ntcapacity; /* # nodes allocated */
+ int ntdepth; /* maximum depth of tree */
+ int ntsplits; /* # splits performed */
+ int ntrev; /* last rev scanned */
+ int ntlookups; /* # lookups */
+ int ntmisses; /* # lookups that miss the cache */
int inlined;
} indexObject;
-static Py_ssize_t index_length(indexObject *self)
+static Py_ssize_t index_length(const indexObject *self)
{
if (self->added == NULL)
return self->length;
@@ -240,6 +265,7 @@
}
static PyObject *nullentry;
+static const char nullid[20];
static long inline_scan(indexObject *self, const char **offsets);
@@ -249,7 +275,27 @@
static char *tuple_format = "kiiiiiis#";
#endif
-/* RevlogNG format (all in big endian, data may be inlined):
+/*
+ * Return a pointer to the beginning of a RevlogNG record.
+ */
+static const char *index_deref(indexObject *self, Py_ssize_t pos)
+{
+ if (self->inlined && pos > 0) {
+ if (self->offsets == NULL) {
+ self->offsets = malloc(self->raw_length *
+ sizeof(*self->offsets));
+ if (self->offsets == NULL)
+ return (const char *)PyErr_NoMemory();
+ inline_scan(self, self->offsets);
+ }
+ return self->offsets[pos];
+ }
+
+ return PyString_AS_STRING(self->data) + pos * 64;
+}
+
+/*
+ * RevlogNG format (all in big endian, data may be inlined):
* 6 bytes: offset
* 2 bytes: flags
* 4 bytes: compressed length
@@ -270,7 +316,10 @@
Py_ssize_t length = index_length(self);
PyObject *entry;
- if (pos >= length) {
+ if (pos < 0)
+ pos += length;
+
+ if (pos < 0 || pos >= length) {
PyErr_SetString(PyExc_IndexError, "revlog index out of range");
return NULL;
}
@@ -298,17 +347,9 @@
return PyErr_NoMemory();
}
- if (self->inlined && pos > 0) {
- if (self->offsets == NULL) {
- self->offsets = malloc(self->raw_length *
- sizeof(*self->offsets));
- if (self->offsets == NULL)
- return PyErr_NoMemory();
- inline_scan(self, self->offsets);
- }
- data = self->offsets[pos];
- } else
- data = PyString_AS_STRING(self->data) + pos * 64;
+ data = index_deref(self, pos);
+ if (data == NULL)
+ return NULL;
memcpy(decode, data, 8 * sizeof(uint32_t));
@@ -341,26 +382,60 @@
return entry;
}
+/*
+ * Return the 20-byte SHA of the node corresponding to the given rev.
+ */
+static const char *index_node(indexObject *self, Py_ssize_t pos)
+{
+ Py_ssize_t length = index_length(self);
+ const char *data;
+
+ if (pos == length - 1)
+ return nullid;
+
+ if (pos >= length)
+ return NULL;
+
+ if (pos >= self->length - 1) {
+ PyObject *tuple, *str;
+ tuple = PyList_GET_ITEM(self->added, pos - self->length + 1);
+ str = PyTuple_GetItem(tuple, 7);
+ return str ? PyString_AS_STRING(str) : NULL;
+ }
+
+ data = index_deref(self, pos);
+ return data ? data + 32 : NULL;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev);
+
+static int node_check(PyObject *obj, char **node, Py_ssize_t *nodelen)
+{
+ if (PyString_AsStringAndSize(obj, node, nodelen) == -1)
+ return -1;
+ if (*nodelen == 20)
+ return 0;
+ PyErr_SetString(PyExc_ValueError, "20-byte hash required");
+ return -1;
+}
+
static PyObject *index_insert(indexObject *self, PyObject *args)
{
- PyObject *obj, *node;
+ PyObject *obj;
+ char *node;
long offset;
- Py_ssize_t len;
+ Py_ssize_t len, nodelen;
if (!PyArg_ParseTuple(args, "lO", &offset, &obj))
return NULL;
if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) {
- PyErr_SetString(PyExc_ValueError, "8-tuple required");
+ PyErr_SetString(PyExc_TypeError, "8-tuple required");
return NULL;
}
- node = PyTuple_GET_ITEM(obj, 7);
- if (!PyString_Check(node) || PyString_GET_SIZE(node) != 20) {
- PyErr_SetString(PyExc_ValueError,
- "20-byte hash required as last element");
+ if (node_check(PyTuple_GET_ITEM(obj, 7), &node, &nodelen) == -1)
return NULL;
- }
len = index_length(self);
@@ -373,6 +448,12 @@
return NULL;
}
+ if (offset > INT_MAX) {
+ PyErr_SetString(PyExc_ValueError,
+ "currently only 2**31 revs supported");
+ return NULL;
+ }
+
if (self->added == NULL) {
self->added = PyList_New(0);
if (self->added == NULL)
@@ -382,6 +463,9 @@
if (PyList_Append(self->added, obj) == -1)
return NULL;
+ if (self->nt)
+ nt_insert(self, node, (int)offset);
+
Py_RETURN_NONE;
}
@@ -401,26 +485,356 @@
free(self->offsets);
self->offsets = NULL;
}
+ if (self->nt) {
+ free(self->nt);
+ self->nt = NULL;
+ }
}
static PyObject *index_clearcaches(indexObject *self)
{
_index_clearcaches(self);
+ self->ntlength = self->ntcapacity = 0;
+ self->ntdepth = self->ntsplits = 0;
+ self->ntrev = -1;
+ self->ntlookups = self->ntmisses = 0;
Py_RETURN_NONE;
}
-static int index_assign_subscript(indexObject *self, PyObject *item,
- PyObject *value)
+static PyObject *index_stats(indexObject *self)
+{
+ PyObject *obj = PyDict_New();
+
+ if (obj == NULL)
+ return NULL;
+
+#define istat(__n, __d) \
+ if (PyDict_SetItemString(obj, __d, PyInt_FromLong(self->__n)) == -1) \
+ goto bail;
+
+ if (self->added) {
+ Py_ssize_t len = PyList_GET_SIZE(self->added);
+ if (PyDict_SetItemString(obj, "index entries added",
+ PyInt_FromLong(len)) == -1)
+ goto bail;
+ }
+
+ if (self->raw_length != self->length - 1)
+ istat(raw_length, "revs on disk");
+ istat(length, "revs in memory");
+ istat(ntcapacity, "node trie capacity");
+ istat(ntdepth, "node trie depth");
+ istat(ntlength, "node trie count");
+ istat(ntlookups, "node trie lookups");
+ istat(ntmisses, "node trie misses");
+ istat(ntrev, "node trie last rev scanned");
+ istat(ntsplits, "node trie splits");
+
+#undef istat
+
+ return obj;
+
+bail:
+ Py_XDECREF(obj);
+ return NULL;
+}
+
+static inline int nt_level(const char *node, int level)
+{
+ int v = node[level>>1];
+ if (!(level & 1))
+ v >>= 4;
+ return v & 0xf;
+}
+
+static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen)
+{
+ int level, off;
+
+ if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0)
+ return -1;
+
+ if (self->nt == NULL)
+ return -2;
+
+ for (level = off = 0; level < nodelen; level++) {
+ int k = nt_level(node, level);
+ nodetree *n = &self->nt[off];
+ int v = n->children[k];
+
+ if (v < 0) {
+ const char *n;
+ v = -v - 1;
+ n = index_node(self, v);
+ if (n == NULL)
+ return -2;
+ return memcmp(node, n, nodelen > 20 ? 20 : nodelen)
+ ? -2 : v;
+ }
+ if (v == 0)
+ return -2;
+ off = v;
+ }
+ return -2;
+}
+
+static int nt_new(indexObject *self)
+{
+ if (self->ntlength == self->ntcapacity) {
+ self->ntcapacity *= 2;
+ self->nt = realloc(self->nt,
+ self->ntcapacity * sizeof(nodetree));
+ if (self->nt == NULL) {
+ PyErr_SetString(PyExc_MemoryError, "out of memory");
+ return -1;
+ }
+ memset(&self->nt[self->ntlength], 0,
+ sizeof(nodetree) * (self->ntcapacity - self->ntlength));
+ }
+ return self->ntlength++;
+}
+
+static int nt_insert(indexObject *self, const char *node, int rev)
+{
+ int level = 0;
+ int off = 0;
+
+ while (level < 20) {
+ int k = nt_level(node, level);
+ nodetree *n;
+ int v;
+
+ n = &self->nt[off];
+ v = n->children[k];
+
+ if (v == 0) {
+ n->children[k] = -rev - 1;
+ return 0;
+ }
+ if (v < 0) {
+ const char *oldnode = index_node(self, -v - 1);
+ int noff;
+
+ if (!oldnode || !memcmp(oldnode, node, 20)) {
+ n->children[k] = -rev - 1;
+ return 0;
+ }
+ noff = nt_new(self);
+ if (noff == -1)
+ return -1;
+ /* self->nt may have been changed by realloc */
+ self->nt[off].children[k] = noff;
+ off = noff;
+ n = &self->nt[off];
+ n->children[nt_level(oldnode, ++level)] = v;
+ if (level > self->ntdepth)
+ self->ntdepth = level;
+ self->ntsplits += 1;
+ } else {
+ level += 1;
+ off = v;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Return values:
+ *
+ * -3: error (exception set)
+ * -2: not found (no exception set)
+ * rest: valid rev
+ */
+static int index_find_node(indexObject *self,
+ const char *node, Py_ssize_t nodelen)
+{
+ int rev;
+
+ self->ntlookups++;
+ rev = nt_find(self, node, nodelen);
+ if (rev >= -1)
+ return rev;
+
+ if (self->nt == NULL) {
+ self->ntcapacity = self->raw_length < 4
+ ? 4 : self->raw_length / 2;
+ self->nt = calloc(self->ntcapacity, sizeof(nodetree));
+ if (self->nt == NULL) {
+ PyErr_SetString(PyExc_MemoryError, "out of memory");
+ return -3;
+ }
+ self->ntlength = 1;
+ self->ntrev = (int)index_length(self) - 1;
+ self->ntlookups = 1;
+ self->ntmisses = 0;
+ }
+
+ /*
+ * For the first handful of lookups, we scan the entire index,
+ * and cache only the matching nodes. This optimizes for cases
+ * like "hg tip", where only a few nodes are accessed.
+ *
+ * After that, we cache every node we visit, using a single
+ * scan amortized over multiple lookups. This gives the best
+ * bulk performance, e.g. for "hg log".
+ */
+ if (self->ntmisses++ < 4) {
+ for (rev = self->ntrev - 1; rev >= 0; rev--) {
+ const char *n = index_node(self, rev);
+ if (n == NULL)
+ return -2;
+ if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+ if (nt_insert(self, n, rev) == -1)
+ return -3;
+ break;
+ }
+ }
+ } else {
+ for (rev = self->ntrev - 1; rev >= 0; rev--) {
+ const char *n = index_node(self, rev);
+ if (n == NULL)
+ return -2;
+ if (nt_insert(self, n, rev) == -1)
+ return -3;
+ if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) {
+ break;
+ }
+ }
+ self->ntrev = rev;
+ }
+
+ if (rev >= 0)
+ return rev;
+ return -2;
+}
+
+static PyObject *raise_revlog_error(void)
+{
+ static PyObject *errclass;
+ PyObject *mod = NULL, *errobj;
+
+ if (errclass == NULL) {
+ PyObject *dict;
+
+ mod = PyImport_ImportModule("mercurial.error");
+ if (mod == NULL)
+ goto classfail;
+
+ dict = PyModule_GetDict(mod);
+ if (dict == NULL)
+ goto classfail;
+
+ errclass = PyDict_GetItemString(dict, "RevlogError");
+ if (errclass == NULL) {
+ PyErr_SetString(PyExc_SystemError,
+ "could not find RevlogError");
+ goto classfail;
+ }
+ Py_INCREF(errclass);
+ }
+
+ errobj = PyObject_CallFunction(errclass, NULL);
+ if (errobj == NULL)
+ return NULL;
+ PyErr_SetObject(errclass, errobj);
+ return errobj;
+
+classfail:
+ Py_XDECREF(mod);
+ return NULL;
+}
+
+static PyObject *index_getitem(indexObject *self, PyObject *value)
+{
+ char *node;
+ Py_ssize_t nodelen;
+ int rev;
+
+ if (PyInt_Check(value))
+ return index_get(self, PyInt_AS_LONG(value));
+
+ if (PyString_AsStringAndSize(value, &node, &nodelen) == -1)
+ return NULL;
+ rev = index_find_node(self, node, nodelen);
+ if (rev >= -1)
+ return PyInt_FromLong(rev);
+ if (rev == -2)
+ raise_revlog_error();
+ return NULL;
+}
+
+static PyObject *index_m_get(indexObject *self, PyObject *args)
+{
+ char *node;
+ int nodelen, rev;
+
+ if (!PyArg_ParseTuple(args, "s#", &node, &nodelen))
+ return NULL;
+
+ rev = index_find_node(self, node, nodelen);
+ if (rev == -3)
+ return NULL;
+ if (rev == -2)
+ Py_RETURN_NONE;
+ return PyInt_FromLong(rev);
+}
+
+static int index_contains(indexObject *self, PyObject *value)
+{
+ char *node;
+ Py_ssize_t nodelen;
+
+ if (PyInt_Check(value)) {
+ long rev = PyInt_AS_LONG(value);
+ return rev >= -1 && rev < index_length(self);
+ }
+
+ if (!PyString_Check(value))
+ return 0;
+
+ node = PyString_AS_STRING(value);
+ nodelen = PyString_GET_SIZE(value);
+
+ switch (index_find_node(self, node, nodelen)) {
+ case -3:
+ return -1;
+ case -2:
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+/*
+ * Invalidate any trie entries introduced by added revs.
+ */
+static void nt_invalidate_added(indexObject *self, Py_ssize_t start)
+{
+ Py_ssize_t i, len = PyList_GET_SIZE(self->added);
+
+ for (i = start; i < len; i++) {
+ PyObject *tuple = PyList_GET_ITEM(self->added, i);
+ PyObject *node = PyTuple_GET_ITEM(tuple, 7);
+
+ nt_insert(self, PyString_AS_STRING(node), -1);
+ }
+
+ if (start == 0) {
+ Py_DECREF(self->added);
+ self->added = NULL;
+ }
+}
+
+/*
+ * Delete a numeric range of revs, which must be at the end of the
+ * range, but exclude the sentinel nullid entry.
+ */
+static int index_slice_del(indexObject *self, PyObject *item)
{
Py_ssize_t start, stop, step, slicelength;
Py_ssize_t length = index_length(self);
- if (!PySlice_Check(item) || value != NULL) {
- PyErr_SetString(PyExc_TypeError,
- "revlog index only supports slice deletion");
- return -1;
- }
-
if (PySlice_GetIndicesEx((PySliceObject*)item, length,
&start, &stop, &step, &slicelength) < 0)
return -1;
@@ -449,20 +863,71 @@
return -1;
}
- if (start < self->length) {
+ if (start < self->length - 1) {
+ if (self->nt) {
+ Py_ssize_t i;
+
+ for (i = start + 1; i < self->length - 1; i++) {
+ const char *node = index_node(self, i);
+
+ if (node)
+ nt_insert(self, node, -1);
+ }
+ if (self->added)
+ nt_invalidate_added(self, 0);
+ if (self->ntrev > start)
+ self->ntrev = (int)start;
+ }
self->length = start + 1;
- if (self->added) {
- Py_DECREF(self->added);
- self->added = NULL;
- }
return 0;
}
- return PyList_SetSlice(self->added, start - self->length + 1,
- PyList_GET_SIZE(self->added),
- NULL);
+ if (self->nt) {
+ nt_invalidate_added(self, start - self->length + 1);
+ if (self->ntrev > start)
+ self->ntrev = (int)start;
+ }
+ return self->added
+ ? PyList_SetSlice(self->added, start - self->length + 1,
+ PyList_GET_SIZE(self->added), NULL)
+ : 0;
}
+/*
+ * Supported ops:
+ *
+ * slice deletion
+ * string assignment (extend node->rev mapping)
+ * string deletion (shrink node->rev mapping)
+ */
+static int index_assign_subscript(indexObject *self, PyObject *item,
+ PyObject *value)
+{
+ char *node;
+ Py_ssize_t nodelen;
+ long rev;
+
+ if (PySlice_Check(item) && value == NULL)
+ return index_slice_del(self, item);
+
+ if (node_check(item, &node, &nodelen) == -1)
+ return -1;
+
+ if (value == NULL)
+ return self->nt ? nt_insert(self, node, -1) : 0;
+ rev = PyInt_AsLong(value);
+ if (rev > INT_MAX || rev < 0) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_ValueError, "rev out of range");
+ return -1;
+ }
+ return nt_insert(self, node, (int)rev);
+}
+
+/*
+ * Find all RevlogNG entries in an index that has inline data. Update
+ * the optional "offsets" table with those entries.
+ */
static long inline_scan(indexObject *self, const char **offsets)
{
const char *data = PyString_AS_STRING(self->data);
@@ -506,6 +971,11 @@
self->added = NULL;
self->offsets = NULL;
+ self->nt = NULL;
+ self->ntlength = self->ntcapacity = 0;
+ self->ntdepth = self->ntsplits = 0;
+ self->ntlookups = self->ntmisses = 0;
+ self->ntrev = -1;
Py_INCREF(self->data);
if (self->inlined) {
@@ -541,6 +1011,11 @@
PyTuple_GET_ITEM(args, 0));
}
+static PyObject *index_nodemap(indexObject *self)
+{
+ return (PyObject *)self;
+}
+
static void index_dealloc(indexObject *self)
{
_index_clearcaches(self);
@@ -554,19 +1029,32 @@
0, /* sq_concat */
0, /* sq_repeat */
(ssizeargfunc)index_get, /* sq_item */
+ 0, /* sq_slice */
+ 0, /* sq_ass_item */
+ 0, /* sq_ass_slice */
+ (objobjproc)index_contains, /* sq_contains */
};
static PyMappingMethods index_mapping_methods = {
(lenfunc)index_length, /* mp_length */
- NULL, /* mp_subscript */
+ (binaryfunc)index_getitem, /* mp_subscript */
(objobjargproc)index_assign_subscript, /* mp_ass_subscript */
};
static PyMethodDef index_methods[] = {
{"clearcaches", (PyCFunction)index_clearcaches, METH_NOARGS,
"clear the index caches"},
+ {"get", (PyCFunction)index_m_get, METH_VARARGS,
+ "get an index entry"},
{"insert", (PyCFunction)index_insert, METH_VARARGS,
"insert an index entry"},
+ {"stats", (PyCFunction)index_stats, METH_NOARGS,
+ "stats for the index"},
+ {NULL} /* Sentinel */
+};
+
+static PyGetSetDef index_getset[] = {
+ {"nodemap", (getter)index_nodemap, NULL, "nodemap", NULL},
{NULL} /* Sentinel */
};
@@ -601,7 +1089,7 @@
0, /* tp_iternext */
index_methods, /* tp_methods */
0, /* tp_members */
- 0, /* tp_getset */
+ index_getset, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
@@ -613,10 +1101,10 @@
};
/*
- * returns a tuple of the form (index, None, cache) with elements as
+ * returns a tuple of the form (index, index, cache) with elements as
* follows:
*
- * index: an index object that lazily parses the RevlogNG records
+ * index: an index object that lazily parses RevlogNG records
* cache: if data is inlined, a tuple (index_file_content, 0), else None
*
* added complications are for backwards compatibility
@@ -651,6 +1139,8 @@
Py_INCREF(cache);
}
+ Py_INCREF(idx);
+
tuple = Py_BuildValue("NN", idx, cache);
if (!tuple)
goto bail;
@@ -674,8 +1164,6 @@
static void module_init(PyObject *mod)
{
- static const char nullid[20];
-
if (PyType_Ready(&indexType) < 0)
return;
Py_INCREF(&indexType);
@@ -710,4 +1198,3 @@
module_init(mod);
}
#endif
-