Mercurial > hg
view mercurial/parsers.c @ 19738:93b8544c4482
hgweb: add nextentry variable for easy pagination in changelog
nextentry always contains the first entry not shown on current page (if there is
such entry)
author | Alexander Plavin <alexander@plav.in> |
---|---|
date | Fri, 06 Sep 2013 13:30:57 +0400 |
parents | 3daabd2da78b |
children | e57c532c3835 40b7c6e4b993 |
line wrap: on
line source
/* parsers.c - efficient content parsing Copyright 2008 Matt Mackall <mpm@selenic.com> and others This software may be used and distributed according to the terms of the GNU General Public License, incorporated herein by reference. */ #include <Python.h> #include <ctype.h> #include <stddef.h> #include <string.h> #include "util.h" static int8_t hextable[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, /* 0-9 */ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* A-F */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* a-f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; static inline int hexdigit(const char *p, Py_ssize_t off) { int8_t val = hextable[(unsigned char)p[off]]; if (val >= 0) { return val; } PyErr_SetString(PyExc_ValueError, "input contains non-hex character"); return 0; } /* * Turn a hex-encoded string into binary. */ static PyObject *unhexlify(const char *str, int len) { PyObject *ret; char *d; int i; ret = PyBytes_FromStringAndSize(NULL, len / 2); if (!ret) return NULL; d = PyBytes_AsString(ret); for (i = 0; i < len;) { int hi = hexdigit(str, i++); int lo = hexdigit(str, i++); *d++ = (hi << 4) | lo; } return ret; } /* * This code assumes that a manifest is stitched together with newline * ('\n') characters. */ static PyObject *parse_manifest(PyObject *self, PyObject *args) { PyObject *mfdict, *fdict; char *str, *start, *end; int len; if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest", &PyDict_Type, &mfdict, &PyDict_Type, &fdict, &str, &len)) goto quit; start = str; end = str + len; while (start < end) { PyObject *file = NULL, *node = NULL; PyObject *flags = NULL; char *zero = NULL, *newline = NULL; ptrdiff_t nlen; zero = memchr(start, '\0', end - start); if (!zero) { PyErr_SetString(PyExc_ValueError, "manifest entry has no separator"); goto quit; } newline = memchr(zero + 1, '\n', end - (zero + 1)); if (!newline) { PyErr_SetString(PyExc_ValueError, "manifest contains trailing garbage"); goto quit; } file = PyBytes_FromStringAndSize(start, zero - start); if (!file) goto bail; nlen = newline - zero - 1; node = unhexlify(zero + 1, nlen > 40 ? 40 : (int)nlen); if (!node) goto bail; if (nlen > 40) { flags = PyBytes_FromStringAndSize(zero + 41, nlen - 40); if (!flags) goto bail; if (PyDict_SetItem(fdict, file, flags) == -1) goto bail; } if (PyDict_SetItem(mfdict, file, node) == -1) goto bail; start = newline + 1; Py_XDECREF(flags); Py_XDECREF(node); Py_XDECREF(file); continue; bail: Py_XDECREF(flags); Py_XDECREF(node); Py_XDECREF(file); goto quit; } Py_INCREF(Py_None); return Py_None; quit: return NULL; } static PyObject *parse_dirstate(PyObject *self, PyObject *args) { PyObject *dmap, *cmap, *parents = NULL, *ret = NULL; PyObject *fname = NULL, *cname = NULL, *entry = NULL; char state, *str, *cur, *end, *cpos; int mode, size, mtime; unsigned int flen; int len; if (!PyArg_ParseTuple(args, "O!O!s#:parse_dirstate", &PyDict_Type, &dmap, &PyDict_Type, &cmap, &str, &len)) goto quit; /* read parents */ if (len < 40) goto quit; parents = Py_BuildValue("s#s#", str, 20, str + 20, 20); if (!parents) goto quit; /* read filenames */ cur = str + 40; end = str + len; while (cur < end - 17) { /* unpack header */ state = *cur; mode = getbe32(cur + 1); size = getbe32(cur + 5); mtime = getbe32(cur + 9); flen = getbe32(cur + 13); cur += 17; if (cur + flen > end || cur + flen < cur) { PyErr_SetString(PyExc_ValueError, "overflow in dirstate"); goto quit; } entry = Py_BuildValue("ciii", state, mode, size, mtime); if (!entry) goto quit; PyObject_GC_UnTrack(entry); /* don't waste time with this */ cpos = memchr(cur, 0, flen); if (cpos) { fname = PyBytes_FromStringAndSize(cur, cpos - cur); cname = PyBytes_FromStringAndSize(cpos + 1, flen - (cpos - cur) - 1); if (!fname || !cname || PyDict_SetItem(cmap, fname, cname) == -1 || PyDict_SetItem(dmap, fname, entry) == -1) goto quit; Py_DECREF(cname); } else { fname = PyBytes_FromStringAndSize(cur, flen); if (!fname || PyDict_SetItem(dmap, fname, entry) == -1) goto quit; } cur += flen; Py_DECREF(fname); Py_DECREF(entry); fname = cname = entry = NULL; } ret = parents; Py_INCREF(ret); quit: Py_XDECREF(fname); Py_XDECREF(cname); Py_XDECREF(entry); Py_XDECREF(parents); return ret; } static inline int getintat(PyObject *tuple, int off, uint32_t *v) { PyObject *o = PyTuple_GET_ITEM(tuple, off); long val; if (PyInt_Check(o)) val = PyInt_AS_LONG(o); else if (PyLong_Check(o)) { val = PyLong_AsLong(o); if (val == -1 && PyErr_Occurred()) return -1; } else { PyErr_SetString(PyExc_TypeError, "expected an int or long"); return -1; } if (LONG_MAX > INT_MAX && (val > INT_MAX || val < INT_MIN)) { PyErr_SetString(PyExc_OverflowError, "Python value to large to convert to uint32_t"); return -1; } *v = (uint32_t)val; return 0; } static PyObject *dirstate_unset; /* * Efficiently pack a dirstate object into its on-disk format. */ static PyObject *pack_dirstate(PyObject *self, PyObject *args) { PyObject *packobj = NULL; PyObject *map, *copymap, *pl; Py_ssize_t nbytes, pos, l; PyObject *k, *v, *pn; char *p, *s; double now; if (!PyArg_ParseTuple(args, "O!O!Od:pack_dirstate", &PyDict_Type, &map, &PyDict_Type, ©map, &pl, &now)) return NULL; if (!PySequence_Check(pl) || PySequence_Size(pl) != 2) { PyErr_SetString(PyExc_TypeError, "expected 2-element sequence"); return NULL; } /* Figure out how much we need to allocate. */ for (nbytes = 40, pos = 0; PyDict_Next(map, &pos, &k, &v);) { PyObject *c; if (!PyString_Check(k)) { PyErr_SetString(PyExc_TypeError, "expected string key"); goto bail; } nbytes += PyString_GET_SIZE(k) + 17; c = PyDict_GetItem(copymap, k); if (c) { if (!PyString_Check(c)) { PyErr_SetString(PyExc_TypeError, "expected string key"); goto bail; } nbytes += PyString_GET_SIZE(c) + 1; } } packobj = PyString_FromStringAndSize(NULL, nbytes); if (packobj == NULL) goto bail; p = PyString_AS_STRING(packobj); pn = PySequence_ITEM(pl, 0); if (PyString_AsStringAndSize(pn, &s, &l) == -1 || l != 20) { PyErr_SetString(PyExc_TypeError, "expected a 20-byte hash"); goto bail; } memcpy(p, s, l); p += 20; pn = PySequence_ITEM(pl, 1); if (PyString_AsStringAndSize(pn, &s, &l) == -1 || l != 20) { PyErr_SetString(PyExc_TypeError, "expected a 20-byte hash"); goto bail; } memcpy(p, s, l); p += 20; for (pos = 0; PyDict_Next(map, &pos, &k, &v); ) { uint32_t mode, size, mtime; Py_ssize_t len, l; PyObject *o; char *s, *t; if (!PyTuple_Check(v) || PyTuple_GET_SIZE(v) != 4) { PyErr_SetString(PyExc_TypeError, "expected a 4-tuple"); goto bail; } o = PyTuple_GET_ITEM(v, 0); if (PyString_AsStringAndSize(o, &s, &l) == -1 || l != 1) { PyErr_SetString(PyExc_TypeError, "expected one byte"); goto bail; } *p++ = *s; if (getintat(v, 1, &mode) == -1) goto bail; if (getintat(v, 2, &size) == -1) goto bail; if (getintat(v, 3, &mtime) == -1) goto bail; if (*s == 'n' && mtime == (uint32_t)now) { /* See pure/parsers.py:pack_dirstate for why we do * this. */ if (PyDict_SetItem(map, k, dirstate_unset) == -1) goto bail; mtime = -1; } putbe32(mode, p); putbe32(size, p + 4); putbe32(mtime, p + 8); t = p + 12; p += 16; len = PyString_GET_SIZE(k); memcpy(p, PyString_AS_STRING(k), len); p += len; o = PyDict_GetItem(copymap, k); if (o) { *p++ = '\0'; l = PyString_GET_SIZE(o); memcpy(p, PyString_AS_STRING(o), l); p += l; len += l + 1; } putbe32((uint32_t)len, t); } pos = p - PyString_AS_STRING(packobj); if (pos != nbytes) { PyErr_Format(PyExc_SystemError, "bad dirstate size: %ld != %ld", (long)pos, (long)nbytes); goto bail; } return packobj; bail: Py_XDECREF(packobj); return NULL; } /* * A base-16 trie for fast node->rev mapping. * * Positive value is index of the next node in the trie * Negative value is a leaf: -(rev + 1) * Zero is empty */ typedef struct { int children[16]; } nodetree; /* * This class has two behaviours. * * When used in a list-like way (with integer keys), we decode an * entry in a RevlogNG index file on demand. Our last entry is a * sentinel, always a nullid. We have limited support for * integer-keyed insert and delete, only at elements right before the * sentinel. * * With string keys, we lazily perform a reverse mapping from node to * rev, using a base-16 trie. */ typedef struct { PyObject_HEAD /* Type-specific fields go here. */ PyObject *data; /* raw bytes of index */ PyObject **cache; /* cached tuples */ const char **offsets; /* populated on demand */ Py_ssize_t raw_length; /* original number of elements */ Py_ssize_t length; /* current number of elements */ PyObject *added; /* populated on demand */ PyObject *headrevs; /* cache, invalidated on changes */ nodetree *nt; /* base-16 trie */ int ntlength; /* # nodes in use */ int ntcapacity; /* # nodes allocated */ int ntdepth; /* maximum depth of tree */ int ntsplits; /* # splits performed */ int ntrev; /* last rev scanned */ int ntlookups; /* # lookups */ int ntmisses; /* # lookups that miss the cache */ int inlined; } indexObject; static Py_ssize_t index_length(const indexObject *self) { if (self->added == NULL) return self->length; return self->length + PyList_GET_SIZE(self->added); } static PyObject *nullentry; static const char nullid[20]; static long inline_scan(indexObject *self, const char **offsets); #if LONG_MAX == 0x7fffffffL static char *tuple_format = "Kiiiiiis#"; #else static char *tuple_format = "kiiiiiis#"; #endif /* A RevlogNG v1 index entry is 64 bytes long. */ static const long v1_hdrsize = 64; /* * Return a pointer to the beginning of a RevlogNG record. */ static const char *index_deref(indexObject *self, Py_ssize_t pos) { if (self->inlined && pos > 0) { if (self->offsets == NULL) { self->offsets = malloc(self->raw_length * sizeof(*self->offsets)); if (self->offsets == NULL) return (const char *)PyErr_NoMemory(); inline_scan(self, self->offsets); } return self->offsets[pos]; } return PyString_AS_STRING(self->data) + pos * v1_hdrsize; } /* * RevlogNG format (all in big endian, data may be inlined): * 6 bytes: offset * 2 bytes: flags * 4 bytes: compressed length * 4 bytes: uncompressed length * 4 bytes: base revision * 4 bytes: link revision * 4 bytes: parent 1 revision * 4 bytes: parent 2 revision * 32 bytes: nodeid (only 20 bytes used) */ static PyObject *index_get(indexObject *self, Py_ssize_t pos) { uint64_t offset_flags; int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2; const char *c_node_id; const char *data; Py_ssize_t length = index_length(self); PyObject *entry; if (pos < 0) pos += length; if (pos < 0 || pos >= length) { PyErr_SetString(PyExc_IndexError, "revlog index out of range"); return NULL; } if (pos == length - 1) { Py_INCREF(nullentry); return nullentry; } if (pos >= self->length - 1) { PyObject *obj; obj = PyList_GET_ITEM(self->added, pos - self->length + 1); Py_INCREF(obj); return obj; } if (self->cache) { if (self->cache[pos]) { Py_INCREF(self->cache[pos]); return self->cache[pos]; } } else { self->cache = calloc(self->raw_length, sizeof(PyObject *)); if (self->cache == NULL) return PyErr_NoMemory(); } data = index_deref(self, pos); if (data == NULL) return NULL; offset_flags = getbe32(data + 4); if (pos == 0) /* mask out version number for the first entry */ offset_flags &= 0xFFFF; else { uint32_t offset_high = getbe32(data); offset_flags |= ((uint64_t)offset_high) << 32; } comp_len = getbe32(data + 8); uncomp_len = getbe32(data + 12); base_rev = getbe32(data + 16); link_rev = getbe32(data + 20); parent_1 = getbe32(data + 24); parent_2 = getbe32(data + 28); c_node_id = data + 32; entry = Py_BuildValue(tuple_format, offset_flags, comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2, c_node_id, 20); if (entry) { PyObject_GC_UnTrack(entry); Py_INCREF(entry); } self->cache[pos] = entry; return entry; } /* * Return the 20-byte SHA of the node corresponding to the given rev. */ static const char *index_node(indexObject *self, Py_ssize_t pos) { Py_ssize_t length = index_length(self); const char *data; if (pos == length - 1 || pos == INT_MAX) return nullid; if (pos >= length) return NULL; if (pos >= self->length - 1) { PyObject *tuple, *str; tuple = PyList_GET_ITEM(self->added, pos - self->length + 1); str = PyTuple_GetItem(tuple, 7); return str ? PyString_AS_STRING(str) : NULL; } data = index_deref(self, pos); return data ? data + 32 : NULL; } static int nt_insert(indexObject *self, const char *node, int rev); static int node_check(PyObject *obj, char **node, Py_ssize_t *nodelen) { if (PyString_AsStringAndSize(obj, node, nodelen) == -1) return -1; if (*nodelen == 20) return 0; PyErr_SetString(PyExc_ValueError, "20-byte hash required"); return -1; } static PyObject *index_insert(indexObject *self, PyObject *args) { PyObject *obj; char *node; long offset; Py_ssize_t len, nodelen; if (!PyArg_ParseTuple(args, "lO", &offset, &obj)) return NULL; if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) { PyErr_SetString(PyExc_TypeError, "8-tuple required"); return NULL; } if (node_check(PyTuple_GET_ITEM(obj, 7), &node, &nodelen) == -1) return NULL; len = index_length(self); if (offset < 0) offset += len; if (offset != len - 1) { PyErr_SetString(PyExc_IndexError, "insert only supported at index -1"); return NULL; } if (offset > INT_MAX) { PyErr_SetString(PyExc_ValueError, "currently only 2**31 revs supported"); return NULL; } if (self->added == NULL) { self->added = PyList_New(0); if (self->added == NULL) return NULL; } if (PyList_Append(self->added, obj) == -1) return NULL; if (self->nt) nt_insert(self, node, (int)offset); Py_CLEAR(self->headrevs); Py_RETURN_NONE; } static void _index_clearcaches(indexObject *self) { if (self->cache) { Py_ssize_t i; for (i = 0; i < self->raw_length; i++) Py_CLEAR(self->cache[i]); free(self->cache); self->cache = NULL; } if (self->offsets) { free(self->offsets); self->offsets = NULL; } if (self->nt) { free(self->nt); self->nt = NULL; } Py_CLEAR(self->headrevs); } static PyObject *index_clearcaches(indexObject *self) { _index_clearcaches(self); self->ntlength = self->ntcapacity = 0; self->ntdepth = self->ntsplits = 0; self->ntrev = -1; self->ntlookups = self->ntmisses = 0; Py_RETURN_NONE; } static PyObject *index_stats(indexObject *self) { PyObject *obj = PyDict_New(); if (obj == NULL) return NULL; #define istat(__n, __d) \ if (PyDict_SetItemString(obj, __d, PyInt_FromSsize_t(self->__n)) == -1) \ goto bail; if (self->added) { Py_ssize_t len = PyList_GET_SIZE(self->added); if (PyDict_SetItemString(obj, "index entries added", PyInt_FromSsize_t(len)) == -1) goto bail; } if (self->raw_length != self->length - 1) istat(raw_length, "revs on disk"); istat(length, "revs in memory"); istat(ntcapacity, "node trie capacity"); istat(ntdepth, "node trie depth"); istat(ntlength, "node trie count"); istat(ntlookups, "node trie lookups"); istat(ntmisses, "node trie misses"); istat(ntrev, "node trie last rev scanned"); istat(ntsplits, "node trie splits"); #undef istat return obj; bail: Py_XDECREF(obj); return NULL; } /* * When we cache a list, we want to be sure the caller can't mutate * the cached copy. */ static PyObject *list_copy(PyObject *list) { Py_ssize_t len = PyList_GET_SIZE(list); PyObject *newlist = PyList_New(len); Py_ssize_t i; if (newlist == NULL) return NULL; for (i = 0; i < len; i++) { PyObject *obj = PyList_GET_ITEM(list, i); Py_INCREF(obj); PyList_SET_ITEM(newlist, i, obj); } return newlist; } static PyObject *index_headrevs(indexObject *self) { Py_ssize_t i, len, addlen; char *nothead = NULL; PyObject *heads; if (self->headrevs) return list_copy(self->headrevs); len = index_length(self) - 1; heads = PyList_New(0); if (heads == NULL) goto bail; if (len == 0) { PyObject *nullid = PyInt_FromLong(-1); if (nullid == NULL || PyList_Append(heads, nullid) == -1) { Py_XDECREF(nullid); goto bail; } goto done; } nothead = calloc(len, 1); if (nothead == NULL) goto bail; for (i = 0; i < self->raw_length; i++) { const char *data = index_deref(self, i); int parent_1 = getbe32(data + 24); int parent_2 = getbe32(data + 28); if (parent_1 >= 0) nothead[parent_1] = 1; if (parent_2 >= 0) nothead[parent_2] = 1; } addlen = self->added ? PyList_GET_SIZE(self->added) : 0; for (i = 0; i < addlen; i++) { PyObject *rev = PyList_GET_ITEM(self->added, i); PyObject *p1 = PyTuple_GET_ITEM(rev, 5); PyObject *p2 = PyTuple_GET_ITEM(rev, 6); long parent_1, parent_2; if (!PyInt_Check(p1) || !PyInt_Check(p2)) { PyErr_SetString(PyExc_TypeError, "revlog parents are invalid"); goto bail; } parent_1 = PyInt_AS_LONG(p1); parent_2 = PyInt_AS_LONG(p2); if (parent_1 >= 0) nothead[parent_1] = 1; if (parent_2 >= 0) nothead[parent_2] = 1; } for (i = 0; i < len; i++) { PyObject *head; if (nothead[i]) continue; head = PyInt_FromLong(i); if (head == NULL || PyList_Append(heads, head) == -1) { Py_XDECREF(head); goto bail; } } done: self->headrevs = heads; free(nothead); return list_copy(self->headrevs); bail: Py_XDECREF(heads); free(nothead); return NULL; } static inline int nt_level(const char *node, Py_ssize_t level) { int v = node[level>>1]; if (!(level & 1)) v >>= 4; return v & 0xf; } /* * Return values: * * -4: match is ambiguous (multiple candidates) * -2: not found * rest: valid rev */ static int nt_find(indexObject *self, const char *node, Py_ssize_t nodelen, int hex) { int (*getnybble)(const char *, Py_ssize_t) = hex ? hexdigit : nt_level; int level, maxlevel, off; if (nodelen == 20 && node[0] == '\0' && memcmp(node, nullid, 20) == 0) return -1; if (self->nt == NULL) return -2; if (hex) maxlevel = nodelen > 40 ? 40 : (int)nodelen; else maxlevel = nodelen > 20 ? 40 : ((int)nodelen * 2); for (level = off = 0; level < maxlevel; level++) { int k = getnybble(node, level); nodetree *n = &self->nt[off]; int v = n->children[k]; if (v < 0) { const char *n; Py_ssize_t i; v = -v - 1; n = index_node(self, v); if (n == NULL) return -2; for (i = level; i < maxlevel; i++) if (getnybble(node, i) != nt_level(n, i)) return -2; return v; } if (v == 0) return -2; off = v; } /* multiple matches against an ambiguous prefix */ return -4; } static int nt_new(indexObject *self) { if (self->ntlength == self->ntcapacity) { self->ntcapacity *= 2; self->nt = realloc(self->nt, self->ntcapacity * sizeof(nodetree)); if (self->nt == NULL) { PyErr_SetString(PyExc_MemoryError, "out of memory"); return -1; } memset(&self->nt[self->ntlength], 0, sizeof(nodetree) * (self->ntcapacity - self->ntlength)); } return self->ntlength++; } static int nt_insert(indexObject *self, const char *node, int rev) { int level = 0; int off = 0; while (level < 40) { int k = nt_level(node, level); nodetree *n; int v; n = &self->nt[off]; v = n->children[k]; if (v == 0) { n->children[k] = -rev - 1; return 0; } if (v < 0) { const char *oldnode = index_node(self, -v - 1); int noff; if (!oldnode || !memcmp(oldnode, node, 20)) { n->children[k] = -rev - 1; return 0; } noff = nt_new(self); if (noff == -1) return -1; /* self->nt may have been changed by realloc */ self->nt[off].children[k] = noff; off = noff; n = &self->nt[off]; n->children[nt_level(oldnode, ++level)] = v; if (level > self->ntdepth) self->ntdepth = level; self->ntsplits += 1; } else { level += 1; off = v; } } return -1; } static int nt_init(indexObject *self) { if (self->nt == NULL) { self->ntcapacity = self->raw_length < 4 ? 4 : self->raw_length / 2; self->nt = calloc(self->ntcapacity, sizeof(nodetree)); if (self->nt == NULL) { PyErr_NoMemory(); return -1; } self->ntlength = 1; self->ntrev = (int)index_length(self) - 1; self->ntlookups = 1; self->ntmisses = 0; if (nt_insert(self, nullid, INT_MAX) == -1) return -1; } return 0; } /* * Return values: * * -3: error (exception set) * -2: not found (no exception set) * rest: valid rev */ static int index_find_node(indexObject *self, const char *node, Py_ssize_t nodelen) { int rev; self->ntlookups++; rev = nt_find(self, node, nodelen, 0); if (rev >= -1) return rev; if (nt_init(self) == -1) return -3; /* * For the first handful of lookups, we scan the entire index, * and cache only the matching nodes. This optimizes for cases * like "hg tip", where only a few nodes are accessed. * * After that, we cache every node we visit, using a single * scan amortized over multiple lookups. This gives the best * bulk performance, e.g. for "hg log". */ if (self->ntmisses++ < 4) { for (rev = self->ntrev - 1; rev >= 0; rev--) { const char *n = index_node(self, rev); if (n == NULL) return -2; if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) { if (nt_insert(self, n, rev) == -1) return -3; break; } } } else { for (rev = self->ntrev - 1; rev >= 0; rev--) { const char *n = index_node(self, rev); if (n == NULL) { self->ntrev = rev + 1; return -2; } if (nt_insert(self, n, rev) == -1) { self->ntrev = rev + 1; return -3; } if (memcmp(node, n, nodelen > 20 ? 20 : nodelen) == 0) { break; } } self->ntrev = rev; } if (rev >= 0) return rev; return -2; } static PyObject *raise_revlog_error(void) { static PyObject *errclass; PyObject *mod = NULL, *errobj; if (errclass == NULL) { PyObject *dict; mod = PyImport_ImportModule("mercurial.error"); if (mod == NULL) goto classfail; dict = PyModule_GetDict(mod); if (dict == NULL) goto classfail; errclass = PyDict_GetItemString(dict, "RevlogError"); if (errclass == NULL) { PyErr_SetString(PyExc_SystemError, "could not find RevlogError"); goto classfail; } Py_INCREF(errclass); } errobj = PyObject_CallFunction(errclass, NULL); if (errobj == NULL) return NULL; PyErr_SetObject(errclass, errobj); return errobj; classfail: Py_XDECREF(mod); return NULL; } static PyObject *index_getitem(indexObject *self, PyObject *value) { char *node; Py_ssize_t nodelen; int rev; if (PyInt_Check(value)) return index_get(self, PyInt_AS_LONG(value)); if (node_check(value, &node, &nodelen) == -1) return NULL; rev = index_find_node(self, node, nodelen); if (rev >= -1) return PyInt_FromLong(rev); if (rev == -2) raise_revlog_error(); return NULL; } static int nt_partialmatch(indexObject *self, const char *node, Py_ssize_t nodelen) { int rev; if (nt_init(self) == -1) return -3; if (self->ntrev > 0) { /* ensure that the radix tree is fully populated */ for (rev = self->ntrev - 1; rev >= 0; rev--) { const char *n = index_node(self, rev); if (n == NULL) return -2; if (nt_insert(self, n, rev) == -1) return -3; } self->ntrev = rev; } return nt_find(self, node, nodelen, 1); } static PyObject *index_partialmatch(indexObject *self, PyObject *args) { const char *fullnode; int nodelen; char *node; int rev, i; if (!PyArg_ParseTuple(args, "s#", &node, &nodelen)) return NULL; if (nodelen < 4) { PyErr_SetString(PyExc_ValueError, "key too short"); return NULL; } if (nodelen > 40) { PyErr_SetString(PyExc_ValueError, "key too long"); return NULL; } for (i = 0; i < nodelen; i++) hexdigit(node, i); if (PyErr_Occurred()) { /* input contains non-hex characters */ PyErr_Clear(); Py_RETURN_NONE; } rev = nt_partialmatch(self, node, nodelen); switch (rev) { case -4: raise_revlog_error(); case -3: return NULL; case -2: Py_RETURN_NONE; case -1: return PyString_FromStringAndSize(nullid, 20); } fullnode = index_node(self, rev); if (fullnode == NULL) { PyErr_Format(PyExc_IndexError, "could not access rev %d", rev); return NULL; } return PyString_FromStringAndSize(fullnode, 20); } static PyObject *index_m_get(indexObject *self, PyObject *args) { Py_ssize_t nodelen; PyObject *val; char *node; int rev; if (!PyArg_ParseTuple(args, "O", &val)) return NULL; if (node_check(val, &node, &nodelen) == -1) return NULL; rev = index_find_node(self, node, nodelen); if (rev == -3) return NULL; if (rev == -2) Py_RETURN_NONE; return PyInt_FromLong(rev); } static int index_contains(indexObject *self, PyObject *value) { char *node; Py_ssize_t nodelen; if (PyInt_Check(value)) { long rev = PyInt_AS_LONG(value); return rev >= -1 && rev < index_length(self); } if (node_check(value, &node, &nodelen) == -1) return -1; switch (index_find_node(self, node, nodelen)) { case -3: return -1; case -2: return 0; default: return 1; } } static inline void index_get_parents(indexObject *self, int rev, int *ps) { if (rev >= self->length - 1) { PyObject *tuple = PyList_GET_ITEM(self->added, rev - self->length + 1); ps[0] = (int)PyInt_AS_LONG(PyTuple_GET_ITEM(tuple, 5)); ps[1] = (int)PyInt_AS_LONG(PyTuple_GET_ITEM(tuple, 6)); } else { const char *data = index_deref(self, rev); ps[0] = getbe32(data + 24); ps[1] = getbe32(data + 28); } } typedef uint64_t bitmask; /* * Given a disjoint set of revs, return all candidates for the * greatest common ancestor. In revset notation, this is the set * "heads(::a and ::b and ...)" */ static PyObject *find_gca_candidates(indexObject *self, const int *revs, int revcount) { const bitmask allseen = (1ull << revcount) - 1; const bitmask poison = 1ull << revcount; PyObject *gca = PyList_New(0); int i, v, interesting, left; int maxrev = -1; long sp; bitmask *seen; if (gca == NULL) return PyErr_NoMemory(); for (i = 0; i < revcount; i++) { if (revs[i] > maxrev) maxrev = revs[i]; } seen = calloc(sizeof(*seen), maxrev + 1); if (seen == NULL) { Py_DECREF(gca); return PyErr_NoMemory(); } for (i = 0; i < revcount; i++) seen[revs[i]] = 1ull << i; interesting = left = revcount; for (v = maxrev; v >= 0 && interesting; v--) { long sv = seen[v]; int parents[2]; if (!sv) continue; if (sv < poison) { interesting -= 1; if (sv == allseen) { PyObject *obj = PyInt_FromLong(v); if (obj == NULL) goto bail; if (PyList_Append(gca, obj) == -1) { Py_DECREF(obj); goto bail; } sv |= poison; for (i = 0; i < revcount; i++) { if (revs[i] == v) { if (--left <= 1) goto done; break; } } } } index_get_parents(self, v, parents); for (i = 0; i < 2; i++) { int p = parents[i]; if (p == -1) continue; sp = seen[p]; if (sv < poison) { if (sp == 0) { seen[p] = sv; interesting++; } else if (sp != sv) seen[p] |= sv; } else { if (sp && sp < poison) interesting--; seen[p] = sv; } } } done: free(seen); return gca; bail: free(seen); Py_XDECREF(gca); return NULL; } /* * Given a disjoint set of revs, return the subset with the longest * path to the root. */ static PyObject *find_deepest(indexObject *self, PyObject *revs) { const Py_ssize_t revcount = PyList_GET_SIZE(revs); static const Py_ssize_t capacity = 24; int *depth, *interesting = NULL; int i, j, v, ninteresting; PyObject *dict = NULL, *keys; long *seen = NULL; int maxrev = -1; long final; if (revcount > capacity) { PyErr_Format(PyExc_OverflowError, "bitset size (%ld) > capacity (%ld)", (long)revcount, (long)capacity); return NULL; } for (i = 0; i < revcount; i++) { int n = (int)PyInt_AsLong(PyList_GET_ITEM(revs, i)); if (n > maxrev) maxrev = n; } depth = calloc(sizeof(*depth), maxrev + 1); if (depth == NULL) return PyErr_NoMemory(); seen = calloc(sizeof(*seen), maxrev + 1); if (seen == NULL) { PyErr_NoMemory(); goto bail; } interesting = calloc(sizeof(*interesting), 2 << revcount); if (interesting == NULL) { PyErr_NoMemory(); goto bail; } if (PyList_Sort(revs) == -1) goto bail; for (i = 0; i < revcount; i++) { int n = (int)PyInt_AsLong(PyList_GET_ITEM(revs, i)); long b = 1l << i; depth[n] = 1; seen[n] = b; interesting[b] = 1; } ninteresting = (int)revcount; for (v = maxrev; v >= 0 && ninteresting > 1; v--) { int dv = depth[v]; int parents[2]; long sv; if (dv == 0) continue; sv = seen[v]; index_get_parents(self, v, parents); for (i = 0; i < 2; i++) { int p = parents[i]; long nsp, sp; int dp; if (p == -1) continue; dp = depth[p]; nsp = sp = seen[p]; if (dp <= dv) { depth[p] = dv + 1; if (sp != sv) { interesting[sv] += 1; nsp = seen[p] = sv; if (sp) { interesting[sp] -= 1; if (interesting[sp] == 0) ninteresting -= 1; } } } else if (dv == dp - 1) { nsp = sp | sv; if (nsp == sp) continue; seen[p] = nsp; interesting[sp] -= 1; if (interesting[sp] == 0 && interesting[nsp] > 0) ninteresting -= 1; interesting[nsp] += 1; } } interesting[sv] -= 1; if (interesting[sv] == 0) ninteresting -= 1; } final = 0; j = ninteresting; for (i = 0; i < (int)(2 << revcount) && j > 0; i++) { if (interesting[i] == 0) continue; final |= i; j -= 1; } if (final == 0) return PyList_New(0); dict = PyDict_New(); if (dict == NULL) goto bail; for (i = 0; i < revcount; i++) { PyObject *key; if ((final & (1 << i)) == 0) continue; key = PyList_GET_ITEM(revs, i); Py_INCREF(key); Py_INCREF(Py_None); if (PyDict_SetItem(dict, key, Py_None) == -1) { Py_DECREF(key); Py_DECREF(Py_None); goto bail; } } keys = PyDict_Keys(dict); free(depth); free(seen); free(interesting); Py_DECREF(dict); return keys; bail: free(depth); free(seen); free(interesting); Py_XDECREF(dict); return NULL; } /* * Given a (possibly overlapping) set of revs, return the greatest * common ancestors: those with the longest path to the root. */ static PyObject *index_ancestors(indexObject *self, PyObject *args) { PyObject *ret = NULL, *gca = NULL; Py_ssize_t argcount, i, len; bitmask repeat = 0; int revcount = 0; int *revs; argcount = PySequence_Length(args); revs = malloc(argcount * sizeof(*revs)); if (argcount > 0 && revs == NULL) return PyErr_NoMemory(); len = index_length(self) - 1; for (i = 0; i < argcount; i++) { static const int capacity = 24; PyObject *obj = PySequence_GetItem(args, i); bitmask x; long val; if (!PyInt_Check(obj)) { PyErr_SetString(PyExc_TypeError, "arguments must all be ints"); goto bail; } val = PyInt_AsLong(obj); if (val == -1) { ret = PyList_New(0); goto done; } if (val < 0 || val >= len) { PyErr_SetString(PyExc_IndexError, "index out of range"); goto bail; } /* this cheesy bloom filter lets us avoid some more * expensive duplicate checks in the common set-is-disjoint * case */ x = 1ull << (val & 0x3f); if (repeat & x) { int k; for (k = 0; k < revcount; k++) { if (val == revs[k]) goto duplicate; } } else repeat |= x; if (revcount >= capacity) { PyErr_Format(PyExc_OverflowError, "bitset size (%d) > capacity (%d)", revcount, capacity); goto bail; } revs[revcount++] = (int)val; duplicate:; } if (revcount == 0) { ret = PyList_New(0); goto done; } if (revcount == 1) { PyObject *obj; ret = PyList_New(1); if (ret == NULL) goto bail; obj = PyInt_FromLong(revs[0]); if (obj == NULL) goto bail; PyList_SET_ITEM(ret, 0, obj); goto done; } gca = find_gca_candidates(self, revs, revcount); if (gca == NULL) goto bail; if (PyList_GET_SIZE(gca) <= 1) { ret = gca; Py_INCREF(gca); } else if (PyList_GET_SIZE(gca) == 1) { ret = PyList_GET_ITEM(gca, 0); Py_INCREF(ret); } else ret = find_deepest(self, gca); done: free(revs); Py_XDECREF(gca); return ret; bail: free(revs); Py_XDECREF(gca); Py_XDECREF(ret); return NULL; } /* * Invalidate any trie entries introduced by added revs. */ static void nt_invalidate_added(indexObject *self, Py_ssize_t start) { Py_ssize_t i, len = PyList_GET_SIZE(self->added); for (i = start; i < len; i++) { PyObject *tuple = PyList_GET_ITEM(self->added, i); PyObject *node = PyTuple_GET_ITEM(tuple, 7); nt_insert(self, PyString_AS_STRING(node), -1); } if (start == 0) Py_CLEAR(self->added); } /* * Delete a numeric range of revs, which must be at the end of the * range, but exclude the sentinel nullid entry. */ static int index_slice_del(indexObject *self, PyObject *item) { Py_ssize_t start, stop, step, slicelength; Py_ssize_t length = index_length(self); int ret = 0; if (PySlice_GetIndicesEx((PySliceObject*)item, length, &start, &stop, &step, &slicelength) < 0) return -1; if (slicelength <= 0) return 0; if ((step < 0 && start < stop) || (step > 0 && start > stop)) stop = start; if (step < 0) { stop = start + 1; start = stop + step*(slicelength - 1) - 1; step = -step; } if (step != 1) { PyErr_SetString(PyExc_ValueError, "revlog index delete requires step size of 1"); return -1; } if (stop != length - 1) { PyErr_SetString(PyExc_IndexError, "revlog index deletion indices are invalid"); return -1; } if (start < self->length - 1) { if (self->nt) { Py_ssize_t i; for (i = start + 1; i < self->length - 1; i++) { const char *node = index_node(self, i); if (node) nt_insert(self, node, -1); } if (self->added) nt_invalidate_added(self, 0); if (self->ntrev > start) self->ntrev = (int)start; } self->length = start + 1; if (start < self->raw_length) { if (self->cache) { Py_ssize_t i; for (i = start; i < self->raw_length; i++) Py_CLEAR(self->cache[i]); } self->raw_length = start; } goto done; } if (self->nt) { nt_invalidate_added(self, start - self->length + 1); if (self->ntrev > start) self->ntrev = (int)start; } if (self->added) ret = PyList_SetSlice(self->added, start - self->length + 1, PyList_GET_SIZE(self->added), NULL); done: Py_CLEAR(self->headrevs); return ret; } /* * Supported ops: * * slice deletion * string assignment (extend node->rev mapping) * string deletion (shrink node->rev mapping) */ static int index_assign_subscript(indexObject *self, PyObject *item, PyObject *value) { char *node; Py_ssize_t nodelen; long rev; if (PySlice_Check(item) && value == NULL) return index_slice_del(self, item); if (node_check(item, &node, &nodelen) == -1) return -1; if (value == NULL) return self->nt ? nt_insert(self, node, -1) : 0; rev = PyInt_AsLong(value); if (rev > INT_MAX || rev < 0) { if (!PyErr_Occurred()) PyErr_SetString(PyExc_ValueError, "rev out of range"); return -1; } return nt_insert(self, node, (int)rev); } /* * Find all RevlogNG entries in an index that has inline data. Update * the optional "offsets" table with those entries. */ static long inline_scan(indexObject *self, const char **offsets) { const char *data = PyString_AS_STRING(self->data); const char *end = data + PyString_GET_SIZE(self->data); long incr = v1_hdrsize; Py_ssize_t len = 0; while (data + v1_hdrsize <= end) { uint32_t comp_len; const char *old_data; /* 3rd element of header is length of compressed inline data */ comp_len = getbe32(data + 8); incr = v1_hdrsize + comp_len; if (incr < v1_hdrsize) break; if (offsets) offsets[len] = data; len++; old_data = data; data += incr; if (data <= old_data) break; } if (data != end && data + v1_hdrsize != end) { if (!PyErr_Occurred()) PyErr_SetString(PyExc_ValueError, "corrupt index file"); return -1; } return len; } static int index_init(indexObject *self, PyObject *args) { PyObject *data_obj, *inlined_obj; Py_ssize_t size; if (!PyArg_ParseTuple(args, "OO", &data_obj, &inlined_obj)) return -1; if (!PyString_Check(data_obj)) { PyErr_SetString(PyExc_TypeError, "data is not a string"); return -1; } size = PyString_GET_SIZE(data_obj); self->inlined = inlined_obj && PyObject_IsTrue(inlined_obj); self->data = data_obj; self->cache = NULL; self->added = NULL; self->headrevs = NULL; self->offsets = NULL; self->nt = NULL; self->ntlength = self->ntcapacity = 0; self->ntdepth = self->ntsplits = 0; self->ntlookups = self->ntmisses = 0; self->ntrev = -1; Py_INCREF(self->data); if (self->inlined) { long len = inline_scan(self, NULL); if (len == -1) goto bail; self->raw_length = len; self->length = len + 1; } else { if (size % v1_hdrsize) { PyErr_SetString(PyExc_ValueError, "corrupt index file"); goto bail; } self->raw_length = size / v1_hdrsize; self->length = self->raw_length + 1; } return 0; bail: return -1; } static PyObject *index_nodemap(indexObject *self) { Py_INCREF(self); return (PyObject *)self; } static void index_dealloc(indexObject *self) { _index_clearcaches(self); Py_DECREF(self->data); Py_XDECREF(self->added); PyObject_Del(self); } static PySequenceMethods index_sequence_methods = { (lenfunc)index_length, /* sq_length */ 0, /* sq_concat */ 0, /* sq_repeat */ (ssizeargfunc)index_get, /* sq_item */ 0, /* sq_slice */ 0, /* sq_ass_item */ 0, /* sq_ass_slice */ (objobjproc)index_contains, /* sq_contains */ }; static PyMappingMethods index_mapping_methods = { (lenfunc)index_length, /* mp_length */ (binaryfunc)index_getitem, /* mp_subscript */ (objobjargproc)index_assign_subscript, /* mp_ass_subscript */ }; static PyMethodDef index_methods[] = { {"ancestors", (PyCFunction)index_ancestors, METH_VARARGS, "return the gca set of the given revs"}, {"clearcaches", (PyCFunction)index_clearcaches, METH_NOARGS, "clear the index caches"}, {"get", (PyCFunction)index_m_get, METH_VARARGS, "get an index entry"}, {"headrevs", (PyCFunction)index_headrevs, METH_NOARGS, "get head revisions"}, {"insert", (PyCFunction)index_insert, METH_VARARGS, "insert an index entry"}, {"partialmatch", (PyCFunction)index_partialmatch, METH_VARARGS, "match a potentially ambiguous node ID"}, {"stats", (PyCFunction)index_stats, METH_NOARGS, "stats for the index"}, {NULL} /* Sentinel */ }; static PyGetSetDef index_getset[] = { {"nodemap", (getter)index_nodemap, NULL, "nodemap", NULL}, {NULL} /* Sentinel */ }; static PyTypeObject indexType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ "parsers.index", /* tp_name */ sizeof(indexObject), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)index_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ &index_sequence_methods, /* tp_as_sequence */ &index_mapping_methods, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ "revlog index", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ index_methods, /* tp_methods */ 0, /* tp_members */ index_getset, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)index_init, /* tp_init */ 0, /* tp_alloc */ }; /* * returns a tuple of the form (index, index, cache) with elements as * follows: * * index: an index object that lazily parses RevlogNG records * cache: if data is inlined, a tuple (index_file_content, 0), else None * * added complications are for backwards compatibility */ static PyObject *parse_index2(PyObject *self, PyObject *args) { PyObject *tuple = NULL, *cache = NULL; indexObject *idx; int ret; idx = PyObject_New(indexObject, &indexType); if (idx == NULL) goto bail; ret = index_init(idx, args); if (ret == -1) goto bail; if (idx->inlined) { cache = Py_BuildValue("iO", 0, idx->data); if (cache == NULL) goto bail; } else { cache = Py_None; Py_INCREF(cache); } tuple = Py_BuildValue("NN", idx, cache); if (!tuple) goto bail; return tuple; bail: Py_XDECREF(idx); Py_XDECREF(cache); Py_XDECREF(tuple); return NULL; } static char parsers_doc[] = "Efficient content parsing."; PyObject *encodedir(PyObject *self, PyObject *args); PyObject *pathencode(PyObject *self, PyObject *args); PyObject *lowerencode(PyObject *self, PyObject *args); static PyMethodDef methods[] = { {"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"}, {"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"}, {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"}, {"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"}, {"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"}, {"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"}, {"lowerencode", lowerencode, METH_VARARGS, "lower-encode a path\n"}, {NULL, NULL} }; void dirs_module_init(PyObject *mod); static void module_init(PyObject *mod) { dirs_module_init(mod); indexType.tp_new = PyType_GenericNew; if (PyType_Ready(&indexType) < 0) return; Py_INCREF(&indexType); PyModule_AddObject(mod, "index", (PyObject *)&indexType); nullentry = Py_BuildValue("iiiiiiis#", 0, 0, 0, -1, -1, -1, -1, nullid, 20); if (nullentry) PyObject_GC_UnTrack(nullentry); dirstate_unset = Py_BuildValue("ciii", 'n', 0, -1, -1); } #ifdef IS_PY3K static struct PyModuleDef parsers_module = { PyModuleDef_HEAD_INIT, "parsers", parsers_doc, -1, methods }; PyMODINIT_FUNC PyInit_parsers(void) { PyObject *mod = PyModule_Create(&parsers_module); module_init(mod); return mod; } #else PyMODINIT_FUNC initparsers(void) { PyObject *mod = Py_InitModule3("parsers", methods, parsers_doc); module_init(mod); } #endif