cext: add support for revlogv2
authorRaphaël Gomès <rgomes@octobus.net>
Wed, 20 Jan 2021 18:35:12 +0100
changeset 46708 358737abeeef
parent 46707 eed42f1c22d6
child 46709 3d740058b467
cext: add support for revlogv2 This enables the C code to retrieve/create entries in the revlog v2 format. This is mainly a matter of taking into account the additional slots for sidedata, etc. Differential Revision: https://phab.mercurial-scm.org/D9846
mercurial/cext/parsers.c
mercurial/cext/revlog.c
--- a/mercurial/cext/parsers.c	Mon Jan 18 10:43:12 2021 +0100
+++ b/mercurial/cext/parsers.c	Wed Jan 20 18:35:12 2021 +0100
@@ -638,7 +638,7 @@
 PyObject *encodedir(PyObject *self, PyObject *args);
 PyObject *pathencode(PyObject *self, PyObject *args);
 PyObject *lowerencode(PyObject *self, PyObject *args);
-PyObject *parse_index2(PyObject *self, PyObject *args);
+PyObject *parse_index2(PyObject *self, PyObject *args, PyObject *kwargs);
 
 static PyMethodDef methods[] = {
     {"pack_dirstate", pack_dirstate, METH_VARARGS, "pack a dirstate\n"},
@@ -646,7 +646,8 @@
      "create a set containing non-normal and other parent entries of given "
      "dirstate\n"},
     {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
-    {"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+    {"parse_index2", (PyCFunction)parse_index2, METH_VARARGS | METH_KEYWORDS,
+     "parse a revlog index\n"},
     {"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"},
     {"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"},
     {"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"},
--- a/mercurial/cext/revlog.c	Mon Jan 18 10:43:12 2021 +0100
+++ b/mercurial/cext/revlog.c	Wed Jan 20 18:35:12 2021 +0100
@@ -98,6 +98,7 @@
 	int ntlookups;          /* # lookups */
 	int ntmisses;           /* # lookups that miss the cache */
 	int inlined;
+	long hdrsize; /* size of index headers. Differs in v1 v.s. v2 format */
 };
 
 static Py_ssize_t index_length(const indexObject *self)
@@ -113,14 +114,21 @@
 static int index_find_node(indexObject *self, const char *node);
 
 #if LONG_MAX == 0x7fffffffL
-static const char *const tuple_format = PY23("Kiiiiiis#", "Kiiiiiiy#");
+static const char *const v1_tuple_format = PY23("Kiiiiiis#", "Kiiiiiiy#");
+static const char *const v2_tuple_format =
+    PY23("Kiiiiiis#Ki", "Kiiiiiiy#Ki");
 #else
-static const char *const tuple_format = PY23("kiiiiiis#", "kiiiiiiy#");
+static const char *const v1_tuple_format = PY23("kiiiiiis#", "kiiiiiiy#");
+static const char *const v2_tuple_format =
+    PY23("kiiiiiis#ki", "kiiiiiiy#ki");
 #endif
 
 /* A RevlogNG v1 index entry is 64 bytes long. */
 static const long v1_hdrsize = 64;
 
+/* A Revlogv2 index entry is 96 bytes long. */
+static const long v2_hdrsize = 96;
+
 static void raise_revlog_error(void)
 {
 	PyObject *mod = NULL, *dict = NULL, *errclass = NULL;
@@ -157,7 +165,7 @@
 static const char *index_deref(indexObject *self, Py_ssize_t pos)
 {
 	if (pos >= self->length)
-		return self->added + (pos - self->length) * v1_hdrsize;
+		return self->added + (pos - self->length) * self->hdrsize;
 
 	if (self->inlined && pos > 0) {
 		if (self->offsets == NULL) {
@@ -174,7 +182,7 @@
 		return self->offsets[pos];
 	}
 
-	return (const char *)(self->buf.buf) + pos * v1_hdrsize;
+	return (const char *)(self->buf.buf) + pos * self->hdrsize;
 }
 
 /*
@@ -280,8 +288,9 @@
  */
 static PyObject *index_get(indexObject *self, Py_ssize_t pos)
 {
-	uint64_t offset_flags;
-	int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
+	uint64_t offset_flags, sidedata_offset;
+	int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2,
+	    sidedata_comp_len;
 	const char *c_node_id;
 	const char *data;
 	Py_ssize_t length = index_length(self);
@@ -320,9 +329,19 @@
 	parent_2 = getbe32(data + 28);
 	c_node_id = data + 32;
 
-	return Py_BuildValue(tuple_format, offset_flags, comp_len, uncomp_len,
-	                     base_rev, link_rev, parent_1, parent_2, c_node_id,
-	                     self->nodelen);
+	if (self->hdrsize == v1_hdrsize) {
+		return Py_BuildValue(v1_tuple_format, offset_flags, comp_len,
+		                     uncomp_len, base_rev, link_rev, parent_1,
+		                     parent_2, c_node_id, self->nodelen);
+	} else {
+		sidedata_offset = getbe64(data + 64);
+		sidedata_comp_len = getbe32(data + 72);
+
+		return Py_BuildValue(v2_tuple_format, offset_flags, comp_len,
+		                     uncomp_len, base_rev, link_rev, parent_1,
+		                     parent_2, c_node_id, self->nodelen,
+		                     sidedata_offset, sidedata_comp_len);
+	}
 }
 
 /*
@@ -373,18 +392,30 @@
 
 static PyObject *index_append(indexObject *self, PyObject *obj)
 {
-	uint64_t offset_flags;
+	uint64_t offset_flags, sidedata_offset;
 	int rev, comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
-	Py_ssize_t c_node_id_len;
+	Py_ssize_t c_node_id_len, sidedata_comp_len;
 	const char *c_node_id;
 	char *data;
 
-	if (!PyArg_ParseTuple(obj, tuple_format, &offset_flags, &comp_len,
-	                      &uncomp_len, &base_rev, &link_rev, &parent_1,
-	                      &parent_2, &c_node_id, &c_node_id_len)) {
-		PyErr_SetString(PyExc_TypeError, "8-tuple required");
-		return NULL;
+	if (self->hdrsize == v1_hdrsize) {
+		if (!PyArg_ParseTuple(obj, v1_tuple_format, &offset_flags,
+		                      &comp_len, &uncomp_len, &base_rev,
+		                      &link_rev, &parent_1, &parent_2,
+		                      &c_node_id, &c_node_id_len)) {
+			PyErr_SetString(PyExc_TypeError, "8-tuple required");
+			return NULL;
+		}
+	} else {
+		if (!PyArg_ParseTuple(
+		        obj, v2_tuple_format, &offset_flags, &comp_len,
+		        &uncomp_len, &base_rev, &link_rev, &parent_1, &parent_2,
+		        &c_node_id, &c_node_id_len, &sidedata_offset, &sidedata_comp_len)) {
+			PyErr_SetString(PyExc_TypeError, "10-tuple required");
+			return NULL;
+		}
 	}
+
 	if (c_node_id_len != self->nodelen) {
 		PyErr_SetString(PyExc_TypeError, "invalid node");
 		return NULL;
@@ -393,15 +424,15 @@
 	if (self->new_length == self->added_length) {
 		size_t new_added_length =
 		    self->added_length ? self->added_length * 2 : 4096;
-		void *new_added =
-		    PyMem_Realloc(self->added, new_added_length * v1_hdrsize);
+		void *new_added = PyMem_Realloc(self->added, new_added_length *
+		                                                 self->hdrsize);
 		if (!new_added)
 			return PyErr_NoMemory();
 		self->added = new_added;
 		self->added_length = new_added_length;
 	}
 	rev = self->length + self->new_length;
-	data = self->added + v1_hdrsize * self->new_length++;
+	data = self->added + self->hdrsize * self->new_length++;
 	putbe32(offset_flags >> 32, data);
 	putbe32(offset_flags & 0xffffffffU, data + 4);
 	putbe32(comp_len, data + 8);
@@ -411,7 +442,14 @@
 	putbe32(parent_1, data + 24);
 	putbe32(parent_2, data + 28);
 	memcpy(data + 32, c_node_id, c_node_id_len);
+	/* Padding since SHA-1 is only 20 bytes for now */
 	memset(data + 32 + c_node_id_len, 0, 32 - c_node_id_len);
+	if (self->hdrsize != v1_hdrsize) {
+		putbe64(sidedata_offset, data + 64);
+		putbe32(sidedata_comp_len, data + 72);
+		/* Padding for 96 bytes alignment */
+		memset(data + 76, 0, self->hdrsize - 76);
+	}
 
 	if (self->ntinitialized)
 		nt_insert(&self->nt, c_node_id, rev);
@@ -2563,14 +2601,17 @@
 	const char *data = (const char *)self->buf.buf;
 	Py_ssize_t pos = 0;
 	Py_ssize_t end = self->buf.len;
-	long incr = v1_hdrsize;
+	long incr = self->hdrsize;
 	Py_ssize_t len = 0;
 
-	while (pos + v1_hdrsize <= end && pos >= 0) {
-		uint32_t comp_len;
+	while (pos + self->hdrsize <= end && pos >= 0) {
+		uint32_t comp_len, sidedata_comp_len = 0;
 		/* 3rd element of header is length of compressed inline data */
 		comp_len = getbe32(data + pos + 8);
-		incr = v1_hdrsize + comp_len;
+		if (self->hdrsize == v2_hdrsize) {
+			sidedata_comp_len = getbe32(data + pos + 72);
+		}
+		incr = self->hdrsize + comp_len + sidedata_comp_len;
 		if (offsets)
 			offsets[len] = data + pos;
 		len++;
@@ -2586,11 +2627,13 @@
 	return len;
 }
 
-static int index_init(indexObject *self, PyObject *args)
+static int index_init(indexObject *self, PyObject *args, PyObject *kwargs)
 {
-	PyObject *data_obj, *inlined_obj;
+	PyObject *data_obj, *inlined_obj, *revlogv2;
 	Py_ssize_t size;
 
+	static char *kwlist[] = {"data", "inlined", "revlogv2", NULL};
+
 	/* Initialize before argument-checking to avoid index_dealloc() crash.
 	 */
 	self->added = NULL;
@@ -2606,7 +2649,9 @@
 	self->nodelen = 20;
 	self->nullentry = NULL;
 
-	if (!PyArg_ParseTuple(args, "OO", &data_obj, &inlined_obj))
+	revlogv2 = NULL;
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O", kwlist,
+	                                 &data_obj, &inlined_obj, &revlogv2))
 		return -1;
 	if (!PyObject_CheckBuffer(data_obj)) {
 		PyErr_SetString(PyExc_TypeError,
@@ -2618,8 +2663,22 @@
 		return -1;
 	}
 
-	self->nullentry = Py_BuildValue(PY23("iiiiiiis#", "iiiiiiiy#"), 0, 0, 0,
-	                                -1, -1, -1, -1, nullid, self->nodelen);
+	if (revlogv2 && PyObject_IsTrue(revlogv2)) {
+		self->hdrsize = v2_hdrsize;
+	} else {
+		self->hdrsize = v1_hdrsize;
+	}
+
+	if (self->hdrsize == v1_hdrsize) {
+		self->nullentry =
+		    Py_BuildValue(PY23("iiiiiiis#", "iiiiiiiy#"), 0, 0, 0, -1,
+		                  -1, -1, -1, nullid, self->nodelen);
+	} else {
+		self->nullentry = Py_BuildValue(
+		    PY23("iiiiiiis#ii", "iiiiiiiy#ii"), 0, 0, 0, -1, -1, -1,
+		    -1, nullid, self->nodelen, 0, 0);
+	}
+
 	if (!self->nullentry)
 		return -1;
 	PyObject_GC_UnTrack(self->nullentry);
@@ -2641,11 +2700,11 @@
 			goto bail;
 		self->length = len;
 	} else {
-		if (size % v1_hdrsize) {
+		if (size % self->hdrsize) {
 			PyErr_SetString(PyExc_ValueError, "corrupt index file");
 			goto bail;
 		}
-		self->length = size / v1_hdrsize;
+		self->length = size / self->hdrsize;
 	}
 
 	return 0;
@@ -2797,16 +2856,16 @@
 };
 
 /*
- * returns a tuple of the form (index, index, cache) with elements as
+ * returns a tuple of the form (index, cache) with elements as
  * follows:
  *
- * index: an index object that lazily parses RevlogNG records
+ * index: an index object that lazily parses Revlog (v1 or v2) records
  * cache: if data is inlined, a tuple (0, index_file_content), else None
  *        index_file_content could be a string, or a buffer
  *
  * added complications are for backwards compatibility
  */
-PyObject *parse_index2(PyObject *self, PyObject *args)
+PyObject *parse_index2(PyObject *self, PyObject *args, PyObject *kwargs)
 {
 	PyObject *cache = NULL;
 	indexObject *idx;
@@ -2816,7 +2875,7 @@
 	if (idx == NULL)
 		goto bail;
 
-	ret = index_init(idx, args);
+	ret = index_init(idx, args, kwargs);
 	if (ret == -1)
 		goto bail;