revlog: split revlog v1 and revlog v2 handling
authorpacien <pacien.trangirard@pacien.net>
Wed, 26 Jan 2022 13:18:48 +0100
changeset 48722 92b5a2c4d637
parent 48721 a9364de9be29
child 48723 27fe84a8dd60
revlog: split revlog v1 and revlog v2 handling Explicitly splitting their fields packing and unpacking makes it easier to extend the existing C implemenation to handle the new changelog format, whose fields and offsets are not simply a superset of the revlog. Differential Revision: https://phab.mercurial-scm.org/D12137
mercurial/cext/revlog.c
--- a/mercurial/cext/revlog.c	Wed Jan 26 13:08:36 2022 +0100
+++ b/mercurial/cext/revlog.c	Wed Jan 26 13:18:48 2022 +0100
@@ -136,19 +136,29 @@
 static const long format_v1 = 1; /* Internal only, could be any number */
 static const long format_v2 = 2; /* Internal only, could be any number */
 
-static const long entry_offset_high = 0;
-static const long entry_offset_offset_flags = 4;
-static const long entry_offset_comp_len = 8;
-static const long entry_offset_uncomp_len = 12;
-static const long entry_offset_base_rev = 16;
-static const long entry_offset_link_rev = 20;
-static const long entry_offset_parent_1 = 24;
-static const long entry_offset_parent_2 = 28;
-static const long entry_offset_node_id = 32;
-static const long entry_offset_sidedata_offset = 64;
-static const long entry_offset_sidedata_comp_len = 72;
-static const long entry_offset_all_comp_mode = 76;
-static const long entry_offset_padding_start = 77;
+static const long entry_v1_offset_high = 0;
+static const long entry_v1_offset_offset_flags = 4;
+static const long entry_v1_offset_comp_len = 8;
+static const long entry_v1_offset_uncomp_len = 12;
+static const long entry_v1_offset_base_rev = 16;
+static const long entry_v1_offset_link_rev = 20;
+static const long entry_v1_offset_parent_1 = 24;
+static const long entry_v1_offset_parent_2 = 28;
+static const long entry_v1_offset_node_id = 32;
+
+static const long entry_v2_offset_high = 0;
+static const long entry_v2_offset_offset_flags = 4;
+static const long entry_v2_offset_comp_len = 8;
+static const long entry_v2_offset_uncomp_len = 12;
+static const long entry_v2_offset_base_rev = 16;
+static const long entry_v2_offset_link_rev = 20;
+static const long entry_v2_offset_parent_1 = 24;
+static const long entry_v2_offset_parent_2 = 28;
+static const long entry_v2_offset_node_id = 32;
+static const long entry_v2_offset_sidedata_offset = 64;
+static const long entry_v2_offset_sidedata_comp_len = 72;
+static const long entry_v2_offset_all_comp_mode = 76;
+static const long entry_v2_offset_padding_start = 77;
 
 static const char comp_mode_inline = 2;
 static const char rank_unknown = -1;
@@ -220,8 +230,16 @@
 {
 	const char *data = index_deref(self, rev);
 
-	ps[0] = getbe32(data + entry_offset_parent_1);
-	ps[1] = getbe32(data + entry_offset_parent_2);
+	if (self->format_version == format_v1) {
+		ps[0] = getbe32(data + entry_v1_offset_parent_1);
+		ps[1] = getbe32(data + entry_v1_offset_parent_2);
+	} else if (self->format_version == format_v2) {
+		ps[0] = getbe32(data + entry_v2_offset_parent_1);
+		ps[1] = getbe32(data + entry_v2_offset_parent_2);
+	} else {
+		raise_revlog_error();
+		return -1;
+	}
 
 	/* If index file is corrupted, ps[] may point to invalid revisions. So
 	 * there is a risk of buffer overflow to trust them unconditionally. */
@@ -268,14 +286,32 @@
 		return 0;
 
 	data = index_deref(self, rev);
-	offset = getbe32(data + entry_offset_offset_flags);
-	if (rev == 0) {
-		/* mask out version number for the first entry */
-		offset &= 0xFFFF;
+
+	if (self->format_version == format_v1) {
+		offset = getbe32(data + entry_v1_offset_offset_flags);
+		if (rev == 0) {
+			/* mask out version number for the first entry */
+			offset &= 0xFFFF;
+		} else {
+			uint32_t offset_high =
+			    getbe32(data + entry_v1_offset_high);
+			offset |= ((uint64_t)offset_high) << 32;
+		}
+	} else if (self->format_version == format_v2) {
+		offset = getbe32(data + entry_v2_offset_offset_flags);
+		if (rev == 0) {
+			/* mask out version number for the first entry */
+			offset &= 0xFFFF;
+		} else {
+			uint32_t offset_high =
+			    getbe32(data + entry_v2_offset_high);
+			offset |= ((uint64_t)offset_high) << 32;
+		}
 	} else {
-		uint32_t offset_high = getbe32(data + entry_offset_high);
-		offset |= ((uint64_t)offset_high) << 32;
+		raise_revlog_error();
+		return -1;
 	}
+
 	return (int64_t)(offset >> 16);
 }
 
@@ -289,7 +325,14 @@
 
 	data = index_deref(self, rev);
 
-	tmp = (int)getbe32(data + entry_offset_comp_len);
+	if (self->format_version == format_v1) {
+		tmp = (int)getbe32(data + entry_v1_offset_comp_len);
+	} else if (self->format_version == format_v2) {
+		tmp = (int)getbe32(data + entry_v2_offset_comp_len);
+	} else {
+		raise_revlog_error();
+		return -1;
+	}
 	if (tmp < 0) {
 		PyErr_Format(PyExc_OverflowError,
 		             "revlog entry size out of bound (%d)", tmp);
@@ -334,38 +377,66 @@
 	if (data == NULL)
 		return NULL;
 
-	offset_flags = getbe32(data + entry_offset_offset_flags);
-	/*
-	 * The first entry on-disk needs the version number masked out,
-	 * but this doesn't apply if entries are added to an empty index.
-	 */
-	if (self->length && pos == 0)
-		offset_flags &= 0xFFFF;
-	else {
-		uint32_t offset_high = getbe32(data + entry_offset_high);
-		offset_flags |= ((uint64_t)offset_high) << 32;
-	}
-
-	comp_len = getbe32(data + entry_offset_comp_len);
-	uncomp_len = getbe32(data + entry_offset_uncomp_len);
-	base_rev = getbe32(data + entry_offset_base_rev);
-	link_rev = getbe32(data + entry_offset_link_rev);
-	parent_1 = getbe32(data + entry_offset_parent_1);
-	parent_2 = getbe32(data + entry_offset_parent_2);
-	c_node_id = data + entry_offset_node_id;
-
 	if (self->format_version == format_v1) {
+		offset_flags = getbe32(data + entry_v1_offset_offset_flags);
+		/*
+		 * The first entry on-disk needs the version number masked out,
+		 * but this doesn't apply if entries are added to an empty
+		 * index.
+		 */
+		if (self->length && pos == 0)
+			offset_flags &= 0xFFFF;
+		else {
+			uint32_t offset_high =
+			    getbe32(data + entry_v1_offset_high);
+			offset_flags |= ((uint64_t)offset_high) << 32;
+		}
+
+		comp_len = getbe32(data + entry_v1_offset_comp_len);
+		uncomp_len = getbe32(data + entry_v1_offset_uncomp_len);
+		base_rev = getbe32(data + entry_v1_offset_base_rev);
+		link_rev = getbe32(data + entry_v1_offset_link_rev);
+		parent_1 = getbe32(data + entry_v1_offset_parent_1);
+		parent_2 = getbe32(data + entry_v1_offset_parent_2);
+		c_node_id = data + entry_v1_offset_node_id;
+
 		sidedata_offset = 0;
 		sidedata_comp_len = 0;
 		data_comp_mode = comp_mode_inline;
 		sidedata_comp_mode = comp_mode_inline;
-	} else {
-		sidedata_offset = getbe64(data + entry_offset_sidedata_offset);
+	} else if (self->format_version == format_v2) {
+		offset_flags = getbe32(data + entry_v2_offset_offset_flags);
+		/*
+		 * The first entry on-disk needs the version number masked out,
+		 * but this doesn't apply if entries are added to an empty
+		 * index.
+		 */
+		if (self->length && pos == 0)
+			offset_flags &= 0xFFFF;
+		else {
+			uint32_t offset_high =
+			    getbe32(data + entry_v2_offset_high);
+			offset_flags |= ((uint64_t)offset_high) << 32;
+		}
+
+		comp_len = getbe32(data + entry_v2_offset_comp_len);
+		uncomp_len = getbe32(data + entry_v2_offset_uncomp_len);
+		base_rev = getbe32(data + entry_v2_offset_base_rev);
+		link_rev = getbe32(data + entry_v2_offset_link_rev);
+		parent_1 = getbe32(data + entry_v2_offset_parent_1);
+		parent_2 = getbe32(data + entry_v2_offset_parent_2);
+		c_node_id = data + entry_v2_offset_node_id;
+
+		sidedata_offset =
+		    getbe64(data + entry_v2_offset_sidedata_offset);
 		sidedata_comp_len =
-		    getbe32(data + entry_offset_sidedata_comp_len);
-		data_comp_mode = data[entry_offset_all_comp_mode] & 3;
+		    getbe32(data + entry_v2_offset_sidedata_comp_len);
+		data_comp_mode = data[entry_v2_offset_all_comp_mode] & 3;
 		sidedata_comp_mode =
-		    ((data[entry_offset_all_comp_mode] >> 2) & 3);
+		    ((data[entry_v2_offset_all_comp_mode] >> 2) & 3);
+	} else {
+		raise_revlog_error();
+		return NULL;
 	}
 
 	return Py_BuildValue(tuple_format, offset_flags, comp_len, uncomp_len,
@@ -429,6 +500,7 @@
 {
 	Py_ssize_t length = index_length(self);
 	const char *data;
+	const char *node_id;
 
 	if (pos == nullrev)
 		return nullid;
@@ -437,7 +509,17 @@
 		return NULL;
 
 	data = index_deref(self, pos);
-	return data ? data + entry_offset_node_id : NULL;
+
+	if (self->format_version == format_v1) {
+		node_id = data + entry_v1_offset_node_id;
+	} else if (self->format_version == format_v2) {
+		node_id = data + entry_v2_offset_node_id;
+	} else {
+		raise_revlog_error();
+		return NULL;
+	}
+
+	return data ? node_id : NULL;
 }
 
 /*
@@ -520,28 +602,50 @@
 	}
 	rev = self->length + self->new_length;
 	data = self->added + self->entry_size * self->new_length++;
-	putbe32(offset_flags >> 32, data + entry_offset_high);
-	putbe32(offset_flags & 0xffffffffU, data + entry_offset_offset_flags);
-	putbe32(comp_len, data + entry_offset_comp_len);
-	putbe32(uncomp_len, data + entry_offset_uncomp_len);
-	putbe32(base_rev, data + entry_offset_base_rev);
-	putbe32(link_rev, data + entry_offset_link_rev);
-	putbe32(parent_1, data + entry_offset_parent_1);
-	putbe32(parent_2, data + entry_offset_parent_2);
-	memcpy(data + entry_offset_node_id, c_node_id, c_node_id_len);
-	/* Padding since SHA-1 is only 20 bytes for now */
-	memset(data + entry_offset_node_id + c_node_id_len, 0,
-	       entry_offset_node_id - c_node_id_len);
-	if (self->format_version == format_v2) {
-		putbe64(sidedata_offset, data + entry_offset_sidedata_offset);
+
+	if (self->format_version == format_v1) {
+		putbe32(offset_flags >> 32, data + entry_v1_offset_high);
+		putbe32(offset_flags & 0xffffffffU,
+		        data + entry_v1_offset_offset_flags);
+		putbe32(comp_len, data + entry_v1_offset_comp_len);
+		putbe32(uncomp_len, data + entry_v1_offset_uncomp_len);
+		putbe32(base_rev, data + entry_v1_offset_base_rev);
+		putbe32(link_rev, data + entry_v1_offset_link_rev);
+		putbe32(parent_1, data + entry_v1_offset_parent_1);
+		putbe32(parent_2, data + entry_v1_offset_parent_2);
+		memcpy(data + entry_v1_offset_node_id, c_node_id,
+		       c_node_id_len);
+		/* Padding since SHA-1 is only 20 bytes for now */
+		memset(data + entry_v1_offset_node_id + c_node_id_len, 0,
+		       entry_v1_offset_node_id - c_node_id_len);
+	} else if (self->format_version == format_v2) {
+		putbe32(offset_flags >> 32, data + entry_v2_offset_high);
+		putbe32(offset_flags & 0xffffffffU,
+		        data + entry_v2_offset_offset_flags);
+		putbe32(comp_len, data + entry_v2_offset_comp_len);
+		putbe32(uncomp_len, data + entry_v2_offset_uncomp_len);
+		putbe32(base_rev, data + entry_v2_offset_base_rev);
+		putbe32(link_rev, data + entry_v2_offset_link_rev);
+		putbe32(parent_1, data + entry_v2_offset_parent_1);
+		putbe32(parent_2, data + entry_v2_offset_parent_2);
+		memcpy(data + entry_v2_offset_node_id, c_node_id,
+		       c_node_id_len);
+		/* Padding since SHA-1 is only 20 bytes for now */
+		memset(data + entry_v2_offset_node_id + c_node_id_len, 0,
+		       entry_v2_offset_node_id - c_node_id_len);
+		putbe64(sidedata_offset,
+		        data + entry_v2_offset_sidedata_offset);
 		putbe32(sidedata_comp_len,
-		        data + entry_offset_sidedata_comp_len);
+		        data + entry_v2_offset_sidedata_comp_len);
 		comp_field = data_comp_mode & 3;
 		comp_field = comp_field | (sidedata_comp_mode & 3) << 2;
-		data[entry_offset_all_comp_mode] = comp_field;
+		data[entry_v2_offset_all_comp_mode] = comp_field;
 		/* Padding for 96 bytes alignment */
-		memset(data + entry_offset_padding_start, 0,
-		       self->entry_size - entry_offset_padding_start);
+		memset(data + entry_v2_offset_padding_start, 0,
+		       self->entry_size - entry_v2_offset_padding_start);
+	} else {
+		raise_revlog_error();
+		return NULL;
 	}
 
 	if (self->ntinitialized)
@@ -596,11 +700,11 @@
 	/* Find the newly added node, offset from the "already on-disk" length
 	 */
 	data = self->added + self->entry_size * (rev - self->length);
-	putbe64(offset_flags, data + entry_offset_high);
-	putbe64(sidedata_offset, data + entry_offset_sidedata_offset);
-	putbe32(sidedata_comp_len, data + entry_offset_sidedata_comp_len);
-	data[entry_offset_all_comp_mode] =
-	    (data[entry_offset_all_comp_mode] & ~(3 << 2)) |
+	putbe64(offset_flags, data + entry_v2_offset_high);
+	putbe64(sidedata_offset, data + entry_v2_offset_sidedata_offset);
+	putbe32(sidedata_comp_len, data + entry_v2_offset_sidedata_comp_len);
+	data[entry_v2_offset_all_comp_mode] =
+	    (data[entry_v2_offset_all_comp_mode] & ~(3 << 2)) |
 	    ((comp_mode & 3) << 2);
 
 	Py_RETURN_NONE;
@@ -1144,7 +1248,15 @@
 	data = index_deref(self, rev);
 	if (data == NULL)
 		return -2;
-	result = getbe32(data + entry_offset_base_rev);
+
+	if (self->format_version == format_v1) {
+		result = getbe32(data + entry_v1_offset_base_rev);
+	} else if (self->format_version == format_v2) {
+		result = getbe32(data + entry_v2_offset_base_rev);
+	} else {
+		raise_revlog_error();
+		return -1;
+	}
 
 	if (result > rev) {
 		PyErr_Format(
@@ -2756,10 +2868,18 @@
 	while (pos + self->entry_size <= end && pos >= 0) {
 		uint32_t comp_len, sidedata_comp_len = 0;
 		/* 3rd element of header is length of compressed inline data */
-		comp_len = getbe32(data + pos + entry_offset_comp_len);
-		if (self->entry_size == v2_entry_size) {
+		if (self->format_version == format_v1) {
+			comp_len =
+			    getbe32(data + pos + entry_v1_offset_comp_len);
+			sidedata_comp_len = 0;
+		} else if (self->format_version == format_v2) {
+			comp_len =
+			    getbe32(data + pos + entry_v2_offset_comp_len);
 			sidedata_comp_len = getbe32(
-			    data + pos + entry_offset_sidedata_comp_len);
+			    data + pos + entry_v2_offset_sidedata_comp_len);
+		} else {
+			raise_revlog_error();
+			return -1;
 		}
 		incr = self->entry_size + comp_len + sidedata_comp_len;
 		if (offsets)