manifest: tigher manifest parsing and flag use
authorJoerg Sonnenberger <joerg@bec.de>
Mon, 06 Jul 2020 03:43:32 +0200
changeset 45118 d0ef8c1dddd4
parent 45117 b1e51ef4e536
child 45119 19748c73c208
manifest: tigher manifest parsing and flag use In the manifest line, flags are put directly after the hash, so the parser has been guessing the presence of flags based on the length of the hash. Replace this assumption by an enumeration of the valid flags and removing them from the hash first as they are distinct input values. Consistently handle the expected 256bit length of the SHA1-replacement in the pure Python parser. Check that setting flags will use one of the blessed values. Extend write logic in the C version to handle 256bit hashes as well. Verify that hashes always have exactly the expected length. Since 1070df141718 we should no longer depend on the old extra-byte hack. Differential Revision: https://phab.mercurial-scm.org/D8679
mercurial/cext/manifest.c
mercurial/manifest.py
mercurial/merge.py
tests/test-manifest.py
--- a/mercurial/cext/manifest.c	Wed Jul 08 00:15:15 2020 +0200
+++ b/mercurial/cext/manifest.c	Mon Jul 06 03:43:32 2020 +0200
@@ -49,23 +49,35 @@
 }
 
 /* get the node value of a single line */
-static PyObject *nodeof(line *l)
+static PyObject *nodeof(line *l, char *flag)
 {
 	char *s = l->start;
 	Py_ssize_t llen = pathlen(l);
 	Py_ssize_t hlen = l->len - llen - 2;
-	Py_ssize_t hlen_raw = 20;
+	Py_ssize_t hlen_raw;
 	PyObject *hash;
 	if (llen + 1 + 40 + 1 > l->len) { /* path '\0' hash '\n' */
 		PyErr_SetString(PyExc_ValueError, "manifest line too short");
 		return NULL;
 	}
+	/* Detect flags after the hash first. */
+	switch (s[llen + hlen]) {
+	case 'l':
+	case 't':
+	case 'x':
+		*flag = s[llen + hlen];
+		--hlen;
+		break;
+	default:
+		*flag = '\0';
+		break;
+	}
+
 	switch (hlen) {
 	case 40: /* sha1 */
-	case 41: /* sha1 with cruft for a merge */
+		hlen_raw = 20;
 		break;
 	case 64: /* new hash */
-	case 65: /* new hash with cruft for a merge */
 		hlen_raw = 32;
 		break;
 	default:
@@ -89,9 +101,8 @@
 /* get the node hash and flags of a line as a tuple */
 static PyObject *hashflags(line *l)
 {
-	char *s = l->start;
-	Py_ssize_t plen = pathlen(l);
-	PyObject *hash = nodeof(l);
+	char flag;
+	PyObject *hash = nodeof(l, &flag);
 	ssize_t hlen;
 	Py_ssize_t hplen, flen;
 	PyObject *flags;
@@ -99,14 +110,7 @@
 
 	if (!hash)
 		return NULL;
-	/* hash is either 20 or 21 bytes for an old hash, so we use a
-	   ternary here to get the "real" hexlified sha length. */
-	hlen = PyBytes_GET_SIZE(hash) < 22 ? 40 : 64;
-	/* 1 for null byte, 1 for newline */
-	hplen = plen + hlen + 2;
-	flen = l->len - hplen;
-
-	flags = PyBytes_FromStringAndSize(s + hplen - 1, flen);
+	flags = PyBytes_FromStringAndSize(&flag, flag ? 1 : 0);
 	if (!flags) {
 		Py_DECREF(hash);
 		return NULL;
@@ -291,6 +295,7 @@
 {
 	Py_ssize_t pl;
 	line *l;
+	char flag;
 	Py_ssize_t consumed;
 	PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
 	l = lmiter_nextline((lmIter *)o);
@@ -299,13 +304,11 @@
 	}
 	pl = pathlen(l);
 	path = PyBytes_FromStringAndSize(l->start, pl);
-	hash = nodeof(l);
+	hash = nodeof(l, &flag);
 	if (!path || !hash) {
 		goto done;
 	}
-	consumed = pl + 41;
-	flags = PyBytes_FromStringAndSize(l->start + consumed,
-					   l->len - consumed - 1);
+	flags = PyBytes_FromStringAndSize(&flag, flag ? 1 : 0);
 	if (!flags) {
 		goto done;
 	}
@@ -568,19 +571,13 @@
 	pyhash = PyTuple_GetItem(value, 0);
 	if (!PyBytes_Check(pyhash)) {
 		PyErr_Format(PyExc_TypeError,
-			     "node must be a 20-byte string");
+			     "node must be a 20 or 32 bytes string");
 		return -1;
 	}
 	hlen = PyBytes_Size(pyhash);
-	/* Some parts of the codebase try and set 21 or 22
-	 * byte "hash" values in order to perturb things for
-	 * status. We have to preserve at least the 21st
-	 * byte. Sigh. If there's a 22nd byte, we drop it on
-	 * the floor, which works fine.
-	 */
-	if (hlen != 20 && hlen != 21 && hlen != 22) {
+	if (hlen != 20 && hlen != 32) {
 		PyErr_Format(PyExc_TypeError,
-			     "node must be a 20-byte string");
+			     "node must be a 20 or 32 bytes string");
 		return -1;
 	}
 	hash = PyBytes_AsString(pyhash);
@@ -588,28 +585,39 @@
 	pyflags = PyTuple_GetItem(value, 1);
 	if (!PyBytes_Check(pyflags) || PyBytes_Size(pyflags) > 1) {
 		PyErr_Format(PyExc_TypeError,
-			     "flags must a 0 or 1 byte string");
+			     "flags must a 0 or 1 bytes string");
 		return -1;
 	}
 	if (PyBytes_AsStringAndSize(pyflags, &flags, &flen) == -1) {
 		return -1;
 	}
+	if (flen == 1) {
+		switch (*flags) {
+		case 'l':
+		case 't':
+		case 'x':
+			break;
+		default:
+			PyErr_Format(PyExc_TypeError, "invalid manifest flag");
+			return -1;
+		}
+	}
 	/* one null byte and one newline */
-	dlen = plen + 41 + flen + 1;
+	dlen = plen + hlen * 2 + 1 + flen + 1;
 	dest = malloc(dlen);
 	if (!dest) {
 		PyErr_NoMemory();
 		return -1;
 	}
 	memcpy(dest, path, plen + 1);
-	for (i = 0; i < 20; i++) {
+	for (i = 0; i < hlen; i++) {
 		/* Cast to unsigned, so it will not get sign-extended when promoted
 		 * to int (as is done when passing to a variadic function)
 		 */
 		sprintf(dest + plen + 1 + (i * 2), "%02x", (unsigned char)hash[i]);
 	}
-	memcpy(dest + plen + 41, flags, flen);
-	dest[plen + 41 + flen] = '\n';
+	memcpy(dest + plen + 2 * hlen + 1, flags, flen);
+	dest[plen + 2 * hlen + 1 + flen] = '\n';
 	new.start = dest;
 	new.len = dlen;
 	new.hash_suffix = '\0';
--- a/mercurial/manifest.py	Wed Jul 08 00:15:15 2020 +0200
+++ b/mercurial/manifest.py	Mon Jul 06 03:43:32 2020 +0200
@@ -121,8 +121,20 @@
             self.pos += 1
             return data
         zeropos = data.find(b'\x00', pos)
-        hashval = unhexlify(data, self.lm.extrainfo[self.pos], zeropos + 1, 40)
-        flags = self.lm._getflags(data, self.pos, zeropos)
+        nlpos = data.find(b'\n', pos)
+        if zeropos == -1 or nlpos == -1 or nlpos < zeropos:
+            raise error.StorageError(b'Invalid manifest line')
+        flags = data[nlpos - 1 : nlpos]
+        if flags in _manifestflags:
+            hlen = nlpos - zeropos - 2
+        else:
+            hlen = nlpos - zeropos - 1
+            flags = b''
+        if hlen not in (40, 64):
+            raise error.StorageError(b'Invalid manifest line')
+        hashval = unhexlify(
+            data, self.lm.extrainfo[self.pos], zeropos + 1, hlen
+        )
         self.pos += 1
         return (data[pos:zeropos], hashval, flags)
 
@@ -140,6 +152,9 @@
     return (a > b) - (a < b)
 
 
+_manifestflags = {b'', b'l', b't', b'x'}
+
+
 class _lazymanifest(object):
     """A pure python manifest backed by a byte string.  It is supplimented with
     internal lists as it is modified, until it is compacted back to a pure byte
@@ -251,15 +266,6 @@
     def __contains__(self, key):
         return self.bsearch(key) != -1
 
-    def _getflags(self, data, needle, pos):
-        start = pos + 41
-        end = data.find(b"\n", start)
-        if end == -1:
-            end = len(data) - 1
-        if start == end:
-            return b''
-        return self.data[start:end]
-
     def __getitem__(self, key):
         if not isinstance(key, bytes):
             raise TypeError(b"getitem: manifest keys must be a bytes.")
@@ -273,13 +279,17 @@
         nlpos = data.find(b'\n', zeropos)
         assert 0 <= needle <= len(self.positions)
         assert len(self.extrainfo) == len(self.positions)
+        if zeropos == -1 or nlpos == -1 or nlpos < zeropos:
+            raise error.StorageError(b'Invalid manifest line')
         hlen = nlpos - zeropos - 1
-        # Hashes sometimes have an extra byte tucked on the end, so
-        # detect that.
-        if hlen % 2:
+        flags = data[nlpos - 1 : nlpos]
+        if flags in _manifestflags:
             hlen -= 1
+        else:
+            flags = b''
+        if hlen not in (40, 64):
+            raise error.StorageError(b'Invalid manifest line')
         hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, hlen)
-        flags = self._getflags(data, needle, zeropos)
         return (hashval, flags)
 
     def __delitem__(self, key):
@@ -408,9 +418,7 @@
 
     def _pack(self, d):
         n = d[1]
-        if len(n) == 21 or len(n) == 33:
-            n = n[:-1]
-        assert len(n) == 20 or len(n) == 32
+        assert len(n) in (20, 32)
         return d[0] + b'\x00' + hex(n) + d[2] + b'\n'
 
     def text(self):
@@ -609,6 +617,8 @@
         return self._lm.diff(m2._lm, clean)
 
     def setflag(self, key, flag):
+        if flag not in _manifestflags:
+            raise TypeError(b"Invalid manifest flag set.")
         self._lm[key] = self[key], flag
 
     def get(self, key, default=None):
@@ -1049,11 +1059,10 @@
             self._dirs[dir].__setitem__(subpath, n)
         else:
             # manifest nodes are either 20 bytes or 32 bytes,
-            # depending on the hash in use. An extra byte is
-            # occasionally used by hg, but won't ever be
-            # persisted. Trim to 21 or 33 bytes as appropriate.
-            trim = 21 if len(n) < 25 else 33
-            self._files[f] = n[:trim]  # to match manifestdict's behavior
+            # depending on the hash in use. Assert this as historically
+            # sometimes extra bytes were added.
+            assert len(n) in (20, 32)
+            self._files[f] = n
         self._dirty = True
 
     def _load(self):
@@ -1066,6 +1075,8 @@
 
     def setflag(self, f, flags):
         """Set the flags (symlink, executable) for path f."""
+        if flags not in _manifestflags:
+            raise TypeError(b"Invalid manifest flag set.")
         self._load()
         dir, subpath = _splittopdir(f)
         if dir:
--- a/mercurial/merge.py	Wed Jul 08 00:15:15 2020 +0200
+++ b/mercurial/merge.py	Mon Jul 06 03:43:32 2020 +0200
@@ -725,8 +725,7 @@
                             b'prompt changed/deleted',
                         )
                 elif n1 == addednodeid:
-                    # This extra 'a' is added by working copy manifest to mark
-                    # the file as locally added. We should forget it instead of
+                    # This file was locally added. We should forget it instead of
                     # deleting it.
                     actions[f] = (
                         mergestatemod.ACTION_FORGET,
--- a/tests/test-manifest.py	Wed Jul 08 00:15:15 2020 +0200
+++ b/tests/test-manifest.py	Mon Jul 06 03:43:32 2020 +0200
@@ -156,39 +156,6 @@
         with self.assertRaises(KeyError):
             m[b'foo']
 
-    def testSetGetNodeSuffix(self):
-        clean = self.parsemanifest(A_SHORT_MANIFEST)
-        m = self.parsemanifest(A_SHORT_MANIFEST)
-        h = m[b'foo']
-        f = m.flags(b'foo')
-        want = h + b'a'
-        # Merge code wants to set 21-byte fake hashes at times
-        m[b'foo'] = want
-        self.assertEqual(want, m[b'foo'])
-        self.assertEqual(
-            [(b'bar/baz/qux.py', BIN_HASH_2), (b'foo', BIN_HASH_1 + b'a')],
-            list(m.items()),
-        )
-        # Sometimes it even tries a 22-byte fake hash, but we can
-        # return 21 and it'll work out
-        m[b'foo'] = want + b'+'
-        self.assertEqual(want, m[b'foo'])
-        # make sure the suffix survives a copy
-        match = matchmod.match(util.localpath(b'/repo'), b'', [b're:foo'])
-        m2 = m._matches(match)
-        self.assertEqual(want, m2[b'foo'])
-        self.assertEqual(1, len(m2))
-        m2 = m.copy()
-        self.assertEqual(want, m2[b'foo'])
-        # suffix with iteration
-        self.assertEqual(
-            [(b'bar/baz/qux.py', BIN_HASH_2), (b'foo', want)], list(m.items())
-        )
-
-        # shows up in diff
-        self.assertEqual({b'foo': ((want, f), (h, b''))}, m.diff(clean))
-        self.assertEqual({b'foo': ((h, b''), (want, f))}, clean.diff(m))
-
     def testMatchException(self):
         m = self.parsemanifest(A_SHORT_MANIFEST)
         match = matchmod.match(util.localpath(b'/repo'), b'', [b're:.*'])