dirstate-v2: initial Python parser
The dirstate-v2 file format should be supported even if Rust extensions are
not enabled. This changeset adds parsing code that is not used yet.
Differential Revision: https://phab.mercurial-scm.org/D11518
--- a/mercurial/cext/parsers.c Fri Oct 08 13:15:22 2021 +0200
+++ b/mercurial/cext/parsers.c Sun Oct 03 13:18:03 2021 +0200
@@ -347,6 +347,33 @@
return (PyObject *)dirstate_item_from_v1_data(state, mode, size, mtime);
};
+static PyObject *dirstate_item_from_v2_meth(PyTypeObject *subtype,
+ PyObject *args)
+{
+ dirstateItemObject *t =
+ PyObject_New(dirstateItemObject, &dirstateItemType);
+ if (!t) {
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(args, "bii", &t->flags, &t->size, &t->mtime)) {
+ return NULL;
+ }
+ t->mode = 0;
+ if (t->flags & dirstate_flag_has_meaningful_data) {
+ if (t->flags & dirstate_flag_mode_exec_perm) {
+ t->mode = 0755;
+ } else {
+ t->mode = 0644;
+ }
+ if (t->flags & dirstate_flag_mode_is_symlink) {
+ t->mode |= S_IFLNK;
+ } else {
+ t->mode |= S_IFREG;
+ }
+ }
+ return (PyObject *)t;
+};
+
/* This means the next status call will have to actually check its content
to make sure it is correct. */
static PyObject *dirstate_item_set_possibly_dirty(dirstateItemObject *self)
@@ -413,6 +440,8 @@
"True if the stored mtime would be ambiguous with the current time"},
{"from_v1_data", (PyCFunction)dirstate_item_from_v1_meth,
METH_VARARGS | METH_CLASS, "build a new DirstateItem object from V1 data"},
+ {"from_v2_data", (PyCFunction)dirstate_item_from_v2_meth,
+ METH_VARARGS | METH_CLASS, "build a new DirstateItem object from V2 data"},
{"set_possibly_dirty", (PyCFunction)dirstate_item_set_possibly_dirty,
METH_NOARGS, "mark a file as \"possibly dirty\""},
{"set_clean", (PyCFunction)dirstate_item_set_clean, METH_VARARGS,
--- a/mercurial/cext/util.h Fri Oct 08 13:15:22 2021 +0200
+++ b/mercurial/cext/util.h Sun Oct 03 13:18:03 2021 +0200
@@ -36,6 +36,8 @@
static const unsigned char dirstate_flag_p2_info = 1 << 2;
static const unsigned char dirstate_flag_has_meaningful_data = 1 << 3;
static const unsigned char dirstate_flag_has_meaningful_mtime = 1 << 4;
+static const unsigned char dirstate_flag_mode_exec_perm = 1 << 5;
+static const unsigned char dirstate_flag_mode_is_symlink = 1 << 6;
extern PyTypeObject dirstateItemType;
#define dirstate_tuple_check(op) (Py_TYPE(op) == &dirstateItemType)
--- a/mercurial/dirstatemap.py Fri Oct 08 13:15:22 2021 +0200
+++ b/mercurial/dirstatemap.py Sun Oct 03 13:18:03 2021 +0200
@@ -20,6 +20,7 @@
from .dirstateutils import (
docket as docketmod,
+ v2,
)
parsers = policy.importmod('parsers')
--- a/mercurial/dirstateutils/docket.py Fri Oct 08 13:15:22 2021 +0200
+++ b/mercurial/dirstateutils/docket.py Sun Oct 03 13:18:03 2021 +0200
@@ -10,14 +10,10 @@
import struct
from ..revlogutils import docket as docket_mod
-
+from . import v2
V2_FORMAT_MARKER = b"dirstate-v2\n"
-# Must match the constant of the same name in
-# `rust/hg-core/src/dirstate_tree/on_disk.rs`
-TREE_METADATA_SIZE = 44
-
# * 12 bytes: format marker
# * 32 bytes: node ID of the working directory's first parent
# * 32 bytes: node ID of the working directory's second parent
@@ -29,7 +25,7 @@
# Node IDs are null-padded if shorter than 32 bytes.
# A data file shorter than the specified used size is corrupted (truncated)
HEADER = struct.Struct(
- ">{}s32s32s{}sLB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
+ ">{}s32s32s{}sLB".format(len(V2_FORMAT_MARKER), v2.TREE_METADATA_SIZE)
)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/dirstateutils/v2.py Sun Oct 03 13:18:03 2021 +0200
@@ -0,0 +1,118 @@
+# v2.py - Pure-Python implementation of the dirstate-v2 file format
+#
+# Copyright Mercurial Contributors
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import struct
+
+from .. import policy
+
+parsers = policy.importmod('parsers')
+
+
+# Must match the constant of the same name in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`
+TREE_METADATA_SIZE = 44
+NODE_SIZE = 43
+
+
+# Must match the `TreeMetadata` Rust struct in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
+#
+# * 4 bytes: start offset of root nodes
+# * 4 bytes: number of root nodes
+# * 4 bytes: total number of nodes in the tree that have an entry
+# * 4 bytes: total number of nodes in the tree that have a copy source
+# * 4 bytes: number of bytes in the data file that are not used anymore
+# * 4 bytes: unused
+# * 20 bytes: SHA-1 hash of ignore patterns
+TREE_METADATA = struct.Struct('>LLLLL4s20s')
+
+
+# Must match the `Node` Rust struct in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
+#
+# * 4 bytes: start offset of full path
+# * 2 bytes: length of the full path
+# * 2 bytes: length within the full path before its "base name"
+# * 4 bytes: start offset of the copy source if any, or zero for no copy source
+# * 2 bytes: length of the copy source if any, or unused
+# * 4 bytes: start offset of child nodes
+# * 4 bytes: number of child nodes
+# * 4 bytes: number of descendant nodes that have an entry
+# * 4 bytes: number of descendant nodes that have a "tracked" state
+# * 1 byte: flags
+# * 4 bytes: expected size
+# * 4 bytes: mtime seconds
+# * 4 bytes: mtime nanoseconds
+NODE = struct.Struct('>LHHLHLLLLBlll')
+
+
+assert TREE_METADATA_SIZE == TREE_METADATA.size
+assert NODE_SIZE == NODE.size
+
+
+def parse_dirstate(map, copy_map, data, tree_metadata):
+ """parse a full v2-dirstate from a binary data into dictionnaries:
+
+ - map: a {path: entry} mapping that will be filled
+ - copy_map: a {path: copy-source} mapping that will be filled
+ - data: a binary blob contains v2 nodes data
+ - tree_metadata:: a binary blob of the top level node (from the docket)
+ """
+ (
+ root_nodes_start,
+ root_nodes_len,
+ _nodes_with_entry_count,
+ _nodes_with_copy_source_count,
+ _unreachable_bytes,
+ _unused,
+ _ignore_patterns_hash,
+ ) = TREE_METADATA.unpack(tree_metadata)
+ parse_nodes(map, copy_map, data, root_nodes_start, root_nodes_len)
+
+
+def parse_nodes(map, copy_map, data, start, len):
+ """parse <len> nodes from <data> starting at offset <start>
+
+ This is used by parse_dirstate to recursively fill `map` and `copy_map`.
+ """
+ for i in range(len):
+ node_start = start + NODE_SIZE * i
+ node_bytes = slice_with_len(data, node_start, NODE_SIZE)
+ (
+ path_start,
+ path_len,
+ _basename_start,
+ copy_source_start,
+ copy_source_len,
+ children_start,
+ children_count,
+ _descendants_with_entry_count,
+ _tracked_descendants_count,
+ flags,
+ size,
+ mtime_s,
+ _mtime_ns,
+ ) = NODE.unpack(node_bytes)
+
+ # Parse child nodes of this node recursively
+ parse_nodes(map, copy_map, data, children_start, children_count)
+
+ item = parsers.DirstateItem.from_v2_data(flags, size, mtime_s)
+ if not item.any_tracked:
+ continue
+ path = slice_with_len(data, path_start, path_len)
+ map[path] = item
+ if copy_source_start:
+ copy_map[path] = slice_with_len(
+ data, copy_source_start, copy_source_len
+ )
+
+
+def slice_with_len(data, start, len):
+ return data[start : start + len]
--- a/mercurial/pure/parsers.py Fri Oct 08 13:15:22 2021 +0200
+++ b/mercurial/pure/parsers.py Sun Oct 03 13:18:03 2021 +0200
@@ -7,6 +7,7 @@
from __future__ import absolute_import
+import stat
import struct
import zlib
@@ -43,6 +44,15 @@
# a special value used internally for `time` if the time is ambigeous
AMBIGUOUS_TIME = -1
+# Bits of the `flags` byte inside a node in the file format
+DIRSTATE_V2_WDIR_TRACKED = 1 << 0
+DIRSTATE_V2_P1_TRACKED = 1 << 1
+DIRSTATE_V2_P2_INFO = 1 << 2
+DIRSTATE_V2_HAS_MODE_AND_SIZE = 1 << 3
+DIRSTATE_V2_HAS_MTIME = 1 << 4
+DIRSTATE_V2_MODE_EXEC_PERM = 1 << 5
+DIRSTATE_V2_MODE_IS_SYMLINK = 1 << 6
+
@attr.s(slots=True, init=False)
class DirstateItem(object):
@@ -109,6 +119,30 @@
self._mtime = parentfiledata[2]
@classmethod
+ def from_v2_data(cls, flags, size, mtime):
+ """Build a new DirstateItem object from V2 data"""
+ has_mode_size = bool(flags & DIRSTATE_V2_HAS_MODE_AND_SIZE)
+ mode = None
+ if has_mode_size:
+ assert stat.S_IXUSR == 0o100
+ if flags & DIRSTATE_V2_MODE_EXEC_PERM:
+ mode = 0o755
+ else:
+ mode = 0o644
+ if flags & DIRSTATE_V2_MODE_IS_SYMLINK:
+ mode |= stat.S_IFLNK
+ else:
+ mode |= stat.S_IFREG
+ return cls(
+ wc_tracked=bool(flags & DIRSTATE_V2_WDIR_TRACKED),
+ p1_tracked=bool(flags & DIRSTATE_V2_P1_TRACKED),
+ p2_info=bool(flags & DIRSTATE_V2_P2_INFO),
+ has_meaningful_data=has_mode_size,
+ has_meaningful_mtime=bool(flags & DIRSTATE_V2_HAS_MTIME),
+ parentfiledata=(mode, size, mtime),
+ )
+
+ @classmethod
def from_v1_data(cls, state, mode, size, mtime):
"""Build a new DirstateItem object from V1 data
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Fri Oct 08 13:15:22 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Sun Oct 03 13:18:03 2021 +0200
@@ -31,10 +31,8 @@
pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
-/// Must match the constant of the same name in
-/// `mercurial/dirstateutils/docket.py`
+/// Must match constants of the same names in `mercurial/dirstateutils/v2.py`
const TREE_METADATA_SIZE: usize = 44;
-
const NODE_SIZE: usize = 43;
/// Make sure that size-affecting changes are made knowingly