--- a/mercurial/cext/parsers.c Sun Oct 03 13:18:03 2021 +0200
+++ b/mercurial/cext/parsers.c Thu Jul 22 17:31:37 2021 +0200
@@ -235,6 +235,23 @@
}
}
+static PyObject *dirstate_item_v2_data(dirstateItemObject *self)
+{
+ unsigned char flags = self->flags;
+ int mode = dirstate_item_c_v1_mode(self);
+ if ((mode & S_IXUSR) != 0) {
+ flags |= dirstate_flag_mode_exec_perm;
+ } else {
+ flags &= ~dirstate_flag_mode_exec_perm;
+ }
+ if (S_ISLNK(mode)) {
+ flags |= dirstate_flag_mode_is_symlink;
+ } else {
+ flags &= ~dirstate_flag_mode_is_symlink;
+ }
+ return Py_BuildValue("Bii", flags, self->size, self->mtime);
+};
+
static PyObject *dirstate_item_v1_state(dirstateItemObject *self)
{
char state = dirstate_item_c_v1_state(self);
@@ -428,6 +445,8 @@
Py_RETURN_NONE;
}
static PyMethodDef dirstate_item_methods[] = {
+ {"v2_data", (PyCFunction)dirstate_item_v2_data, METH_NOARGS,
+ "return data suitable for v2 serialization"},
{"v1_state", (PyCFunction)dirstate_item_v1_state, METH_NOARGS,
"return a \"state\" suitable for v1 serialization"},
{"v1_mode", (PyCFunction)dirstate_item_v1_mode, METH_NOARGS,
--- a/mercurial/dirstateutils/v2.py Sun Oct 03 13:18:03 2021 +0200
+++ b/mercurial/dirstateutils/v2.py Thu Jul 22 17:31:37 2021 +0200
@@ -9,7 +9,8 @@
import struct
-from .. import policy
+from ..thirdparty import attr
+from .. import error, policy
parsers = policy.importmod('parsers')
@@ -116,3 +117,295 @@
def slice_with_len(data, start, len):
return data[start : start + len]
+
+
+@attr.s
+class Node(object):
+ path = attr.ib()
+ entry = attr.ib()
+ parent = attr.ib(default=None)
+ children_count = attr.ib(default=0)
+ children_offset = attr.ib(default=0)
+ descendants_with_entry = attr.ib(default=0)
+ tracked_descendants = attr.ib(default=0)
+
+ def pack(self, copy_map, paths_offset):
+ path = self.path
+ copy = copy_map.get(path)
+ entry = self.entry
+
+ path_start = paths_offset
+ path_len = len(path)
+ basename_start = path.rfind(b'/') + 1 # 0 if rfind returns -1
+ if copy is not None:
+ copy_source_start = paths_offset + len(path)
+ copy_source_len = len(copy)
+ else:
+ copy_source_start = 0
+ copy_source_len = 0
+ if entry is not None:
+ flags, size, mtime_s = entry.v2_data()
+ mtime_ns = 0
+ else:
+ # There are no mtime-cached directories in the Python implementation
+ flags = 0
+ mode = 0
+ size = 0
+ mtime_s = 0
+ mtime_ns = 0
+ return NODE.pack(
+ path_start,
+ path_len,
+ basename_start,
+ copy_source_start,
+ copy_source_len,
+ self.children_offset,
+ self.children_count,
+ self.descendants_with_entry,
+ self.tracked_descendants,
+ flags,
+ size,
+ mtime_s,
+ mtime_ns,
+ )
+
+
+def pack_dirstate(map, copy_map, now):
+ """
+ Pack `map` and `copy_map` into the dirstate v2 binary format and return
+ the bytearray.
+ `now` is a timestamp of the current filesystem time used to detect race
+ conditions in writing the dirstate to disk, see inline comment.
+
+ The on-disk format expects a tree-like structure where the leaves are
+ written first (and sorted per-directory), going up levels until the root
+ node and writing that one to the docket. See more details on the on-disk
+ format in `mercurial/helptext/internals/dirstate-v2`.
+
+ Since both `map` and `copy_map` are flat dicts we need to figure out the
+ hierarchy. This algorithm does so without having to build the entire tree
+ in-memory: it only keeps the minimum number of nodes around to satisfy the
+ format.
+
+ # Algorithm explanation
+
+ This explanation does not talk about the different counters for tracked
+ descendents and storing the copies, but that work is pretty simple once this
+ algorithm is in place.
+
+ ## Building a subtree
+
+ First, sort `map`: this makes it so the leaves of the tree are contiguous
+ per directory (i.e. a/b/c and a/b/d will be next to each other in the list),
+ and enables us to use the ordering of folders to have a "cursor" of the
+ current folder we're in without ever going twice in the same branch of the
+ tree. The cursor is a node that remembers its parent and any information
+ relevant to the format (see the `Node` class), building the relevant part
+ of the tree lazily.
+ Then, for each file in `map`, move the cursor into the tree to the
+ corresponding folder of the file: for example, if the very first file
+ is "a/b/c", we start from `Node[""]`, create `Node["a"]` which points to
+ its parent `Node[""]`, then create `Node["a/b"]`, which points to its parent
+ `Node["a"]`. These nodes are kept around in a stack.
+ If the next file in `map` is in the same subtree ("a/b/d" or "a/b/e/f"), we
+ add it to the stack and keep looping with the same logic of creating the
+ tree nodes as needed. If however the next file in `map` is *not* in the same
+ subtree ("a/other", if we're still in the "a/b" folder), then we know that
+ the subtree we're in is complete.
+
+ ## Writing the subtree
+
+ We have the entire subtree in the stack, so we start writing it to disk
+ folder by folder. The way we write a folder is to pop the stack into a list
+ until the folder changes, revert this list of direct children (to satisfy
+ the format requirement that children be sorted). This process repeats until
+ we hit the "other" subtree.
+
+ An example:
+ a
+ dir1/b
+ dir1/c
+ dir2/dir3/d
+ dir2/dir3/e
+ dir2/f
+
+ Would have us:
+ - add to the stack until "dir2/dir3/e"
+ - realize that "dir2/f" is in a different subtree
+ - pop "dir2/dir3/e", "dir2/dir3/d", reverse them so they're sorted and
+ pack them since the next entry is "dir2/dir3"
+ - go back up to "dir2"
+ - add "dir2/f" to the stack
+ - realize we're done with the map
+ - pop "dir2/f", "dir2/dir3" from the stack, reverse and pack them
+ - go up to the root node, do the same to write "a", "dir1" and "dir2" in
+ that order
+
+ ## Special case for the root node
+
+ The root node is not serialized in the format, but its information is
+ written to the docket. Again, see more details on the on-disk format in
+ `mercurial/helptext/internals/dirstate-v2`.
+ """
+ now = int(now)
+ data = bytearray()
+ root_nodes_start = 0
+ root_nodes_len = 0
+ nodes_with_entry_count = 0
+ nodes_with_copy_source_count = 0
+ # Will always be 0 since this implementation always re-writes everything
+ # to disk
+ unreachable_bytes = 0
+ unused = b'\x00' * 4
+ # This is an optimization that's only useful for the Rust implementation
+ ignore_patterns_hash = b'\x00' * 20
+
+ if len(map) == 0:
+ tree_metadata = TREE_METADATA.pack(
+ root_nodes_start,
+ root_nodes_len,
+ nodes_with_entry_count,
+ nodes_with_copy_source_count,
+ unreachable_bytes,
+ unused,
+ ignore_patterns_hash,
+ )
+ return data, tree_metadata
+
+ sorted_map = sorted(map.items(), key=lambda x: x[0])
+
+ # Use a stack to not have to only remember the nodes we currently need
+ # instead of building the entire tree in memory
+ stack = []
+ current_node = Node(b"", None)
+ stack.append(current_node)
+
+ for index, (path, entry) in enumerate(sorted_map, 1):
+ if entry.need_delay(now):
+ # The file was last modified "simultaneously" with the current
+ # write to dirstate (i.e. within the same second for file-
+ # systems with a granularity of 1 sec). This commonly happens
+ # for at least a couple of files on 'update'.
+ # The user could change the file without changing its size
+ # within the same second. Invalidate the file's mtime in
+ # dirstate, forcing future 'status' calls to compare the
+ # contents of the file if the size is the same. This prevents
+ # mistakenly treating such files as clean.
+ entry.set_possibly_dirty()
+ nodes_with_entry_count += 1
+ if path in copy_map:
+ nodes_with_copy_source_count += 1
+ current_folder = get_folder(path)
+ current_node = move_to_correct_node_in_tree(
+ current_folder, current_node, stack
+ )
+
+ current_node.children_count += 1
+ # Entries from `map` are never `None`
+ if entry.tracked:
+ current_node.tracked_descendants += 1
+ current_node.descendants_with_entry += 1
+ stack.append(Node(path, entry, current_node))
+
+ should_pack = True
+ next_path = None
+ if index < len(sorted_map):
+ # Determine if the next entry is in the same sub-tree, if so don't
+ # pack yet
+ next_path = sorted_map[index][0]
+ should_pack = not get_folder(next_path).startswith(current_folder)
+ if should_pack:
+ pack_directory_children(current_node, copy_map, data, stack)
+ while stack and current_node.path != b"":
+ # Go up the tree and write until we reach the folder of the next
+ # entry (if any, otherwise the root)
+ parent = current_node.parent
+ in_parent_folder_of_next_entry = next_path is not None and (
+ get_folder(next_path).startswith(get_folder(stack[-1].path))
+ )
+ if parent is None or in_parent_folder_of_next_entry:
+ break
+ pack_directory_children(parent, copy_map, data, stack)
+ current_node = parent
+
+ # Special case for the root node since we don't write it to disk, only its
+ # children to the docket
+ current_node = stack.pop()
+ assert current_node.path == b"", current_node.path
+ assert len(stack) == 0, len(stack)
+
+ tree_metadata = TREE_METADATA.pack(
+ current_node.children_offset,
+ current_node.children_count,
+ nodes_with_entry_count,
+ nodes_with_copy_source_count,
+ unreachable_bytes,
+ unused,
+ ignore_patterns_hash,
+ )
+
+ return data, tree_metadata
+
+
+def get_folder(path):
+ """
+ Return the folder of the path that's given, an empty string for root paths.
+ """
+ return path.rsplit(b'/', 1)[0] if b'/' in path else b''
+
+
+def move_to_correct_node_in_tree(target_folder, current_node, stack):
+ """
+ Move inside the dirstate node tree to the node corresponding to
+ `target_folder`, creating the missing nodes along the way if needed.
+ """
+ while target_folder != current_node.path:
+ if target_folder.startswith(current_node.path):
+ # We need to go down a folder
+ prefix = target_folder[len(current_node.path) :].lstrip(b'/')
+ subfolder_name = prefix.split(b'/', 1)[0]
+ if current_node.path:
+ subfolder_path = current_node.path + b'/' + subfolder_name
+ else:
+ subfolder_path = subfolder_name
+ next_node = stack[-1]
+ if next_node.path == target_folder:
+ # This folder is now a file and only contains removed entries
+ # merge with the last node
+ current_node = next_node
+ else:
+ current_node.children_count += 1
+ current_node = Node(subfolder_path, None, current_node)
+ stack.append(current_node)
+ else:
+ # We need to go up a folder
+ current_node = current_node.parent
+ return current_node
+
+
+def pack_directory_children(node, copy_map, data, stack):
+ """
+ Write the binary representation of the direct sorted children of `node` to
+ `data`
+ """
+ direct_children = []
+
+ while stack[-1].path != b"" and get_folder(stack[-1].path) == node.path:
+ direct_children.append(stack.pop())
+ if not direct_children:
+ raise error.ProgrammingError(b"no direct children for %r" % node.path)
+
+ # Reverse the stack to get the correct sorted order
+ direct_children.reverse()
+ packed_children = bytearray()
+ # Write the paths to `data`. Pack child nodes but don't write them yet
+ for child in direct_children:
+ packed = child.pack(copy_map=copy_map, paths_offset=len(data))
+ packed_children.extend(packed)
+ data.extend(child.path)
+ data.extend(copy_map.get(child.path, b""))
+ node.tracked_descendants += child.tracked_descendants
+ node.descendants_with_entry += child.descendants_with_entry
+ # Write the fixed-size child nodes all together
+ node.children_offset = len(data)
+ data.extend(packed_children)