Mercurial > hg
changeset 48222:7e78c72ee3ea
dirstate-v2: Initial Python serializer
This adds code seralizing a `map` and `copy_map` dicts into dirstate-v2
file formate. This is not used yet.
Differential Revision: https://phab.mercurial-scm.org/D11519
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Thu, 22 Jul 2021 17:31:37 +0200 |
parents | a32a96079e2d |
children | b4f83c9e7905 |
files | mercurial/cext/parsers.c mercurial/dirstateutils/v2.py mercurial/pure/parsers.py |
diffstat | 3 files changed, 332 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/cext/parsers.c Sun Oct 03 13:18:03 2021 +0200 +++ b/mercurial/cext/parsers.c Thu Jul 22 17:31:37 2021 +0200 @@ -235,6 +235,23 @@ } } +static PyObject *dirstate_item_v2_data(dirstateItemObject *self) +{ + unsigned char flags = self->flags; + int mode = dirstate_item_c_v1_mode(self); + if ((mode & S_IXUSR) != 0) { + flags |= dirstate_flag_mode_exec_perm; + } else { + flags &= ~dirstate_flag_mode_exec_perm; + } + if (S_ISLNK(mode)) { + flags |= dirstate_flag_mode_is_symlink; + } else { + flags &= ~dirstate_flag_mode_is_symlink; + } + return Py_BuildValue("Bii", flags, self->size, self->mtime); +}; + static PyObject *dirstate_item_v1_state(dirstateItemObject *self) { char state = dirstate_item_c_v1_state(self); @@ -428,6 +445,8 @@ Py_RETURN_NONE; } static PyMethodDef dirstate_item_methods[] = { + {"v2_data", (PyCFunction)dirstate_item_v2_data, METH_NOARGS, + "return data suitable for v2 serialization"}, {"v1_state", (PyCFunction)dirstate_item_v1_state, METH_NOARGS, "return a \"state\" suitable for v1 serialization"}, {"v1_mode", (PyCFunction)dirstate_item_v1_mode, METH_NOARGS,
--- a/mercurial/dirstateutils/v2.py Sun Oct 03 13:18:03 2021 +0200 +++ b/mercurial/dirstateutils/v2.py Thu Jul 22 17:31:37 2021 +0200 @@ -9,7 +9,8 @@ import struct -from .. import policy +from ..thirdparty import attr +from .. import error, policy parsers = policy.importmod('parsers') @@ -116,3 +117,295 @@ def slice_with_len(data, start, len): return data[start : start + len] + + +@attr.s +class Node(object): + path = attr.ib() + entry = attr.ib() + parent = attr.ib(default=None) + children_count = attr.ib(default=0) + children_offset = attr.ib(default=0) + descendants_with_entry = attr.ib(default=0) + tracked_descendants = attr.ib(default=0) + + def pack(self, copy_map, paths_offset): + path = self.path + copy = copy_map.get(path) + entry = self.entry + + path_start = paths_offset + path_len = len(path) + basename_start = path.rfind(b'/') + 1 # 0 if rfind returns -1 + if copy is not None: + copy_source_start = paths_offset + len(path) + copy_source_len = len(copy) + else: + copy_source_start = 0 + copy_source_len = 0 + if entry is not None: + flags, size, mtime_s = entry.v2_data() + mtime_ns = 0 + else: + # There are no mtime-cached directories in the Python implementation + flags = 0 + mode = 0 + size = 0 + mtime_s = 0 + mtime_ns = 0 + return NODE.pack( + path_start, + path_len, + basename_start, + copy_source_start, + copy_source_len, + self.children_offset, + self.children_count, + self.descendants_with_entry, + self.tracked_descendants, + flags, + size, + mtime_s, + mtime_ns, + ) + + +def pack_dirstate(map, copy_map, now): + """ + Pack `map` and `copy_map` into the dirstate v2 binary format and return + the bytearray. + `now` is a timestamp of the current filesystem time used to detect race + conditions in writing the dirstate to disk, see inline comment. + + The on-disk format expects a tree-like structure where the leaves are + written first (and sorted per-directory), going up levels until the root + node and writing that one to the docket. See more details on the on-disk + format in `mercurial/helptext/internals/dirstate-v2`. + + Since both `map` and `copy_map` are flat dicts we need to figure out the + hierarchy. This algorithm does so without having to build the entire tree + in-memory: it only keeps the minimum number of nodes around to satisfy the + format. + + # Algorithm explanation + + This explanation does not talk about the different counters for tracked + descendents and storing the copies, but that work is pretty simple once this + algorithm is in place. + + ## Building a subtree + + First, sort `map`: this makes it so the leaves of the tree are contiguous + per directory (i.e. a/b/c and a/b/d will be next to each other in the list), + and enables us to use the ordering of folders to have a "cursor" of the + current folder we're in without ever going twice in the same branch of the + tree. The cursor is a node that remembers its parent and any information + relevant to the format (see the `Node` class), building the relevant part + of the tree lazily. + Then, for each file in `map`, move the cursor into the tree to the + corresponding folder of the file: for example, if the very first file + is "a/b/c", we start from `Node[""]`, create `Node["a"]` which points to + its parent `Node[""]`, then create `Node["a/b"]`, which points to its parent + `Node["a"]`. These nodes are kept around in a stack. + If the next file in `map` is in the same subtree ("a/b/d" or "a/b/e/f"), we + add it to the stack and keep looping with the same logic of creating the + tree nodes as needed. If however the next file in `map` is *not* in the same + subtree ("a/other", if we're still in the "a/b" folder), then we know that + the subtree we're in is complete. + + ## Writing the subtree + + We have the entire subtree in the stack, so we start writing it to disk + folder by folder. The way we write a folder is to pop the stack into a list + until the folder changes, revert this list of direct children (to satisfy + the format requirement that children be sorted). This process repeats until + we hit the "other" subtree. + + An example: + a + dir1/b + dir1/c + dir2/dir3/d + dir2/dir3/e + dir2/f + + Would have us: + - add to the stack until "dir2/dir3/e" + - realize that "dir2/f" is in a different subtree + - pop "dir2/dir3/e", "dir2/dir3/d", reverse them so they're sorted and + pack them since the next entry is "dir2/dir3" + - go back up to "dir2" + - add "dir2/f" to the stack + - realize we're done with the map + - pop "dir2/f", "dir2/dir3" from the stack, reverse and pack them + - go up to the root node, do the same to write "a", "dir1" and "dir2" in + that order + + ## Special case for the root node + + The root node is not serialized in the format, but its information is + written to the docket. Again, see more details on the on-disk format in + `mercurial/helptext/internals/dirstate-v2`. + """ + now = int(now) + data = bytearray() + root_nodes_start = 0 + root_nodes_len = 0 + nodes_with_entry_count = 0 + nodes_with_copy_source_count = 0 + # Will always be 0 since this implementation always re-writes everything + # to disk + unreachable_bytes = 0 + unused = b'\x00' * 4 + # This is an optimization that's only useful for the Rust implementation + ignore_patterns_hash = b'\x00' * 20 + + if len(map) == 0: + tree_metadata = TREE_METADATA.pack( + root_nodes_start, + root_nodes_len, + nodes_with_entry_count, + nodes_with_copy_source_count, + unreachable_bytes, + unused, + ignore_patterns_hash, + ) + return data, tree_metadata + + sorted_map = sorted(map.items(), key=lambda x: x[0]) + + # Use a stack to not have to only remember the nodes we currently need + # instead of building the entire tree in memory + stack = [] + current_node = Node(b"", None) + stack.append(current_node) + + for index, (path, entry) in enumerate(sorted_map, 1): + if entry.need_delay(now): + # The file was last modified "simultaneously" with the current + # write to dirstate (i.e. within the same second for file- + # systems with a granularity of 1 sec). This commonly happens + # for at least a couple of files on 'update'. + # The user could change the file without changing its size + # within the same second. Invalidate the file's mtime in + # dirstate, forcing future 'status' calls to compare the + # contents of the file if the size is the same. This prevents + # mistakenly treating such files as clean. + entry.set_possibly_dirty() + nodes_with_entry_count += 1 + if path in copy_map: + nodes_with_copy_source_count += 1 + current_folder = get_folder(path) + current_node = move_to_correct_node_in_tree( + current_folder, current_node, stack + ) + + current_node.children_count += 1 + # Entries from `map` are never `None` + if entry.tracked: + current_node.tracked_descendants += 1 + current_node.descendants_with_entry += 1 + stack.append(Node(path, entry, current_node)) + + should_pack = True + next_path = None + if index < len(sorted_map): + # Determine if the next entry is in the same sub-tree, if so don't + # pack yet + next_path = sorted_map[index][0] + should_pack = not get_folder(next_path).startswith(current_folder) + if should_pack: + pack_directory_children(current_node, copy_map, data, stack) + while stack and current_node.path != b"": + # Go up the tree and write until we reach the folder of the next + # entry (if any, otherwise the root) + parent = current_node.parent + in_parent_folder_of_next_entry = next_path is not None and ( + get_folder(next_path).startswith(get_folder(stack[-1].path)) + ) + if parent is None or in_parent_folder_of_next_entry: + break + pack_directory_children(parent, copy_map, data, stack) + current_node = parent + + # Special case for the root node since we don't write it to disk, only its + # children to the docket + current_node = stack.pop() + assert current_node.path == b"", current_node.path + assert len(stack) == 0, len(stack) + + tree_metadata = TREE_METADATA.pack( + current_node.children_offset, + current_node.children_count, + nodes_with_entry_count, + nodes_with_copy_source_count, + unreachable_bytes, + unused, + ignore_patterns_hash, + ) + + return data, tree_metadata + + +def get_folder(path): + """ + Return the folder of the path that's given, an empty string for root paths. + """ + return path.rsplit(b'/', 1)[0] if b'/' in path else b'' + + +def move_to_correct_node_in_tree(target_folder, current_node, stack): + """ + Move inside the dirstate node tree to the node corresponding to + `target_folder`, creating the missing nodes along the way if needed. + """ + while target_folder != current_node.path: + if target_folder.startswith(current_node.path): + # We need to go down a folder + prefix = target_folder[len(current_node.path) :].lstrip(b'/') + subfolder_name = prefix.split(b'/', 1)[0] + if current_node.path: + subfolder_path = current_node.path + b'/' + subfolder_name + else: + subfolder_path = subfolder_name + next_node = stack[-1] + if next_node.path == target_folder: + # This folder is now a file and only contains removed entries + # merge with the last node + current_node = next_node + else: + current_node.children_count += 1 + current_node = Node(subfolder_path, None, current_node) + stack.append(current_node) + else: + # We need to go up a folder + current_node = current_node.parent + return current_node + + +def pack_directory_children(node, copy_map, data, stack): + """ + Write the binary representation of the direct sorted children of `node` to + `data` + """ + direct_children = [] + + while stack[-1].path != b"" and get_folder(stack[-1].path) == node.path: + direct_children.append(stack.pop()) + if not direct_children: + raise error.ProgrammingError(b"no direct children for %r" % node.path) + + # Reverse the stack to get the correct sorted order + direct_children.reverse() + packed_children = bytearray() + # Write the paths to `data`. Pack child nodes but don't write them yet + for child in direct_children: + packed = child.pack(copy_map=copy_map, paths_offset=len(data)) + packed_children.extend(packed) + data.extend(child.path) + data.extend(copy_map.get(child.path, b"")) + node.tracked_descendants += child.tracked_descendants + node.descendants_with_entry += child.descendants_with_entry + # Write the fixed-size child nodes all together + node.children_offset = len(data) + data.extend(packed_children)
--- a/mercurial/pure/parsers.py Sun Oct 03 13:18:03 2021 +0200 +++ b/mercurial/pure/parsers.py Thu Jul 22 17:31:37 2021 +0200 @@ -313,6 +313,25 @@ """True if the file has been removed""" return not self._wc_tracked and (self._p1_tracked or self._p2_info) + def v2_data(self): + """Returns (flags, mode, size, mtime) for v2 serialization""" + flags = 0 + if self._wc_tracked: + flags |= DIRSTATE_V2_WDIR_TRACKED + if self._p1_tracked: + flags |= DIRSTATE_V2_P1_TRACKED + if self._p2_info: + flags |= DIRSTATE_V2_P2_INFO + if self.mode is not None and self.size is not None: + flags |= DIRSTATE_V2_HAS_MODE_AND_SIZE + if self.mode & stat.S_IXUSR: + flags |= DIRSTATE_V2_MODE_EXEC_PERM + if stat.S_ISLNK(self.mode): + flags |= DIRSTATE_V2_MODE_IS_SYMLINK + if self.mtime is not None: + flags |= DIRSTATE_V2_HAS_MTIME + return (flags, self.size or 0, self.mtime or 0) + def v1_state(self): """return a "state" suitable for v1 serialization""" if not self.any_tracked: