comparison mercurial/dirstateutils/v2.py @ 48222:7e78c72ee3ea

dirstate-v2: Initial Python serializer This adds code seralizing a `map` and `copy_map` dicts into dirstate-v2 file formate. This is not used yet. Differential Revision: https://phab.mercurial-scm.org/D11519
author Raphaël Gomès <rgomes@octobus.net>
date Thu, 22 Jul 2021 17:31:37 +0200
parents a32a96079e2d
children 50dca3aa5c3b
comparison
equal deleted inserted replaced
48221:a32a96079e2d 48222:7e78c72ee3ea
7 7
8 from __future__ import absolute_import 8 from __future__ import absolute_import
9 9
10 import struct 10 import struct
11 11
12 from .. import policy 12 from ..thirdparty import attr
13 from .. import error, policy
13 14
14 parsers = policy.importmod('parsers') 15 parsers = policy.importmod('parsers')
15 16
16 17
17 # Must match the constant of the same name in 18 # Must match the constant of the same name in
114 ) 115 )
115 116
116 117
117 def slice_with_len(data, start, len): 118 def slice_with_len(data, start, len):
118 return data[start : start + len] 119 return data[start : start + len]
120
121
122 @attr.s
123 class Node(object):
124 path = attr.ib()
125 entry = attr.ib()
126 parent = attr.ib(default=None)
127 children_count = attr.ib(default=0)
128 children_offset = attr.ib(default=0)
129 descendants_with_entry = attr.ib(default=0)
130 tracked_descendants = attr.ib(default=0)
131
132 def pack(self, copy_map, paths_offset):
133 path = self.path
134 copy = copy_map.get(path)
135 entry = self.entry
136
137 path_start = paths_offset
138 path_len = len(path)
139 basename_start = path.rfind(b'/') + 1 # 0 if rfind returns -1
140 if copy is not None:
141 copy_source_start = paths_offset + len(path)
142 copy_source_len = len(copy)
143 else:
144 copy_source_start = 0
145 copy_source_len = 0
146 if entry is not None:
147 flags, size, mtime_s = entry.v2_data()
148 mtime_ns = 0
149 else:
150 # There are no mtime-cached directories in the Python implementation
151 flags = 0
152 mode = 0
153 size = 0
154 mtime_s = 0
155 mtime_ns = 0
156 return NODE.pack(
157 path_start,
158 path_len,
159 basename_start,
160 copy_source_start,
161 copy_source_len,
162 self.children_offset,
163 self.children_count,
164 self.descendants_with_entry,
165 self.tracked_descendants,
166 flags,
167 size,
168 mtime_s,
169 mtime_ns,
170 )
171
172
173 def pack_dirstate(map, copy_map, now):
174 """
175 Pack `map` and `copy_map` into the dirstate v2 binary format and return
176 the bytearray.
177 `now` is a timestamp of the current filesystem time used to detect race
178 conditions in writing the dirstate to disk, see inline comment.
179
180 The on-disk format expects a tree-like structure where the leaves are
181 written first (and sorted per-directory), going up levels until the root
182 node and writing that one to the docket. See more details on the on-disk
183 format in `mercurial/helptext/internals/dirstate-v2`.
184
185 Since both `map` and `copy_map` are flat dicts we need to figure out the
186 hierarchy. This algorithm does so without having to build the entire tree
187 in-memory: it only keeps the minimum number of nodes around to satisfy the
188 format.
189
190 # Algorithm explanation
191
192 This explanation does not talk about the different counters for tracked
193 descendents and storing the copies, but that work is pretty simple once this
194 algorithm is in place.
195
196 ## Building a subtree
197
198 First, sort `map`: this makes it so the leaves of the tree are contiguous
199 per directory (i.e. a/b/c and a/b/d will be next to each other in the list),
200 and enables us to use the ordering of folders to have a "cursor" of the
201 current folder we're in without ever going twice in the same branch of the
202 tree. The cursor is a node that remembers its parent and any information
203 relevant to the format (see the `Node` class), building the relevant part
204 of the tree lazily.
205 Then, for each file in `map`, move the cursor into the tree to the
206 corresponding folder of the file: for example, if the very first file
207 is "a/b/c", we start from `Node[""]`, create `Node["a"]` which points to
208 its parent `Node[""]`, then create `Node["a/b"]`, which points to its parent
209 `Node["a"]`. These nodes are kept around in a stack.
210 If the next file in `map` is in the same subtree ("a/b/d" or "a/b/e/f"), we
211 add it to the stack and keep looping with the same logic of creating the
212 tree nodes as needed. If however the next file in `map` is *not* in the same
213 subtree ("a/other", if we're still in the "a/b" folder), then we know that
214 the subtree we're in is complete.
215
216 ## Writing the subtree
217
218 We have the entire subtree in the stack, so we start writing it to disk
219 folder by folder. The way we write a folder is to pop the stack into a list
220 until the folder changes, revert this list of direct children (to satisfy
221 the format requirement that children be sorted). This process repeats until
222 we hit the "other" subtree.
223
224 An example:
225 a
226 dir1/b
227 dir1/c
228 dir2/dir3/d
229 dir2/dir3/e
230 dir2/f
231
232 Would have us:
233 - add to the stack until "dir2/dir3/e"
234 - realize that "dir2/f" is in a different subtree
235 - pop "dir2/dir3/e", "dir2/dir3/d", reverse them so they're sorted and
236 pack them since the next entry is "dir2/dir3"
237 - go back up to "dir2"
238 - add "dir2/f" to the stack
239 - realize we're done with the map
240 - pop "dir2/f", "dir2/dir3" from the stack, reverse and pack them
241 - go up to the root node, do the same to write "a", "dir1" and "dir2" in
242 that order
243
244 ## Special case for the root node
245
246 The root node is not serialized in the format, but its information is
247 written to the docket. Again, see more details on the on-disk format in
248 `mercurial/helptext/internals/dirstate-v2`.
249 """
250 now = int(now)
251 data = bytearray()
252 root_nodes_start = 0
253 root_nodes_len = 0
254 nodes_with_entry_count = 0
255 nodes_with_copy_source_count = 0
256 # Will always be 0 since this implementation always re-writes everything
257 # to disk
258 unreachable_bytes = 0
259 unused = b'\x00' * 4
260 # This is an optimization that's only useful for the Rust implementation
261 ignore_patterns_hash = b'\x00' * 20
262
263 if len(map) == 0:
264 tree_metadata = TREE_METADATA.pack(
265 root_nodes_start,
266 root_nodes_len,
267 nodes_with_entry_count,
268 nodes_with_copy_source_count,
269 unreachable_bytes,
270 unused,
271 ignore_patterns_hash,
272 )
273 return data, tree_metadata
274
275 sorted_map = sorted(map.items(), key=lambda x: x[0])
276
277 # Use a stack to not have to only remember the nodes we currently need
278 # instead of building the entire tree in memory
279 stack = []
280 current_node = Node(b"", None)
281 stack.append(current_node)
282
283 for index, (path, entry) in enumerate(sorted_map, 1):
284 if entry.need_delay(now):
285 # The file was last modified "simultaneously" with the current
286 # write to dirstate (i.e. within the same second for file-
287 # systems with a granularity of 1 sec). This commonly happens
288 # for at least a couple of files on 'update'.
289 # The user could change the file without changing its size
290 # within the same second. Invalidate the file's mtime in
291 # dirstate, forcing future 'status' calls to compare the
292 # contents of the file if the size is the same. This prevents
293 # mistakenly treating such files as clean.
294 entry.set_possibly_dirty()
295 nodes_with_entry_count += 1
296 if path in copy_map:
297 nodes_with_copy_source_count += 1
298 current_folder = get_folder(path)
299 current_node = move_to_correct_node_in_tree(
300 current_folder, current_node, stack
301 )
302
303 current_node.children_count += 1
304 # Entries from `map` are never `None`
305 if entry.tracked:
306 current_node.tracked_descendants += 1
307 current_node.descendants_with_entry += 1
308 stack.append(Node(path, entry, current_node))
309
310 should_pack = True
311 next_path = None
312 if index < len(sorted_map):
313 # Determine if the next entry is in the same sub-tree, if so don't
314 # pack yet
315 next_path = sorted_map[index][0]
316 should_pack = not get_folder(next_path).startswith(current_folder)
317 if should_pack:
318 pack_directory_children(current_node, copy_map, data, stack)
319 while stack and current_node.path != b"":
320 # Go up the tree and write until we reach the folder of the next
321 # entry (if any, otherwise the root)
322 parent = current_node.parent
323 in_parent_folder_of_next_entry = next_path is not None and (
324 get_folder(next_path).startswith(get_folder(stack[-1].path))
325 )
326 if parent is None or in_parent_folder_of_next_entry:
327 break
328 pack_directory_children(parent, copy_map, data, stack)
329 current_node = parent
330
331 # Special case for the root node since we don't write it to disk, only its
332 # children to the docket
333 current_node = stack.pop()
334 assert current_node.path == b"", current_node.path
335 assert len(stack) == 0, len(stack)
336
337 tree_metadata = TREE_METADATA.pack(
338 current_node.children_offset,
339 current_node.children_count,
340 nodes_with_entry_count,
341 nodes_with_copy_source_count,
342 unreachable_bytes,
343 unused,
344 ignore_patterns_hash,
345 )
346
347 return data, tree_metadata
348
349
350 def get_folder(path):
351 """
352 Return the folder of the path that's given, an empty string for root paths.
353 """
354 return path.rsplit(b'/', 1)[0] if b'/' in path else b''
355
356
357 def move_to_correct_node_in_tree(target_folder, current_node, stack):
358 """
359 Move inside the dirstate node tree to the node corresponding to
360 `target_folder`, creating the missing nodes along the way if needed.
361 """
362 while target_folder != current_node.path:
363 if target_folder.startswith(current_node.path):
364 # We need to go down a folder
365 prefix = target_folder[len(current_node.path) :].lstrip(b'/')
366 subfolder_name = prefix.split(b'/', 1)[0]
367 if current_node.path:
368 subfolder_path = current_node.path + b'/' + subfolder_name
369 else:
370 subfolder_path = subfolder_name
371 next_node = stack[-1]
372 if next_node.path == target_folder:
373 # This folder is now a file and only contains removed entries
374 # merge with the last node
375 current_node = next_node
376 else:
377 current_node.children_count += 1
378 current_node = Node(subfolder_path, None, current_node)
379 stack.append(current_node)
380 else:
381 # We need to go up a folder
382 current_node = current_node.parent
383 return current_node
384
385
386 def pack_directory_children(node, copy_map, data, stack):
387 """
388 Write the binary representation of the direct sorted children of `node` to
389 `data`
390 """
391 direct_children = []
392
393 while stack[-1].path != b"" and get_folder(stack[-1].path) == node.path:
394 direct_children.append(stack.pop())
395 if not direct_children:
396 raise error.ProgrammingError(b"no direct children for %r" % node.path)
397
398 # Reverse the stack to get the correct sorted order
399 direct_children.reverse()
400 packed_children = bytearray()
401 # Write the paths to `data`. Pack child nodes but don't write them yet
402 for child in direct_children:
403 packed = child.pack(copy_map=copy_map, paths_offset=len(data))
404 packed_children.extend(packed)
405 data.extend(child.path)
406 data.extend(copy_map.get(child.path, b""))
407 node.tracked_descendants += child.tracked_descendants
408 node.descendants_with_entry += child.descendants_with_entry
409 # Write the fixed-size child nodes all together
410 node.children_offset = len(data)
411 data.extend(packed_children)