Mercurial > hg
comparison mercurial/dirstateutils/v2.py @ 48222:7e78c72ee3ea
dirstate-v2: Initial Python serializer
This adds code seralizing a `map` and `copy_map` dicts into dirstate-v2
file formate. This is not used yet.
Differential Revision: https://phab.mercurial-scm.org/D11519
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Thu, 22 Jul 2021 17:31:37 +0200 |
parents | a32a96079e2d |
children | 50dca3aa5c3b |
comparison
equal
deleted
inserted
replaced
48221:a32a96079e2d | 48222:7e78c72ee3ea |
---|---|
7 | 7 |
8 from __future__ import absolute_import | 8 from __future__ import absolute_import |
9 | 9 |
10 import struct | 10 import struct |
11 | 11 |
12 from .. import policy | 12 from ..thirdparty import attr |
13 from .. import error, policy | |
13 | 14 |
14 parsers = policy.importmod('parsers') | 15 parsers = policy.importmod('parsers') |
15 | 16 |
16 | 17 |
17 # Must match the constant of the same name in | 18 # Must match the constant of the same name in |
114 ) | 115 ) |
115 | 116 |
116 | 117 |
117 def slice_with_len(data, start, len): | 118 def slice_with_len(data, start, len): |
118 return data[start : start + len] | 119 return data[start : start + len] |
120 | |
121 | |
122 @attr.s | |
123 class Node(object): | |
124 path = attr.ib() | |
125 entry = attr.ib() | |
126 parent = attr.ib(default=None) | |
127 children_count = attr.ib(default=0) | |
128 children_offset = attr.ib(default=0) | |
129 descendants_with_entry = attr.ib(default=0) | |
130 tracked_descendants = attr.ib(default=0) | |
131 | |
132 def pack(self, copy_map, paths_offset): | |
133 path = self.path | |
134 copy = copy_map.get(path) | |
135 entry = self.entry | |
136 | |
137 path_start = paths_offset | |
138 path_len = len(path) | |
139 basename_start = path.rfind(b'/') + 1 # 0 if rfind returns -1 | |
140 if copy is not None: | |
141 copy_source_start = paths_offset + len(path) | |
142 copy_source_len = len(copy) | |
143 else: | |
144 copy_source_start = 0 | |
145 copy_source_len = 0 | |
146 if entry is not None: | |
147 flags, size, mtime_s = entry.v2_data() | |
148 mtime_ns = 0 | |
149 else: | |
150 # There are no mtime-cached directories in the Python implementation | |
151 flags = 0 | |
152 mode = 0 | |
153 size = 0 | |
154 mtime_s = 0 | |
155 mtime_ns = 0 | |
156 return NODE.pack( | |
157 path_start, | |
158 path_len, | |
159 basename_start, | |
160 copy_source_start, | |
161 copy_source_len, | |
162 self.children_offset, | |
163 self.children_count, | |
164 self.descendants_with_entry, | |
165 self.tracked_descendants, | |
166 flags, | |
167 size, | |
168 mtime_s, | |
169 mtime_ns, | |
170 ) | |
171 | |
172 | |
173 def pack_dirstate(map, copy_map, now): | |
174 """ | |
175 Pack `map` and `copy_map` into the dirstate v2 binary format and return | |
176 the bytearray. | |
177 `now` is a timestamp of the current filesystem time used to detect race | |
178 conditions in writing the dirstate to disk, see inline comment. | |
179 | |
180 The on-disk format expects a tree-like structure where the leaves are | |
181 written first (and sorted per-directory), going up levels until the root | |
182 node and writing that one to the docket. See more details on the on-disk | |
183 format in `mercurial/helptext/internals/dirstate-v2`. | |
184 | |
185 Since both `map` and `copy_map` are flat dicts we need to figure out the | |
186 hierarchy. This algorithm does so without having to build the entire tree | |
187 in-memory: it only keeps the minimum number of nodes around to satisfy the | |
188 format. | |
189 | |
190 # Algorithm explanation | |
191 | |
192 This explanation does not talk about the different counters for tracked | |
193 descendents and storing the copies, but that work is pretty simple once this | |
194 algorithm is in place. | |
195 | |
196 ## Building a subtree | |
197 | |
198 First, sort `map`: this makes it so the leaves of the tree are contiguous | |
199 per directory (i.e. a/b/c and a/b/d will be next to each other in the list), | |
200 and enables us to use the ordering of folders to have a "cursor" of the | |
201 current folder we're in without ever going twice in the same branch of the | |
202 tree. The cursor is a node that remembers its parent and any information | |
203 relevant to the format (see the `Node` class), building the relevant part | |
204 of the tree lazily. | |
205 Then, for each file in `map`, move the cursor into the tree to the | |
206 corresponding folder of the file: for example, if the very first file | |
207 is "a/b/c", we start from `Node[""]`, create `Node["a"]` which points to | |
208 its parent `Node[""]`, then create `Node["a/b"]`, which points to its parent | |
209 `Node["a"]`. These nodes are kept around in a stack. | |
210 If the next file in `map` is in the same subtree ("a/b/d" or "a/b/e/f"), we | |
211 add it to the stack and keep looping with the same logic of creating the | |
212 tree nodes as needed. If however the next file in `map` is *not* in the same | |
213 subtree ("a/other", if we're still in the "a/b" folder), then we know that | |
214 the subtree we're in is complete. | |
215 | |
216 ## Writing the subtree | |
217 | |
218 We have the entire subtree in the stack, so we start writing it to disk | |
219 folder by folder. The way we write a folder is to pop the stack into a list | |
220 until the folder changes, revert this list of direct children (to satisfy | |
221 the format requirement that children be sorted). This process repeats until | |
222 we hit the "other" subtree. | |
223 | |
224 An example: | |
225 a | |
226 dir1/b | |
227 dir1/c | |
228 dir2/dir3/d | |
229 dir2/dir3/e | |
230 dir2/f | |
231 | |
232 Would have us: | |
233 - add to the stack until "dir2/dir3/e" | |
234 - realize that "dir2/f" is in a different subtree | |
235 - pop "dir2/dir3/e", "dir2/dir3/d", reverse them so they're sorted and | |
236 pack them since the next entry is "dir2/dir3" | |
237 - go back up to "dir2" | |
238 - add "dir2/f" to the stack | |
239 - realize we're done with the map | |
240 - pop "dir2/f", "dir2/dir3" from the stack, reverse and pack them | |
241 - go up to the root node, do the same to write "a", "dir1" and "dir2" in | |
242 that order | |
243 | |
244 ## Special case for the root node | |
245 | |
246 The root node is not serialized in the format, but its information is | |
247 written to the docket. Again, see more details on the on-disk format in | |
248 `mercurial/helptext/internals/dirstate-v2`. | |
249 """ | |
250 now = int(now) | |
251 data = bytearray() | |
252 root_nodes_start = 0 | |
253 root_nodes_len = 0 | |
254 nodes_with_entry_count = 0 | |
255 nodes_with_copy_source_count = 0 | |
256 # Will always be 0 since this implementation always re-writes everything | |
257 # to disk | |
258 unreachable_bytes = 0 | |
259 unused = b'\x00' * 4 | |
260 # This is an optimization that's only useful for the Rust implementation | |
261 ignore_patterns_hash = b'\x00' * 20 | |
262 | |
263 if len(map) == 0: | |
264 tree_metadata = TREE_METADATA.pack( | |
265 root_nodes_start, | |
266 root_nodes_len, | |
267 nodes_with_entry_count, | |
268 nodes_with_copy_source_count, | |
269 unreachable_bytes, | |
270 unused, | |
271 ignore_patterns_hash, | |
272 ) | |
273 return data, tree_metadata | |
274 | |
275 sorted_map = sorted(map.items(), key=lambda x: x[0]) | |
276 | |
277 # Use a stack to not have to only remember the nodes we currently need | |
278 # instead of building the entire tree in memory | |
279 stack = [] | |
280 current_node = Node(b"", None) | |
281 stack.append(current_node) | |
282 | |
283 for index, (path, entry) in enumerate(sorted_map, 1): | |
284 if entry.need_delay(now): | |
285 # The file was last modified "simultaneously" with the current | |
286 # write to dirstate (i.e. within the same second for file- | |
287 # systems with a granularity of 1 sec). This commonly happens | |
288 # for at least a couple of files on 'update'. | |
289 # The user could change the file without changing its size | |
290 # within the same second. Invalidate the file's mtime in | |
291 # dirstate, forcing future 'status' calls to compare the | |
292 # contents of the file if the size is the same. This prevents | |
293 # mistakenly treating such files as clean. | |
294 entry.set_possibly_dirty() | |
295 nodes_with_entry_count += 1 | |
296 if path in copy_map: | |
297 nodes_with_copy_source_count += 1 | |
298 current_folder = get_folder(path) | |
299 current_node = move_to_correct_node_in_tree( | |
300 current_folder, current_node, stack | |
301 ) | |
302 | |
303 current_node.children_count += 1 | |
304 # Entries from `map` are never `None` | |
305 if entry.tracked: | |
306 current_node.tracked_descendants += 1 | |
307 current_node.descendants_with_entry += 1 | |
308 stack.append(Node(path, entry, current_node)) | |
309 | |
310 should_pack = True | |
311 next_path = None | |
312 if index < len(sorted_map): | |
313 # Determine if the next entry is in the same sub-tree, if so don't | |
314 # pack yet | |
315 next_path = sorted_map[index][0] | |
316 should_pack = not get_folder(next_path).startswith(current_folder) | |
317 if should_pack: | |
318 pack_directory_children(current_node, copy_map, data, stack) | |
319 while stack and current_node.path != b"": | |
320 # Go up the tree and write until we reach the folder of the next | |
321 # entry (if any, otherwise the root) | |
322 parent = current_node.parent | |
323 in_parent_folder_of_next_entry = next_path is not None and ( | |
324 get_folder(next_path).startswith(get_folder(stack[-1].path)) | |
325 ) | |
326 if parent is None or in_parent_folder_of_next_entry: | |
327 break | |
328 pack_directory_children(parent, copy_map, data, stack) | |
329 current_node = parent | |
330 | |
331 # Special case for the root node since we don't write it to disk, only its | |
332 # children to the docket | |
333 current_node = stack.pop() | |
334 assert current_node.path == b"", current_node.path | |
335 assert len(stack) == 0, len(stack) | |
336 | |
337 tree_metadata = TREE_METADATA.pack( | |
338 current_node.children_offset, | |
339 current_node.children_count, | |
340 nodes_with_entry_count, | |
341 nodes_with_copy_source_count, | |
342 unreachable_bytes, | |
343 unused, | |
344 ignore_patterns_hash, | |
345 ) | |
346 | |
347 return data, tree_metadata | |
348 | |
349 | |
350 def get_folder(path): | |
351 """ | |
352 Return the folder of the path that's given, an empty string for root paths. | |
353 """ | |
354 return path.rsplit(b'/', 1)[0] if b'/' in path else b'' | |
355 | |
356 | |
357 def move_to_correct_node_in_tree(target_folder, current_node, stack): | |
358 """ | |
359 Move inside the dirstate node tree to the node corresponding to | |
360 `target_folder`, creating the missing nodes along the way if needed. | |
361 """ | |
362 while target_folder != current_node.path: | |
363 if target_folder.startswith(current_node.path): | |
364 # We need to go down a folder | |
365 prefix = target_folder[len(current_node.path) :].lstrip(b'/') | |
366 subfolder_name = prefix.split(b'/', 1)[0] | |
367 if current_node.path: | |
368 subfolder_path = current_node.path + b'/' + subfolder_name | |
369 else: | |
370 subfolder_path = subfolder_name | |
371 next_node = stack[-1] | |
372 if next_node.path == target_folder: | |
373 # This folder is now a file and only contains removed entries | |
374 # merge with the last node | |
375 current_node = next_node | |
376 else: | |
377 current_node.children_count += 1 | |
378 current_node = Node(subfolder_path, None, current_node) | |
379 stack.append(current_node) | |
380 else: | |
381 # We need to go up a folder | |
382 current_node = current_node.parent | |
383 return current_node | |
384 | |
385 | |
386 def pack_directory_children(node, copy_map, data, stack): | |
387 """ | |
388 Write the binary representation of the direct sorted children of `node` to | |
389 `data` | |
390 """ | |
391 direct_children = [] | |
392 | |
393 while stack[-1].path != b"" and get_folder(stack[-1].path) == node.path: | |
394 direct_children.append(stack.pop()) | |
395 if not direct_children: | |
396 raise error.ProgrammingError(b"no direct children for %r" % node.path) | |
397 | |
398 # Reverse the stack to get the correct sorted order | |
399 direct_children.reverse() | |
400 packed_children = bytearray() | |
401 # Write the paths to `data`. Pack child nodes but don't write them yet | |
402 for child in direct_children: | |
403 packed = child.pack(copy_map=copy_map, paths_offset=len(data)) | |
404 packed_children.extend(packed) | |
405 data.extend(child.path) | |
406 data.extend(copy_map.get(child.path, b"")) | |
407 node.tracked_descendants += child.tracked_descendants | |
408 node.descendants_with_entry += child.descendants_with_entry | |
409 # Write the fixed-size child nodes all together | |
410 node.children_offset = len(data) | |
411 data.extend(packed_children) |