Mercurial > hg
changeset 52060:8b7123c8947b
update: add a Rust fast-path when updating from null (and clean)
This case is easy to detect and we have all we need to generate a valid
working copy and dirstate entirely in Rust, which speeds things up
considerably:
On my machine updating a repo of ~300k files goes from 10.00s down to 4.2s,
all while consuming 50% less system time, with all caches hot.
Something to note is that further improvements will probably happen
with the upcoming `InnerRevlog` series that does smarter
mmap hanlding, especially for filelogs.
Here are benchmark numbers on a machine with only 4 cores (and no SMT enabled)
```
### data-env-vars.name = heptapod-public-2024-03-25-ds2-pnm
# benchmark.name = hg.command.update
# bin-env-vars.hg.py-re2-module = default
# bin-env-vars.hg.changeset.node = <this change>
# benchmark.variants.atomic-update = no
# benchmark.variants.scenario = null-to-tip
# benchmark.variants.worker = default
default: 5.328762 ~~~~~
rust: 1.308654 (-75.44%, -4.02)
### data-env-vars.name = mercurial-devel-2024-03-22-ds2-pnm
# benchmark.name = hg.command.update
# bin-env-vars.hg.py-re2-module = default
# bin-env-vars.hg.changeset.node = <this change>
# benchmark.variants.atomic-update = no
# benchmark.variants.scenario = null-to-tip
# benchmark.variants.worker = default
default: 1.693271 ~~~~~
rust: 1.151053 (-32.02%, -0.54)
### data-env-vars.name = mozilla-unified-2024-03-22-ds2-pnm
# benchmark.name = hg.command.update
# bin-env-vars.hg.py-re2-module = default
# bin-env-vars.hg.changeset.node = <this change>
# benchmark.variants.atomic-update = no
# benchmark.variants.scenario = null-to-tip
# benchmark.variants.worker = default
default: 38.901613 ~~~~~
rust: 11.637880 (-70.08%, -27.26)
### data-env-vars.name = netbsd-xsrc-public-2024-09-19-ds2-pnm
# benchmark.name = hg.command.update
# bin-env-vars.hg.py-re2-module = default
# bin-env-vars.hg.changeset.node = <this change>
# benchmark.variants.atomic-update = no
# benchmark.variants.scenario = null-to-tip
# benchmark.variants.worker = default
default: 4.793727 ~~~~~
rust: 1.505905 (-68.59%, -3.29)
```
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Tue, 01 Oct 2024 13:49:11 +0200 |
parents | b332ae615714 |
children | 43e15277498e |
files | hgext/eol.py hgext/journal.py hgext/keyword.py hgext/largefiles/__init__.py hgext/lfs/__init__.py mercurial/merge.py rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/src/dirstate_tree/on_disk.rs rust/hg-core/src/lib.rs rust/hg-core/src/revlog/mod.rs rust/hg-core/src/update.rs rust/hg-cpython/src/lib.rs rust/hg-cpython/src/update.rs tests/test-audit-path.t tests/test-clone.t tests/test-simple-update.t tests/test-stream-bundle-v2.t tests/test-subrepo-deep-nested-change.t tests/test-symlink-placeholder.t |
diffstat | 20 files changed, 741 insertions(+), 67 deletions(-) [+] |
line wrap: on
line diff
--- a/hgext/eol.py Wed Oct 16 19:14:30 2024 +0200 +++ b/hgext/eol.py Tue Oct 01 13:49:11 2024 +0200 @@ -101,6 +101,7 @@ error as errormod, extensions, match, + merge, pycompat, registrar, scmutil, @@ -376,6 +377,7 @@ def reposetup(ui, repo): uisetup(repo.ui) + merge.MAYBE_USE_RUST_UPDATE = False if not repo.local(): return
--- a/hgext/journal.py Wed Oct 16 19:14:30 2024 +0200 +++ b/hgext/journal.py Tue Oct 01 13:49:11 2024 +0200 @@ -34,6 +34,7 @@ localrepo, lock, logcmdutil, + merge, pycompat, registrar, util, @@ -68,6 +69,7 @@ # Journal recording, register hooks and storage object def extsetup(ui): + merge.MAYBE_USE_RUST_UPDATE = False extensions.wrapfunction(dispatch, 'runcommand', runcommand) extensions.wrapfunction(bookmarks.bmstore, '_write', recordbookmarks) extensions.wrapfilecache(
--- a/hgext/keyword.py Wed Oct 16 19:14:30 2024 +0200 +++ b/hgext/keyword.py Tue Oct 01 13:49:11 2024 +0200 @@ -102,6 +102,7 @@ localrepo, logcmdutil, match, + merge, patch, pathutil, pycompat, @@ -818,6 +819,7 @@ extensions.wrapfunction(cmdutil, 'dorecord', kw_dorecord) for c in nokwwebcommands.split(): extensions.wrapfunction(webcommands, c, kwweb_skip) + merge.MAYBE_USE_RUST_UPDATE = False def reposetup(ui, repo):
--- a/hgext/largefiles/__init__.py Wed Oct 16 19:14:30 2024 +0200 +++ b/hgext/largefiles/__init__.py Tue Oct 01 13:49:11 2024 +0200 @@ -114,6 +114,7 @@ exthelper, hg, localrepo, + merge, wireprotov1server, ) @@ -165,6 +166,7 @@ @eh.uisetup def _uisetup(ui): + merge.MAYBE_USE_RUST_UPDATE = False localrepo.featuresetupfuncs.add(featuresetup) hg.wirepeersetupfuncs.append(proto.wirereposetup)
--- a/hgext/lfs/__init__.py Wed Oct 16 19:14:30 2024 +0200 +++ b/hgext/lfs/__init__.py Tue Oct 01 13:49:11 2024 +0200 @@ -138,6 +138,7 @@ filesetlang, localrepo, logcmdutil, + merge, minifileset, pycompat, revlog, @@ -234,6 +235,7 @@ @eh.uisetup def _uisetup(ui): + merge.MAYBE_USE_RUST_UPDATE = False localrepo.featuresetupfuncs.add(featuresetup)
--- a/mercurial/merge.py Wed Oct 16 19:14:30 2024 +0200 +++ b/mercurial/merge.py Tue Oct 01 13:49:11 2024 +0200 @@ -8,6 +8,7 @@ from __future__ import annotations import collections +import os import struct import typing from typing import Dict, Optional, Tuple @@ -34,12 +35,15 @@ pathutil, policy, pycompat, + requirements, scmutil, subrepoutil, util, worker, ) +rust_update_mod = policy.importrust("update") + _pack = struct.pack _unpack = struct.unpack @@ -147,6 +151,8 @@ dircache = dict() dirstate = repo.dirstate wvfs = repo.wvfs + # wouldn't it be easier to loop over unknown files (and dirs)? + if not force: def collectconflicts(conflicts, config): @@ -1835,6 +1841,12 @@ UPDATECHECK_LINEAR = b'linear' UPDATECHECK_NO_CONFLICT = b'noconflict' +# Let extensions turn off any Rust code in the update code if that interferes +# will their patching. +# This being `True` does not mean that you have Rust extensions installed or +# that the Rust path will be taken for any given invocation. +MAYBE_USE_RUST_UPDATE = True + def _update( repo, @@ -2008,6 +2020,60 @@ if not branchmerge and not wc.dirty(missing=True): followcopies = False + update_from_null = False + update_from_null_fallback = False + if ( + MAYBE_USE_RUST_UPDATE + and rust_update_mod is not None + and p1.rev() == nullrev + and not branchmerge + # TODO it's probably not too hard to pass down the transaction and + # respect the write patterns from Rust. But since it doesn't affect + # a simple update from null, then it doesn't matter yet. + and repo.currenttransaction() is None + and matcher is None + and not wc.mergestate().active() + and b'.hgsubstate' not in p2 + ): + working_dir_iter = os.scandir(repo.root) + maybe_hg_folder = next(working_dir_iter) + assert maybe_hg_folder is not None + if maybe_hg_folder.name == b".hg": + try: + next(working_dir_iter) + except StopIteration: + update_from_null = True + + if update_from_null: + # Check the narrowspec and sparsespec here to display warnings + # more easily. + # TODO figure out of a way of bubbling up warnings to Python + # while not polluting the Rust code (probably a channel) + repo.narrowmatch() + sparse.matcher(repo, [nullrev, p2.rev()]) + repo.hook(b'preupdate', throw=True, parent1=xp1, parent2=xp2) + # note that we're in the middle of an update + repo.vfs.write(b'updatestate', p2.hex()) + try: + updated_count = rust_update_mod.update_from_null( + repo.root, p2.rev() + ) + except rust_update_mod.FallbackError: + update_from_null_fallback = True + else: + # We've changed the dirstate from Rust, we need to tell Python + repo.dirstate.invalidate() + # This includes setting the parents, since they are not read + # again on invalidation + with repo.dirstate.changing_parents(repo): + repo.dirstate.setparents(fp2) + repo.dirstate.setbranch(p2.branch(), repo.currenttransaction()) + sparse.prunetemporaryincludes(repo) + repo.hook(b'update', parent1=xp1, parent2=xp2, error=0) + # update completed, clear state + util.unlink(repo.vfs.join(b'updatestate')) + return updateresult(updated_count, 0, 0, 0) + ### calculate phase mresult = calculateupdates( repo, @@ -2131,11 +2197,13 @@ # the dirstate. always = matcher is None or matcher.always() updatedirstate = updatedirstate and always and not wc.isinmemory() - if updatedirstate: + # If we're in the fallback case, we've already done this + if updatedirstate and not update_from_null_fallback: repo.hook(b'preupdate', throw=True, parent1=xp1, parent2=xp2) # note that we're in the middle of an update repo.vfs.write(b'updatestate', p2.hex()) + # TODO don't run if Rust is available _advertisefsmonitor( repo, mresult.len((mergestatemod.ACTION_GET,)), p1.node() ) @@ -2172,14 +2240,14 @@ mergestatemod.recordupdates( repo, mresult.actionsdict, branchmerge, getfiledata ) - # update completed, clear state - util.unlink(repo.vfs.join(b'updatestate')) - if not branchmerge: repo.dirstate.setbranch( p2.branch(), repo.currenttransaction() ) + # update completed, clear state + util.unlink(repo.vfs.join(b'updatestate')) + # If we're updating to a location, clean up any stale temporary includes # (ex: this happens during hg rebase --abort). if not branchmerge:
--- a/rust/Cargo.lock Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/Cargo.lock Tue Oct 01 13:49:11 2024 +0200 @@ -650,6 +650,7 @@ "thread_local", "toml", "twox-hash", + "uuid", "zstd", ] @@ -1530,6 +1531,15 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +dependencies = [ + "getrandom 0.2.8", +] + +[[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index"
--- a/rust/hg-core/Cargo.toml Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/hg-core/Cargo.toml Tue Oct 01 13:49:11 2024 +0200 @@ -44,6 +44,7 @@ chrono = "0.4.34" dyn-clone = "1.0.16" filetime = "0.2.23" +uuid = { version = "1.10", features = ["v4"] } # We don't use the `miniz-oxide` backend to not change rhg benchmarks and until # we have a clearer view of which backend is the fastest.
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Tue Oct 01 13:49:11 2024 +0200 @@ -8,7 +8,9 @@ self, DirstateMap, DirstateMapWriteMode, NodeRef, }; use crate::dirstate_tree::path_with_basename::WithBasename; -use crate::errors::HgError; +use crate::errors::{HgError, IoResultExt}; +use crate::repo::Repo; +use crate::requirements::DIRSTATE_TRACKED_HINT_V1; use crate::utils::hg_path::HgPath; use crate::DirstateEntry; use crate::DirstateError; @@ -20,6 +22,7 @@ use rand::Rng; use std::borrow::Cow; use std::fmt::Write; +use uuid::Uuid; use super::dirstate_map::DirstateIdentity; @@ -916,3 +919,22 @@ } } } + +/// Write a new tracked key to disk. +/// See `format.use-dirstate-tracked-hint` config help for more details. +pub fn write_tracked_key(repo: &Repo) -> Result<(), HgError> { + // TODO move this to the dirstate itself once it grows a `dirty` flag and + // can reason about which context it needs to write this in. + // For now, only this fast-path needs to think about the tracked hint. + // Use [`crate::dirstate_tree::dirstate_map::DirstateMap:: + // use_tracked_hint`] instead of looking at the requirements once + // refactored. + if !repo.requirements().contains(DIRSTATE_TRACKED_HINT_V1) { + return Ok(()); + } + // TODO use `hg_vfs` once the `InnerRevlog` is in. + let path = repo + .working_directory_path() + .join(".hg/dirstate-tracked-hint"); + std::fs::write(&path, Uuid::new_v4().as_bytes()).when_writing_file(&path) +}
--- a/rust/hg-core/src/lib.rs Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/hg-core/src/lib.rs Tue Oct 01 13:49:11 2024 +0200 @@ -37,6 +37,7 @@ pub mod operations; pub mod progress; pub mod revset; +pub mod update; pub mod utils; pub mod vfs;
--- a/rust/hg-core/src/revlog/mod.rs Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/hg-core/src/revlog/mod.rs Tue Oct 01 13:49:11 2024 +0200 @@ -1180,7 +1180,7 @@ } else { if (self.flags & REVISION_FLAG_ELLIPSIS) != 0 { return Err(HgError::unsupported( - "ellipsis revisions are not supported by rhg", + "support for ellipsis nodes is missing", ) .into()); }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/update.rs Tue Oct 01 13:49:11 2024 +0200 @@ -0,0 +1,491 @@ +//! Tools for moving the repository to a given revision + +use std::{ + fs::Permissions, + io::Write, + os::unix::fs::{MetadataExt, PermissionsExt}, + path::Path, + time::Duration, +}; + +use crate::{ + dirstate::{ParentFileData, TruncatedTimestamp}, + dirstate_tree::{ + dirstate_map::DirstateEntryReset, on_disk::write_tracked_key, + }, + errors::{HgError, IoResultExt}, + exit_codes, + filelog::Filelog, + narrow, + node::NULL_NODE, + operations::{list_rev_tracked_files, ExpandedManifestEntry}, + progress::Progress, + repo::Repo, + sparse, + utils::{ + files::{filesystem_now, get_path_from_bytes}, + hg_path::{hg_path_to_path_buf, HgPath, HgPathError}, + path_auditor::PathAuditor, + }, + vfs::{is_on_nfs_mount, VfsImpl}, + DirstateParents, RevlogError, RevlogOpenOptions, UncheckedRevision, +}; +use crossbeam_channel::{Receiver, Sender}; +use rayon::prelude::*; + +fn write_dirstate(repo: &Repo) -> Result<(), HgError> { + repo.write_dirstate() + .map_err(|e| HgError::abort(e.to_string(), exit_codes::ABORT, None))?; + write_tracked_key(repo) +} + +/// Update the current working copy of `repo` to the given revision `to`, from +/// the null revision and set + write out the dirstate to reflect that. +/// +/// Do not call this outside of a Python context. This does *not* handle any +/// of the checks, hooks, lock taking needed to setup and get out of this +/// update from the null revision. +pub fn update_from_null( + repo: &Repo, + to: UncheckedRevision, + progress: &dyn Progress, +) -> Result<usize, HgError> { + // Ignore the warnings, they've been displayed by Python already + // TODO non-Python clients: display narrow warnings + let (narrow_matcher, _) = narrow::matcher(repo)?; + + let files_for_rev = list_rev_tracked_files(repo, to, narrow_matcher) + .map_err(handle_revlog_error)?; + repo.manually_set_parents(DirstateParents { + p1: repo.node(to).expect("update target should exist"), + p2: NULL_NODE, + })?; + + // Filter the working copy according to the sparse spec + let tracked_files: Result<Vec<_>, _> = if !repo.has_sparse() { + files_for_rev.iter().collect() + } else { + // Ignore the warnings, they've been displayed by Python already + // TODO non-Python clients: display sparse warnings + let (sparse_matcher, _) = sparse::matcher(repo)?; + files_for_rev + .iter() + .filter(|f| { + match f { + Ok(f) => sparse_matcher.matches(f.0), + Err(_) => true, // Errors stop the update, include them + } + }) + .collect() + }; + let tracked_files = tracked_files?; + + if tracked_files.is_empty() { + // Still write the dirstate because we might not be in the null + // revision. + // This can happen in narrow repos where all paths are excluded in + // this revision. + write_dirstate(repo)?; + return Ok(0); + } + let store_vfs = &repo.store_vfs(); + let options = repo.default_revlog_options(crate::RevlogType::Filelog)?; + let (errors_sender, errors_receiver) = crossbeam_channel::unbounded(); + let (files_sender, files_receiver) = crossbeam_channel::unbounded(); + let working_directory_path = &repo.working_directory_path(); + + let files_count = tracked_files.len(); + let chunks = chunk_tracked_files(tracked_files); + progress.update(0, Some(files_count as u64)); + + create_working_copy( + chunks, + working_directory_path, + store_vfs, + options, + files_sender, + errors_sender, + progress, + ); + + let errors: Vec<HgError> = errors_receiver.iter().collect(); + if !errors.is_empty() { + log::debug!("{} errors during update (see trace logs)", errors.len()); + for error in errors.iter() { + log::trace!("{}", error); + } + // Best we can do is raise the first error (in order of the channel) + return Err(errors.into_iter().next().expect("can never be empty")); + } + + // TODO try to run this concurrently to update the dirstate while we're + // still writing out the working copy to see if that improves performance. + let total = update_dirstate(repo, files_receiver)?; + + write_dirstate(repo)?; + + Ok(total) +} + +fn handle_revlog_error(e: RevlogError) -> HgError { + match e { + crate::RevlogError::Other(hg_error) => hg_error, + e => HgError::abort( + format!("revlog error: {}", e), + exit_codes::ABORT, + None, + ), + } +} + +/// Preallocated size of Vec holding directory contents. This aims at +/// preventing the need for re-allocating the Vec in most cases. +/// +/// The value is arbitrarily picked as a little over an average number of files +/// per directory done by looking at a few larger open-source repos. +/// Most of the runtime is IO anyway, so this doesn't matter too much. +const FILES_PER_DIRECTORY: usize = 16; + +/// Chunk files per directory prefix, so almost every directory is handled +/// in a separate thread, which works around the FS inode mutex. +/// Chunking less (and doing approximately `files_count`/`threads`) actually +/// ends up being less performant: my hypothesis is `rayon`'s work stealing +/// being more efficient with tasks of varying lengths. +#[logging_timer::time("trace")] +fn chunk_tracked_files( + tracked_files: Vec<ExpandedManifestEntry>, +) -> Vec<(&HgPath, Vec<ExpandedManifestEntry>)> { + let files_count = tracked_files.len(); + + let mut chunks = Vec::with_capacity(files_count / FILES_PER_DIRECTORY); + + let mut current_chunk = Vec::with_capacity(FILES_PER_DIRECTORY); + let mut last_directory = tracked_files[0].0.parent(); + + for file_info in tracked_files { + let current_directory = file_info.0.parent(); + let different_directory = current_directory != last_directory; + if different_directory { + chunks.push((last_directory, current_chunk)); + current_chunk = Vec::with_capacity(FILES_PER_DIRECTORY); + } + current_chunk.push(file_info); + last_directory = current_directory; + } + chunks.push((last_directory, current_chunk)); + chunks +} + +#[logging_timer::time("trace")] +fn create_working_copy<'a: 'b, 'b>( + chunks: Vec<(&HgPath, Vec<ExpandedManifestEntry<'a>>)>, + working_directory_path: &Path, + store_vfs: &VfsImpl, + options: RevlogOpenOptions, + files_sender: Sender<(&'b HgPath, u32, usize, TruncatedTimestamp)>, + error_sender: Sender<HgError>, + progress: &dyn Progress, +) { + let auditor = PathAuditor::new(working_directory_path); + chunks.into_par_iter().for_each(|(dir_path, chunk)| { + if let Err(e) = working_copy_worker( + dir_path, + chunk, + working_directory_path, + store_vfs, + options, + &files_sender, + progress, + &auditor, + ) { + error_sender + .send(e) + .expect("channel should not be disconnected") + } + }); +} + +/// Represents a work unit for a single thread, responsible for this set of +/// files and restoring them to the working copy. +#[allow(clippy::too_many_arguments)] +fn working_copy_worker<'a: 'b, 'b>( + dir_path: &HgPath, + chunk: Vec<ExpandedManifestEntry<'a>>, + working_directory_path: &Path, + store_vfs: &VfsImpl, + options: RevlogOpenOptions, + files_sender: &Sender<(&'b HgPath, u32, usize, TruncatedTimestamp)>, + progress: &dyn Progress, + auditor: &PathAuditor, +) -> Result<(), HgError> { + let dir_path = + hg_path_to_path_buf(dir_path).expect("invalid path in manifest"); + let dir_path = working_directory_path.join(dir_path); + std::fs::create_dir_all(&dir_path).when_writing_file(&dir_path)?; + + for (file, file_node, flags) in chunk { + auditor.audit_path(file)?; + let flags = flags.map(|f| f.into()); + let path = + working_directory_path.join(get_path_from_bytes(file.as_bytes())); + + // Treemanifest is not supported + assert!(flags != Some(b't')); + + let filelog = Filelog::open_vfs(store_vfs, file, options)?; + let filelog_revision_data = &filelog + .data_for_node(file_node) + .map_err(handle_revlog_error)?; + let file_data = filelog_revision_data.file_data()?; + + if flags == Some(b'l') { + let target = get_path_from_bytes(file_data); + if let Err(e) = std::os::unix::fs::symlink(target, &path) { + // If the path already exists either: + // - another process created this file while ignoring the + // lock => error + // - our check for the fast path is incorrect => error + // - this is a malicious repo/bundle and this is symlink that + // tries to write things where it shouldn't be able to. + match e.kind() { + std::io::ErrorKind::AlreadyExists => { + let metadata = std::fs::symlink_metadata(&path) + .when_reading_file(&path)?; + if metadata.is_dir() { + return Err(HgError::Path( + HgPathError::TraversesSymbolicLink { + // Technically it should be one of the + // children, but good enough + path: file + .join(HgPath::new(b"*")) + .to_owned(), + symlink: file.to_owned(), + }, + )); + } + return Err(e).when_writing_file(&path); + } + _ => return Err(e).when_writing_file(&path), + } + } + } else { + let mut f = + std::fs::File::create(&path).when_writing_file(&path)?; + f.write_all(file_data).when_writing_file(&path)?; + } + if flags == Some(b'x') { + std::fs::set_permissions(&path, Permissions::from_mode(0o755)) + .when_writing_file(&path)?; + } + let metadata = + std::fs::symlink_metadata(&path).when_reading_file(&path)?; + + let mode = metadata.mode(); + + files_sender + .send(( + file, + mode, + file_data.len(), + TruncatedTimestamp::for_mtime_of(&metadata) + .when_reading_file(&path)?, + )) + .expect("channel should not be closed"); + progress.increment(1, None); + } + Ok(()) +} + +#[logging_timer::time("trace")] +fn update_dirstate( + repo: &Repo, + files_receiver: Receiver<(&HgPath, u32, usize, TruncatedTimestamp)>, +) -> Result<usize, HgError> { + let mut dirstate = repo + .dirstate_map_mut() + .map_err(|e| HgError::abort(e.to_string(), exit_codes::ABORT, None))?; + + // (see the comments in `filter_ambiguous_files` in `merge.py` for more) + // It turns out that (on Linux at least) the filesystem resolution time + // for most filesystems is based on the HZ kernel config. Their internal + // clocks do return nanoseconds if the hardware clock is precise enough, + // which should be the case on most recent computers but are only updated + // every few milliseconds at best (every "jiffy"). + // + // We are still not concerned with fixing the race with other + // processes that might modify the working copy right after it was created + // within the same tick, because it is impossible to catch. + // However, we might as well not race with operations that could run right + // after this one, especially other Mercurial operations that could be + // waiting for the wlock to change file contents and the dirstate. + // + // Thus: wait until the filesystem clock has ticked to filter ambiguous + // entries and write the dirstate, but only for dirstate-v2, since v1 only + // has second-level granularity and waiting for a whole second is too much + // of a penalty in the general case. + // Although we're assuming that people running dirstate-v2 on Linux + // don't have a second-granularity FS (with the exclusion of NFS), users + // can be surprising, and at some point in the future dirstate-v2 will + // become the default. To that end, we limit the wait time to 100ms and + // fall back to the filter method in case of a timeout. + // + // +------------+------+--------------+ + // | version | wait | filter level | + // +------------+------+--------------+ + // | V1 | No | Second | + // | V2 | Yes | Nanosecond | + // | V2-slow-fs | No | Second | + // +------------+------+--------------+ + let dirstate_v2 = repo.use_dirstate_v2(); + + // Let's ignore NFS right off the bat + let mut fast_enough_fs = !is_on_nfs_mount(repo.working_directory_path()); + let fs_time_now = if dirstate_v2 && fast_enough_fs { + match wait_until_fs_tick(repo.working_directory_path()) { + None => None, + Some(Ok(time)) => Some(time), + Some(Err(time)) => { + fast_enough_fs = false; + Some(time) + } + } + } else { + filesystem_now(repo.working_directory_path()) + .ok() + .map(TruncatedTimestamp::from) + }; + + let mut total = 0; + for (filename, mode, size, mtime) in files_receiver.into_iter() { + total += 1; + // When using dirstate-v2 on a filesystem with reasonable performance + // this is basically always true unless you get a mtime from the + // far future. + let has_meaningful_mtime = if let Some(fs_time) = fs_time_now { + mtime.for_reliable_mtime_of_self(&fs_time).is_some_and(|t| { + // Dirstate-v1 only has second-level information + !t.second_ambiguous || dirstate_v2 && fast_enough_fs + }) + } else { + // We somehow failed to write to the filesystem, so don't store + // the cache information. + false + }; + let reset = DirstateEntryReset { + filename, + wc_tracked: true, + p1_tracked: true, + p2_info: false, + has_meaningful_mtime, + parent_file_data_opt: Some(ParentFileData { + mode_size: Some(( + mode, + size.try_into().expect("invalid file size in manifest"), + )), + mtime: Some(mtime), + }), + from_empty: true, + }; + dirstate.reset_state(reset).map_err(|e| { + HgError::abort(e.to_string(), exit_codes::ABORT, None) + })?; + } + + Ok(total) +} + +/// Wait until the next update from the filesystem time by writing in a loop +/// a new temporary file inside the working directory and checking if its time +/// differs from the first one observed. +/// +/// Returns `None` if we are unable to get the filesystem time, +/// `Some(Err(timestamp))` if we've timed out waiting for the filesystem clock +/// to tick, and `Some(Ok(timestamp))` if we've waited successfully. +/// +/// On Linux, your average tick is going to be a "jiffy", or 1/HZ. +/// HZ is your kernel's tick rate (if it has one configured) and the value +/// is the one returned by `grep 'CONFIG_HZ=' /boot/config-$(uname -r)`, +/// again assuming a normal setup. +/// +/// In my case (Alphare) at the time of writing, I get `CONFIG_HZ=250`, +/// which equates to 4ms. +/// +/// This might change with a series that could make it to Linux 6.12: +/// https://lore.kernel.org/all/20241002-mgtime-v10-8-d1c4717f5284@kernel.org +fn wait_until_fs_tick( + working_directory_path: &Path, +) -> Option<Result<TruncatedTimestamp, TruncatedTimestamp>> { + let start = std::time::Instant::now(); + let old_fs_time = filesystem_now(working_directory_path).ok()?; + let mut fs_time = filesystem_now(working_directory_path).ok()?; + + const FS_TICK_WAIT_TIMEOUT: Duration = Duration::from_millis(100); + + while fs_time == old_fs_time { + if std::time::Instant::now() - start > FS_TICK_WAIT_TIMEOUT { + log::trace!( + "timed out waiting for the fs clock to tick after {:?}", + FS_TICK_WAIT_TIMEOUT + ); + return Some(Err(TruncatedTimestamp::from(old_fs_time))); + } + fs_time = filesystem_now(working_directory_path).ok()?; + } + log::trace!( + "waited for {:?} before writing the dirstate", + fs_time.duration_since(old_fs_time) + ); + Some(Ok(TruncatedTimestamp::from(fs_time))) +} + +#[cfg(test)] +mod test { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn test_chunk_tracked_files() { + fn chunk(v: Vec<&'static str>) -> Vec<ExpandedManifestEntry> { + v.into_iter() + .map(|f| (HgPath::new(f.as_bytes()), NULL_NODE, None)) + .collect() + } + let p = HgPath::new; + + let files = chunk(vec!["a"]); + let expected = vec![(p(""), chunk(vec!["a"]))]; + assert_eq!(chunk_tracked_files(files), expected); + + let files = chunk(vec!["a", "b", "c"]); + let expected = vec![(p(""), chunk(vec!["a", "b", "c"]))]; + assert_eq!(chunk_tracked_files(files), expected); + + let files = chunk(vec![ + "dir/a-new", + "dir/a/mut", + "dir/a/mut-mut", + "dir/albert", + "dir/b", + "dir/subdir/c", + "dir/subdir/d", + "file", + ]); + let expected = vec![ + (p("dir"), chunk(vec!["dir/a-new"])), + (p("dir/a"), chunk(vec!["dir/a/mut", "dir/a/mut-mut"])), + (p("dir"), chunk(vec!["dir/albert", "dir/b"])), + (p("dir/subdir"), chunk(vec!["dir/subdir/c", "dir/subdir/d"])), + (p(""), chunk(vec!["file"])), + ]; + assert_eq!(chunk_tracked_files(files), expected); + + // Doesn't get split + let large_dir = vec![ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", + ]; + let files = chunk(large_dir.clone()); + let expected = vec![(p(""), chunk(large_dir))]; + assert_eq!(chunk_tracked_files(files), expected); + } +}
--- a/rust/hg-cpython/src/lib.rs Wed Oct 16 19:14:30 2024 +0200 +++ b/rust/hg-cpython/src/lib.rs Tue Oct 01 13:49:11 2024 +0200 @@ -45,6 +45,7 @@ pub mod exceptions; mod pybytes_deref; pub mod revlog; +pub mod update; pub mod utils; /// Revision as exposed to/from the Python layer. @@ -100,6 +101,7 @@ m.add(py, "discovery", discovery::init_module(py, &dotted_name)?)?; m.add(py, "dirstate", dirstate::init_module(py, &dotted_name)?)?; m.add(py, "revlog", revlog::init_module(py, &dotted_name)?)?; + m.add(py, "update", update::init_module(py, &dotted_name)?)?; m.add(py, "GraphError", py.get_type::<exceptions::GraphError>())?; Ok(()) });
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-cpython/src/update.rs Tue Oct 01 13:49:11 2024 +0200 @@ -0,0 +1,53 @@ +// debug.rs +// +// Copyright 2024 Mercurial developers +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +//! Module for updating a repository. +use cpython::{PyDict, PyModule, PyObject, PyResult, Python}; +use hg::{ + progress::{HgProgressBar, Progress}, + update::update_from_null, + BaseRevision, +}; + +use crate::{ + exceptions::FallbackError, + utils::{hgerror_to_pyerr, repo_from_path}, +}; + +pub fn update_from_null_fast_path( + py: Python, + repo_path: PyObject, + to: BaseRevision, +) -> PyResult<usize> { + log::trace!("Using update from null fastpath"); + let repo = repo_from_path(py, repo_path)?; + let progress: &dyn Progress = &HgProgressBar::new("updating"); + hgerror_to_pyerr(py, update_from_null(&repo, to.into(), progress)) +} + +pub fn init_module(py: Python, package: &str) -> PyResult<PyModule> { + let dotted_name = &format!("{}.update", package); + let m = PyModule::new(py, dotted_name)?; + + m.add(py, "__package__", package)?; + m.add(py, "__doc__", "Rust module for updating a repository")?; + m.add(py, "FallbackError", py.get_type::<FallbackError>())?; + m.add( + py, + "update_from_null", + py_fn!( + py, + update_from_null_fast_path(repo_path: PyObject, to: BaseRevision,) + ), + )?; + + let sys = PyModule::import(py, "sys")?; + let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?; + sys_modules.set_item(py, dotted_name, &m)?; + + Ok(m) +}
--- a/tests/test-audit-path.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-audit-path.t Tue Oct 01 13:49:11 2024 +0200 @@ -91,7 +91,8 @@ $ hg manifest -r0 .hg/test $ hg update -Cr0 - abort: path contains illegal component: .hg/test + abort: path contains illegal component: .hg/test (no-rust !) + abort: path '.hg/test' is inside the '.hg' folder (rust !) [10] attack foo/.hg/test @@ -107,6 +108,7 @@ $ hg manifest -r2 back back/test + #if symlink $ hg update -Cr2 abort: path 'back/test' traverses symbolic link 'back' @@ -220,17 +222,30 @@ 'a' and 'a/b' are taken as good paths. still applyupdates() should fail. $ hg up -qC null +#if rust + $ hg up 1 + abort: path 'a/*' traverses symbolic link 'a' + [10] +#endif + +#if no-rust $ hg up 1 abort: path 'a/b' traverses symbolic link 'a' [255] +#endif $ ls ../update-symlink-out try branch update replacing directory with symlink, and its content: the path 'a' is audited as a directory first, which should be audited again as a symlink. +#if rust + $ rm -rf a +#else $ rm -f a +#endif $ hg up -qC 2 + $ hg up 1 abort: path 'a/b' traverses symbolic link 'a' [255]
--- a/tests/test-clone.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-clone.t Tue Oct 01 13:49:11 2024 +0200 @@ -1235,7 +1235,7 @@ #if linuxormacos no-fsmonitor $ hg clone a nofsmonitor updating to bookmark @ on branch stable - (warning: large working directory being used without fsmonitor enabled; enable fsmonitor to improve performance; see "hg help -e fsmonitor") + (warning: large working directory being used without fsmonitor enabled; enable fsmonitor to improve performance; see "hg help -e fsmonitor") (no-rust !) 3 files updated, 0 files merged, 0 files removed, 0 files unresolved #else $ hg clone a nofsmonitor @@ -1284,7 +1284,7 @@ #if linuxormacos no-fsmonitor $ hg up cf0fe1914066 - (warning: large working directory being used without fsmonitor enabled; enable fsmonitor to improve performance; see "hg help -e fsmonitor") + (warning: large working directory being used without fsmonitor enabled; enable fsmonitor to improve performance; see "hg help -e fsmonitor") (no-rust !) 2 files updated, 0 files merged, 0 files removed, 0 files unresolved #else $ hg up cf0fe1914066
--- a/tests/test-simple-update.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-simple-update.t Tue Oct 01 13:49:11 2024 +0200 @@ -96,11 +96,12 @@ #if no-windows $ cat <<EOF > forceworker.py - > from mercurial import extensions, worker + > from mercurial import extensions, merge, worker > def nocost(orig, ui, costperop, nops, threadsafe=True): > return worker._numworkers(ui) > 1 > def uisetup(ui): > extensions.wrapfunction(worker, 'worthwhile', nocost) + > merge.MAYBE_USE_RUST_UPDATE = False > EOF $ hg init worker
--- a/tests/test-stream-bundle-v2.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-stream-bundle-v2.t Tue Oct 01 13:49:11 2024 +0200 @@ -152,19 +152,19 @@ bundle2-input-bundle: 2 parts total checking for updated bookmarks updating to branch default - resolving manifests - branchmerge: False, force: False, partial: False - ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 - A: remote created -> g - getting A - B: remote created -> g - getting B - C: remote created -> g - getting C - D: remote created -> g - getting D - E: remote created -> g - getting E + resolving manifests (no-rust !) + branchmerge: False, force: False, partial: False (no-rust !) + ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 (no-rust !) + A: remote created -> g (no-rust !) + getting A (no-rust !) + B: remote created -> g (no-rust !) + getting B (no-rust !) + C: remote created -> g (no-rust !) + getting C (no-rust !) + D: remote created -> g (no-rust !) + getting D (no-rust !) + E: remote created -> g (no-rust !) + getting E (no-rust !) 5 files updated, 0 files merged, 0 files removed, 0 files unresolved updating the branch cache (sent 4 HTTP requests and * bytes; received * bytes in responses) (glob) @@ -215,19 +215,19 @@ bundle2-input-bundle: 2 parts total checking for updated bookmarks updating to branch default - resolving manifests - branchmerge: False, force: False, partial: False - ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 - A: remote created -> g - getting A - B: remote created -> g - getting B - C: remote created -> g - getting C - D: remote created -> g - getting D - E: remote created -> g - getting E + resolving manifests (no-rust !) + branchmerge: False, force: False, partial: False (no-rust !) + ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 (no-rust !) + A: remote created -> g (no-rust !) + getting A (no-rust !) + B: remote created -> g (no-rust !) + getting B (no-rust !) + C: remote created -> g (no-rust !) + getting C (no-rust !) + D: remote created -> g (no-rust !) + getting D (no-rust !) + E: remote created -> g (no-rust !) + getting E (no-rust !) 5 files updated, 0 files merged, 0 files removed, 0 files unresolved updating the branch cache (sent 4 HTTP requests and * bytes; received * bytes in responses) (glob) @@ -280,19 +280,19 @@ bundle2-input-bundle: 2 parts total checking for updated bookmarks updating to branch default - resolving manifests - branchmerge: False, force: False, partial: False - ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 - A: remote created -> g - getting A - B: remote created -> g - getting B - C: remote created -> g - getting C - D: remote created -> g - getting D - E: remote created -> g - getting E + resolving manifests (no-rust !) + branchmerge: False, force: False, partial: False (no-rust !) + ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 (no-rust !) + A: remote created -> g (no-rust !) + getting A (no-rust !) + B: remote created -> g (no-rust !) + getting B (no-rust !) + C: remote created -> g (no-rust !) + getting C (no-rust !) + D: remote created -> g (no-rust !) + getting D (no-rust !) + E: remote created -> g (no-rust !) + getting E (no-rust !) 5 files updated, 0 files merged, 0 files removed, 0 files unresolved updating the branch cache (sent 4 HTTP requests and * bytes; received * bytes in responses) (glob) @@ -342,19 +342,19 @@ bundle2-input-bundle: 2 parts total checking for updated bookmarks updating to branch default - resolving manifests - branchmerge: False, force: False, partial: False - ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 - A: remote created -> g - getting A - B: remote created -> g - getting B - C: remote created -> g - getting C - D: remote created -> g - getting D - E: remote created -> g - getting E + resolving manifests (no-rust !) + branchmerge: False, force: False, partial: False (no-rust !) + ancestor: 000000000000, local: 000000000000+, remote: 9bc730a19041 (no-rust !) + A: remote created -> g (no-rust !) + getting A (no-rust !) + B: remote created -> g (no-rust !) + getting B (no-rust !) + C: remote created -> g (no-rust !) + getting C (no-rust !) + D: remote created -> g (no-rust !) + getting D (no-rust !) + E: remote created -> g (no-rust !) + getting E (no-rust !) 5 files updated, 0 files merged, 0 files removed, 0 files unresolved updating the branch cache (sent 4 HTTP requests and * bytes; received * bytes in responses) (glob)
--- a/tests/test-subrepo-deep-nested-change.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-subrepo-deep-nested-change.t Tue Oct 01 13:49:11 2024 +0200 @@ -45,9 +45,9 @@ linking [=======================================> ] 8/9\r (no-eol) (esc) (rust !) linking [============================================>] 9/9\r (no-eol) (esc) (rust !) \r (no-eol) (esc) - \r (no-eol) (esc) - updating [===========================================>] 1/1\r (no-eol) (esc) - \r (no-eol) (esc) + \r (no-eol) (esc) (no-rust !) + updating [===========================================>] 1/1\r (no-eol) (esc) (no-rust !) + \r (no-eol) (esc) (no-rust !) updating to branch default 1 files updated, 0 files merged, 0 files removed, 0 files unresolved $ hg add -R sub1 @@ -89,7 +89,6 @@ linking [==================================> ] 7/9\r (no-eol) (esc) linking [=======================================> ] 8/9\r (no-eol) (esc) linking [============================================>] 9/9\r (no-eol) (esc) - updating [===========================================>] 1/1\r (no-eol) (esc) \r (no-eol) (esc) updating to branch default cloning subrepo sub2 from $TESTTMP/sub2
--- a/tests/test-symlink-placeholder.t Wed Oct 16 19:14:30 2024 +0200 +++ b/tests/test-symlink-placeholder.t Tue Oct 01 13:49:11 2024 +0200 @@ -3,12 +3,13 @@ Create extension that can disable symlink support: $ cat > nolink.py <<EOF - > from mercurial import extensions, util + > from mercurial import extensions, merge, util > def setflags(orig, f, l, x): > pass > def checklink(orig, path): > return False > def extsetup(ui): + > merge.MAYBE_USE_RUST_UPDATE = False > extensions.wrapfunction(util, 'setflags', setflags) > extensions.wrapfunction(util, 'checklink', checklink) > EOF