view rust/hg-core/src/utils/path_auditor.rs @ 46155:fce2f20a54ce

copies-rust: start recording overwrite as they happens If a revision has information overwriting data from another revision, the overwriting revision is a descendant of the overwritten one. So we could warm the Oracle cache with such information to avoid potential future `is_ancestors` call. This provide us with a large speedup in the most expensive cases: Repo Case Source-Rev Dest-Rev # of revisions old time new time Difference Factor time per rev --------------------------------------------------------------------------------------------------------------------------------------------------------------- mozilla-try x00000_revs_x00000_added_x0000_copies 1b661134e2ca 1ae03d022d6d : 228985 revs, 41.113063 s, 36.001255 s, -5.111808 s, × 0.8757, 157 µs/rev mozilla-try x00000_revs_x00000_added_x000_copies 9b2a99adc05e 8e29777b48e6 : 382065 revs, 27.891612 s, 14.340641 s, -13.550971 s, × 0.5142, 37 µs/rev Full comparison below: Repo Case Source-Rev Dest-Rev # of revisions old time new time Difference Factor time per rev --------------------------------------------------------------------------------------------------------------------------------------------------------------- mercurial x_revs_x_added_0_copies ad6b123de1c7 39cfcef4f463 : 1 revs, 0.000042 s, 0.000042 s, +0.000000 s, × 1.0000, 42 µs/rev mercurial x_revs_x_added_x_copies 2b1c78674230 0c1d10351869 : 6 revs, 0.000114 s, 0.000109 s, -0.000005 s, × 0.9561, 18 µs/rev mercurial x000_revs_x000_added_x_copies 81f8ff2a9bf2 dd3267698d84 : 1032 revs, 0.004934 s, 0.004953 s, +0.000019 s, × 1.0039, 4 µs/rev pypy x_revs_x_added_0_copies aed021ee8ae8 099ed31b181b : 9 revs, 0.000195 s, 0.000237 s, +0.000042 s, × 1.2154, 26 µs/rev pypy x_revs_x000_added_0_copies 4aa4e1f8e19a 359343b9ac0e : 1 revs, 0.000050 s, 0.000050 s, +0.000000 s, × 1.0000, 50 µs/rev pypy x_revs_x_added_x_copies ac52eb7bbbb0 72e022663155 : 7 revs, 0.000113 s, 0.000113 s, +0.000000 s, × 1.0000, 16 µs/rev pypy x_revs_x00_added_x_copies c3b14617fbd7 ace7255d9a26 : 1 revs, 0.6f1f4a s, 0.6f1f4a s, +0.000000 s, × 1.0000, 322 µs/rev pypy x_revs_x000_added_x000_copies df6f7a526b60 a83dc6a2d56f : 6 revs, 0.010788 s, 0.010702 s, -0.000086 s, × 0.9920, 1783 µs/rev pypy x000_revs_xx00_added_0_copies 89a76aede314 2f22446ff07e : 4785 revs, 0.050880 s, 0.050504 s, -0.000376 s, × 0.9926, 10 µs/rev pypy x000_revs_x000_added_x_copies 8a3b5bfd266e 2c68e87c3efe : 6780 revs, 0.081760 s, 0.080159 s, -0.001601 s, × 0.9804, 11 µs/rev pypy x000_revs_x000_added_x000_copies 89a76aede314 7b3dda341c84 : 5441 revs, 0.061382 s, 0.060058 s, -0.001324 s, × 0.9784, 11 µs/rev pypy x0000_revs_x_added_0_copies d1defd0dc478 c9cb1334cc78 : 43645 revs, 0.585802 s, 0.536950 s, -0.048852 s, × 0.9166, 12 µs/rev pypy x0000_revs_xx000_added_0_copies bf2c629d0071 4ffed77c095c : 2 revs, 0.012803 s, 0.012868 s, +0.000065 s, × 1.0051, 6434 µs/rev pypy x0000_revs_xx000_added_x000_copies 08ea3258278e d9fa043f30c0 : 11316 revs, 0.113558 s, 0.112806 s, -0.000752 s, × 0.9934, 9 µs/rev netbeans x_revs_x_added_0_copies fb0955ffcbcd a01e9239f9e7 : 2 revs, 0.000085 s, 0.000084 s, -0.000001 s, × 0.9882, 42 µs/rev netbeans x_revs_x000_added_0_copies 6f360122949f 20eb231cc7d0 : 2 revs, 0.000106 s, 0.000106 s, +0.000000 s, × 1.0000, 53 µs/rev netbeans x_revs_x_added_x_copies 1ada3faf6fb6 5a39d12eecf4 : 3 revs, 0.000175 s, 0.000174 s, -0.000001 s, × 0.9943, 58 µs/rev netbeans x_revs_x00_added_x_copies 35be93ba1e2c 9eec5e90c05f : 9 revs, 0.000721 s, 0.000726 s, +0.000005 s, × 1.0069, 80 µs/rev netbeans x000_revs_xx00_added_0_copies eac3045b4fdd 51d4ae7f1290 : 1421 revs, 0.010127 s, 0.010105 s, -0.000022 s, × 0.9978, 7 µs/rev netbeans x000_revs_x000_added_x_copies e2063d266acd 6081d72689dc : 1533 revs, 0.015616 s, 0.015748 s, +0.000132 s, × 1.0085, 10 µs/rev netbeans x000_revs_x000_added_x000_copies ff453e9fee32 411350406ec2 : 5750 revs, 0.061341 s, 0.060357 s, -0.000984 s, × 0.9840, 10 µs/rev netbeans x0000_revs_xx000_added_x000_copies 588c2d1ced70 1aad62e59ddd : 66949 revs, 0.542214 s, 0.499356 s, -0.042858 s, × 0.9210, 7 µs/rev mozilla-central x_revs_x_added_0_copies 3697f962bb7b 7015fcdd43a2 : 2 revs, 0.000089 s, 0.000092 s, +0.000003 s, × 1.0337, 46 µs/rev mozilla-central x_revs_x000_added_0_copies dd390860c6c9 40d0c5bed75d : 8 revs, 0.000279 s, 0.000279 s, +0.000000 s, × 1.0000, 34 µs/rev mozilla-central x_revs_x_added_x_copies 8d198483ae3b 14207ffc2b2f : 9 revs, 0.000184 s, 0.000186 s, +0.000002 s, × 1.0109, 20 µs/rev mozilla-central x_revs_x00_added_x_copies 98cbc58cc6bc 446a150332c3 : 7 revs, 0.000661 s, 0.000660 s, -0.000001 s, × 0.9985, 94 µs/rev mozilla-central x_revs_x000_added_x000_copies 3c684b4b8f68 0a5e72d1b479 : 3 revs, 0.003377 s, 0.003372 s, -0.000005 s, × 0.9985, 1124 µs/rev mozilla-central x_revs_x0000_added_x0000_copies effb563bb7e5 c07a39dc4e80 : 6 revs, 0.070508 s, 0.070294 s, -0.000214 s, × 0.9970, 11715 µs/rev mozilla-central x000_revs_xx00_added_0_copies 6100d773079a 04a55431795e : 1593 revs, 0.006576 s, 0.006545 s, -0.000031 s, × 0.9953, 4 µs/rev mozilla-central x000_revs_x000_added_x_copies 9f17a6fc04f9 2d37b966abed : 41 revs, 0.004809 s, 0.004998 s, +0.000189 s, × 1.0393, 121 µs/rev mozilla-central x000_revs_x000_added_x000_copies 7c97034feb78 4407bd0c6330 : 7839 revs, 0.064872 s, 0.063348 s, -0.001524 s, × 0.9765, 8 µs/rev mozilla-central x0000_revs_xx000_added_0_copies 9eec5917337d 67118cc6dcad : 615 revs, 0.026142 s, 0.026154 s, +0.000012 s, × 1.0005, 42 µs/rev mozilla-central x0000_revs_xx000_added_x000_copies f78c615a656c 96a38b690156 : 30263 revs, 0.203956 s, 0.199063 s, -0.004893 s, × 0.9760, 6 µs/rev mozilla-central x00000_revs_x0000_added_x0000_copies 6832ae71433c 4c222a1d9a00 : 153721 revs, 1.763853 s, 1.277320 s, -0.486533 s, × 0.7242, 8 µs/rev mozilla-central x00000_revs_x00000_added_x000_copies 76caed42cf7c 1daa622bbe42 : 204976 revs, 2.609761 s, 1.698794 s, -0.910967 s, × 0.6509, 8 µs/rev mozilla-try x_revs_x_added_0_copies aaf6dde0deb8 9790f499805a : 2 revs, 0.000847 s, 0.000842 s, -0.000005 s, × 0.9941, 421 µs/rev mozilla-try x_revs_x000_added_0_copies d8d0222927b4 5bb8ce8c7450 : 2 revs, 0.000867 s, 0.000865 s, -0.000002 s, × 0.9977, 432 µs/rev mozilla-try x_revs_x_added_x_copies 092fcca11bdb 936255a0384a : 4 revs, 0.000161 s, 0.000160 s, -0.000001 s, × 0.9938, 40 µs/rev mozilla-try x_revs_x00_added_x_copies b53d2fadbdb5 017afae788ec : 2 revs, 0.001131 s, 0.001122 s, -0.000009 s, × 0.9920, 561 µs/rev mozilla-try x_revs_x000_added_x000_copies 20408ad61ce5 6f0ee96e21ad : 1 revs, 0.033114 s, 0.032743 s, -0.000371 s, × 0.9888, 32743 µs/rev mozilla-try x_revs_x0000_added_x0000_copies effb563bb7e5 c07a39dc4e80 : 6 revs, 0.071092 s, 0.071529 s, +0.000437 s, × 1.0061, 11921 µs/rev mozilla-try x000_revs_xx00_added_0_copies 6100d773079a 04a55431795e : 1593 revs, 0.006554 s, 0.006593 s, +0.000039 s, × 1.0060, 4 µs/rev mozilla-try x000_revs_x000_added_x_copies 9f17a6fc04f9 2d37b966abed : 41 revs, 0.005160 s, 0.005311 s, +0.000151 s, × 1.0293, 129 µs/rev mozilla-try x000_revs_x000_added_x000_copies 1346fd0130e4 4c65cbdabc1f : 6657 revs, 0.065063 s, 0.063063 s, -0.002000 s, × 0.9693, 9 µs/rev mozilla-try x0000_revs_x_added_0_copies 63519bfd42ee a36a2a865d92 : 40314 revs, 0.297118 s, 0.312363 s, +0.015245 s, × 1.0513, 7 µs/rev mozilla-try x0000_revs_x_added_x_copies 9fe69ff0762d bcabf2a78927 : 38690 revs, 0.284002 s, 0.283106 s, -0.000896 s, × 0.9968, 7 µs/rev mozilla-try x0000_revs_xx000_added_x_copies 156f6e2674f2 4d0f2c178e66 : 8598 revs, 0.086311 s, 0.083817 s, -0.002494 s, × 0.9711, 9 µs/rev mozilla-try x0000_revs_xx000_added_0_copies 9eec5917337d 67118cc6dcad : 615 revs, 0.026738 s, 0.026516 s, -0.000222 s, × 0.9917, 43 µs/rev mozilla-try x0000_revs_xx000_added_x000_copies 89294cd501d9 7ccb2fc7ccb5 : 97052 revs, 1.514270 s, 1.304865 s, -0.209405 s, × 0.8617, 13 µs/rev mozilla-try x0000_revs_x0000_added_x0000_copies e928c65095ed e951f4ad123a : 52031 revs, 0.735875 s, 0.681088 s, -0.054787 s, × 0.9255, 13 µs/rev mozilla-try x00000_revs_x_added_0_copies 6a320851d377 1ebb79acd503 : 363753 revs, 4.843329 s, 4.454320 s, -0.389009 s, × 0.9197, 12 µs/rev mozilla-try x00000_revs_x00000_added_0_copies dc8a3ca7010e d16fde900c9c : 34414 revs, 0.591752 s, 0.567913 s, -0.023839 s, × 0.9597, 16 µs/rev mozilla-try x00000_revs_x_added_x_copies 5173c4b6f97c 95d83ee7242d : 362229 revs, 4.760563 s, 4.547043 s, -0.213520 s, × 0.9551, 12 µs/rev mozilla-try x00000_revs_x000_added_x_copies 9126823d0e9c ca82787bb23c : 359344 revs, 4.751942 s, 4.378579 s, -0.373363 s, × 0.9214, 12 µs/rev mozilla-try x00000_revs_x0000_added_x0000_copies 8d3fafa80d4b eb884023b810 : 192665 revs, 2.605014 s, 1.703622 s, -0.901392 s, × 0.6540, 8 µs/rev mozilla-try x00000_revs_x00000_added_x0000_copies 1b661134e2ca 1ae03d022d6d : 228985 revs, 41.113063 s, 36.001255 s, -5.111808 s, × 0.8757, 157 µs/rev mozilla-try x00000_revs_x00000_added_x000_copies 9b2a99adc05e 8e29777b48e6 : 382065 revs, 27.891612 s, 14.340641 s, -13.550971 s, × 0.5142, 37 µs/rev Differential Revision: https://phab.mercurial-scm.org/D9497
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Sat, 21 Nov 2020 17:00:32 +0100
parents 26114bd6ec60
children db2bc9e667a1
line wrap: on
line source

// path_auditor.rs
//
// Copyright 2020
// Raphaël Gomès <rgomes@octobus.net>,
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.

use crate::utils::{
    files::lower_clean,
    find_slice_in_slice,
    hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
};
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::{Mutex, RwLock};

/// Ensures that a path is valid for use in the repository i.e. does not use
/// any banned components, does not traverse a symlink, etc.
#[derive(Debug, Default)]
pub struct PathAuditor {
    audited: Mutex<HashSet<HgPathBuf>>,
    audited_dirs: RwLock<HashSet<HgPathBuf>>,
    root: PathBuf,
}

impl PathAuditor {
    pub fn new(root: impl AsRef<Path>) -> Self {
        Self {
            root: root.as_ref().to_owned(),
            ..Default::default()
        }
    }
    pub fn audit_path(
        &self,
        path: impl AsRef<HgPath>,
    ) -> Result<(), HgPathError> {
        // TODO windows "localpath" normalization
        let path = path.as_ref();
        if path.is_empty() {
            return Ok(());
        }
        // TODO case normalization
        if self.audited.lock().unwrap().contains(path) {
            return Ok(());
        }
        // AIX ignores "/" at end of path, others raise EISDIR.
        let last_byte = path.as_bytes()[path.len() - 1];
        if last_byte == b'/' || last_byte == b'\\' {
            return Err(HgPathError::EndsWithSlash(path.to_owned()));
        }
        let parts: Vec<_> = path
            .as_bytes()
            .split(|b| std::path::is_separator(*b as char))
            .collect();

        let first_component = lower_clean(parts[0]);
        let first_component = first_component.as_slice();
        if !path.split_drive().0.is_empty()
            || (first_component == b".hg"
                || first_component == b".hg."
                || first_component == b"")
            || parts.iter().any(|c| c == b"..")
        {
            return Err(HgPathError::InsideDotHg(path.to_owned()));
        }

        // Windows shortname aliases
        for part in parts.iter() {
            if part.contains(&b'~') {
                let mut split = part.splitn(2, |b| *b == b'~');
                let first =
                    split.next().unwrap().to_owned().to_ascii_uppercase();
                let last = split.next().unwrap();
                if last.iter().all(u8::is_ascii_digit)
                    && (first == b"HG" || first == b"HG8B6C")
                {
                    return Err(HgPathError::ContainsIllegalComponent(
                        path.to_owned(),
                    ));
                }
            }
        }
        let lower_path = lower_clean(path.as_bytes());
        if find_slice_in_slice(&lower_path, b".hg").is_some() {
            let lower_parts: Vec<_> = path
                .as_bytes()
                .split(|b| std::path::is_separator(*b as char))
                .collect();
            for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
                if let Some(pos) = lower_parts[1..]
                    .iter()
                    .position(|part| part == &pattern.as_slice())
                {
                    let base = lower_parts[..=pos]
                        .iter()
                        .fold(HgPathBuf::new(), |acc, p| {
                            acc.join(HgPath::new(p))
                        });
                    return Err(HgPathError::IsInsideNestedRepo {
                        path: path.to_owned(),
                        nested_repo: base,
                    });
                }
            }
        }

        let parts = &parts[..parts.len().saturating_sub(1)];

        // We don't want to add "foo/bar/baz" to `audited_dirs` before checking
        // if there's a "foo/.hg" directory. This also means we won't
        // accidentally traverse a symlink into some other filesystem (which
        // is potentially expensive to access).
        for index in 0..parts.len() {
            let prefix = &parts[..=index].join(&b'/');
            let prefix = HgPath::new(prefix);
            if self.audited_dirs.read().unwrap().contains(prefix) {
                continue;
            }
            self.check_filesystem(&prefix, &path)?;
            self.audited_dirs.write().unwrap().insert(prefix.to_owned());
        }

        self.audited.lock().unwrap().insert(path.to_owned());

        Ok(())
    }

    pub fn check_filesystem(
        &self,
        prefix: impl AsRef<HgPath>,
        path: impl AsRef<HgPath>,
    ) -> Result<(), HgPathError> {
        let prefix = prefix.as_ref();
        let path = path.as_ref();
        let current_path = self.root.join(
            hg_path_to_path_buf(prefix)
                .map_err(|_| HgPathError::NotFsCompliant(path.to_owned()))?,
        );
        match std::fs::symlink_metadata(&current_path) {
            Err(e) => {
                // EINVAL can be raised as invalid path syntax under win32.
                if e.kind() != std::io::ErrorKind::NotFound
                    && e.kind() != std::io::ErrorKind::InvalidInput
                    && e.raw_os_error() != Some(20)
                {
                    // Rust does not yet have an `ErrorKind` for
                    // `NotADirectory` (errno 20)
                    // It happens if the dirstate contains `foo/bar` and
                    // foo is not a directory
                    return Err(HgPathError::NotFsCompliant(path.to_owned()));
                }
            }
            Ok(meta) => {
                if meta.file_type().is_symlink() {
                    return Err(HgPathError::TraversesSymbolicLink {
                        path: path.to_owned(),
                        symlink: prefix.to_owned(),
                    });
                }
                if meta.file_type().is_dir()
                    && current_path.join(".hg").is_dir()
                {
                    return Err(HgPathError::IsInsideNestedRepo {
                        path: path.to_owned(),
                        nested_repo: prefix.to_owned(),
                    });
                }
            }
        };

        Ok(())
    }

    pub fn check(&self, path: impl AsRef<HgPath>) -> bool {
        self.audit_path(path).is_ok()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::utils::files::get_path_from_bytes;
    use crate::utils::hg_path::path_to_hg_path_buf;

    #[test]
    fn test_path_auditor() {
        let auditor = PathAuditor::new(get_path_from_bytes(b"/tmp"));

        let path = HgPath::new(b".hg/00changelog.i");
        assert_eq!(
            auditor.audit_path(path),
            Err(HgPathError::InsideDotHg(path.to_owned()))
        );
        let path = HgPath::new(b"this/is/nested/.hg/thing.txt");
        assert_eq!(
            auditor.audit_path(path),
            Err(HgPathError::IsInsideNestedRepo {
                path: path.to_owned(),
                nested_repo: HgPathBuf::from_bytes(b"this/is/nested")
            })
        );

        use std::fs::{create_dir, File};
        use tempfile::tempdir;

        let base_dir = tempdir().unwrap();
        let base_dir_path = base_dir.path();
        let a = base_dir_path.join("a");
        let b = base_dir_path.join("b");
        create_dir(&a).unwrap();
        let in_a_path = a.join("in_a");
        File::create(in_a_path).unwrap();

        // TODO make portable
        std::os::unix::fs::symlink(&a, &b).unwrap();

        let buf = b.join("in_a").components().skip(2).collect::<PathBuf>();
        eprintln!("buf: {}", buf.display());
        let path = path_to_hg_path_buf(buf).unwrap();
        assert_eq!(
            auditor.audit_path(&path),
            Err(HgPathError::TraversesSymbolicLink {
                path: path,
                symlink: path_to_hg_path_buf(
                    b.components().skip(2).collect::<PathBuf>()
                )
                .unwrap()
            })
        );
    }
}