rust-pathauditor: add Rust implementation of the `pathauditor`
It does not offer the same flexibility as the Python implementation, but
should check incoming paths just as well.
Differential Revision: https://phab.mercurial-scm.org/D7866
--- a/rust/Cargo.lock Wed Jan 22 03:17:06 2020 +0530
+++ b/rust/Cargo.lock Wed Feb 05 17:05:37 2020 +0100
@@ -146,6 +146,7 @@
"rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -393,6 +394,11 @@
]
[[package]]
+name = "redox_syscall"
+version = "0.1.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
name = "regex"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -409,6 +415,14 @@
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "remove_dir_all"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -435,6 +449,19 @@
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "tempfile"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)",
+ "remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -519,12 +546,15 @@
"checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
"checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
+"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
"checksum regex 1.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b5508c1941e4e7cb19965abef075d35a9a8b5cdf0846f30b4050e9b55dc55e87"
"checksum regex-syntax 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "e734e891f5b408a29efbf8309e656876276f49ab6a6ac208600b4419bd893d90"
+"checksum remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e"
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
+"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
"checksum thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
"checksum twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56"
"checksum wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)" = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
--- a/rust/hg-core/Cargo.toml Wed Jan 22 03:17:06 2020 +0530
+++ b/rust/hg-core/Cargo.toml Wed Feb 05 17:05:37 2020 +0100
@@ -18,3 +18,6 @@
rayon = "1.3.0"
regex = "1.1.0"
twox-hash = "1.5.0"
+
+[dev-dependencies]
+tempfile = "3.1.0"
\ No newline at end of file
--- a/rust/hg-core/src/utils.rs Wed Jan 22 03:17:06 2020 +0530
+++ b/rust/hg-core/src/utils.rs Wed Feb 05 17:05:37 2020 +0100
@@ -9,6 +9,7 @@
pub mod files;
pub mod hg_path;
+pub mod path_auditor;
/// Useful until rust/issues/56345 is stable
///
--- a/rust/hg-core/src/utils/files.rs Wed Jan 22 03:17:06 2020 +0530
+++ b/rust/hg-core/src/utils/files.rs Wed Feb 05 17:05:37 2020 +0100
@@ -12,6 +12,8 @@
use crate::utils::hg_path::{HgPath, HgPathBuf};
use std::iter::FusedIterator;
+use crate::utils::replace_slice;
+use lazy_static::lazy_static;
use std::fs::Metadata;
use std::path::Path;
@@ -85,6 +87,41 @@
path.to_ascii_lowercase()
}
+lazy_static! {
+ static ref IGNORED_CHARS: Vec<Vec<u8>> = {
+ [
+ 0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d,
+ 0x202e, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
+ ]
+ .iter()
+ .map(|code| {
+ std::char::from_u32(*code)
+ .unwrap()
+ .encode_utf8(&mut [0; 3])
+ .bytes()
+ .collect()
+ })
+ .collect()
+ };
+}
+
+fn hfs_ignore_clean(bytes: &[u8]) -> Vec<u8> {
+ let mut buf = bytes.to_owned();
+ let needs_escaping = bytes.iter().any(|b| *b == b'\xe2' || *b == b'\xef');
+ if needs_escaping {
+ for forbidden in IGNORED_CHARS.iter() {
+ replace_slice(&mut buf, forbidden, &[])
+ }
+ buf
+ } else {
+ buf
+ }
+}
+
+pub fn lower_clean(bytes: &[u8]) -> Vec<u8> {
+ hfs_ignore_clean(&bytes.to_ascii_lowercase())
+}
+
#[derive(Eq, PartialEq, Ord, PartialOrd, Copy, Clone)]
pub struct HgMetadata {
pub st_dev: u64,
--- a/rust/hg-core/src/utils/hg_path.rs Wed Jan 22 03:17:06 2020 +0530
+++ b/rust/hg-core/src/utils/hg_path.rs Wed Feb 05 17:05:37 2020 +0100
@@ -15,12 +15,35 @@
pub enum HgPathError {
/// Bytes from the invalid `HgPath`
LeadingSlash(Vec<u8>),
- /// Bytes and index of the second slash
- ConsecutiveSlashes(Vec<u8>, usize),
- /// Bytes and index of the null byte
- ContainsNullByte(Vec<u8>, usize),
+ ConsecutiveSlashes {
+ bytes: Vec<u8>,
+ second_slash_index: usize,
+ },
+ ContainsNullByte {
+ bytes: Vec<u8>,
+ null_byte_index: usize,
+ },
/// Bytes
DecodeError(Vec<u8>),
+ /// The rest come from audit errors
+ EndsWithSlash(HgPathBuf),
+ ContainsIllegalComponent(HgPathBuf),
+ /// Path is inside the `.hg` folder
+ InsideDotHg(HgPathBuf),
+ IsInsideNestedRepo {
+ path: HgPathBuf,
+ nested_repo: HgPathBuf,
+ },
+ TraversesSymbolicLink {
+ path: HgPathBuf,
+ symlink: HgPathBuf,
+ },
+ NotFsCompliant(HgPathBuf),
+ /// `path` is the smallest invalid path
+ NotUnderRoot {
+ path: PathBuf,
+ root: PathBuf,
+ },
}
impl ToString for HgPathError {
@@ -29,17 +52,55 @@
HgPathError::LeadingSlash(bytes) => {
format!("Invalid HgPath '{:?}': has a leading slash.", bytes)
}
- HgPathError::ConsecutiveSlashes(bytes, pos) => format!(
- "Invalid HgPath '{:?}': consecutive slahes at pos {}.",
+ HgPathError::ConsecutiveSlashes {
+ bytes,
+ second_slash_index: pos,
+ } => format!(
+ "Invalid HgPath '{:?}': consecutive slashes at pos {}.",
bytes, pos
),
- HgPathError::ContainsNullByte(bytes, pos) => format!(
+ HgPathError::ContainsNullByte {
+ bytes,
+ null_byte_index: pos,
+ } => format!(
"Invalid HgPath '{:?}': contains null byte at pos {}.",
bytes, pos
),
HgPathError::DecodeError(bytes) => {
format!("Invalid HgPath '{:?}': could not be decoded.", bytes)
}
+ HgPathError::EndsWithSlash(path) => {
+ format!("Audit failed for '{}': ends with a slash.", path)
+ }
+ HgPathError::ContainsIllegalComponent(path) => format!(
+ "Audit failed for '{}': contains an illegal component.",
+ path
+ ),
+ HgPathError::InsideDotHg(path) => format!(
+ "Audit failed for '{}': is inside the '.hg' folder.",
+ path
+ ),
+ HgPathError::IsInsideNestedRepo {
+ path,
+ nested_repo: nested,
+ } => format!(
+ "Audit failed for '{}': is inside a nested repository '{}'.",
+ path, nested
+ ),
+ HgPathError::TraversesSymbolicLink { path, symlink } => format!(
+ "Audit failed for '{}': traverses symbolic link '{}'.",
+ path, symlink
+ ),
+ HgPathError::NotFsCompliant(path) => format!(
+ "Audit failed for '{}': cannot be turned into a \
+ filesystem path.",
+ path
+ ),
+ HgPathError::NotUnderRoot { path, root } => format!(
+ "Audit failed for '{}': not under root {}.",
+ path.display(),
+ root.display()
+ ),
}
}
}
@@ -229,17 +290,17 @@
for (index, byte) in bytes.iter().enumerate() {
match byte {
0 => {
- return Err(HgPathError::ContainsNullByte(
- bytes.to_vec(),
- index,
- ))
+ return Err(HgPathError::ContainsNullByte {
+ bytes: bytes.to_vec(),
+ null_byte_index: index,
+ })
}
b'/' => {
if previous_byte.is_some() && previous_byte == Some(b'/') {
- return Err(HgPathError::ConsecutiveSlashes(
- bytes.to_vec(),
- index,
- ));
+ return Err(HgPathError::ConsecutiveSlashes {
+ bytes: bytes.to_vec(),
+ second_slash_index: index,
+ });
}
}
_ => (),
@@ -431,11 +492,17 @@
HgPath::new(b"/").check_state()
);
assert_eq!(
- Err(HgPathError::ConsecutiveSlashes(b"a/b//c".to_vec(), 4)),
+ Err(HgPathError::ConsecutiveSlashes {
+ bytes: b"a/b//c".to_vec(),
+ second_slash_index: 4
+ }),
HgPath::new(b"a/b//c").check_state()
);
assert_eq!(
- Err(HgPathError::ContainsNullByte(b"a/b/\0c".to_vec(), 4)),
+ Err(HgPathError::ContainsNullByte {
+ bytes: b"a/b/\0c".to_vec(),
+ null_byte_index: 4
+ }),
HgPath::new(b"a/b/\0c").check_state()
);
// TODO test HgPathError::DecodeError for the Windows implementation.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/utils/path_auditor.rs Wed Feb 05 17:05:37 2020 +0100
@@ -0,0 +1,230 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès <rgomes@octobus.net>,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+ files::lower_clean,
+ find_slice_in_slice,
+ hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+ audited: HashSet<HgPathBuf>,
+ audited_dirs: HashSet<HgPathBuf>,
+ root: PathBuf,
+}
+
+impl PathAuditor {
+ pub fn new(root: impl AsRef<Path>) -> Self {
+ Self {
+ root: root.as_ref().to_owned(),
+ ..Default::default()
+ }
+ }
+ pub fn audit_path(
+ &mut self,
+ path: impl AsRef<HgPath>,
+ ) -> Result<(), HgPathError> {
+ // TODO windows "localpath" normalization
+ let path = path.as_ref();
+ if path.is_empty() {
+ return Ok(());
+ }
+ // TODO case normalization
+ if self.audited.contains(path) {
+ return Ok(());
+ }
+ // AIX ignores "/" at end of path, others raise EISDIR.
+ let last_byte = path.as_bytes()[path.len() - 1];
+ if last_byte == b'/' || last_byte == b'\\' {
+ return Err(HgPathError::EndsWithSlash(path.to_owned()));
+ }
+ let parts: Vec<_> = path
+ .as_bytes()
+ .split(|b| std::path::is_separator(*b as char))
+ .collect();
+
+ let first_component = lower_clean(parts[0]);
+ let first_component = first_component.as_slice();
+ if !path.split_drive().0.is_empty()
+ || (first_component == b".hg"
+ || first_component == b".hg."
+ || first_component == b"")
+ || parts.iter().any(|c| c == b"..")
+ {
+ return Err(HgPathError::InsideDotHg(path.to_owned()));
+ }
+
+ // Windows shortname aliases
+ for part in parts.iter() {
+ if part.contains(&b'~') {
+ let mut split = part.splitn(1, |b| *b == b'~');
+ let first =
+ split.next().unwrap().to_owned().to_ascii_uppercase();
+ let last = split.next().unwrap();
+ if last.iter().all(u8::is_ascii_digit)
+ && (first == b"HG" || first == b"HG8B6C")
+ {
+ return Err(HgPathError::ContainsIllegalComponent(
+ path.to_owned(),
+ ));
+ }
+ }
+ }
+ let lower_path = lower_clean(path.as_bytes());
+ if find_slice_in_slice(&lower_path, b".hg").is_some() {
+ let lower_parts: Vec<_> = path
+ .as_bytes()
+ .split(|b| std::path::is_separator(*b as char))
+ .collect();
+ for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+ if let Some(pos) = lower_parts[1..]
+ .iter()
+ .position(|part| part == &pattern.as_slice())
+ {
+ let base = lower_parts[..=pos]
+ .iter()
+ .fold(HgPathBuf::new(), |acc, p| {
+ acc.join(HgPath::new(p))
+ });
+ return Err(HgPathError::IsInsideNestedRepo {
+ path: path.to_owned(),
+ nested_repo: base,
+ });
+ }
+ }
+ }
+
+ let parts = &parts[..parts.len().saturating_sub(1)];
+
+ // We don't want to add "foo/bar/baz" to `audited_dirs` before checking
+ // if there's a "foo/.hg" directory. This also means we won't
+ // accidentally traverse a symlink into some other filesystem (which
+ // is potentially expensive to access).
+ for index in 0..parts.len() {
+ let prefix = &parts[..index + 1].join(&b'/');
+ let prefix = HgPath::new(prefix);
+ if self.audited_dirs.contains(prefix) {
+ continue;
+ }
+ self.check_filesystem(&prefix, &path)?;
+ }
+
+ self.audited.insert(path.to_owned());
+
+ Ok(())
+ }
+
+ pub fn check_filesystem(
+ &self,
+ prefix: impl AsRef<HgPath>,
+ path: impl AsRef<HgPath>,
+ ) -> Result<(), HgPathError> {
+ let prefix = prefix.as_ref();
+ let path = path.as_ref();
+ let current_path = self.root.join(
+ hg_path_to_path_buf(prefix)
+ .map_err(|_| HgPathError::NotFsCompliant(path.to_owned()))?,
+ );
+ match std::fs::symlink_metadata(¤t_path) {
+ Err(e) => {
+ // EINVAL can be raised as invalid path syntax under win32.
+ if e.kind() != std::io::ErrorKind::NotFound
+ && e.kind() != std::io::ErrorKind::InvalidInput
+ && e.raw_os_error() != Some(20)
+ {
+ // Rust does not yet have an `ErrorKind` for
+ // `NotADirectory` (errno 20)
+ // It happens if the dirstate contains `foo/bar` and
+ // foo is not a directory
+ return Err(HgPathError::NotFsCompliant(path.to_owned()));
+ }
+ }
+ Ok(meta) => {
+ if meta.file_type().is_symlink() {
+ return Err(HgPathError::TraversesSymbolicLink {
+ path: path.to_owned(),
+ symlink: prefix.to_owned(),
+ });
+ }
+ if meta.file_type().is_dir()
+ && current_path.join(".hg").is_dir()
+ {
+ return Err(HgPathError::IsInsideNestedRepo {
+ path: path.to_owned(),
+ nested_repo: prefix.to_owned(),
+ });
+ }
+ }
+ };
+
+ Ok(())
+ }
+
+ pub fn check(&mut self, path: impl AsRef<HgPath>) -> bool {
+ self.audit_path(path).is_ok()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::utils::files::get_path_from_bytes;
+ use crate::utils::hg_path::path_to_hg_path_buf;
+
+ #[test]
+ fn test_path_auditor() {
+ let mut auditor = PathAuditor::new(get_path_from_bytes(b"/tmp"));
+
+ let path = HgPath::new(b".hg/00changelog.i");
+ assert_eq!(
+ auditor.audit_path(path),
+ Err(HgPathError::InsideDotHg(path.to_owned()))
+ );
+ let path = HgPath::new(b"this/is/nested/.hg/thing.txt");
+ assert_eq!(
+ auditor.audit_path(path),
+ Err(HgPathError::IsInsideNestedRepo {
+ path: path.to_owned(),
+ nested_repo: HgPathBuf::from_bytes(b"this/is/nested")
+ })
+ );
+
+ use std::fs::{create_dir, File};
+ use tempfile::tempdir;
+
+ let base_dir = tempdir().unwrap();
+ let base_dir_path = base_dir.path();
+ let a = base_dir_path.join("a");
+ let b = base_dir_path.join("b");
+ create_dir(&a).unwrap();
+ let in_a_path = a.join("in_a");
+ File::create(in_a_path).unwrap();
+
+ // TODO make portable
+ std::os::unix::fs::symlink(&a, &b).unwrap();
+
+ let buf = b.join("in_a").components().skip(2).collect::<PathBuf>();
+ eprintln!("buf: {}", buf.display());
+ let path = path_to_hg_path_buf(buf).unwrap();
+ assert_eq!(
+ auditor.audit_path(&path),
+ Err(HgPathError::TraversesSymbolicLink {
+ path: path,
+ symlink: path_to_hg_path_buf(
+ b.components().skip(2).collect::<PathBuf>()
+ )
+ .unwrap()
+ })
+ );
+ }
+}