# HG changeset patch # User Raphaël Gomès # Date 1567363994 -7200 # Node ID 3fe40dd6355dbfd71ac191a7c9465ae006728083 # Parent 9668744c91223b4fc5461a432a5f61c02be4cfcd rust-hgpath: add HgPath and HgPathBuf structs to encapsulate handling of paths This change is a direct consequence of this discussion on the mailing list: https://www.mercurial-scm.org/pipermail/mercurial-devel/2019-August/133574.html The implementations of `HgPath` and `HgPathBuf` are, for the most part, taken directly from `OsStr` and `OsString` respectively from the standard library. What this change does *not* yet do is implement the Windows MBCS to WTF8 conversion, but it lays the basis for a very flexible interface for paths. Differential Revision: https://phab.mercurial-scm.org/D6773 diff -r 9668744c9122 -r 3fe40dd6355d rust/hg-core/src/utils.rs --- a/rust/hg-core/src/utils.rs Wed Sep 18 13:50:33 2019 -0700 +++ b/rust/hg-core/src/utils.rs Sun Sep 01 20:53:14 2019 +0200 @@ -8,6 +8,7 @@ //! Contains useful functions, traits, structs, etc. for use in core. pub mod files; +pub mod hg_path; /// Replaces the `from` slice with the `to` slice inside the `buf` slice. /// diff -r 9668744c9122 -r 3fe40dd6355d rust/hg-core/src/utils/hg_path.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/utils/hg_path.rs Sun Sep 01 20:53:14 2019 +0200 @@ -0,0 +1,409 @@ +// hg_path.rs +// +// Copyright 2019 Raphaël Gomès +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use std::borrow::Borrow; +use std::ffi::{OsStr, OsString}; +use std::ops::Deref; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Eq, PartialEq)] +pub enum HgPathError { + /// Bytes from the invalid `HgPath` + LeadingSlash(Vec), + /// Bytes and index of the second slash + ConsecutiveSlashes(Vec, usize), + /// Bytes and index of the null byte + ContainsNullByte(Vec, usize), + /// Bytes + DecodeError(Vec), +} + +impl ToString for HgPathError { + fn to_string(&self) -> String { + match self { + HgPathError::LeadingSlash(bytes) => { + format!("Invalid HgPath '{:?}': has a leading slash.", bytes) + } + HgPathError::ConsecutiveSlashes(bytes, pos) => format!( + "Invalid HgPath '{:?}': consecutive slahes at pos {}.", + bytes, pos + ), + HgPathError::ContainsNullByte(bytes, pos) => format!( + "Invalid HgPath '{:?}': contains null byte at pos {}.", + bytes, pos + ), + HgPathError::DecodeError(bytes) => { + format!("Invalid HgPath '{:?}': could not be decoded.", bytes) + } + } + } +} + +impl From for std::io::Error { + fn from(e: HgPathError) -> Self { + std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) + } +} + +/// This is a repository-relative path (or canonical path): +/// - no null characters +/// - `/` separates directories +/// - no consecutive slashes +/// - no leading slash, +/// - no `.` nor `..` of special meaning +/// - stored in repository and shared across platforms +/// +/// Note: there is no guarantee of any `HgPath` being well-formed at any point +/// in its lifetime for performance reasons and to ease ergonomics. It is +/// however checked using the `check_state` method before any file-system +/// operation. +/// +/// This allows us to be encoding-transparent as much as possible, until really +/// needed; `HgPath` can be transformed into a platform-specific path (`OsStr` +/// or `Path`) whenever more complex operations are needed: +/// On Unix, it's just byte-to-byte conversion. On Windows, it has to be +/// decoded from MBCS to WTF-8. If WindowsUTF8Plan is implemented, the source +/// character encoding will be determined on a per-repository basis. +// +// FIXME: (adapted from a comment in the stdlib) +// `HgPath::new()` current implementation relies on `Slice` being +// layout-compatible with `[u8]`. +// When attribute privacy is implemented, `Slice` should be annotated as +// `#[repr(transparent)]`. +// Anyway, `Slice` representation and layout are considered implementation +// detail, are not documented and must not be relied upon. +#[derive(Eq, Ord, PartialEq, PartialOrd, Debug, Hash)] +pub struct HgPath { + inner: [u8], +} + +impl HgPath { + pub fn new + ?Sized>(s: &S) -> &Self { + unsafe { &*(s.as_ref() as *const [u8] as *const Self) } + } + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + pub fn len(&self) -> usize { + self.inner.len() + } + fn to_hg_path_buf(&self) -> HgPathBuf { + HgPathBuf { + inner: self.inner.to_owned(), + } + } + pub fn bytes(&self) -> std::slice::Iter { + self.inner.iter() + } + pub fn to_ascii_uppercase(&self) -> HgPathBuf { + HgPathBuf::from(self.inner.to_ascii_uppercase()) + } + pub fn to_ascii_lowercase(&self) -> HgPathBuf { + HgPathBuf::from(self.inner.to_ascii_lowercase()) + } + pub fn as_bytes(&self) -> &[u8] { + &self.inner + } + pub fn contains(&self, other: u8) -> bool { + self.inner.contains(&other) + } + pub fn join>(&self, other: &T) -> HgPathBuf { + let mut inner = self.inner.to_owned(); + if inner.len() != 0 && inner.last() != Some(&b'/') { + inner.push(b'/'); + } + inner.extend(other.as_ref().bytes()); + HgPathBuf::from_bytes(&inner) + } + /// Checks for errors in the path, short-circuiting at the first one. + /// This generates fine-grained errors useful for debugging. + /// To simply check if the path is valid during tests, use `is_valid`. + pub fn check_state(&self) -> Result<(), HgPathError> { + if self.len() == 0 { + return Ok(()); + } + let bytes = self.as_bytes(); + let mut previous_byte = None; + + if bytes[0] == b'/' { + return Err(HgPathError::LeadingSlash(bytes.to_vec())); + } + for (index, byte) in bytes.iter().enumerate() { + match byte { + 0 => { + return Err(HgPathError::ContainsNullByte( + bytes.to_vec(), + index, + )) + } + b'/' => { + if previous_byte.is_some() && previous_byte == Some(b'/') { + return Err(HgPathError::ConsecutiveSlashes( + bytes.to_vec(), + index, + )); + } + } + _ => (), + }; + previous_byte = Some(*byte); + } + Ok(()) + } + + #[cfg(test)] + /// Only usable during tests to force developers to handle invalid states + fn is_valid(&self) -> bool { + self.check_state().is_ok() + } +} + +#[derive(Eq, Ord, Clone, PartialEq, PartialOrd, Debug, Hash)] +pub struct HgPathBuf { + inner: Vec, +} + +impl HgPathBuf { + pub fn new() -> Self { + Self { inner: Vec::new() } + } + pub fn push(&mut self, byte: u8) { + self.inner.push(byte); + } + pub fn from_bytes(s: &[u8]) -> HgPathBuf { + HgPath::new(s).to_owned() + } + pub fn into_vec(self) -> Vec { + self.inner + } + pub fn as_ref(&self) -> &[u8] { + self.inner.as_ref() + } +} + +impl Deref for HgPathBuf { + type Target = HgPath; + + #[inline] + fn deref(&self) -> &HgPath { + &HgPath::new(&self.inner) + } +} + +impl From> for HgPathBuf { + fn from(vec: Vec) -> Self { + Self { inner: vec } + } +} + +impl> From<&T> for HgPathBuf { + fn from(s: &T) -> HgPathBuf { + s.as_ref().to_owned() + } +} + +impl Into> for HgPathBuf { + fn into(self) -> Vec { + self.inner + } +} + +impl Borrow for HgPathBuf { + fn borrow(&self) -> &HgPath { + &HgPath::new(self.as_bytes()) + } +} + +impl ToOwned for HgPath { + type Owned = HgPathBuf; + + fn to_owned(&self) -> HgPathBuf { + self.to_hg_path_buf() + } +} + +impl AsRef for HgPath { + fn as_ref(&self) -> &HgPath { + self + } +} + +impl AsRef for HgPathBuf { + fn as_ref(&self) -> &HgPath { + self + } +} + +impl Extend for HgPathBuf { + fn extend>(&mut self, iter: T) { + self.inner.extend(iter); + } +} + +/// TODO: Once https://www.mercurial-scm.org/wiki/WindowsUTF8Plan is +/// implemented, these conversion utils will have to work differently depending +/// on the repository encoding: either `UTF-8` or `MBCS`. + +pub fn hg_path_to_os_string>( + hg_path: P, +) -> Result { + hg_path.as_ref().check_state()?; + let os_str; + #[cfg(unix)] + { + use std::os::unix::ffi::OsStrExt; + os_str = std::ffi::OsStr::from_bytes(&hg_path.as_ref().as_bytes()); + } + #[cfg(windows)] + { + // TODO: convert from Windows MBCS (ANSI encoding) to WTF8. + unimplemented!(); + } + Ok(os_str.to_os_string()) +} + +pub fn hg_path_to_path_buf>( + hg_path: P, +) -> Result { + Ok(Path::new(&hg_path_to_os_string(hg_path)?).to_path_buf()) +} + +pub fn os_string_to_hg_path_buf>( + os_string: S, +) -> Result { + let buf; + #[cfg(unix)] + { + use std::os::unix::ffi::OsStrExt; + buf = HgPathBuf::from_bytes(&os_string.as_ref().as_bytes()); + } + #[cfg(windows)] + { + // TODO: convert from WTF8 to Windows MBCS (ANSI encoding). + unimplemented!(); + } + buf.check_state()?; + Ok(buf) +} + +pub fn path_to_hg_path_buf>( + path: P, +) -> Result { + let buf; + let os_str = path.as_ref().as_os_str(); + #[cfg(unix)] + { + use std::os::unix::ffi::OsStrExt; + buf = HgPathBuf::from_bytes(&os_str.as_bytes()); + } + #[cfg(windows)] + { + // TODO: convert from WTF8 to Windows MBCS (ANSI encoding). + unimplemented!(); + } + buf.check_state()?; + Ok(buf) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_path_states() { + assert_eq!( + Err(HgPathError::LeadingSlash(b"/".to_vec())), + HgPath::new(b"/").check_state() + ); + assert_eq!( + Err(HgPathError::ConsecutiveSlashes(b"a/b//c".to_vec(), 4)), + HgPath::new(b"a/b//c").check_state() + ); + assert_eq!( + Err(HgPathError::ContainsNullByte(b"a/b/\0c".to_vec(), 4)), + HgPath::new(b"a/b/\0c").check_state() + ); + // TODO test HgPathError::DecodeError for the Windows implementation. + assert_eq!(true, HgPath::new(b"").is_valid()); + assert_eq!(true, HgPath::new(b"a/b/c").is_valid()); + // Backslashes in paths are not significant, but allowed + assert_eq!(true, HgPath::new(br"a\b/c").is_valid()); + // Dots in paths are not significant, but allowed + assert_eq!(true, HgPath::new(b"a/b/../c/").is_valid()); + assert_eq!(true, HgPath::new(b"./a/b/../c/").is_valid()); + } + + #[test] + fn test_iter() { + let path = HgPath::new(b"a"); + let mut iter = path.bytes(); + assert_eq!(Some(&b'a'), iter.next()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + + let path = HgPath::new(b"a"); + let mut iter = path.bytes(); + assert_eq!(Some(&b'a'), iter.next_back()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + + let path = HgPath::new(b"abc"); + let mut iter = path.bytes(); + assert_eq!(Some(&b'a'), iter.next()); + assert_eq!(Some(&b'c'), iter.next_back()); + assert_eq!(Some(&b'b'), iter.next_back()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + + let path = HgPath::new(b"abc"); + let mut iter = path.bytes(); + assert_eq!(Some(&b'a'), iter.next()); + assert_eq!(Some(&b'b'), iter.next()); + assert_eq!(Some(&b'c'), iter.next()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + + let path = HgPath::new(b"abc"); + let iter = path.bytes(); + let mut vec = Vec::new(); + vec.extend(iter); + assert_eq!(vec![b'a', b'b', b'c'], vec); + + let path = HgPath::new(b"abc"); + let mut iter = path.bytes(); + assert_eq!(Some(2), iter.rposition(|c| *c == b'c')); + + let path = HgPath::new(b"abc"); + let mut iter = path.bytes(); + assert_eq!(None, iter.rposition(|c| *c == b'd')); + } + + #[test] + fn test_join() { + let path = HgPathBuf::from_bytes(b"a").join(HgPath::new(b"b")); + assert_eq!(b"a/b", path.as_bytes()); + + let path = HgPathBuf::from_bytes(b"a/").join(HgPath::new(b"b/c")); + assert_eq!(b"a/b/c", path.as_bytes()); + + // No leading slash if empty before join + let path = HgPathBuf::new().join(HgPath::new(b"b/c")); + assert_eq!(b"b/c", path.as_bytes()); + + // The leading slash is an invalid representation of an `HgPath`, but + // it can happen. This creates another invalid representation of + // consecutive bytes. + // TODO What should be done in this case? Should we silently remove + // the extra slash? Should we change the signature to a problematic + // `Result`, or should we just keep it so and + // let the error happen upon filesystem interaction? + let path = HgPathBuf::from_bytes(b"a/").join(HgPath::new(b"/b")); + assert_eq!(b"a//b", path.as_bytes()); + let path = HgPathBuf::from_bytes(b"a").join(HgPath::new(b"/b")); + assert_eq!(b"a//b", path.as_bytes()); + } +}