Mercurial > hg
changeset 45539:aebc976fd7d5
hg-core: add path_encode
Differential Revision: https://phab.mercurial-scm.org/D9049
author | Antoine Cezar <antoine.cezar@octobus.net> |
---|---|
date | Tue, 15 Sep 2020 16:45:27 +0200 |
parents | 2d5dfc8fed55 |
children | f2de24c2b1f6 |
files | rust/hg-core/src/revlog.rs rust/hg-core/src/revlog/path_encode.rs |
diffstat | 2 files changed, 654 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/revlog.rs Tue Sep 15 10:52:28 2020 +0200 +++ b/rust/hg-core/src/revlog.rs Tue Sep 15 16:45:27 2020 +0200 @@ -7,6 +7,7 @@ pub mod node; pub mod nodemap; +pub mod path_encode; pub use node::{Node, NodeError, NodePrefix, NodePrefixRef}; pub mod changelog; pub mod index;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/path_encode.rs Tue Sep 15 16:45:27 2020 +0200 @@ -0,0 +1,653 @@ +use crypto::digest::Digest; +use crypto::sha1::Sha1; + +#[derive(PartialEq, Debug)] +#[allow(non_camel_case_types)] +enum path_state { + START, /* first byte of a path component */ + A, /* "AUX" */ + AU, + THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ + C, /* "CON" or "COMn" */ + CO, + COMLPT, /* "COM" or "LPT" */ + COMLPTn, + L, + LP, + N, + NU, + P, /* "PRN" */ + PR, + LDOT, /* leading '.' */ + DOT, /* '.' in a non-leading position */ + H, /* ".h" */ + HGDI, /* ".hg", ".d", or ".i" */ + SPACE, + DEFAULT, /* byte of a path component after the first */ +} + +/* state machine for dir-encoding */ +#[allow(non_camel_case_types)] +enum dir_state { + DDOT, + DH, + DHGDI, + DDEFAULT, +} + +fn inset(bitset: &[u32; 8], c: u8) -> bool { + bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0 +} + +fn charcopy(dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) { + if let Some(slice) = dest { + slice[*destlen] = c + } + *destlen += 1 +} + +fn memcopy(dest: Option<&mut [u8]>, destlen: &mut usize, src: &[u8]) { + if let Some(slice) = dest { + slice[*destlen..*destlen + src.len()].copy_from_slice(src) + } + *destlen += src.len(); +} + +fn rewrap_option<'a, 'b: 'a>( + x: &'a mut Option<&'b mut [u8]>, +) -> Option<&'a mut [u8]> { + match x { + None => None, + Some(y) => Some(y), + } +} + +fn hexencode<'a>(mut dest: Option<&'a mut [u8]>, destlen: &mut usize, c: u8) { + let hexdigit = b"0123456789abcdef"; + charcopy( + rewrap_option(&mut dest), + destlen, + hexdigit[(c as usize) >> 4], + ); + charcopy(dest, destlen, hexdigit[(c as usize) & 15]); +} + +/* 3-byte escape: tilde followed by two hex digits */ +fn escape3(mut dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) { + charcopy(rewrap_option(&mut dest), destlen, b'~'); + hexencode(dest, destlen, c); +} + +fn encode_dir(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize { + let mut state = dir_state::DDEFAULT; + let mut i = 0; + let mut destlen = 0; + + while i < src.len() { + match state { + dir_state::DDOT => match src[i] { + b'd' | b'i' => { + state = dir_state::DHGDI; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'h' => { + state = dir_state::DH; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + _ => { + state = dir_state::DDEFAULT; + } + }, + dir_state::DH => { + if src[i] == b'g' { + state = dir_state::DHGDI; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = dir_state::DDEFAULT; + } + } + dir_state::DHGDI => { + if src[i] == b'/' { + memcopy(rewrap_option(&mut dest), &mut destlen, b".hg"); + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + state = dir_state::DDEFAULT; + } + dir_state::DDEFAULT => { + if src[i] == b'.' { + state = dir_state::DDOT + } + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + } + } + destlen +} + +fn _encode( + twobytes: &[u32; 8], + onebyte: &[u32; 8], + mut dest: Option<&mut [u8]>, + src: &[u8], + encodedir: bool, +) -> usize { + let mut state = path_state::START; + let mut i = 0; + let mut destlen = 0; + let len = src.len(); + + while i < len { + match state { + path_state::START => match src[i] { + b'/' => { + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'.' => { + state = path_state::LDOT; + escape3(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b' ' => { + state = path_state::DEFAULT; + escape3(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'a' => { + state = path_state::A; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'c' => { + state = path_state::C; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'l' => { + state = path_state::L; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'n' => { + state = path_state::N; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'p' => { + state = path_state::P; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + _ => { + state = path_state::DEFAULT; + } + }, + path_state::A => { + if src[i] == b'u' { + state = path_state::AU; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::AU => { + if src[i] == b'x' { + state = path_state::THIRD; + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::THIRD => { + state = path_state::DEFAULT; + match src[i] { + b'.' | b'/' | b'\0' => escape3( + rewrap_option(&mut dest), + &mut destlen, + src[i - 1], + ), + _ => i -= 1, + } + } + path_state::C => { + if src[i] == b'o' { + state = path_state::CO; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::CO => { + if src[i] == b'm' { + state = path_state::COMLPT; + i += 1; + } else if src[i] == b'n' { + state = path_state::THIRD; + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::COMLPT => { + if src[i] >= b'1' && src[i] <= b'9' { + state = path_state::COMLPTn; + i += 1; + } else { + state = path_state::DEFAULT; + charcopy( + rewrap_option(&mut dest), + &mut destlen, + src[i - 1], + ); + } + } + path_state::COMLPTn => { + state = path_state::DEFAULT; + match src[i] { + b'.' | b'/' | b'\0' => { + escape3( + rewrap_option(&mut dest), + &mut destlen, + src[i - 2], + ); + charcopy( + rewrap_option(&mut dest), + &mut destlen, + src[i - 1], + ); + } + _ => { + memcopy( + rewrap_option(&mut dest), + &mut destlen, + &src[i - 2..i], + ); + } + } + } + path_state::L => { + if src[i] == b'p' { + state = path_state::LP; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::LP => { + if src[i] == b't' { + state = path_state::COMLPT; + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::N => { + if src[i] == b'u' { + state = path_state::NU; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::NU => { + if src[i] == b'l' { + state = path_state::THIRD; + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::P => { + if src[i] == b'r' { + state = path_state::PR; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::PR => { + if src[i] == b'n' { + state = path_state::THIRD; + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::LDOT => match src[i] { + b'd' | b'i' => { + state = path_state::HGDI; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'h' => { + state = path_state::H; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + _ => { + state = path_state::DEFAULT; + } + }, + path_state::DOT => match src[i] { + b'/' | b'\0' => { + state = path_state::START; + memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e"); + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'd' | b'i' => { + state = path_state::HGDI; + charcopy(rewrap_option(&mut dest), &mut destlen, b'.'); + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + b'h' => { + state = path_state::H; + memcopy(rewrap_option(&mut dest), &mut destlen, b".h"); + i += 1; + } + _ => { + state = path_state::DEFAULT; + charcopy(rewrap_option(&mut dest), &mut destlen, b'.'); + } + }, + path_state::H => { + if src[i] == b'g' { + state = path_state::HGDI; + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } else { + state = path_state::DEFAULT; + } + } + path_state::HGDI => { + if src[i] == b'/' { + state = path_state::START; + if encodedir { + memcopy( + rewrap_option(&mut dest), + &mut destlen, + b".hg", + ); + } + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1 + } else { + state = path_state::DEFAULT; + } + } + path_state::SPACE => match src[i] { + b'/' | b'\0' => { + state = path_state::START; + memcopy(rewrap_option(&mut dest), &mut destlen, b"~20"); + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + _ => { + state = path_state::DEFAULT; + charcopy(rewrap_option(&mut dest), &mut destlen, b' '); + } + }, + path_state::DEFAULT => { + while i != len && inset(onebyte, src[i]) { + charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); + i += 1; + } + if i == len { + break; + } + match src[i] { + b'.' => { + state = path_state::DOT; + i += 1 + } + b' ' => { + state = path_state::SPACE; + i += 1 + } + b'/' => { + state = path_state::START; + charcopy(rewrap_option(&mut dest), &mut destlen, b'/'); + i += 1; + } + _ => { + if inset(onebyte, src[i]) { + loop { + charcopy( + rewrap_option(&mut dest), + &mut destlen, + src[i], + ); + i += 1; + if !(i < len && inset(onebyte, src[i])) { + break; + } + } + } else if inset(twobytes, src[i]) { + let c = src[i]; + i += 1; + charcopy( + rewrap_option(&mut dest), + &mut destlen, + b'_', + ); + charcopy( + rewrap_option(&mut dest), + &mut destlen, + if c == b'_' { b'_' } else { c + 32 }, + ); + } else { + escape3( + rewrap_option(&mut dest), + &mut destlen, + src[i], + ); + i += 1; + } + } + } + } + } + } + match state { + path_state::START => (), + path_state::A => (), + path_state::AU => (), + path_state::THIRD => { + escape3(rewrap_option(&mut dest), &mut destlen, src[i - 1]) + } + path_state::C => (), + path_state::CO => (), + path_state::COMLPT => { + charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]) + } + path_state::COMLPTn => { + escape3(rewrap_option(&mut dest), &mut destlen, src[i - 2]); + charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]); + } + path_state::L => (), + path_state::LP => (), + path_state::N => (), + path_state::NU => (), + path_state::P => (), + path_state::PR => (), + path_state::LDOT => (), + path_state::DOT => { + memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e"); + } + path_state::H => (), + path_state::HGDI => (), + path_state::SPACE => { + memcopy(rewrap_option(&mut dest), &mut destlen, b"~20"); + } + path_state::DEFAULT => (), + }; + destlen +} + +fn basic_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize { + let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0]; + let onebyte: [u32; 8] = + [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0]; + _encode(&twobytes, &onebyte, dest, src, true) +} + +const MAXSTOREPATHLEN: usize = 120; + +fn lower_encode(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize { + let onebyte: [u32; 8] = + [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0]; + let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0]; + let mut destlen = 0; + for c in src { + if inset(&onebyte, *c) { + charcopy(rewrap_option(&mut dest), &mut destlen, *c) + } else if inset(&lower, *c) { + charcopy(rewrap_option(&mut dest), &mut destlen, *c + 32) + } else { + escape3(rewrap_option(&mut dest), &mut destlen, *c) + } + } + destlen +} + +fn aux_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize { + let twobytes = [0; 8]; + let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0]; + _encode(&twobytes, &onebyte, dest, src, false) +} + +fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> { + let dirprefixlen = 8; + let maxshortdirslen = 68; + let mut destlen = 0; + + let last_slash = src.iter().rposition(|b| *b == b'/'); + let last_dot: Option<usize> = { + let s = last_slash.unwrap_or(0); + src[s..] + .iter() + .rposition(|b| *b == b'.') + .and_then(|i| Some(i + s)) + }; + + let mut dest = vec![0; MAXSTOREPATHLEN]; + memcopy(Some(&mut dest), &mut destlen, b"dh/"); + + { + let mut first = true; + for slice in src[..last_slash.unwrap_or_else(|| src.len())] + .split(|b| *b == b'/') + { + let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)]; + if destlen + (slice.len() + if first { 0 } else { 1 }) + > maxshortdirslen + 3 + { + break; + } else { + if !first { + charcopy(Some(&mut dest), &mut destlen, b'/') + }; + memcopy(Some(&mut dest), &mut destlen, slice); + if dest[destlen - 1] == b'.' || dest[destlen - 1] == b' ' { + dest[destlen - 1] = b'_' + } + } + first = false; + } + if !first { + charcopy(Some(&mut dest), &mut destlen, b'/'); + } + } + + let used = destlen + 40 + { + if let Some(l) = last_dot { + src.len() - l + } else { + 0 + } + }; + + if MAXSTOREPATHLEN > used { + let slop = MAXSTOREPATHLEN - used; + let basenamelen = match last_slash { + Some(l) => src.len() - l - 1, + None => src.len(), + }; + let basenamelen = std::cmp::min(basenamelen, slop); + if basenamelen > 0 { + let start = match last_slash { + Some(l) => l + 1, + None => 0, + }; + memcopy( + Some(&mut dest), + &mut destlen, + &src[start..][..basenamelen], + ) + } + } + for c in sha { + hexencode(Some(&mut dest), &mut destlen, *c); + } + if let Some(l) = last_dot { + memcopy(Some(&mut dest), &mut destlen, &src[l..]); + } + if destlen == dest.len() { + dest + } else { + // sometimes the path are shorter than MAXSTOREPATHLEN + dest[..destlen].to_vec() + } +} + +const MAXENCODE: usize = 4096 * 4; +fn hash_encode(src: &[u8]) -> Vec<u8> { + let dired = &mut [0; MAXENCODE]; + let lowered = &mut [0; MAXENCODE]; + let auxed = &mut [0; MAXENCODE]; + let baselen = (src.len() - 5) * 3; + if baselen >= MAXENCODE { + panic!("path_encode::hash_encore: string too long: {}", baselen) + }; + let dirlen = encode_dir(Some(&mut dired[..]), src); + let sha = { + let mut hasher = Sha1::new(); + hasher.input(&dired[..dirlen]); + let mut hash = vec![0; 20]; + hasher.result(&mut hash); + hash + }; + let lowerlen = lower_encode(Some(&mut lowered[..]), &dired[..dirlen][5..]); + let auxlen = aux_encode(Some(&mut auxed[..]), &lowered[..lowerlen]); + hash_mangle(&auxed[..auxlen], &sha) +} + +pub fn path_encode(path: &[u8]) -> Vec<u8> { + let newlen = if path.len() <= MAXSTOREPATHLEN { + basic_encode(None, path) + } else { + MAXSTOREPATHLEN + 1 + }; + if newlen <= MAXSTOREPATHLEN { + if newlen == path.len() { + path.to_vec() + } else { + let mut res = vec![0; newlen]; + basic_encode(Some(&mut res), path); + res + } + } else { + hash_encode(&path) + } +}