hg-core: add path_encode
authorAntoine Cezar <antoine.cezar@octobus.net>
Tue, 15 Sep 2020 16:45:27 +0200
changeset 45539 aebc976fd7d5
parent 45538 2d5dfc8fed55
child 45540 f2de24c2b1f6
hg-core: add path_encode Differential Revision: https://phab.mercurial-scm.org/D9049
rust/hg-core/src/revlog.rs
rust/hg-core/src/revlog/path_encode.rs
--- a/rust/hg-core/src/revlog.rs	Tue Sep 15 10:52:28 2020 +0200
+++ b/rust/hg-core/src/revlog.rs	Tue Sep 15 16:45:27 2020 +0200
@@ -7,6 +7,7 @@
 
 pub mod node;
 pub mod nodemap;
+pub mod path_encode;
 pub use node::{Node, NodeError, NodePrefix, NodePrefixRef};
 pub mod changelog;
 pub mod index;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/revlog/path_encode.rs	Tue Sep 15 16:45:27 2020 +0200
@@ -0,0 +1,653 @@
+use crypto::digest::Digest;
+use crypto::sha1::Sha1;
+
+#[derive(PartialEq, Debug)]
+#[allow(non_camel_case_types)]
+enum path_state {
+    START, /* first byte of a path component */
+    A,     /* "AUX" */
+    AU,
+    THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
+    C,     /* "CON" or "COMn" */
+    CO,
+    COMLPT, /* "COM" or "LPT" */
+    COMLPTn,
+    L,
+    LP,
+    N,
+    NU,
+    P, /* "PRN" */
+    PR,
+    LDOT, /* leading '.' */
+    DOT,  /* '.' in a non-leading position */
+    H,    /* ".h" */
+    HGDI, /* ".hg", ".d", or ".i" */
+    SPACE,
+    DEFAULT, /* byte of a path component after the first */
+}
+
+/* state machine for dir-encoding */
+#[allow(non_camel_case_types)]
+enum dir_state {
+    DDOT,
+    DH,
+    DHGDI,
+    DDEFAULT,
+}
+
+fn inset(bitset: &[u32; 8], c: u8) -> bool {
+    bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0
+}
+
+fn charcopy(dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) {
+    if let Some(slice) = dest {
+        slice[*destlen] = c
+    }
+    *destlen += 1
+}
+
+fn memcopy(dest: Option<&mut [u8]>, destlen: &mut usize, src: &[u8]) {
+    if let Some(slice) = dest {
+        slice[*destlen..*destlen + src.len()].copy_from_slice(src)
+    }
+    *destlen += src.len();
+}
+
+fn rewrap_option<'a, 'b: 'a>(
+    x: &'a mut Option<&'b mut [u8]>,
+) -> Option<&'a mut [u8]> {
+    match x {
+        None => None,
+        Some(y) => Some(y),
+    }
+}
+
+fn hexencode<'a>(mut dest: Option<&'a mut [u8]>, destlen: &mut usize, c: u8) {
+    let hexdigit = b"0123456789abcdef";
+    charcopy(
+        rewrap_option(&mut dest),
+        destlen,
+        hexdigit[(c as usize) >> 4],
+    );
+    charcopy(dest, destlen, hexdigit[(c as usize) & 15]);
+}
+
+/* 3-byte escape: tilde followed by two hex digits */
+fn escape3(mut dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) {
+    charcopy(rewrap_option(&mut dest), destlen, b'~');
+    hexencode(dest, destlen, c);
+}
+
+fn encode_dir(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize {
+    let mut state = dir_state::DDEFAULT;
+    let mut i = 0;
+    let mut destlen = 0;
+
+    while i < src.len() {
+        match state {
+            dir_state::DDOT => match src[i] {
+                b'd' | b'i' => {
+                    state = dir_state::DHGDI;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'h' => {
+                    state = dir_state::DH;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                _ => {
+                    state = dir_state::DDEFAULT;
+                }
+            },
+            dir_state::DH => {
+                if src[i] == b'g' {
+                    state = dir_state::DHGDI;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = dir_state::DDEFAULT;
+                }
+            }
+            dir_state::DHGDI => {
+                if src[i] == b'/' {
+                    memcopy(rewrap_option(&mut dest), &mut destlen, b".hg");
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                state = dir_state::DDEFAULT;
+            }
+            dir_state::DDEFAULT => {
+                if src[i] == b'.' {
+                    state = dir_state::DDOT
+                }
+                charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                i += 1;
+            }
+        }
+    }
+    destlen
+}
+
+fn _encode(
+    twobytes: &[u32; 8],
+    onebyte: &[u32; 8],
+    mut dest: Option<&mut [u8]>,
+    src: &[u8],
+    encodedir: bool,
+) -> usize {
+    let mut state = path_state::START;
+    let mut i = 0;
+    let mut destlen = 0;
+    let len = src.len();
+
+    while i < len {
+        match state {
+            path_state::START => match src[i] {
+                b'/' => {
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'.' => {
+                    state = path_state::LDOT;
+                    escape3(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b' ' => {
+                    state = path_state::DEFAULT;
+                    escape3(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'a' => {
+                    state = path_state::A;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'c' => {
+                    state = path_state::C;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'l' => {
+                    state = path_state::L;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'n' => {
+                    state = path_state::N;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'p' => {
+                    state = path_state::P;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                _ => {
+                    state = path_state::DEFAULT;
+                }
+            },
+            path_state::A => {
+                if src[i] == b'u' {
+                    state = path_state::AU;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::AU => {
+                if src[i] == b'x' {
+                    state = path_state::THIRD;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::THIRD => {
+                state = path_state::DEFAULT;
+                match src[i] {
+                    b'.' | b'/' | b'\0' => escape3(
+                        rewrap_option(&mut dest),
+                        &mut destlen,
+                        src[i - 1],
+                    ),
+                    _ => i -= 1,
+                }
+            }
+            path_state::C => {
+                if src[i] == b'o' {
+                    state = path_state::CO;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::CO => {
+                if src[i] == b'm' {
+                    state = path_state::COMLPT;
+                    i += 1;
+                } else if src[i] == b'n' {
+                    state = path_state::THIRD;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::COMLPT => {
+                if src[i] >= b'1' && src[i] <= b'9' {
+                    state = path_state::COMLPTn;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                    charcopy(
+                        rewrap_option(&mut dest),
+                        &mut destlen,
+                        src[i - 1],
+                    );
+                }
+            }
+            path_state::COMLPTn => {
+                state = path_state::DEFAULT;
+                match src[i] {
+                    b'.' | b'/' | b'\0' => {
+                        escape3(
+                            rewrap_option(&mut dest),
+                            &mut destlen,
+                            src[i - 2],
+                        );
+                        charcopy(
+                            rewrap_option(&mut dest),
+                            &mut destlen,
+                            src[i - 1],
+                        );
+                    }
+                    _ => {
+                        memcopy(
+                            rewrap_option(&mut dest),
+                            &mut destlen,
+                            &src[i - 2..i],
+                        );
+                    }
+                }
+            }
+            path_state::L => {
+                if src[i] == b'p' {
+                    state = path_state::LP;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::LP => {
+                if src[i] == b't' {
+                    state = path_state::COMLPT;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::N => {
+                if src[i] == b'u' {
+                    state = path_state::NU;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::NU => {
+                if src[i] == b'l' {
+                    state = path_state::THIRD;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::P => {
+                if src[i] == b'r' {
+                    state = path_state::PR;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::PR => {
+                if src[i] == b'n' {
+                    state = path_state::THIRD;
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::LDOT => match src[i] {
+                b'd' | b'i' => {
+                    state = path_state::HGDI;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'h' => {
+                    state = path_state::H;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                _ => {
+                    state = path_state::DEFAULT;
+                }
+            },
+            path_state::DOT => match src[i] {
+                b'/' | b'\0' => {
+                    state = path_state::START;
+                    memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e");
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'd' | b'i' => {
+                    state = path_state::HGDI;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, b'.');
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                b'h' => {
+                    state = path_state::H;
+                    memcopy(rewrap_option(&mut dest), &mut destlen, b".h");
+                    i += 1;
+                }
+                _ => {
+                    state = path_state::DEFAULT;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, b'.');
+                }
+            },
+            path_state::H => {
+                if src[i] == b'g' {
+                    state = path_state::HGDI;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::HGDI => {
+                if src[i] == b'/' {
+                    state = path_state::START;
+                    if encodedir {
+                        memcopy(
+                            rewrap_option(&mut dest),
+                            &mut destlen,
+                            b".hg",
+                        );
+                    }
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1
+                } else {
+                    state = path_state::DEFAULT;
+                }
+            }
+            path_state::SPACE => match src[i] {
+                b'/' | b'\0' => {
+                    state = path_state::START;
+                    memcopy(rewrap_option(&mut dest), &mut destlen, b"~20");
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                _ => {
+                    state = path_state::DEFAULT;
+                    charcopy(rewrap_option(&mut dest), &mut destlen, b' ');
+                }
+            },
+            path_state::DEFAULT => {
+                while i != len && inset(onebyte, src[i]) {
+                    charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
+                    i += 1;
+                }
+                if i == len {
+                    break;
+                }
+                match src[i] {
+                    b'.' => {
+                        state = path_state::DOT;
+                        i += 1
+                    }
+                    b' ' => {
+                        state = path_state::SPACE;
+                        i += 1
+                    }
+                    b'/' => {
+                        state = path_state::START;
+                        charcopy(rewrap_option(&mut dest), &mut destlen, b'/');
+                        i += 1;
+                    }
+                    _ => {
+                        if inset(onebyte, src[i]) {
+                            loop {
+                                charcopy(
+                                    rewrap_option(&mut dest),
+                                    &mut destlen,
+                                    src[i],
+                                );
+                                i += 1;
+                                if !(i < len && inset(onebyte, src[i])) {
+                                    break;
+                                }
+                            }
+                        } else if inset(twobytes, src[i]) {
+                            let c = src[i];
+                            i += 1;
+                            charcopy(
+                                rewrap_option(&mut dest),
+                                &mut destlen,
+                                b'_',
+                            );
+                            charcopy(
+                                rewrap_option(&mut dest),
+                                &mut destlen,
+                                if c == b'_' { b'_' } else { c + 32 },
+                            );
+                        } else {
+                            escape3(
+                                rewrap_option(&mut dest),
+                                &mut destlen,
+                                src[i],
+                            );
+                            i += 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    match state {
+        path_state::START => (),
+        path_state::A => (),
+        path_state::AU => (),
+        path_state::THIRD => {
+            escape3(rewrap_option(&mut dest), &mut destlen, src[i - 1])
+        }
+        path_state::C => (),
+        path_state::CO => (),
+        path_state::COMLPT => {
+            charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1])
+        }
+        path_state::COMLPTn => {
+            escape3(rewrap_option(&mut dest), &mut destlen, src[i - 2]);
+            charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]);
+        }
+        path_state::L => (),
+        path_state::LP => (),
+        path_state::N => (),
+        path_state::NU => (),
+        path_state::P => (),
+        path_state::PR => (),
+        path_state::LDOT => (),
+        path_state::DOT => {
+            memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e");
+        }
+        path_state::H => (),
+        path_state::HGDI => (),
+        path_state::SPACE => {
+            memcopy(rewrap_option(&mut dest), &mut destlen, b"~20");
+        }
+        path_state::DEFAULT => (),
+    };
+    destlen
+}
+
+fn basic_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize {
+    let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0];
+    let onebyte: [u32; 8] =
+        [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0];
+    _encode(&twobytes, &onebyte, dest, src, true)
+}
+
+const MAXSTOREPATHLEN: usize = 120;
+
+fn lower_encode(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize {
+    let onebyte: [u32; 8] =
+        [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0];
+    let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0];
+    let mut destlen = 0;
+    for c in src {
+        if inset(&onebyte, *c) {
+            charcopy(rewrap_option(&mut dest), &mut destlen, *c)
+        } else if inset(&lower, *c) {
+            charcopy(rewrap_option(&mut dest), &mut destlen, *c + 32)
+        } else {
+            escape3(rewrap_option(&mut dest), &mut destlen, *c)
+        }
+    }
+    destlen
+}
+
+fn aux_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize {
+    let twobytes = [0; 8];
+    let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0];
+    _encode(&twobytes, &onebyte, dest, src, false)
+}
+
+fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> {
+    let dirprefixlen = 8;
+    let maxshortdirslen = 68;
+    let mut destlen = 0;
+
+    let last_slash = src.iter().rposition(|b| *b == b'/');
+    let last_dot: Option<usize> = {
+        let s = last_slash.unwrap_or(0);
+        src[s..]
+            .iter()
+            .rposition(|b| *b == b'.')
+            .and_then(|i| Some(i + s))
+    };
+
+    let mut dest = vec![0; MAXSTOREPATHLEN];
+    memcopy(Some(&mut dest), &mut destlen, b"dh/");
+
+    {
+        let mut first = true;
+        for slice in src[..last_slash.unwrap_or_else(|| src.len())]
+            .split(|b| *b == b'/')
+        {
+            let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)];
+            if destlen + (slice.len() + if first { 0 } else { 1 })
+                > maxshortdirslen + 3
+            {
+                break;
+            } else {
+                if !first {
+                    charcopy(Some(&mut dest), &mut destlen, b'/')
+                };
+                memcopy(Some(&mut dest), &mut destlen, slice);
+                if dest[destlen - 1] == b'.' || dest[destlen - 1] == b' ' {
+                    dest[destlen - 1] = b'_'
+                }
+            }
+            first = false;
+        }
+        if !first {
+            charcopy(Some(&mut dest), &mut destlen, b'/');
+        }
+    }
+
+    let used = destlen + 40 + {
+        if let Some(l) = last_dot {
+            src.len() - l
+        } else {
+            0
+        }
+    };
+
+    if MAXSTOREPATHLEN > used {
+        let slop = MAXSTOREPATHLEN - used;
+        let basenamelen = match last_slash {
+            Some(l) => src.len() - l - 1,
+            None => src.len(),
+        };
+        let basenamelen = std::cmp::min(basenamelen, slop);
+        if basenamelen > 0 {
+            let start = match last_slash {
+                Some(l) => l + 1,
+                None => 0,
+            };
+            memcopy(
+                Some(&mut dest),
+                &mut destlen,
+                &src[start..][..basenamelen],
+            )
+        }
+    }
+    for c in sha {
+        hexencode(Some(&mut dest), &mut destlen, *c);
+    }
+    if let Some(l) = last_dot {
+        memcopy(Some(&mut dest), &mut destlen, &src[l..]);
+    }
+    if destlen == dest.len() {
+        dest
+    } else {
+        // sometimes the path are shorter than MAXSTOREPATHLEN
+        dest[..destlen].to_vec()
+    }
+}
+
+const MAXENCODE: usize = 4096 * 4;
+fn hash_encode(src: &[u8]) -> Vec<u8> {
+    let dired = &mut [0; MAXENCODE];
+    let lowered = &mut [0; MAXENCODE];
+    let auxed = &mut [0; MAXENCODE];
+    let baselen = (src.len() - 5) * 3;
+    if baselen >= MAXENCODE {
+        panic!("path_encode::hash_encore: string too long: {}", baselen)
+    };
+    let dirlen = encode_dir(Some(&mut dired[..]), src);
+    let sha = {
+        let mut hasher = Sha1::new();
+        hasher.input(&dired[..dirlen]);
+        let mut hash = vec![0; 20];
+        hasher.result(&mut hash);
+        hash
+    };
+    let lowerlen = lower_encode(Some(&mut lowered[..]), &dired[..dirlen][5..]);
+    let auxlen = aux_encode(Some(&mut auxed[..]), &lowered[..lowerlen]);
+    hash_mangle(&auxed[..auxlen], &sha)
+}
+
+pub fn path_encode(path: &[u8]) -> Vec<u8> {
+    let newlen = if path.len() <= MAXSTOREPATHLEN {
+        basic_encode(None, path)
+    } else {
+        MAXSTOREPATHLEN + 1
+    };
+    if newlen <= MAXSTOREPATHLEN {
+        if newlen == path.len() {
+            path.to_vec()
+        } else {
+            let mut res = vec![0; newlen];
+            basic_encode(Some(&mut res), path);
+            res
+        }
+    } else {
+        hash_encode(&path)
+    }
+}