view rust/hg-core/src/revlog/path_encode.rs @ 50159:96d31efd21f7

rhg: reduce verbosity in path_encode by using a trait for writing Hopefully this makes the code easier to read and understand and shorter overall. It also lets us later tweak the type we use as a [Sink], without having to change the encoding functions, including using two different types for size measurement and for the actual serialization.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Thu, 16 Feb 2023 18:29:52 +0000
parents 0d2ec486d95c
children 5ce53ff6133a
line wrap: on
line source

use sha1::{Digest, Sha1};

#[derive(PartialEq, Debug)]
#[allow(non_camel_case_types)]
#[allow(clippy::upper_case_acronyms)]
enum path_state {
    START, /* first byte of a path component */
    A,     /* "AUX" */
    AU,
    THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
    C,     /* "CON" or "COMn" */
    CO,
    COMLPT, /* "COM" or "LPT" */
    COMLPTn,
    L,
    LP,
    N,
    NU,
    P, /* "PRN" */
    PR,
    LDOT, /* leading '.' */
    DOT,  /* '.' in a non-leading position */
    H,    /* ".h" */
    HGDI, /* ".hg", ".d", or ".i" */
    SPACE,
    DEFAULT, /* byte of a path component after the first */
}

/* state machine for dir-encoding */
#[allow(non_camel_case_types)]
#[allow(clippy::upper_case_acronyms)]
enum dir_state {
    DDOT,
    DH,
    DHGDI,
    DDEFAULT,
}

trait Sink {
    fn write_byte(&mut self, c: u8);
    fn write_bytes(&mut self, c: &[u8]);
}

fn inset(bitset: &[u32; 8], c: u8) -> bool {
    bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0
}

struct Dest<'a> {
    dest: Option<&'a mut [u8]>,
    pub len: usize,
}

impl<'a> Dest<'a> {
    pub fn create(buf: &'a mut [u8]) -> Dest<'a> {
        Dest {
            dest: Some(buf),
            len: 0,
        }
    }

    pub fn create_measure() -> Dest<'a> {
        Dest { dest: None, len: 0 }
    }
}

fn rewrap_option<'a, 'b: 'a>(
    x: &'a mut Option<&'b mut [u8]>,
) -> Option<&'a mut [u8]> {
    match x {
        None => None,
        Some(y) => Some(y),
    }
}

impl<'a> Sink for Dest<'a> {
    fn write_byte(&mut self, c: u8) {
        if let Some(slice) = rewrap_option(&mut self.dest) {
            slice[self.len] = c
        }
        self.len += 1
    }

    fn write_bytes(&mut self, src: &[u8]) {
        if let Some(slice) = rewrap_option(&mut self.dest) {
            slice[self.len..self.len + src.len()].copy_from_slice(src)
        }
        self.len += src.len();
    }
}

fn hexencode(dest: &mut impl Sink, c: u8) {
    let hexdigit = b"0123456789abcdef";
    dest.write_byte(hexdigit[(c as usize) >> 4]);
    dest.write_byte(hexdigit[(c as usize) & 15]);
}

/* 3-byte escape: tilde followed by two hex digits */
fn escape3(dest: &mut impl Sink, c: u8) {
    dest.write_byte(b'~');
    hexencode(dest, c);
}

fn encode_dir(dest: &mut impl Sink, src: &[u8]) {
    let mut state = dir_state::DDEFAULT;
    let mut i = 0;

    while i < src.len() {
        match state {
            dir_state::DDOT => match src[i] {
                b'd' | b'i' => {
                    state = dir_state::DHGDI;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'h' => {
                    state = dir_state::DH;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                _ => {
                    state = dir_state::DDEFAULT;
                }
            },
            dir_state::DH => {
                if src[i] == b'g' {
                    state = dir_state::DHGDI;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = dir_state::DDEFAULT;
                }
            }
            dir_state::DHGDI => {
                if src[i] == b'/' {
                    dest.write_bytes(b".hg");
                    dest.write_byte(src[i]);
                    i += 1;
                }
                state = dir_state::DDEFAULT;
            }
            dir_state::DDEFAULT => {
                if src[i] == b'.' {
                    state = dir_state::DDOT
                }
                dest.write_byte(src[i]);
                i += 1;
            }
        }
    }
}

fn _encode(
    twobytes: &[u32; 8],
    onebyte: &[u32; 8],
    dest: &mut impl Sink,
    src: &[u8],
    encodedir: bool,
) {
    let mut state = path_state::START;
    let mut i = 0;
    let len = src.len();

    while i < len {
        match state {
            path_state::START => match src[i] {
                b'/' => {
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'.' => {
                    state = path_state::LDOT;
                    escape3(dest, src[i]);
                    i += 1;
                }
                b' ' => {
                    state = path_state::DEFAULT;
                    escape3(dest, src[i]);
                    i += 1;
                }
                b'a' => {
                    state = path_state::A;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'c' => {
                    state = path_state::C;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'l' => {
                    state = path_state::L;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'n' => {
                    state = path_state::N;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'p' => {
                    state = path_state::P;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                _ => {
                    state = path_state::DEFAULT;
                }
            },
            path_state::A => {
                if src[i] == b'u' {
                    state = path_state::AU;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::AU => {
                if src[i] == b'x' {
                    state = path_state::THIRD;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::THIRD => {
                state = path_state::DEFAULT;
                match src[i] {
                    b'.' | b'/' | b'\0' => escape3(dest, src[i - 1]),
                    _ => i -= 1,
                }
            }
            path_state::C => {
                if src[i] == b'o' {
                    state = path_state::CO;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::CO => {
                if src[i] == b'm' {
                    state = path_state::COMLPT;
                    i += 1;
                } else if src[i] == b'n' {
                    state = path_state::THIRD;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::COMLPT => {
                if src[i] >= b'1' && src[i] <= b'9' {
                    state = path_state::COMLPTn;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                    dest.write_byte(src[i - 1]);
                }
            }
            path_state::COMLPTn => {
                state = path_state::DEFAULT;
                match src[i] {
                    b'.' | b'/' | b'\0' => {
                        escape3(dest, src[i - 2]);
                        dest.write_byte(src[i - 1]);
                    }
                    _ => {
                        dest.write_bytes(&src[i - 2..i]);
                    }
                }
            }
            path_state::L => {
                if src[i] == b'p' {
                    state = path_state::LP;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::LP => {
                if src[i] == b't' {
                    state = path_state::COMLPT;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::N => {
                if src[i] == b'u' {
                    state = path_state::NU;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::NU => {
                if src[i] == b'l' {
                    state = path_state::THIRD;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::P => {
                if src[i] == b'r' {
                    state = path_state::PR;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::PR => {
                if src[i] == b'n' {
                    state = path_state::THIRD;
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::LDOT => match src[i] {
                b'd' | b'i' => {
                    state = path_state::HGDI;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'h' => {
                    state = path_state::H;
                    dest.write_byte(src[i]);
                    i += 1;
                }
                _ => {
                    state = path_state::DEFAULT;
                }
            },
            path_state::DOT => match src[i] {
                b'/' | b'\0' => {
                    state = path_state::START;
                    dest.write_bytes(b"~2e");
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'd' | b'i' => {
                    state = path_state::HGDI;
                    dest.write_byte(b'.');
                    dest.write_byte(src[i]);
                    i += 1;
                }
                b'h' => {
                    state = path_state::H;
                    dest.write_bytes(b".h");
                    i += 1;
                }
                _ => {
                    state = path_state::DEFAULT;
                    dest.write_byte(b'.');
                }
            },
            path_state::H => {
                if src[i] == b'g' {
                    state = path_state::HGDI;
                    dest.write_byte(src[i]);
                    i += 1;
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::HGDI => {
                if src[i] == b'/' {
                    state = path_state::START;
                    if encodedir {
                        dest.write_bytes(b".hg");
                    }
                    dest.write_byte(src[i]);
                    i += 1
                } else {
                    state = path_state::DEFAULT;
                }
            }
            path_state::SPACE => match src[i] {
                b'/' | b'\0' => {
                    state = path_state::START;
                    dest.write_bytes(b"~20");
                    dest.write_byte(src[i]);
                    i += 1;
                }
                _ => {
                    state = path_state::DEFAULT;
                    dest.write_byte(b' ');
                }
            },
            path_state::DEFAULT => {
                while i != len && inset(onebyte, src[i]) {
                    dest.write_byte(src[i]);
                    i += 1;
                }
                if i == len {
                    break;
                }
                match src[i] {
                    b'.' => {
                        state = path_state::DOT;
                        i += 1
                    }
                    b' ' => {
                        state = path_state::SPACE;
                        i += 1
                    }
                    b'/' => {
                        state = path_state::START;
                        dest.write_byte(b'/');
                        i += 1;
                    }
                    _ => {
                        if inset(onebyte, src[i]) {
                            loop {
                                dest.write_byte(src[i]);
                                i += 1;
                                if !(i < len && inset(onebyte, src[i])) {
                                    break;
                                }
                            }
                        } else if inset(twobytes, src[i]) {
                            let c = src[i];
                            i += 1;
                            dest.write_byte(b'_');
                            dest.write_byte(if c == b'_' {
                                b'_'
                            } else {
                                c + 32
                            });
                        } else {
                            escape3(dest, src[i]);
                            i += 1;
                        }
                    }
                }
            }
        }
    }
    match state {
        path_state::START => (),
        path_state::A => (),
        path_state::AU => (),
        path_state::THIRD => escape3(dest, src[i - 1]),
        path_state::C => (),
        path_state::CO => (),
        path_state::COMLPT => dest.write_byte(src[i - 1]),
        path_state::COMLPTn => {
            escape3(dest, src[i - 2]);
            dest.write_byte(src[i - 1]);
        }
        path_state::L => (),
        path_state::LP => (),
        path_state::N => (),
        path_state::NU => (),
        path_state::P => (),
        path_state::PR => (),
        path_state::LDOT => (),
        path_state::DOT => {
            dest.write_bytes(b"~2e");
        }
        path_state::H => (),
        path_state::HGDI => (),
        path_state::SPACE => {
            dest.write_bytes(b"~20");
        }
        path_state::DEFAULT => (),
    }
}

fn basic_encode(dest: &mut impl Sink, src: &[u8]) {
    let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0];
    let onebyte: [u32; 8] =
        [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0];
    _encode(&twobytes, &onebyte, dest, src, true)
}

const MAXSTOREPATHLEN: usize = 120;

fn lower_encode(dest: &mut impl Sink, src: &[u8]) {
    let onebyte: [u32; 8] =
        [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0];
    let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0];
    for c in src {
        if inset(&onebyte, *c) {
            dest.write_byte(*c)
        } else if inset(&lower, *c) {
            dest.write_byte(*c + 32)
        } else {
            escape3(dest, *c)
        }
    }
}

fn aux_encode(dest: &mut impl Sink, src: &[u8]) {
    let twobytes = [0; 8];
    let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0];
    _encode(&twobytes, &onebyte, dest, src, false)
}

fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> {
    let dirprefixlen = 8;
    let maxshortdirslen = 68;

    let last_slash = src.iter().rposition(|b| *b == b'/');
    let last_dot: Option<usize> = {
        let s = last_slash.unwrap_or(0);
        src[s..].iter().rposition(|b| *b == b'.').map(|i| i + s)
    };

    let mut dest_vec = vec![0; MAXSTOREPATHLEN];
    let mut dest = Dest::create(&mut dest_vec);
    dest.write_bytes(b"dh/");

    if let Some(last_slash) = last_slash {
        for slice in src[..last_slash].split(|b| *b == b'/') {
            let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)];
            if dest.len + slice.len() > maxshortdirslen + 3 {
                break;
            } else {
                dest.write_bytes(slice);
            }
            dest.write_byte(b'/');
        }
    }

    let used = dest.len + 40 + {
        if let Some(l) = last_dot {
            src.len() - l
        } else {
            0
        }
    };

    if MAXSTOREPATHLEN > used {
        let slop = MAXSTOREPATHLEN - used;
        let basenamelen = match last_slash {
            Some(l) => src.len() - l - 1,
            None => src.len(),
        };
        let basenamelen = std::cmp::min(basenamelen, slop);
        if basenamelen > 0 {
            let start = match last_slash {
                Some(l) => l + 1,
                None => 0,
            };
            dest.write_bytes(&src[start..][..basenamelen])
        }
    }
    for c in sha {
        hexencode(&mut dest, *c);
    }
    if let Some(l) = last_dot {
        dest.write_bytes(&src[l..]);
    }
    let destlen = dest.len;
    if destlen == dest_vec.len() {
        dest_vec
    } else {
        // sometimes the path are shorter than MAXSTOREPATHLEN
        dest_vec[..destlen].to_vec()
    }
}

const MAXENCODE: usize = 4096 * 4;
fn hash_encode(src: &[u8]) -> Vec<u8> {
    let dired = &mut [0; MAXENCODE];
    let mut dired_dest = Dest::create(dired);
    let lowered = &mut [0; MAXENCODE];
    let mut lowered_dest = Dest::create(lowered);
    let auxed = &mut [0; MAXENCODE];
    let mut auxed_dest = Dest::create(auxed);
    let baselen = (src.len() - 5) * 3;
    if baselen >= MAXENCODE {
        panic!("path_encode::hash_encore: string too long: {}", baselen)
    };
    encode_dir(&mut dired_dest, src);
    let dirlen = dired_dest.len;
    let sha = Sha1::digest(&dired[..dirlen]);
    lower_encode(&mut lowered_dest, &dired[..dirlen][5..]);
    let lowerlen = lowered_dest.len;
    aux_encode(&mut auxed_dest, &lowered[..lowerlen]);
    let auxlen = auxed_dest.len;
    hash_mangle(&auxed[..auxlen], &sha)
}

pub fn path_encode(path: &[u8]) -> Vec<u8> {
    let newlen = if path.len() <= MAXSTOREPATHLEN {
        let mut measure = Dest::create_measure();
        basic_encode(&mut measure, path);
        measure.len
    } else {
        MAXSTOREPATHLEN + 1
    };
    if newlen <= MAXSTOREPATHLEN {
        if newlen == path.len() {
            path.to_vec()
        } else {
            let mut res = vec![0; newlen];
            let mut dest = Dest::create(&mut res);
            basic_encode(&mut dest, path);
            assert!(dest.len == newlen);
            res
        }
    } else {
        hash_encode(path)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::utils::hg_path::HgPathBuf;

    #[test]
    fn test_long_filename_at_root() {
        let input = b"data/ABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJ.i";
        let expected = b"dh/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij.i708243a2237a7afae259ea3545a72a2ef11c247b.i";
        let res = path_encode(input);
        assert_eq!(
            HgPathBuf::from_bytes(&res),
            HgPathBuf::from_bytes(expected)
        );
    }
}