Mercurial > hg

use byteorder::{BigEndian, ByteOrder};

/// A chunk of data to insert, delete or replace in a patch
///
/// A chunk is:
/// - an insertion when `!data.is_empty() && start == end`
/// - an deletion when `data.is_empty() && start < end`
/// - a replacement when `!data.is_empty() && start < end`
/// - not doing anything when `data.is_empty() && start == end`
#[derive(Debug, Clone)]
struct Chunk<'a> {
    /// The start position of the chunk of data to replace
    start: u32,
    /// The end position of the chunk of data to replace (open end interval)
    end: u32,
    /// The data replacing the chunk
    data: &'a [u8],
}

impl Chunk<'_> {
    /// Adjusted start of the chunk to replace.
    ///
    /// The offset, taking into account the growth/shrinkage of data
    /// induced by previously applied chunks.
    fn start_offset_by(&self, offset: i32) -> u32 {
        let start = self.start as i32 + offset;
        assert!(start >= 0, "negative chunk start should never happen");
        start as u32
    }

    /// Adjusted end of the chunk to replace.
    ///
    /// The offset, taking into account the growth/shrinkage of data
    /// induced by previously applied chunks.
    fn end_offset_by(&self, offset: i32) -> u32 {
        self.start_offset_by(offset) + self.data.len() as u32
    }

    /// Length of the replaced chunk.
    fn replaced_len(&self) -> u32 {
        self.end - self.start
    }

    /// Length difference between the replacing data and the replaced data.
    fn len_diff(&self) -> i32 {
        self.data.len() as i32 - self.replaced_len() as i32
    }
}

/// The delta between two revisions data.
#[derive(Debug, Clone)]
pub struct PatchList<'a> {
    /// A collection of chunks to apply.
    ///
    /// Those chunks are:
    /// - ordered from the left-most replacement to the right-most replacement
    /// - non-overlapping, meaning that two chucks can not change the same
    ///   chunk of the patched data
    chunks: Vec<Chunk<'a>>,
}

impl<'a> PatchList<'a> {
    /// Create a `PatchList` from bytes.
    pub fn new(data: &'a [u8]) -> Self {
        let mut chunks = vec![];
        let mut data = data;
        while !data.is_empty() {
            let start = BigEndian::read_u32(&data[0..]);
            let end = BigEndian::read_u32(&data[4..]);
            let len = BigEndian::read_u32(&data[8..]);
            assert!(start <= end);
            chunks.push(Chunk {
                start,
                end,
                data: &data[12..12 + (len as usize)],
            });
            data = &data[12 + (len as usize)..];
        }
        PatchList { chunks }
    }

    /// Return the final length of data after patching
    /// given its initial length .
    fn size(&self, initial_size: i32) -> i32 {
        self.chunks
            .iter()
            .fold(initial_size, |acc, chunk| acc + chunk.len_diff())
    }

    /// Apply the patch to some data.
    pub fn apply(&self, initial: &[u8]) -> Vec<u8> {
        let mut last: usize = 0;
        let mut vec =
            Vec::with_capacity(self.size(initial.len() as i32) as usize);
        for Chunk { start, end, data } in self.chunks.iter() {
            vec.extend(&initial[last..(*start as usize)]);
            vec.extend(data.iter());
            last = *end as usize;
        }
        vec.extend(&initial[last..]);
        vec
    }

    /// Combine two patch lists into a single patch list.
    ///
    /// Applying consecutive patches can lead to waste of time and memory
    /// as the changes introduced by one patch can be overridden by the next.
    /// Combining patches optimizes the whole patching sequence.
    fn combine(&mut self, other: &mut Self) -> Self {
        let mut chunks = vec![];

        // Keep track of each growth/shrinkage resulting from applying a chunk
        // in order to adjust the start/end of subsequent chunks.
        let mut offset = 0i32;

        // Keep track of the chunk of self.chunks to process.
        let mut pos = 0;

        // For each chunk of `other`, chunks of `self` are processed
        // until they start after the end of the current chunk.
        for Chunk { start, end, data } in other.chunks.iter() {
            // Add chunks of `self` that start before this chunk of `other`
            // without overlap.
            while pos < self.chunks.len()
                && self.chunks[pos].end_offset_by(offset) <= *start
            {
                let first = self.chunks[pos].clone();
                offset += first.len_diff();
                chunks.push(first);
                pos += 1;
            }

            // The current chunk of `self` starts before this chunk of `other`
            // with overlap.
            // The left-most part of data is added as an insertion chunk.
            // The right-most part data is kept in the chunk.
            if pos < self.chunks.len()
                && self.chunks[pos].start_offset_by(offset) < *start
            {
                let first = &mut self.chunks[pos];

                let (data_left, data_right) = first.data.split_at(
                    (*start - first.start_offset_by(offset)) as usize,
                );
                let left = Chunk {
                    start: first.start,
                    end: first.start,
                    data: data_left,
                };

                first.data = data_right;

                offset += left.len_diff();

                chunks.push(left);

                // There is no index incrementation because the right-most part
                // needs further examination.
            }

            // At this point remaining chunks of `self` starts after
            // the current chunk of `other`.

            // `start_offset` will be used to adjust the start of the current
            // chunk of `other`.
            // Offset tracking continues with `end_offset` to adjust the end
            // of the current chunk of `other`.
            let mut next_offset = offset;

            // Discard the chunks of `self` that are totally overridden
            // by the current chunk of `other`
            while pos < self.chunks.len()
                && self.chunks[pos].end_offset_by(next_offset) <= *end
            {
                let first = &self.chunks[pos];
                next_offset += first.len_diff();
                pos += 1;
            }

            // Truncate the left-most part of chunk of `self` that overlaps
            // the current chunk of `other`.
            if pos < self.chunks.len()
                && self.chunks[pos].start_offset_by(next_offset) < *end
            {
                let first = &mut self.chunks[pos];

                let how_much_to_discard =
                    *end - first.start_offset_by(next_offset);

                first.data = &first.data[(how_much_to_discard as usize)..];

                next_offset += how_much_to_discard as i32;
            }

            // Add the chunk of `other` with adjusted position.
            chunks.push(Chunk {
                start: (*start as i32 - offset) as u32,
                end: (*end as i32 - next_offset) as u32,
                data,
            });

            // Go back to normal offset tracking for the next `o` chunk
            offset = next_offset;
        }

        // Add remaining chunks of `self`.
        for elt in &self.chunks[pos..] {
            chunks.push(elt.clone());
        }
        PatchList { chunks }
    }
}

/// Combine a list of patch list into a single patch optimized patch list.
pub fn fold_patch_lists<'a>(lists: &[PatchList<'a>]) -> PatchList<'a> {
    if lists.len() <= 1 {
        if lists.is_empty() {
            PatchList { chunks: vec![] }
        } else {
            lists[0].clone()
        }
    } else {
        let (left, right) = lists.split_at(lists.len() / 2);
        let mut left_res = fold_patch_lists(left);
        let mut right_res = fold_patch_lists(right);
        left_res.combine(&mut right_res)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    struct PatchDataBuilder {
        data: Vec<u8>,
    }

    impl PatchDataBuilder {
        pub fn new() -> Self {
            Self { data: vec![] }
        }

        pub fn replace(
            &mut self,
            start: usize,
            end: usize,
            data: &[u8],
        ) -> &mut Self {
            assert!(start <= end);
            self.data.extend(&(start as i32).to_be_bytes());
            self.data.extend(&(end as i32).to_be_bytes());
            self.data.extend(&(data.len() as i32).to_be_bytes());
            self.data.extend(data.iter());
            self
        }

        pub fn get(&mut self) -> &[u8] {
            &self.data
        }
    }

    #[test]
    fn test_ends_before() {
        let data = vec![0u8, 0u8, 0u8];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(0, 1, &[1, 2]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(2, 4, &[3, 4]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![1u8, 2, 3, 4]);
    }

    #[test]
    fn test_starts_after() {
        let data = vec![0u8, 0u8, 0u8];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(2, 3, &[3]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(1, 2, &[1, 2]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![0u8, 1, 2, 3]);
    }

    #[test]
    fn test_overridden() {
        let data = vec![0u8, 0, 0];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(1, 2, &[3, 4]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(1, 4, &[1, 2, 3]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![0u8, 1, 2, 3]);
    }

    #[test]
    fn test_right_most_part_is_overridden() {
        let data = vec![0u8, 0, 0];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(0, 1, &[1, 3]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(1, 4, &[2, 3, 4]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![1u8, 2, 3, 4]);
    }

    #[test]
    fn test_left_most_part_is_overridden() {
        let data = vec![0u8, 0, 0];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(1, 3, &[1, 3, 4]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(0, 2, &[1, 2]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![1u8, 2, 3, 4]);
    }

    #[test]
    fn test_mid_is_overridden() {
        let data = vec![0u8, 0, 0];
        let mut patch1_data = PatchDataBuilder::new();
        patch1_data.replace(0, 3, &[1, 3, 3, 4]);
        let mut patch1 = PatchList::new(patch1_data.get());

        let mut patch2_data = PatchDataBuilder::new();
        patch2_data.replace(1, 3, &[2, 3]);
        let mut patch2 = PatchList::new(patch2_data.get());

        let patch = patch1.combine(&mut patch2);

        let result = patch.apply(&data);

        assert_eq!(result, vec![1u8, 2, 3, 4]);
    }
}
author	Arseniy Alekseyev <aalekseyev@janestreet.com>
date	Fri, 12 Apr 2024 14:21:14 +0100
parents	b68b19104d16
children	e01e84e5e426