view rust/hg-core/src/copy_tracing.rs @ 46058:12192fdbf3ac

copies-rust: move the parent token to an enum We carry around information about which parent of a revision is been dealt with. So far this was a `usize` but as we are about to pass it around to more function it seems like a good idea to start cleaning this up and use a proper enum. Differential Revision: https://phab.mercurial-scm.org/D9420
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Fri, 20 Nov 2020 14:03:40 +0100
parents e0313b0a6f7e
children dacb771f6dd2
line wrap: on
line source

use crate::utils::hg_path::HgPath;
use crate::utils::hg_path::HgPathBuf;
use crate::Revision;

use im_rc::ordmap::DiffItem;
use im_rc::ordmap::OrdMap;

use std::cmp::Ordering;
use std::collections::HashMap;
use std::convert::TryInto;

pub type PathCopies = HashMap<HgPathBuf, HgPathBuf>;

#[derive(Clone, Debug, PartialEq)]
struct TimeStampedPathCopy {
    /// revision at which the copy information was added
    rev: Revision,
    /// the copy source, (Set to None in case of deletion of the associated
    /// key)
    path: Option<HgPathBuf>,
}

/// maps CopyDestination to Copy Source (+ a "timestamp" for the operation)
type TimeStampedPathCopies = OrdMap<HgPathBuf, TimeStampedPathCopy>;

/// hold parent 1, parent 2 and relevant files actions.
pub type RevInfo<'a> = (Revision, Revision, ChangedFiles<'a>);

/// represent the files affected by a changesets
///
/// This hold a subset of mercurial.metadata.ChangingFiles as we do not need
/// all the data categories tracked by it.
/// This hold a subset of mercurial.metadata.ChangingFiles as we do not need
/// all the data categories tracked by it.
pub struct ChangedFiles<'a> {
    nb_items: u32,
    index: &'a [u8],
    data: &'a [u8],
}

/// Represent active changes that affect the copy tracing.
enum Action<'a> {
    /// The parent ? children edge is removing a file
    ///
    /// (actually, this could be the edge from the other parent, but it does
    /// not matters)
    Removed(&'a HgPath),
    /// The parent ? children edge introduce copy information between (dest,
    /// source)
    Copied(&'a HgPath, &'a HgPath),
}

/// This express the possible "special" case we can get in a merge
///
/// See mercurial/metadata.py for details on these values.
#[derive(PartialEq)]
enum MergeCase {
    /// Merged: file had history on both side that needed to be merged
    Merged,
    /// Salvaged: file was candidate for deletion, but survived the merge
    Salvaged,
    /// Normal: Not one of the two cases above
    Normal,
}

type FileChange<'a> = (u8, &'a HgPath, &'a HgPath);

const EMPTY: &[u8] = b"";
const COPY_MASK: u8 = 3;
const P1_COPY: u8 = 2;
const P2_COPY: u8 = 3;
const ACTION_MASK: u8 = 28;
const REMOVED: u8 = 12;
const MERGED: u8 = 8;
const SALVAGED: u8 = 16;

impl<'a> ChangedFiles<'a> {
    const INDEX_START: usize = 4;
    const ENTRY_SIZE: u32 = 9;
    const FILENAME_START: u32 = 1;
    const COPY_SOURCE_START: u32 = 5;

    pub fn new(data: &'a [u8]) -> Self {
        assert!(
            data.len() >= 4,
            "data size ({}) is too small to contain the header (4)",
            data.len()
        );
        let nb_items_raw: [u8; 4] = (&data[0..=3])
            .try_into()
            .expect("failed to turn 4 bytes into 4 bytes");
        let nb_items = u32::from_be_bytes(nb_items_raw);

        let index_size = (nb_items * Self::ENTRY_SIZE) as usize;
        let index_end = Self::INDEX_START + index_size;

        assert!(
            data.len() >= index_end,
            "data size ({}) is too small to fit the index_data ({})",
            data.len(),
            index_end
        );

        let ret = ChangedFiles {
            nb_items,
            index: &data[Self::INDEX_START..index_end],
            data: &data[index_end..],
        };
        let max_data = ret.filename_end(nb_items - 1) as usize;
        assert!(
            ret.data.len() >= max_data,
            "data size ({}) is too small to fit all data ({})",
            data.len(),
            index_end + max_data
        );
        ret
    }

    pub fn new_empty() -> Self {
        ChangedFiles {
            nb_items: 0,
            index: EMPTY,
            data: EMPTY,
        }
    }

    /// internal function to return an individual entry at a given index
    fn entry(&'a self, idx: u32) -> FileChange<'a> {
        if idx >= self.nb_items {
            panic!(
                "index for entry is higher that the number of file {} >= {}",
                idx, self.nb_items
            )
        }
        let flags = self.flags(idx);
        let filename = self.filename(idx);
        let copy_idx = self.copy_idx(idx);
        let copy_source = self.filename(copy_idx);
        (flags, filename, copy_source)
    }

    /// internal function to return the filename of the entry at a given index
    fn filename(&self, idx: u32) -> &HgPath {
        let filename_start;
        if idx == 0 {
            filename_start = 0;
        } else {
            filename_start = self.filename_end(idx - 1)
        }
        let filename_end = self.filename_end(idx);
        let filename_start = filename_start as usize;
        let filename_end = filename_end as usize;
        HgPath::new(&self.data[filename_start..filename_end])
    }

    /// internal function to return the flag field of the entry at a given
    /// index
    fn flags(&self, idx: u32) -> u8 {
        let idx = idx as usize;
        self.index[idx * (Self::ENTRY_SIZE as usize)]
    }

    /// internal function to return the end of a filename part at a given index
    fn filename_end(&self, idx: u32) -> u32 {
        let start = (idx * Self::ENTRY_SIZE) + Self::FILENAME_START;
        let end = (idx * Self::ENTRY_SIZE) + Self::COPY_SOURCE_START;
        let start = start as usize;
        let end = end as usize;
        let raw = (&self.index[start..end])
            .try_into()
            .expect("failed to turn 4 bytes into 4 bytes");
        u32::from_be_bytes(raw)
    }

    /// internal function to return index of the copy source of the entry at a
    /// given index
    fn copy_idx(&self, idx: u32) -> u32 {
        let start = (idx * Self::ENTRY_SIZE) + Self::COPY_SOURCE_START;
        let end = (idx + 1) * Self::ENTRY_SIZE;
        let start = start as usize;
        let end = end as usize;
        let raw = (&self.index[start..end])
            .try_into()
            .expect("failed to turn 4 bytes into 4 bytes");
        u32::from_be_bytes(raw)
    }

    /// Return an iterator over all the `Action` in this instance.
    fn iter_actions(&self, parent: Parent) -> ActionsIterator {
        ActionsIterator {
            changes: &self,
            parent: parent,
            current: 0,
        }
    }

    /// return the MergeCase value associated with a filename
    fn get_merge_case(&self, path: &HgPath) -> MergeCase {
        if self.nb_items == 0 {
            return MergeCase::Normal;
        }
        let mut low_part = 0;
        let mut high_part = self.nb_items;

        while low_part < high_part {
            let cursor = (low_part + high_part - 1) / 2;
            let (flags, filename, _source) = self.entry(cursor);
            match path.cmp(filename) {
                Ordering::Less => low_part = cursor + 1,
                Ordering::Greater => high_part = cursor,
                Ordering::Equal => {
                    return match flags & ACTION_MASK {
                        MERGED => MergeCase::Merged,
                        SALVAGED => MergeCase::Salvaged,
                        _ => MergeCase::Normal,
                    };
                }
            }
        }
        MergeCase::Normal
    }
}

/// A struct responsible for answering "is X ancestors of Y" quickly
///
/// The structure will delegate ancestors call to a callback, and cache the
/// result.
#[derive(Debug)]
struct AncestorOracle<'a, A: Fn(Revision, Revision) -> bool> {
    inner: &'a A,
    pairs: HashMap<(Revision, Revision), bool>,
}

impl<'a, A: Fn(Revision, Revision) -> bool> AncestorOracle<'a, A> {
    fn new(func: &'a A) -> Self {
        Self {
            inner: func,
            pairs: HashMap::default(),
        }
    }

    /// returns `true` if `anc` is an ancestors of `desc`, `false` otherwise
    fn is_ancestor(&mut self, anc: Revision, desc: Revision) -> bool {
        if anc > desc {
            false
        } else if anc == desc {
            true
        } else {
            if let Some(b) = self.pairs.get(&(anc, desc)) {
                *b
            } else {
                let b = (self.inner)(anc, desc);
                self.pairs.insert((anc, desc), b);
                b
            }
        }
    }
}

struct ActionsIterator<'a> {
    changes: &'a ChangedFiles<'a>,
    parent: Parent,
    current: u32,
}

impl<'a> Iterator for ActionsIterator<'a> {
    type Item = Action<'a>;

    fn next(&mut self) -> Option<Action<'a>> {
        let copy_flag = match self.parent {
            Parent::FirstParent => P1_COPY,
            Parent::SecondParent => P2_COPY,
        };
        while self.current < self.changes.nb_items {
            let (flags, file, source) = self.changes.entry(self.current);
            self.current += 1;
            if (flags & ACTION_MASK) == REMOVED {
                return Some(Action::Removed(file));
            }
            let copy = flags & COPY_MASK;
            if copy == copy_flag {
                return Some(Action::Copied(file, source));
            }
        }
        return None;
    }
}

/// A small struct whose purpose is to ensure lifetime of bytes referenced in
/// ChangedFiles
///
/// It is passed to the RevInfoMaker callback who can assign any necessary
/// content to the `data` attribute. The copy tracing code is responsible for
/// keeping the DataHolder alive at least as long as the ChangedFiles object.
pub struct DataHolder<D> {
    /// RevInfoMaker callback should assign data referenced by the
    /// ChangedFiles struct it return to this attribute. The DataHolder
    /// lifetime will be at least as long as the ChangedFiles one.
    pub data: Option<D>,
}

pub type RevInfoMaker<'a, D> =
    Box<dyn for<'r> Fn(Revision, &'r mut DataHolder<D>) -> RevInfo<'r> + 'a>;

/// enum used to carry information about the parent → child currently processed
#[derive(Copy, Clone, Debug)]
enum Parent {
    /// The `p1(x) → x` edge
    FirstParent,
    /// The `p2(x) → x` edge
    SecondParent,
}

/// Same as mercurial.copies._combine_changeset_copies, but in Rust.
///
/// Arguments are:
///
/// revs: all revisions to be considered
/// children: a {parent ? [childrens]} mapping
/// target_rev: the final revision we are combining copies to
/// rev_info(rev): callback to get revision information:
///   * first parent
///   * second parent
///   * ChangedFiles
/// isancestors(low_rev, high_rev): callback to check if a revision is an
///                                 ancestor of another
pub fn combine_changeset_copies<A: Fn(Revision, Revision) -> bool, D>(
    revs: Vec<Revision>,
    children: HashMap<Revision, Vec<Revision>>,
    target_rev: Revision,
    rev_info: RevInfoMaker<D>,
    is_ancestor: &A,
) -> PathCopies {
    let mut all_copies = HashMap::new();
    let mut oracle = AncestorOracle::new(is_ancestor);

    for rev in revs {
        // Retrieve data computed in a previous iteration
        let copies = all_copies.remove(&rev);
        let copies = match copies {
            Some(c) => c,
            None => TimeStampedPathCopies::default(), // root of the walked set
        };

        let current_children = match children.get(&rev) {
            Some(c) => c,
            None => panic!("inconsistent `revs` and `children`"),
        };

        for child in current_children {
            // We will chain the copies information accumulated for `rev` with
            // the individual copies information for each of its children.
            // Creating a new PathCopies for each `rev` → `children` vertex.
            let mut d: DataHolder<D> = DataHolder { data: None };
            let (p1, p2, changes) = rev_info(*child, &mut d);

            let parent = if rev == p1 {
                Parent::FirstParent
            } else {
                assert_eq!(rev, p2);
                Parent::SecondParent
            };
            let mut new_copies = copies.clone();

            for action in changes.iter_actions(parent) {
                match action {
                    Action::Copied(dest, source) => {
                        let entry;
                        if let Some(v) = copies.get(source) {
                            entry = match &v.path {
                                Some(path) => Some((*(path)).to_owned()),
                                None => Some(source.to_owned()),
                            }
                        } else {
                            entry = Some(source.to_owned());
                        }
                        // Each new entry is introduced by the children, we
                        // record this information as we will need it to take
                        // the right decision when merging conflicting copy
                        // information. See merge_copies_dict for details.
                        let ttpc = TimeStampedPathCopy {
                            rev: *child,
                            path: entry,
                        };
                        new_copies.insert(dest.to_owned(), ttpc);
                    }
                    Action::Removed(f) => {
                        // We must drop copy information for removed file.
                        //
                        // We need to explicitly record them as dropped to
                        // propagate this information when merging two
                        // TimeStampedPathCopies object.
                        if new_copies.contains_key(f.as_ref()) {
                            let ttpc = TimeStampedPathCopy {
                                rev: *child,
                                path: None,
                            };
                            new_copies.insert(f.to_owned(), ttpc);
                        }
                    }
                }
            }

            // Merge has two parents needs to combines their copy information.
            //
            // If the vertex from the other parent was already processed, we
            // will have a value for the child ready to be used. We need to
            // grab it and combine it with the one we already
            // computed. If not we can simply store the newly
            // computed data. The processing happening at
            // the time of the second parent will take care of combining the
            // two TimeStampedPathCopies instance.
            match all_copies.remove(child) {
                None => {
                    all_copies.insert(child, new_copies);
                }
                Some(other_copies) => {
                    let (minor, major) = match parent {
                        Parent::FirstParent => (other_copies, new_copies),
                        Parent::SecondParent => (new_copies, other_copies),
                    };
                    let merged_copies =
                        merge_copies_dict(minor, major, &changes, &mut oracle);
                    all_copies.insert(child, merged_copies);
                }
            };
        }
    }

    // Drop internal information (like the timestamp) and return the final
    // mapping.
    let tt_result = all_copies
        .remove(&target_rev)
        .expect("target revision was not processed");
    let mut result = PathCopies::default();
    for (dest, tt_source) in tt_result {
        if let Some(path) = tt_source.path {
            result.insert(dest, path);
        }
    }
    result
}

/// merge two copies-mapping together, minor and major
///
/// In case of conflict, value from "major" will be picked, unless in some
/// cases. See inline documentation for details.
#[allow(clippy::if_same_then_else)]
fn merge_copies_dict<A: Fn(Revision, Revision) -> bool>(
    minor: TimeStampedPathCopies,
    major: TimeStampedPathCopies,
    changes: &ChangedFiles,
    oracle: &mut AncestorOracle<A>,
) -> TimeStampedPathCopies {
    if minor.is_empty() {
        return major;
    } else if major.is_empty() {
        return minor;
    }
    let mut override_minor = Vec::new();
    let mut override_major = Vec::new();

    let mut to_major = |k: &HgPathBuf, v: &TimeStampedPathCopy| {
        override_major.push((k.clone(), v.clone()))
    };
    let mut to_minor = |k: &HgPathBuf, v: &TimeStampedPathCopy| {
        override_minor.push((k.clone(), v.clone()))
    };

    // The diff function leverage detection of the identical subpart if minor
    // and major has some common ancestors. This make it very fast is most
    // case.
    //
    // In case where the two map are vastly different in size, the current
    // approach is still slowish because the iteration will iterate over
    // all the "exclusive" content of the larger on. This situation can be
    // frequent when the subgraph of revision we are processing has a lot
    // of roots. Each roots adding they own fully new map to the mix (and
    // likely a small map, if the path from the root to the "main path" is
    // small.
    //
    // We could do better by detecting such situation and processing them
    // differently.
    for d in minor.diff(&major) {
        match d {
            DiffItem::Add(k, v) => to_minor(k, v),
            DiffItem::Remove(k, v) => to_major(k, v),
            DiffItem::Update { old, new } => {
                let (dest, src_major) = new;
                let (_, src_minor) = old;
                let mut pick_minor = || (to_major(dest, src_minor));
                let mut pick_major = || (to_minor(dest, src_major));
                if src_major.path == src_minor.path {
                    // we have the same value, but from other source;
                    if src_major.rev == src_minor.rev {
                        // If the two entry are identical, no need to do
                        // anything (but diff should not have yield them)
                        unreachable!();
                    } else if oracle.is_ancestor(src_major.rev, src_minor.rev)
                    {
                        pick_minor();
                    } else {
                        pick_major();
                    }
                } else if src_major.rev == src_minor.rev {
                    // We cannot get copy information for both p1 and p2 in the
                    // same rev. So this is the same value.
                    unreachable!();
                } else {
                    let action = changes.get_merge_case(&dest);
                    if src_major.path.is_none()
                        && action == MergeCase::Salvaged
                    {
                        // If the file is "deleted" in the major side but was
                        // salvaged by the merge, we keep the minor side alive
                        pick_minor();
                    } else if src_minor.path.is_none()
                        && action == MergeCase::Salvaged
                    {
                        // If the file is "deleted" in the minor side but was
                        // salvaged by the merge, unconditionnaly preserve the
                        // major side.
                        pick_major();
                    } else if action == MergeCase::Merged {
                        // If the file was actively merged, copy information
                        // from each side might conflict.  The major side will
                        // win such conflict.
                        pick_major();
                    } else if oracle.is_ancestor(src_major.rev, src_minor.rev)
                    {
                        // If the minor side is strictly newer than the major
                        // side, it should be kept.
                        pick_minor();
                    } else if src_major.path.is_some() {
                        // without any special case, the "major" value win
                        // other the "minor" one.
                        pick_major();
                    } else if oracle.is_ancestor(src_minor.rev, src_major.rev)
                    {
                        // the "major" rev is a direct ancestors of "minor",
                        // any different value should
                        // overwrite
                        pick_major();
                    } else {
                        // major version is None (so the file was deleted on
                        // that branch) and that branch is independant (neither
                        // minor nor major is an ancestors of the other one.)
                        // We preserve the new
                        // information about the new file.
                        pick_minor();
                    }
                }
            }
        };
    }

    let updates;
    let mut result;
    if override_major.is_empty() {
        result = major
    } else if override_minor.is_empty() {
        result = minor
    } else {
        if override_minor.len() < override_major.len() {
            updates = override_minor;
            result = minor;
        } else {
            updates = override_major;
            result = major;
        }
        for (k, v) in updates {
            result.insert(k, v);
        }
    }
    result
}