copies-rust: extract generic map merge logic from merge_copies_dict
authorSimon Sapin <simon.sapin@octobus.net>
Wed, 23 Dec 2020 11:48:16 +0100
changeset 46586 435d9fc72646
parent 46585 60b2b7ecf9cb
child 46587 cb4b0b0c6de4
copies-rust: extract generic map merge logic from merge_copies_dict This deduplicates the copy-tracing-specific logic Differential Revision: https://phab.mercurial-scm.org/D9682
rust/hg-core/src/copy_tracing.rs
rust/hg-core/src/utils.rs
--- a/rust/hg-core/src/copy_tracing.rs	Mon Dec 21 12:34:59 2020 +0100
+++ b/rust/hg-core/src/copy_tracing.rs	Wed Dec 23 11:48:16 2020 +0100
@@ -3,7 +3,6 @@
 use crate::Revision;
 use crate::NULL_REVISION;
 
-use im_rc::ordmap::DiffItem;
 use im_rc::ordmap::Entry;
 use im_rc::ordmap::OrdMap;
 use im_rc::OrdSet;
@@ -624,210 +623,40 @@
 fn merge_copies_dict(
     path_map: &TwoWayPathMap,
     current_merge: Revision,
-    mut minor: InternalPathCopies,
-    mut major: InternalPathCopies,
+    minor: InternalPathCopies,
+    major: InternalPathCopies,
     changes: &ChangedFiles,
 ) -> InternalPathCopies {
-    // This closure exist as temporary help while multiple developper are
-    // actively working on this code. Feel free to re-inline it once this
-    // code is more settled.
-    let cmp_value =
-        |dest: &PathToken, src_minor: &CopySource, src_major: &CopySource| {
-            compare_value(
-                path_map,
+    use crate::utils::{ordmap_union_with_merge, MergeResult};
+
+    ordmap_union_with_merge(minor, major, |dest, src_minor, src_major| {
+        let (pick, overwrite) = compare_value(
+            path_map,
+            current_merge,
+            changes,
+            dest,
+            src_minor,
+            src_major,
+        );
+        if overwrite {
+            let (winner, loser) = match pick {
+                MergePick::Major | MergePick::Any => (src_major, src_minor),
+                MergePick::Minor => (src_minor, src_major),
+            };
+            MergeResult::UseNewValue(CopySource::new_from_merge(
                 current_merge,
-                changes,
-                dest,
-                src_minor,
-                src_major,
-            )
-        };
-    if minor.is_empty() {
-        major
-    } else if major.is_empty() {
-        minor
-    } else if minor.len() * 2 < major.len() {
-        // Lets says we are merging two InternalPathCopies instance A and B.
-        //
-        // If A contains N items, the merge result will never contains more
-        // than N values differents than the one in A
-        //
-        // If B contains M items, with M > N, the merge result will always
-        // result in a minimum of M - N value differents than the on in
-        // A
-        //
-        // As a result, if N < (M-N), we know that simply iterating over A will
-        // yield less difference than iterating over the difference
-        // between A and B.
-        //
-        // This help performance a lot in case were a tiny
-        // InternalPathCopies is merged with a much larger one.
-        for (dest, src_minor) in minor {
-            let src_major = major.get(&dest);
-            match src_major {
-                None => {
-                    major.insert(dest, src_minor);
-                }
-                Some(src_major) => {
-                    let (pick, overwrite) =
-                        cmp_value(&dest, &src_minor, src_major);
-                    if overwrite {
-                        let src = match pick {
-                            MergePick::Major => CopySource::new_from_merge(
-                                current_merge,
-                                src_major,
-                                &src_minor,
-                            ),
-                            MergePick::Minor => CopySource::new_from_merge(
-                                current_merge,
-                                &src_minor,
-                                src_major,
-                            ),
-                            MergePick::Any => CopySource::new_from_merge(
-                                current_merge,
-                                src_major,
-                                &src_minor,
-                            ),
-                        };
-                        major.insert(dest, src);
-                    } else {
-                        match pick {
-                            MergePick::Any | MergePick::Major => None,
-                            MergePick::Minor => major.insert(dest, src_minor),
-                        };
-                    }
-                }
-            };
-        }
-        major
-    } else if major.len() * 2 < minor.len() {
-        // This use the same rational than the previous block.
-        // (Check previous block documentation for details.)
-        for (dest, src_major) in major {
-            let src_minor = minor.get(&dest);
-            match src_minor {
-                None => {
-                    minor.insert(dest, src_major);
+                winner,
+                loser,
+            ))
+        } else {
+            match pick {
+                MergePick::Any | MergePick::Major => {
+                    MergeResult::UseRightValue
                 }
-                Some(src_minor) => {
-                    let (pick, overwrite) =
-                        cmp_value(&dest, src_minor, &src_major);
-                    if overwrite {
-                        let src = match pick {
-                            MergePick::Major => CopySource::new_from_merge(
-                                current_merge,
-                                &src_major,
-                                src_minor,
-                            ),
-                            MergePick::Minor => CopySource::new_from_merge(
-                                current_merge,
-                                src_minor,
-                                &src_major,
-                            ),
-                            MergePick::Any => CopySource::new_from_merge(
-                                current_merge,
-                                &src_major,
-                                src_minor,
-                            ),
-                        };
-                        minor.insert(dest, src);
-                    } else {
-                        match pick {
-                            MergePick::Any | MergePick::Minor => None,
-                            MergePick::Major => minor.insert(dest, src_major),
-                        };
-                    }
-                }
-            };
-        }
-        minor
-    } else {
-        let mut override_minor = Vec::new();
-        let mut override_major = Vec::new();
-
-        let mut to_major = |k: &PathToken, v: &CopySource| {
-            override_major.push((k.clone(), v.clone()))
-        };
-        let mut to_minor = |k: &PathToken, v: &CopySource| {
-            override_minor.push((k.clone(), v.clone()))
-        };
-
-        // The diff function leverage detection of the identical subpart if
-        // minor and major has some common ancestors. This make it very
-        // fast is most case.
-        //
-        // In case where the two map are vastly different in size, the current
-        // approach is still slowish because the iteration will iterate over
-        // all the "exclusive" content of the larger on. This situation can be
-        // frequent when the subgraph of revision we are processing has a lot
-        // of roots. Each roots adding they own fully new map to the mix (and
-        // likely a small map, if the path from the root to the "main path" is
-        // small.
-        //
-        // We could do better by detecting such situation and processing them
-        // differently.
-        for d in minor.diff(&major) {
-            match d {
-                DiffItem::Add(k, v) => to_minor(k, v),
-                DiffItem::Remove(k, v) => to_major(k, v),
-                DiffItem::Update { old, new } => {
-                    let (dest, src_major) = new;
-                    let (_, src_minor) = old;
-                    let (pick, overwrite) =
-                        cmp_value(dest, src_minor, src_major);
-                    if overwrite {
-                        let src = match pick {
-                            MergePick::Major => CopySource::new_from_merge(
-                                current_merge,
-                                src_major,
-                                src_minor,
-                            ),
-                            MergePick::Minor => CopySource::new_from_merge(
-                                current_merge,
-                                src_minor,
-                                src_major,
-                            ),
-                            MergePick::Any => CopySource::new_from_merge(
-                                current_merge,
-                                src_major,
-                                src_minor,
-                            ),
-                        };
-                        to_minor(dest, &src);
-                        to_major(dest, &src);
-                    } else {
-                        match pick {
-                            MergePick::Major => to_minor(dest, src_major),
-                            MergePick::Minor => to_major(dest, src_minor),
-                            // If the two entry are identical, no need to do
-                            // anything (but diff should not have yield them)
-                            MergePick::Any => unreachable!(),
-                        }
-                    }
-                }
-            };
-        }
-
-        let updates;
-        let mut result;
-        if override_major.is_empty() {
-            result = major
-        } else if override_minor.is_empty() {
-            result = minor
-        } else {
-            if override_minor.len() < override_major.len() {
-                updates = override_minor;
-                result = minor;
-            } else {
-                updates = override_major;
-                result = major;
-            }
-            for (k, v) in updates {
-                result.insert(k, v);
+                MergePick::Minor => MergeResult::UseLeftValue,
             }
         }
-        result
-    }
+    })
 }
 
 /// represent the side that should prevail when merging two
--- a/rust/hg-core/src/utils.rs	Mon Dec 21 12:34:59 2020 +0100
+++ b/rust/hg-core/src/utils.rs	Wed Dec 23 11:48:16 2020 +0100
@@ -9,6 +9,8 @@
 
 use crate::errors::{HgError, IoErrorContext};
 use crate::utils::hg_path::HgPath;
+use im_rc::ordmap::DiffItem;
+use im_rc::ordmap::OrdMap;
 use std::{io::Write, ops::Deref};
 
 pub mod files;
@@ -199,3 +201,151 @@
         context: IoErrorContext::CurrentExe,
     })
 }
+
+pub(crate) enum MergeResult<V> {
+    UseLeftValue,
+    UseRightValue,
+    UseNewValue(V),
+}
+
+/// Return the union of the two given maps,
+/// calling `merge(key, left_value, right_value)` to resolve keys that exist in
+/// both.
+///
+/// CC https://github.com/bodil/im-rs/issues/166
+pub(crate) fn ordmap_union_with_merge<K, V>(
+    left: OrdMap<K, V>,
+    right: OrdMap<K, V>,
+    mut merge: impl FnMut(&K, &V, &V) -> MergeResult<V>,
+) -> OrdMap<K, V>
+where
+    K: Clone + Ord,
+    V: Clone + PartialEq,
+{
+    if left.ptr_eq(&right) {
+        // One of the two maps is an unmodified clone of the other
+        left
+    } else if left.len() / 2 > right.len() {
+        // When two maps have different sizes,
+        // their size difference is a lower bound on
+        // how many keys of the larger map are not also in the smaller map.
+        // This in turn is a lower bound on the number of differences in
+        // `OrdMap::diff` and the "amount of work" that would be done
+        // by `ordmap_union_with_merge_by_diff`.
+        //
+        // Here `left` is more than twice the size of `right`,
+        // so the number of differences is more than the total size of
+        // `right`. Therefore an algorithm based on iterating `right`
+        // is more efficient.
+        //
+        // This helps a lot when a tiny (or empty) map is merged
+        // with a large one.
+        ordmap_union_with_merge_by_iter(left, right, merge)
+    } else if left.len() < right.len() / 2 {
+        // Same as above but with `left` and `right` swapped
+        ordmap_union_with_merge_by_iter(right, left, |key, a, b| {
+            // Also swapped in `merge` arguments:
+            match merge(key, b, a) {
+                MergeResult::UseNewValue(v) => MergeResult::UseNewValue(v),
+                // … and swap back in `merge` result:
+                MergeResult::UseLeftValue => MergeResult::UseRightValue,
+                MergeResult::UseRightValue => MergeResult::UseLeftValue,
+            }
+        })
+    } else {
+        // For maps of similar size, use the algorithm based on `OrdMap::diff`
+        ordmap_union_with_merge_by_diff(left, right, merge)
+    }
+}
+
+/// Efficient if `right` is much smaller than `left`
+fn ordmap_union_with_merge_by_iter<K, V>(
+    mut left: OrdMap<K, V>,
+    right: OrdMap<K, V>,
+    mut merge: impl FnMut(&K, &V, &V) -> MergeResult<V>,
+) -> OrdMap<K, V>
+where
+    K: Clone + Ord,
+    V: Clone,
+{
+    for (key, right_value) in right {
+        match left.get(&key) {
+            None => {
+                left.insert(key, right_value);
+            }
+            Some(left_value) => match merge(&key, left_value, &right_value) {
+                MergeResult::UseLeftValue => {}
+                MergeResult::UseRightValue => {
+                    left.insert(key, right_value);
+                }
+                MergeResult::UseNewValue(new_value) => {
+                    left.insert(key, new_value);
+                }
+            },
+        }
+    }
+    left
+}
+
+/// Fallback when both maps are of similar size
+fn ordmap_union_with_merge_by_diff<K, V>(
+    mut left: OrdMap<K, V>,
+    mut right: OrdMap<K, V>,
+    mut merge: impl FnMut(&K, &V, &V) -> MergeResult<V>,
+) -> OrdMap<K, V>
+where
+    K: Clone + Ord,
+    V: Clone + PartialEq,
+{
+    // (key, value) pairs that would need to be inserted in either map
+    // in order to turn it into the union.
+    //
+    // TODO: if/when https://github.com/bodil/im-rs/pull/168 is accepted,
+    // change these from `Vec<(K, V)>` to `Vec<(&K, Cow<V>)>`
+    // with `left_updates` only borrowing from `right` and `right_updates` from
+    // `left`, and with `Cow::Owned` used for `MergeResult::UseNewValue`.
+    //
+    // This would allow moving all `.clone()` calls to after we’ve decided
+    // which of `right_updates` or `left_updates` to use
+    // (value ones becoming `Cow::into_owned`),
+    // and avoid making clones we don’t end up using.
+    let mut left_updates = Vec::new();
+    let mut right_updates = Vec::new();
+
+    for difference in left.diff(&right) {
+        match difference {
+            DiffItem::Add(key, value) => {
+                left_updates.push((key.clone(), value.clone()))
+            }
+            DiffItem::Remove(key, value) => {
+                right_updates.push((key.clone(), value.clone()))
+            }
+            DiffItem::Update {
+                old: (key, left_value),
+                new: (_, right_value),
+            } => match merge(key, left_value, right_value) {
+                MergeResult::UseLeftValue => {
+                    right_updates.push((key.clone(), left_value.clone()))
+                }
+                MergeResult::UseRightValue => {
+                    left_updates.push((key.clone(), right_value.clone()))
+                }
+                MergeResult::UseNewValue(new_value) => {
+                    left_updates.push((key.clone(), new_value.clone()));
+                    right_updates.push((key.clone(), new_value))
+                }
+            },
+        }
+    }
+    if left_updates.len() < right_updates.len() {
+        for (key, value) in left_updates {
+            left.insert(key, value);
+        }
+        left
+    } else {
+        for (key, value) in right_updates {
+            right.insert(key, value);
+        }
+        right
+    }
+}