changeset 42738:8041a1b45163

rust-discovery: takefullsample() core implementation take_full_sample() browses the undecided set in both directions: from its roots as well as from its heads. Following what's done on the Python side, we alter update_sample() signature to take a closure returning an iterator: either ParentsIterator or an iterator over the children found in `children_cache`. These constructs should probably be split off in a separate module. This is a first concrete example where a more abstract graph notion (probably a trait) would be useful, as this is nothing but an operation on the reversed DAG. A similar motivation in the context of the discovery process would be to replace the call to dagops::range in `add_missing_revisions()` with a simple iteration over descendents, again an operation on the reversed graph. Differential Revision: https://phab.mercurial-scm.org/D6424
author Georges Racinet <georges.racinet@octobus.net>
date Fri, 17 May 2019 01:56:57 +0200
parents 388622cbc911
children 1c4b5689bef5
files rust/hg-core/src/discovery.rs rust/hg-cpython/src/discovery.rs
diffstat 2 files changed, 235 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/src/discovery.rs	Fri May 17 01:56:56 2019 +0200
+++ b/rust/hg-core/src/discovery.rs	Fri May 17 01:56:57 2019 +0200
@@ -17,6 +17,7 @@
 use super::{Graph, GraphError, Revision, NULL_REVISION};
 use crate::ancestors::MissingAncestors;
 use crate::dagops;
+use std::cmp::{max, min};
 use std::collections::{HashMap, HashSet, VecDeque};
 
 type Rng = self::rand_pcg::Pcg32;
@@ -26,8 +27,10 @@
     graph: G, // plays the role of self._repo
     common: MissingAncestors<G>,
     undecided: Option<HashSet<Revision>>,
+    children_cache: Option<HashMap<Revision, Vec<Revision>>>,
     missing: HashSet<Revision>,
     rng: Rng,
+    respect_size: bool,
 }
 
 pub struct DiscoveryStats {
@@ -49,13 +52,16 @@
 /// - `sample`: a sample to update
 /// - `parentfn`: a callable to resolve parents for a revision
 /// - `quicksamplesize`: optional target size of the sample
-fn update_sample(
+fn update_sample<I>(
     revs: Option<&HashSet<Revision>>,
     heads: impl IntoIterator<Item = Revision>,
     sample: &mut HashSet<Revision>,
-    parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>,
+    parentsfn: impl Fn(Revision) -> Result<I, GraphError>,
     quicksamplesize: Option<usize>,
-) -> Result<(), GraphError> {
+) -> Result<(), GraphError>
+where
+    I: Iterator<Item = Revision>,
+{
     let mut distances: HashMap<Revision, u32> = HashMap::new();
     let mut visit: VecDeque<Revision> = heads.into_iter().collect();
     let mut factor: u32 = 1;
@@ -83,10 +89,7 @@
                 }
             }
         }
-        for &p in &parentsfn(current)? {
-            if p == NULL_REVISION {
-                continue;
-            }
+        for p in parentsfn(current)? {
             if let Some(revs) = revs {
                 if !revs.contains(&p) {
                     continue;
@@ -99,6 +102,39 @@
     Ok(())
 }
 
+struct ParentsIterator {
+    parents: [Revision; 2],
+    cur: usize,
+}
+
+impl ParentsIterator {
+    fn graph_parents(
+        graph: &impl Graph,
+        r: Revision,
+    ) -> Result<ParentsIterator, GraphError> {
+        Ok(ParentsIterator {
+            parents: graph.parents(r)?,
+            cur: 0,
+        })
+    }
+}
+
+impl Iterator for ParentsIterator {
+    type Item = Revision;
+
+    fn next(&mut self) -> Option<Revision> {
+        if self.cur > 1 {
+            return None;
+        }
+        let rev = self.parents[self.cur];
+        self.cur += 1;
+        if rev == NULL_REVISION {
+            return self.next();
+        }
+        Some(rev)
+    }
+}
+
 impl<G: Graph + Clone> PartialDiscovery<G> {
     /// Create a PartialDiscovery object, with the intent
     /// of comparing our `::<target_heads>` revset to the contents of another
@@ -110,24 +146,36 @@
     /// If we want to make the signature more flexible,
     /// we'll have to make it a type argument of `PartialDiscovery` or a trait
     /// object since we'll keep it in the meanwhile
-    pub fn new(graph: G, target_heads: Vec<Revision>) -> Self {
+    ///
+    /// The `respect_size` boolean controls how the sampling methods
+    /// will interpret the size argument requested by the caller. If it's
+    /// `false`, they are allowed to produce a sample whose size is more
+    /// appropriate to the situation (typically bigger).
+    pub fn new(
+        graph: G,
+        target_heads: Vec<Revision>,
+        respect_size: bool,
+    ) -> Self {
         let mut seed: [u8; 16] = [0; 16];
         thread_rng().fill_bytes(&mut seed);
-        Self::new_with_seed(graph, target_heads, seed)
+        Self::new_with_seed(graph, target_heads, seed, respect_size)
     }
 
     pub fn new_with_seed(
         graph: G,
         target_heads: Vec<Revision>,
         seed: [u8; 16],
+        respect_size: bool,
     ) -> Self {
         PartialDiscovery {
             undecided: None,
+            children_cache: None,
             target_heads: Some(target_heads),
             graph: graph.clone(),
             common: MissingAncestors::new(graph, vec![]),
             missing: HashSet::new(),
             rng: Rng::from_seed(seed),
+            respect_size: respect_size,
         }
     }
 
@@ -228,6 +276,22 @@
         Ok(())
     }
 
+    fn ensure_children_cache(&mut self) -> Result<(), GraphError> {
+        if self.children_cache.is_some() {
+            return Ok(());
+        }
+        self.ensure_undecided()?;
+
+        let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new();
+        for &rev in self.undecided.as_ref().unwrap() {
+            for p in ParentsIterator::graph_parents(&self.graph, rev)? {
+                children.entry(p).or_insert_with(|| Vec::new()).push(rev);
+            }
+        }
+        self.children_cache = Some(children);
+        Ok(())
+    }
+
     /// Provide statistics about the current state of the discovery process
     pub fn stats(&self) -> DiscoveryStats {
         DiscoveryStats {
@@ -255,11 +319,114 @@
             None,
             headrevs,
             &mut sample,
-            |r| self.graph.parents(r),
+            |r| ParentsIterator::graph_parents(&self.graph, r),
             Some(size),
         )?;
         Ok(sample.into_iter().collect())
     }
+
+    /// Extract a sample from `self.undecided`, going from its heads and roots.
+    ///
+    /// The `size` parameter is used to avoid useless computations if
+    /// it turns out to be bigger than the whole set of undecided Revisions.
+    ///
+    /// The sample is taken by using `update_sample` from the heads, then
+    /// from the roots, working on the reverse DAG,
+    /// expressed by `self.children_cache`.
+    ///
+    /// No effort is being made to complete or limit the sample to `size`
+    /// but this method returns another interesting size that it derives
+    /// from its knowledge of the structure of the various sets, leaving
+    /// to the caller the decision to use it or not.
+    fn bidirectional_sample(
+        &mut self,
+        size: usize,
+    ) -> Result<(HashSet<Revision>, usize), GraphError> {
+        self.ensure_undecided()?;
+        {
+            // we don't want to compute children_cache before this
+            // but doing it after extracting self.undecided takes a mutable
+            // ref to self while a shareable one is still active.
+            let undecided = self.undecided.as_ref().unwrap();
+            if undecided.len() <= size {
+                return Ok((undecided.clone(), size));
+            }
+        }
+
+        self.ensure_children_cache()?;
+        let revs = self.undecided.as_ref().unwrap();
+        let mut sample: HashSet<Revision> = revs.clone();
+
+        // it's possible that leveraging the children cache would be more
+        // efficient here
+        dagops::retain_heads(&self.graph, &mut sample)?;
+        let revsheads = sample.clone(); // was again heads(revs) in python
+
+        // update from heads
+        update_sample(
+            Some(revs),
+            revsheads.iter().cloned(),
+            &mut sample,
+            |r| ParentsIterator::graph_parents(&self.graph, r),
+            None,
+        )?;
+
+        // update from roots
+        let revroots: HashSet<Revision> =
+            dagops::roots(&self.graph, revs)?.into_iter().collect();
+        let prescribed_size = max(size, min(revroots.len(), revsheads.len()));
+
+        let children = self.children_cache.as_ref().unwrap();
+        let empty_vec: Vec<Revision> = Vec::new();
+        update_sample(
+            Some(revs),
+            revroots,
+            &mut sample,
+            |r| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()),
+            None,
+        )?;
+        Ok((sample, prescribed_size))
+    }
+
+    /// Fill up sample up to the wished size with random undecided Revisions.
+    ///
+    /// This is intended to be used as a last resort completion if the
+    /// regular sampling algorithm returns too few elements.
+    fn random_complete_sample(
+        &mut self,
+        sample: &mut Vec<Revision>,
+        size: usize,
+    ) {
+        let sample_len = sample.len();
+        if size <= sample_len {
+            return;
+        }
+        let take_from: Vec<Revision> = self
+            .undecided
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|&r| !sample.contains(r))
+            .cloned()
+            .collect();
+        sample.extend(self.limit_sample(take_from, size - sample_len));
+    }
+
+    pub fn take_full_sample(
+        &mut self,
+        size: usize,
+    ) -> Result<Vec<Revision>, GraphError> {
+        let (sample_set, prescribed_size) = self.bidirectional_sample(size)?;
+        let size = if self.respect_size {
+            size
+        } else {
+            prescribed_size
+        };
+        let mut sample =
+            self.limit_sample(sample_set.into_iter().collect(), size);
+        self.random_complete_sample(&mut sample, size);
+        Ok(sample)
+    }
 }
 
 #[cfg(test)]
@@ -275,6 +442,7 @@
             SampleGraph,
             vec![10, 11, 12, 13],
             [0; 16],
+            true,
         )
     }
 
@@ -282,7 +450,7 @@
     ///
     /// To avoid actual randomness in tests, we give it a fixed random seed.
     fn disco12() -> PartialDiscovery<SampleGraph> {
-        PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16])
+        PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true)
     }
 
     fn sorted_undecided(
@@ -390,4 +558,58 @@
         assert_eq!(sample_vec, vec![4, 9, 12]);
         Ok(())
     }
+
+    #[test]
+    fn test_children_cache() -> Result<(), GraphError> {
+        let mut disco = full_disco();
+        disco.ensure_children_cache()?;
+
+        let cache = disco.children_cache.unwrap();
+        assert_eq!(cache.get(&2).cloned(), Some(vec![4]));
+        assert_eq!(cache.get(&10).cloned(), None);
+
+        let mut children_4 = cache.get(&4).cloned().unwrap();
+        children_4.sort();
+        assert_eq!(children_4, vec![5, 6, 7]);
+
+        let mut children_7 = cache.get(&7).cloned().unwrap();
+        children_7.sort();
+        assert_eq!(children_7, vec![9, 11]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_complete_sample() {
+        let mut disco = full_disco();
+        let undecided: HashSet<Revision> =
+            [4, 7, 9, 2, 3].iter().cloned().collect();
+        disco.undecided = Some(undecided);
+
+        let mut sample = vec![0];
+        disco.random_complete_sample(&mut sample, 3);
+        assert_eq!(sample.len(), 3);
+
+        let mut sample = vec![2, 4, 7];
+        disco.random_complete_sample(&mut sample, 1);
+        assert_eq!(sample.len(), 3);
+    }
+
+    #[test]
+    fn test_bidirectional_sample() -> Result<(), GraphError> {
+        let mut disco = full_disco();
+        disco.undecided = Some((0..=13).into_iter().collect());
+
+        let (sample_set, size) = disco.bidirectional_sample(7)?;
+        assert_eq!(size, 7);
+        let mut sample: Vec<Revision> = sample_set.into_iter().collect();
+        sample.sort();
+        // our DAG is a bit too small for the results to be really interesting
+        // at least it shows that
+        // - we went both ways
+        // - we didn't take all Revisions (6 is not in the sample)
+        assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]);
+        Ok(())
+    }
+
 }
--- a/rust/hg-cpython/src/discovery.rs	Fri May 17 01:56:56 2019 +0200
+++ b/rust/hg-cpython/src/discovery.rs	Fri May 17 01:56:57 2019 +0200
@@ -36,7 +36,7 @@
         _cls,
         repo: PyObject,
         targetheads: PyObject,
-        _respectsize: bool
+        respectsize: bool
     ) -> PyResult<PartialDiscovery> {
         let index = repo.getattr(py, "changelog")?.getattr(py, "index")?;
         Self::create_instance(
@@ -44,6 +44,7 @@
             RefCell::new(Box::new(CorePartialDiscovery::new(
                 Index::new(py, index)?,
                 rev_pyiter_collect(py, &targetheads)?,
+                respectsize
             )))
         )
     }