changeset 42754:388622cbc911

rust-discovery: core implementation for take_quick_sample() This makes in particular `rand` no longer a testing dependency. We keep a seedable random generator on the `PartialDiscovery` object itself, to avoid lengthy initialization. In take_quick_sample() itself, we had to avoid keeping the reference to `self.undecided` to cope with the mutable reference introduced by the the call to `limit_sample`, but it's still manageable without resorting to inner mutability. Sampling being prone to be improved in the mid-term future, testing is minimal, amounting to checking which code path got executed. Differential Revision: https://phab.mercurial-scm.org/D6423
author Georges Racinet <georges.racinet@octobus.net>
date Fri, 17 May 2019 01:56:56 +0200
parents b6f3f704a561
children 8041a1b45163
files rust/hg-core/Cargo.toml rust/hg-core/src/discovery.rs
diffstat 2 files changed, 189 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/Cargo.toml	Wed Jun 12 14:31:41 2019 +0100
+++ b/rust/hg-core/Cargo.toml	Fri May 17 01:56:56 2019 +0200
@@ -8,12 +8,10 @@
 [lib]
 name = "hg"
 
-[dev-dependencies]
-rand = "*"
-rand_pcg = "*"
-
 [dependencies]
 byteorder = "1.3.1"
 lazy_static = "1.3.0"
 memchr = "2.2.0"
+rand = "> 0.6.4"
+rand_pcg = "> 0.1.0"
 regex = "^1.1"
--- a/rust/hg-core/src/discovery.rs	Wed Jun 12 14:31:41 2019 +0100
+++ b/rust/hg-core/src/discovery.rs	Fri May 17 01:56:56 2019 +0200
@@ -10,10 +10,16 @@
 //! This is a Rust counterpart to the `partialdiscovery` class of
 //! `mercurial.setdiscovery`
 
-use super::{Graph, GraphError, Revision};
+extern crate rand;
+extern crate rand_pcg;
+use self::rand::seq::SliceRandom;
+use self::rand::{thread_rng, RngCore, SeedableRng};
+use super::{Graph, GraphError, Revision, NULL_REVISION};
 use crate::ancestors::MissingAncestors;
 use crate::dagops;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet, VecDeque};
+
+type Rng = self::rand_pcg::Pcg32;
 
 pub struct PartialDiscovery<G: Graph + Clone> {
     target_heads: Option<Vec<Revision>>,
@@ -21,12 +27,78 @@
     common: MissingAncestors<G>,
     undecided: Option<HashSet<Revision>>,
     missing: HashSet<Revision>,
+    rng: Rng,
 }
 
 pub struct DiscoveryStats {
     pub undecided: Option<usize>,
 }
 
+/// Update an existing sample to match the expected size
+///
+/// The sample is updated with revisions exponentially distant from each
+/// element of `heads`.
+///
+/// If a target size is specified, the sampling will stop once this size is
+/// reached. Otherwise sampling will happen until roots of the <revs> set are
+/// reached.
+///
+/// - `revs`: set of revs we want to discover (if None, `assume` the whole dag
+///   represented by `parentfn`
+/// - `heads`: set of DAG head revs
+/// - `sample`: a sample to update
+/// - `parentfn`: a callable to resolve parents for a revision
+/// - `quicksamplesize`: optional target size of the sample
+fn update_sample(
+    revs: Option<&HashSet<Revision>>,
+    heads: impl IntoIterator<Item = Revision>,
+    sample: &mut HashSet<Revision>,
+    parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>,
+    quicksamplesize: Option<usize>,
+) -> Result<(), GraphError> {
+    let mut distances: HashMap<Revision, u32> = HashMap::new();
+    let mut visit: VecDeque<Revision> = heads.into_iter().collect();
+    let mut factor: u32 = 1;
+    let mut seen: HashSet<Revision> = HashSet::new();
+    loop {
+        let current = match visit.pop_front() {
+            None => {
+                break;
+            }
+            Some(r) => r,
+        };
+        if !seen.insert(current) {
+            continue;
+        }
+
+        let d = *distances.entry(current).or_insert(1);
+        if d > factor {
+            factor *= 2;
+        }
+        if d == factor {
+            sample.insert(current);
+            if let Some(sz) = quicksamplesize {
+                if sample.len() >= sz {
+                    return Ok(());
+                }
+            }
+        }
+        for &p in &parentsfn(current)? {
+            if p == NULL_REVISION {
+                continue;
+            }
+            if let Some(revs) = revs {
+                if !revs.contains(&p) {
+                    continue;
+                }
+            }
+            distances.entry(p).or_insert(d + 1);
+            visit.push_back(p);
+        }
+    }
+    Ok(())
+}
+
 impl<G: Graph + Clone> PartialDiscovery<G> {
     /// Create a PartialDiscovery object, with the intent
     /// of comparing our `::<target_heads>` revset to the contents of another
@@ -39,15 +111,47 @@
     /// we'll have to make it a type argument of `PartialDiscovery` or a trait
     /// object since we'll keep it in the meanwhile
     pub fn new(graph: G, target_heads: Vec<Revision>) -> Self {
+        let mut seed: [u8; 16] = [0; 16];
+        thread_rng().fill_bytes(&mut seed);
+        Self::new_with_seed(graph, target_heads, seed)
+    }
+
+    pub fn new_with_seed(
+        graph: G,
+        target_heads: Vec<Revision>,
+        seed: [u8; 16],
+    ) -> Self {
         PartialDiscovery {
             undecided: None,
             target_heads: Some(target_heads),
             graph: graph.clone(),
             common: MissingAncestors::new(graph, vec![]),
             missing: HashSet::new(),
+            rng: Rng::from_seed(seed),
         }
     }
 
+    /// Extract at most `size` random elements from sample and return them
+    /// as a vector
+    fn limit_sample(
+        &mut self,
+        mut sample: Vec<Revision>,
+        size: usize,
+    ) -> Vec<Revision> {
+        let sample_len = sample.len();
+        if sample_len <= size {
+            return sample;
+        }
+        let rng = &mut self.rng;
+        let dropped_size = sample_len - size;
+        let limited_slice = if size < dropped_size {
+            sample.partial_shuffle(rng, size).0
+        } else {
+            sample.partial_shuffle(rng, dropped_size).1
+        };
+        limited_slice.to_owned()
+    }
+
     /// Register revisions known as being common
     pub fn add_common_revisions(
         &mut self,
@@ -130,6 +234,32 @@
             undecided: self.undecided.as_ref().map(|s| s.len()),
         }
     }
+
+    pub fn take_quick_sample(
+        &mut self,
+        headrevs: impl IntoIterator<Item = Revision>,
+        size: usize,
+    ) -> Result<Vec<Revision>, GraphError> {
+        self.ensure_undecided()?;
+        let mut sample = {
+            let undecided = self.undecided.as_ref().unwrap();
+            if undecided.len() <= size {
+                return Ok(undecided.iter().cloned().collect());
+            }
+            dagops::heads(&self.graph, undecided.iter())?
+        };
+        if sample.len() >= size {
+            return Ok(self.limit_sample(sample.into_iter().collect(), size));
+        }
+        update_sample(
+            None,
+            headrevs,
+            &mut sample,
+            |r| self.graph.parents(r),
+            Some(size),
+        )?;
+        Ok(sample.into_iter().collect())
+    }
 }
 
 #[cfg(test)]
@@ -138,8 +268,21 @@
     use crate::testing::SampleGraph;
 
     /// A PartialDiscovery as for pushing all the heads of `SampleGraph`
+    ///
+    /// To avoid actual randomness in tests, we give it a fixed random seed.
     fn full_disco() -> PartialDiscovery<SampleGraph> {
-        PartialDiscovery::new(SampleGraph, vec![10, 11, 12, 13])
+        PartialDiscovery::new_with_seed(
+            SampleGraph,
+            vec![10, 11, 12, 13],
+            [0; 16],
+        )
+    }
+
+    /// A PartialDiscovery as for pushing the 12 head of `SampleGraph`
+    ///
+    /// To avoid actual randomness in tests, we give it a fixed random seed.
+    fn disco12() -> PartialDiscovery<SampleGraph> {
+        PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16])
     }
 
     fn sorted_undecided(
@@ -206,4 +349,45 @@
         assert_eq!(sorted_common_heads(&disco)?, vec![5, 11, 12]);
         Ok(())
     }
+
+    #[test]
+    fn test_limit_sample_no_need_to() {
+        let sample = vec![1, 2, 3, 4];
+        assert_eq!(full_disco().limit_sample(sample, 10), vec![1, 2, 3, 4]);
+    }
+
+    #[test]
+    fn test_limit_sample_less_than_half() {
+        assert_eq!(full_disco().limit_sample((1..6).collect(), 2), vec![4, 2]);
+    }
+
+    #[test]
+    fn test_limit_sample_more_than_half() {
+        assert_eq!(full_disco().limit_sample((1..4).collect(), 2), vec![3, 2]);
+    }
+
+    #[test]
+    fn test_quick_sample_enough_undecided_heads() -> Result<(), GraphError> {
+        let mut disco = full_disco();
+        disco.undecided = Some((1..=13).collect());
+
+        let mut sample_vec = disco.take_quick_sample(vec![], 4)?;
+        sample_vec.sort();
+        assert_eq!(sample_vec, vec![10, 11, 12, 13]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_quick_sample_climbing_from_12() -> Result<(), GraphError> {
+        let mut disco = disco12();
+        disco.ensure_undecided()?;
+
+        let mut sample_vec = disco.take_quick_sample(vec![12], 4)?;
+        sample_vec.sort();
+        // r12's only parent is r9, whose unique grand-parent through the
+        // diamond shape is r4. This ends there because the distance from r4
+        // to the root is only 3.
+        assert_eq!(sample_vec, vec![4, 9, 12]);
+        Ok(())
+    }
 }