Mercurial > hg
changeset 42738:8041a1b45163
rust-discovery: takefullsample() core implementation
take_full_sample() browses the undecided set in both directions: from
its roots as well as from its heads.
Following what's done on the Python side, we alter update_sample()
signature to take a closure returning an iterator: either ParentsIterator
or an iterator over the children found in `children_cache`. These constructs
should probably be split off in a separate module.
This is a first concrete example where a more abstract graph notion (probably
a trait) would be useful, as this is nothing but an operation on the reversed
DAG.
A similar motivation in the context of the discovery
process would be to replace the call to dagops::range in
`add_missing_revisions()` with a simple iteration over descendents, again an
operation on the reversed graph.
Differential Revision: https://phab.mercurial-scm.org/D6424
author | Georges Racinet <georges.racinet@octobus.net> |
---|---|
date | Fri, 17 May 2019 01:56:57 +0200 |
parents | 388622cbc911 |
children | 1c4b5689bef5 |
files | rust/hg-core/src/discovery.rs rust/hg-cpython/src/discovery.rs |
diffstat | 2 files changed, 235 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/discovery.rs Fri May 17 01:56:56 2019 +0200 +++ b/rust/hg-core/src/discovery.rs Fri May 17 01:56:57 2019 +0200 @@ -17,6 +17,7 @@ use super::{Graph, GraphError, Revision, NULL_REVISION}; use crate::ancestors::MissingAncestors; use crate::dagops; +use std::cmp::{max, min}; use std::collections::{HashMap, HashSet, VecDeque}; type Rng = self::rand_pcg::Pcg32; @@ -26,8 +27,10 @@ graph: G, // plays the role of self._repo common: MissingAncestors<G>, undecided: Option<HashSet<Revision>>, + children_cache: Option<HashMap<Revision, Vec<Revision>>>, missing: HashSet<Revision>, rng: Rng, + respect_size: bool, } pub struct DiscoveryStats { @@ -49,13 +52,16 @@ /// - `sample`: a sample to update /// - `parentfn`: a callable to resolve parents for a revision /// - `quicksamplesize`: optional target size of the sample -fn update_sample( +fn update_sample<I>( revs: Option<&HashSet<Revision>>, heads: impl IntoIterator<Item = Revision>, sample: &mut HashSet<Revision>, - parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>, + parentsfn: impl Fn(Revision) -> Result<I, GraphError>, quicksamplesize: Option<usize>, -) -> Result<(), GraphError> { +) -> Result<(), GraphError> +where + I: Iterator<Item = Revision>, +{ let mut distances: HashMap<Revision, u32> = HashMap::new(); let mut visit: VecDeque<Revision> = heads.into_iter().collect(); let mut factor: u32 = 1; @@ -83,10 +89,7 @@ } } } - for &p in &parentsfn(current)? { - if p == NULL_REVISION { - continue; - } + for p in parentsfn(current)? { if let Some(revs) = revs { if !revs.contains(&p) { continue; @@ -99,6 +102,39 @@ Ok(()) } +struct ParentsIterator { + parents: [Revision; 2], + cur: usize, +} + +impl ParentsIterator { + fn graph_parents( + graph: &impl Graph, + r: Revision, + ) -> Result<ParentsIterator, GraphError> { + Ok(ParentsIterator { + parents: graph.parents(r)?, + cur: 0, + }) + } +} + +impl Iterator for ParentsIterator { + type Item = Revision; + + fn next(&mut self) -> Option<Revision> { + if self.cur > 1 { + return None; + } + let rev = self.parents[self.cur]; + self.cur += 1; + if rev == NULL_REVISION { + return self.next(); + } + Some(rev) + } +} + impl<G: Graph + Clone> PartialDiscovery<G> { /// Create a PartialDiscovery object, with the intent /// of comparing our `::<target_heads>` revset to the contents of another @@ -110,24 +146,36 @@ /// If we want to make the signature more flexible, /// we'll have to make it a type argument of `PartialDiscovery` or a trait /// object since we'll keep it in the meanwhile - pub fn new(graph: G, target_heads: Vec<Revision>) -> Self { + /// + /// The `respect_size` boolean controls how the sampling methods + /// will interpret the size argument requested by the caller. If it's + /// `false`, they are allowed to produce a sample whose size is more + /// appropriate to the situation (typically bigger). + pub fn new( + graph: G, + target_heads: Vec<Revision>, + respect_size: bool, + ) -> Self { let mut seed: [u8; 16] = [0; 16]; thread_rng().fill_bytes(&mut seed); - Self::new_with_seed(graph, target_heads, seed) + Self::new_with_seed(graph, target_heads, seed, respect_size) } pub fn new_with_seed( graph: G, target_heads: Vec<Revision>, seed: [u8; 16], + respect_size: bool, ) -> Self { PartialDiscovery { undecided: None, + children_cache: None, target_heads: Some(target_heads), graph: graph.clone(), common: MissingAncestors::new(graph, vec![]), missing: HashSet::new(), rng: Rng::from_seed(seed), + respect_size: respect_size, } } @@ -228,6 +276,22 @@ Ok(()) } + fn ensure_children_cache(&mut self) -> Result<(), GraphError> { + if self.children_cache.is_some() { + return Ok(()); + } + self.ensure_undecided()?; + + let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new(); + for &rev in self.undecided.as_ref().unwrap() { + for p in ParentsIterator::graph_parents(&self.graph, rev)? { + children.entry(p).or_insert_with(|| Vec::new()).push(rev); + } + } + self.children_cache = Some(children); + Ok(()) + } + /// Provide statistics about the current state of the discovery process pub fn stats(&self) -> DiscoveryStats { DiscoveryStats { @@ -255,11 +319,114 @@ None, headrevs, &mut sample, - |r| self.graph.parents(r), + |r| ParentsIterator::graph_parents(&self.graph, r), Some(size), )?; Ok(sample.into_iter().collect()) } + + /// Extract a sample from `self.undecided`, going from its heads and roots. + /// + /// The `size` parameter is used to avoid useless computations if + /// it turns out to be bigger than the whole set of undecided Revisions. + /// + /// The sample is taken by using `update_sample` from the heads, then + /// from the roots, working on the reverse DAG, + /// expressed by `self.children_cache`. + /// + /// No effort is being made to complete or limit the sample to `size` + /// but this method returns another interesting size that it derives + /// from its knowledge of the structure of the various sets, leaving + /// to the caller the decision to use it or not. + fn bidirectional_sample( + &mut self, + size: usize, + ) -> Result<(HashSet<Revision>, usize), GraphError> { + self.ensure_undecided()?; + { + // we don't want to compute children_cache before this + // but doing it after extracting self.undecided takes a mutable + // ref to self while a shareable one is still active. + let undecided = self.undecided.as_ref().unwrap(); + if undecided.len() <= size { + return Ok((undecided.clone(), size)); + } + } + + self.ensure_children_cache()?; + let revs = self.undecided.as_ref().unwrap(); + let mut sample: HashSet<Revision> = revs.clone(); + + // it's possible that leveraging the children cache would be more + // efficient here + dagops::retain_heads(&self.graph, &mut sample)?; + let revsheads = sample.clone(); // was again heads(revs) in python + + // update from heads + update_sample( + Some(revs), + revsheads.iter().cloned(), + &mut sample, + |r| ParentsIterator::graph_parents(&self.graph, r), + None, + )?; + + // update from roots + let revroots: HashSet<Revision> = + dagops::roots(&self.graph, revs)?.into_iter().collect(); + let prescribed_size = max(size, min(revroots.len(), revsheads.len())); + + let children = self.children_cache.as_ref().unwrap(); + let empty_vec: Vec<Revision> = Vec::new(); + update_sample( + Some(revs), + revroots, + &mut sample, + |r| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()), + None, + )?; + Ok((sample, prescribed_size)) + } + + /// Fill up sample up to the wished size with random undecided Revisions. + /// + /// This is intended to be used as a last resort completion if the + /// regular sampling algorithm returns too few elements. + fn random_complete_sample( + &mut self, + sample: &mut Vec<Revision>, + size: usize, + ) { + let sample_len = sample.len(); + if size <= sample_len { + return; + } + let take_from: Vec<Revision> = self + .undecided + .as_ref() + .unwrap() + .iter() + .filter(|&r| !sample.contains(r)) + .cloned() + .collect(); + sample.extend(self.limit_sample(take_from, size - sample_len)); + } + + pub fn take_full_sample( + &mut self, + size: usize, + ) -> Result<Vec<Revision>, GraphError> { + let (sample_set, prescribed_size) = self.bidirectional_sample(size)?; + let size = if self.respect_size { + size + } else { + prescribed_size + }; + let mut sample = + self.limit_sample(sample_set.into_iter().collect(), size); + self.random_complete_sample(&mut sample, size); + Ok(sample) + } } #[cfg(test)] @@ -275,6 +442,7 @@ SampleGraph, vec![10, 11, 12, 13], [0; 16], + true, ) } @@ -282,7 +450,7 @@ /// /// To avoid actual randomness in tests, we give it a fixed random seed. fn disco12() -> PartialDiscovery<SampleGraph> { - PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16]) + PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true) } fn sorted_undecided( @@ -390,4 +558,58 @@ assert_eq!(sample_vec, vec![4, 9, 12]); Ok(()) } + + #[test] + fn test_children_cache() -> Result<(), GraphError> { + let mut disco = full_disco(); + disco.ensure_children_cache()?; + + let cache = disco.children_cache.unwrap(); + assert_eq!(cache.get(&2).cloned(), Some(vec![4])); + assert_eq!(cache.get(&10).cloned(), None); + + let mut children_4 = cache.get(&4).cloned().unwrap(); + children_4.sort(); + assert_eq!(children_4, vec![5, 6, 7]); + + let mut children_7 = cache.get(&7).cloned().unwrap(); + children_7.sort(); + assert_eq!(children_7, vec![9, 11]); + + Ok(()) + } + + #[test] + fn test_complete_sample() { + let mut disco = full_disco(); + let undecided: HashSet<Revision> = + [4, 7, 9, 2, 3].iter().cloned().collect(); + disco.undecided = Some(undecided); + + let mut sample = vec![0]; + disco.random_complete_sample(&mut sample, 3); + assert_eq!(sample.len(), 3); + + let mut sample = vec![2, 4, 7]; + disco.random_complete_sample(&mut sample, 1); + assert_eq!(sample.len(), 3); + } + + #[test] + fn test_bidirectional_sample() -> Result<(), GraphError> { + let mut disco = full_disco(); + disco.undecided = Some((0..=13).into_iter().collect()); + + let (sample_set, size) = disco.bidirectional_sample(7)?; + assert_eq!(size, 7); + let mut sample: Vec<Revision> = sample_set.into_iter().collect(); + sample.sort(); + // our DAG is a bit too small for the results to be really interesting + // at least it shows that + // - we went both ways + // - we didn't take all Revisions (6 is not in the sample) + assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]); + Ok(()) + } + }
--- a/rust/hg-cpython/src/discovery.rs Fri May 17 01:56:56 2019 +0200 +++ b/rust/hg-cpython/src/discovery.rs Fri May 17 01:56:57 2019 +0200 @@ -36,7 +36,7 @@ _cls, repo: PyObject, targetheads: PyObject, - _respectsize: bool + respectsize: bool ) -> PyResult<PartialDiscovery> { let index = repo.getattr(py, "changelog")?.getattr(py, "index")?; Self::create_instance( @@ -44,6 +44,7 @@ RefCell::new(Box::new(CorePartialDiscovery::new( Index::new(py, index)?, rev_pyiter_collect(py, &targetheads)?, + respectsize ))) ) }