Mercurial > hg-stable
view rust/hg-cpython/src/revlog.rs @ 51225:297fa956b6c4
rust-index: optim note for post-scaffolding removal
author | Georges Racinet <georges.racinet@octobus.net> |
---|---|
date | Wed, 27 Sep 2023 11:34:52 +0200 |
parents | 7434747343ab |
children | 002b49905aac |
line wrap: on
line source
// revlog.rs // // Copyright 2019-2020 Georges Racinet <georges.racinet@octobus.net> // // This software may be used and distributed according to the terms of the // GNU General Public License version 2 or any later version. use crate::{ cindex, utils::{node_from_py_bytes, node_from_py_object}, PyRevision, }; use cpython::{ buffer::{Element, PyBuffer}, exc::{IndexError, ValueError}, ObjectProtocol, PyBool, PyBytes, PyClone, PyDict, PyErr, PyInt, PyModule, PyObject, PyResult, PyString, PyTuple, Python, PythonObject, ToPyObject, }; use hg::{ index::IndexHeader, index::{RevisionDataParams, COMPRESSION_MODE_INLINE}, nodemap::{Block, NodeMapError, NodeTree}, revlog::{nodemap::NodeMap, NodePrefix, RevlogIndex}, BaseRevision, Revision, UncheckedRevision, }; use std::cell::RefCell; /// Return a Struct implementing the Graph trait pub(crate) fn pyindex_to_graph( py: Python, index: PyObject, ) -> PyResult<cindex::Index> { match index.extract::<MixedIndex>(py) { Ok(midx) => Ok(midx.clone_cindex(py)), Err(_) => cindex::Index::new(py, index), } } py_class!(pub class MixedIndex |py| { data cindex: RefCell<cindex::Index>; data index: RefCell<hg::index::Index>; data nt: RefCell<Option<NodeTree>>; data docket: RefCell<Option<PyObject>>; // Holds a reference to the mmap'ed persistent nodemap data data nodemap_mmap: RefCell<Option<PyBuffer>>; // Holds a reference to the mmap'ed persistent index data data index_mmap: RefCell<Option<PyBuffer>>; def __new__( _cls, cindex: PyObject, data: PyObject, default_header: u32, ) -> PyResult<MixedIndex> { Self::new(py, cindex, data, default_header) } /// Compatibility layer used for Python consumers needing access to the C index /// /// Only use case so far is `scmutil.shortesthexnodeidprefix`, /// that may need to build a custom `nodetree`, based on a specified revset. /// With a Rust implementation of the nodemap, we will be able to get rid of /// this, by exposing our own standalone nodemap class, /// ready to accept `MixedIndex`. def get_cindex(&self) -> PyResult<PyObject> { Ok(self.cindex(py).borrow().inner().clone_ref(py)) } // Index API involving nodemap, as defined in mercurial/pure/parsers.py /// Return Revision if found, raises a bare `error.RevlogError` /// in case of ambiguity, same as C version does def get_rev(&self, node: PyBytes) -> PyResult<Option<PyRevision>> { let opt = self.get_nodetree(py)?.borrow(); let nt = opt.as_ref().unwrap(); let idx = &*self.cindex(py).borrow(); let ridx = &*self.index(py).borrow(); let node = node_from_py_bytes(py, &node)?; let rust_rev = nt.find_bin(ridx, node.into()).map_err(|e| nodemap_error(py, e))?; let c_rev = nt.find_bin(idx, node.into()).map_err(|e| nodemap_error(py, e))?; assert_eq!(rust_rev, c_rev); Ok(rust_rev.map(Into::into)) } /// same as `get_rev()` but raises a bare `error.RevlogError` if node /// is not found. /// /// No need to repeat `node` in the exception, `mercurial/revlog.py` /// will catch and rewrap with it def rev(&self, node: PyBytes) -> PyResult<PyRevision> { self.get_rev(py, node)?.ok_or_else(|| revlog_error(py)) } /// return True if the node exist in the index def has_node(&self, node: PyBytes) -> PyResult<bool> { // TODO OPTIM we could avoid a needless conversion here, // to do when scaffolding for pure Rust switch is removed, // as `get_rev()` currently does the necessary assertions self.get_rev(py, node).map(|opt| opt.is_some()) } /// find length of shortest hex nodeid of a binary ID def shortest(&self, node: PyBytes) -> PyResult<usize> { let opt = self.get_nodetree(py)?.borrow(); let nt = opt.as_ref().unwrap(); let idx = &*self.cindex(py).borrow(); match nt.unique_prefix_len_node(idx, &node_from_py_bytes(py, &node)?) { Ok(Some(l)) => Ok(l), Ok(None) => Err(revlog_error(py)), Err(e) => Err(nodemap_error(py, e)), } } def partialmatch(&self, node: PyObject) -> PyResult<Option<PyBytes>> { let opt = self.get_nodetree(py)?.borrow(); let nt = opt.as_ref().unwrap(); let idx = &*self.cindex(py).borrow(); let node_as_string = if cfg!(feature = "python3-sys") { node.cast_as::<PyString>(py)?.to_string(py)?.to_string() } else { let node = node.extract::<PyBytes>(py)?; String::from_utf8_lossy(node.data(py)).to_string() }; let prefix = NodePrefix::from_hex(&node_as_string) .map_err(|_| PyErr::new::<ValueError, _>( py, format!("Invalid node or prefix '{}'", node_as_string)) )?; nt.find_bin(idx, prefix) // TODO make an inner API returning the node directly .map(|opt| opt.map( |rev| PyBytes::new(py, idx.node(rev).unwrap().as_bytes()))) .map_err(|e| nodemap_error(py, e)) } /// append an index entry def append(&self, tup: PyTuple) -> PyResult<PyObject> { if tup.len(py) < 8 { // this is better than the panic promised by tup.get_item() return Err( PyErr::new::<IndexError, _>(py, "tuple index out of range")) } let node_bytes = tup.get_item(py, 7).extract(py)?; let node = node_from_py_object(py, &node_bytes)?; let rev = self.len(py)? as BaseRevision; let mut idx = self.cindex(py).borrow_mut(); // This is ok since we will just add the revision to the index let rev = Revision(rev); idx.append(py, tup.clone_ref(py))?; self.index(py) .borrow_mut() .append(py_tuple_to_revision_data_params(py, tup)?) .unwrap(); self.get_nodetree(py)?.borrow_mut().as_mut().unwrap() .insert(&*idx, &node, rev) .map_err(|e| nodemap_error(py, e))?; Ok(py.None()) } def __delitem__(&self, key: PyObject) -> PyResult<()> { // __delitem__ is both for `del idx[r]` and `del idx[r1:r2]` self.cindex(py).borrow().inner().del_item(py, &key)?; let start = key.getattr(py, "start")?; let start = UncheckedRevision(start.extract(py)?); let start = self.index(py) .borrow() .check_revision(start) .ok_or_else(|| { nodemap_error(py, NodeMapError::RevisionNotInIndex(start)) })?; self.index(py).borrow_mut().remove(start).unwrap(); let mut opt = self.get_nodetree(py)?.borrow_mut(); let nt = opt.as_mut().unwrap(); nt.invalidate_all(); self.fill_nodemap(py, nt)?; Ok(()) } // // Reforwarded C index API // // index_methods (tp_methods). Same ordering as in revlog.c /// return the gca set of the given revs def ancestors(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "ancestors", args, kw) } /// return the heads of the common ancestors of the given revs def commonancestorsheads(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "commonancestorsheads", args, kw) } /// Clear the index caches and inner py_class data. /// It is Python's responsibility to call `update_nodemap_data` again. def clearcaches(&self, *args, **kw) -> PyResult<PyObject> { self.nt(py).borrow_mut().take(); self.docket(py).borrow_mut().take(); self.nodemap_mmap(py).borrow_mut().take(); self.index(py).borrow_mut().clear_caches(); self.call_cindex(py, "clearcaches", args, kw) } /// return the raw binary string representing a revision def entry_binary(&self, *args, **kw) -> PyResult<PyObject> { let rindex = self.index(py).borrow(); let rev = UncheckedRevision(args.get_item(py, 0).extract(py)?); let rust_bytes = rindex.check_revision(rev).and_then( |r| rindex.entry_binary(r)) .ok_or_else(|| rev_not_in_index(py, rev))?; let rust_res = PyBytes::new(py, rust_bytes).into_object(); let c_res = self.call_cindex(py, "entry_binary", args, kw)?; assert_py_eq(py, "entry_binary", &rust_res, &c_res)?; Ok(rust_res) } /// return a binary packed version of the header def pack_header(&self, *args, **kw) -> PyResult<PyObject> { let rindex = self.index(py).borrow(); let packed = rindex.pack_header(args.get_item(py, 0).extract(py)?); let rust_res = PyBytes::new(py, &packed).into_object(); let c_res = self.call_cindex(py, "pack_header", args, kw)?; assert_py_eq(py, "pack_header", &rust_res, &c_res)?; Ok(rust_res) } /// get an index entry def get(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "get", args, kw) } /// compute phases def computephasesmapsets(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "computephasesmapsets", args, kw) } /// reachableroots def reachableroots2(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "reachableroots2", args, kw) } /// get head revisions def headrevs(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "headrevs", args, kw) } /// get filtered head revisions def headrevsfiltered(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "headrevsfiltered", args, kw) } /// True if the object is a snapshot def issnapshot(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "issnapshot", args, kw) } /// Gather snapshot data in a cache dict def findsnapshots(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "findsnapshots", args, kw) } /// determine revisions with deltas to reconstruct fulltext def deltachain(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "deltachain", args, kw) } /// slice planned chunk read to reach a density threshold def slicechunktodensity(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "slicechunktodensity", args, kw) } /// stats for the index def stats(&self, *args, **kw) -> PyResult<PyObject> { self.call_cindex(py, "stats", args, kw) } // index_sequence_methods and index_mapping_methods. // // Since we call back through the high level Python API, // there's no point making a distinction between index_get // and index_getitem. def __len__(&self) -> PyResult<usize> { self.len(py) } def __getitem__(&self, key: PyObject) -> PyResult<PyObject> { // this conversion seems needless, but that's actually because // `index_getitem` does not handle conversion from PyLong, // which expressions such as [e for e in index] internally use. // Note that we don't seem to have a direct way to call // PySequence_GetItem (does the job), which would possibly be better // for performance let key = match key.extract::<i32>(py) { Ok(rev) => rev.to_py_object(py).into_object(), Err(_) => key, }; self.cindex(py).borrow().inner().get_item(py, key) } def __contains__(&self, item: PyObject) -> PyResult<bool> { // ObjectProtocol does not seem to provide contains(), so // this is an equivalent implementation of the index_contains() // defined in revlog.c let cindex = self.cindex(py).borrow(); match item.extract::<i32>(py) { Ok(rev) => { Ok(rev >= -1 && rev < self.len(py)? as BaseRevision) } Err(_) => { cindex.inner().call_method( py, "has_node", PyTuple::new(py, &[item]), None)? .extract(py) } } } def nodemap_data_all(&self) -> PyResult<PyBytes> { self.inner_nodemap_data_all(py) } def nodemap_data_incremental(&self) -> PyResult<PyObject> { self.inner_nodemap_data_incremental(py) } def update_nodemap_data( &self, docket: PyObject, nm_data: PyObject ) -> PyResult<PyObject> { self.inner_update_nodemap_data(py, docket, nm_data) } @property def entry_size(&self) -> PyResult<PyInt> { self.cindex(py).borrow().inner().getattr(py, "entry_size")?.extract::<PyInt>(py) } @property def rust_ext_compat(&self) -> PyResult<PyInt> { self.cindex(py).borrow().inner().getattr(py, "rust_ext_compat")?.extract::<PyInt>(py) } }); /// Take a (potentially) mmap'ed buffer, and return the underlying Python /// buffer along with the Rust slice into said buffer. We need to keep the /// Python buffer around, otherwise we'd get a dangling pointer once the buffer /// is freed from Python's side. /// /// # Safety /// /// The caller must make sure that the buffer is kept around for at least as /// long as the slice. #[deny(unsafe_op_in_unsafe_fn)] unsafe fn mmap_keeparound( py: Python, data: PyObject, ) -> PyResult<( PyBuffer, Box<dyn std::ops::Deref<Target = [u8]> + Send + 'static>, )> { let buf = PyBuffer::get(py, &data)?; let len = buf.item_count(); // Build a slice from the mmap'ed buffer data let cbuf = buf.buf_ptr(); let bytes = if std::mem::size_of::<u8>() == buf.item_size() && buf.is_c_contiguous() && u8::is_compatible_format(buf.format()) { unsafe { std::slice::from_raw_parts(cbuf as *const u8, len) } } else { return Err(PyErr::new::<ValueError, _>( py, "Nodemap data buffer has an invalid memory representation" .to_string(), )); }; Ok((buf, Box::new(bytes))) } fn py_tuple_to_revision_data_params( py: Python, tuple: PyTuple, ) -> PyResult<RevisionDataParams> { if tuple.len(py) < 8 { // this is better than the panic promised by tup.get_item() return Err(PyErr::new::<IndexError, _>( py, "tuple index out of range", )); } let offset_or_flags: u64 = tuple.get_item(py, 0).extract(py)?; let node_id = tuple .get_item(py, 7) .extract::<PyBytes>(py)? .data(py) .try_into() .unwrap(); let flags = (offset_or_flags & 0xFFFF) as u16; let data_offset = offset_or_flags >> 16; Ok(RevisionDataParams { flags, data_offset, data_compressed_length: tuple.get_item(py, 1).extract(py)?, data_uncompressed_length: tuple.get_item(py, 2).extract(py)?, data_delta_base: tuple.get_item(py, 3).extract(py)?, link_rev: tuple.get_item(py, 4).extract(py)?, parent_rev_1: tuple.get_item(py, 5).extract(py)?, parent_rev_2: tuple.get_item(py, 6).extract(py)?, node_id, _sidedata_offset: 0, _sidedata_compressed_length: 0, data_compression_mode: COMPRESSION_MODE_INLINE, _sidedata_compression_mode: COMPRESSION_MODE_INLINE, _rank: -1, }) } impl MixedIndex { fn new( py: Python, cindex: PyObject, data: PyObject, header: u32, ) -> PyResult<MixedIndex> { // Safety: we keep the buffer around inside the class as `index_mmap` let (buf, bytes) = unsafe { mmap_keeparound(py, data)? }; Self::create_instance( py, RefCell::new(cindex::Index::new(py, cindex)?), RefCell::new( hg::index::Index::new( bytes, IndexHeader::parse(&header.to_be_bytes()) .expect("default header is broken") .unwrap(), ) .unwrap(), ), RefCell::new(None), RefCell::new(None), RefCell::new(None), RefCell::new(Some(buf)), ) } fn len(&self, py: Python) -> PyResult<usize> { let rust_index_len = self.index(py).borrow().len(); let cindex_len = self.cindex(py).borrow().inner().len(py)?; assert_eq!(rust_index_len, cindex_len); Ok(cindex_len) } /// This is scaffolding at this point, but it could also become /// a way to start a persistent nodemap or perform a /// vacuum / repack operation fn fill_nodemap( &self, py: Python, nt: &mut NodeTree, ) -> PyResult<PyObject> { let index = self.cindex(py).borrow(); for r in 0..self.len(py)? { let rev = Revision(r as BaseRevision); // in this case node() won't ever return None nt.insert(&*index, index.node(rev).unwrap(), rev) .map_err(|e| nodemap_error(py, e))? } Ok(py.None()) } fn get_nodetree<'a>( &'a self, py: Python<'a>, ) -> PyResult<&'a RefCell<Option<NodeTree>>> { if self.nt(py).borrow().is_none() { let readonly = Box::<Vec<_>>::default(); let mut nt = NodeTree::load_bytes(readonly, 0); self.fill_nodemap(py, &mut nt)?; self.nt(py).borrow_mut().replace(nt); } Ok(self.nt(py)) } /// forward a method call to the underlying C index fn call_cindex( &self, py: Python, name: &str, args: &PyTuple, kwargs: Option<&PyDict>, ) -> PyResult<PyObject> { self.cindex(py) .borrow() .inner() .call_method(py, name, args, kwargs) } pub fn clone_cindex(&self, py: Python) -> cindex::Index { self.cindex(py).borrow().clone_ref(py) } /// Returns the full nodemap bytes to be written as-is to disk fn inner_nodemap_data_all(&self, py: Python) -> PyResult<PyBytes> { let nodemap = self.get_nodetree(py)?.borrow_mut().take().unwrap(); let (readonly, bytes) = nodemap.into_readonly_and_added_bytes(); // If there's anything readonly, we need to build the data again from // scratch let bytes = if readonly.len() > 0 { let mut nt = NodeTree::load_bytes(Box::<Vec<_>>::default(), 0); self.fill_nodemap(py, &mut nt)?; let (readonly, bytes) = nt.into_readonly_and_added_bytes(); assert_eq!(readonly.len(), 0); bytes } else { bytes }; let bytes = PyBytes::new(py, &bytes); Ok(bytes) } /// Returns the last saved docket along with the size of any changed data /// (in number of blocks), and said data as bytes. fn inner_nodemap_data_incremental( &self, py: Python, ) -> PyResult<PyObject> { let docket = self.docket(py).borrow(); let docket = match docket.as_ref() { Some(d) => d, None => return Ok(py.None()), }; let node_tree = self.get_nodetree(py)?.borrow_mut().take().unwrap(); let masked_blocks = node_tree.masked_readonly_blocks(); let (_, data) = node_tree.into_readonly_and_added_bytes(); let changed = masked_blocks * std::mem::size_of::<Block>(); Ok((docket, changed, PyBytes::new(py, &data)) .to_py_object(py) .into_object()) } /// Update the nodemap from the new (mmaped) data. /// The docket is kept as a reference for later incremental calls. fn inner_update_nodemap_data( &self, py: Python, docket: PyObject, nm_data: PyObject, ) -> PyResult<PyObject> { // Safety: we keep the buffer around inside the class as `nodemap_mmap` let (buf, bytes) = unsafe { mmap_keeparound(py, nm_data)? }; let len = buf.item_count(); self.nodemap_mmap(py).borrow_mut().replace(buf); let mut nt = NodeTree::load_bytes(bytes, len); let data_tip = docket .getattr(py, "tip_rev")? .extract::<BaseRevision>(py)? .into(); self.docket(py).borrow_mut().replace(docket.clone_ref(py)); let idx = self.cindex(py).borrow(); let data_tip = idx.check_revision(data_tip).ok_or_else(|| { nodemap_error(py, NodeMapError::RevisionNotInIndex(data_tip)) })?; let current_tip = idx.len(); for r in (data_tip.0 + 1)..current_tip as BaseRevision { let rev = Revision(r); // in this case node() won't ever return None nt.insert(&*idx, idx.node(rev).unwrap(), rev) .map_err(|e| nodemap_error(py, e))? } *self.nt(py).borrow_mut() = Some(nt); Ok(py.None()) } } fn revlog_error(py: Python) -> PyErr { match py .import("mercurial.error") .and_then(|m| m.get(py, "RevlogError")) { Err(e) => e, Ok(cls) => PyErr::from_instance( py, cls.call(py, (py.None(),), None).ok().into_py_object(py), ), } } fn nodemap_rev_not_in_index(py: Python, rev: UncheckedRevision) -> PyErr { PyErr::new::<ValueError, _>( py, format!( "Inconsistency: Revision {} found in nodemap \ is not in revlog index", rev ), ) } fn rev_not_in_index(py: Python, rev: UncheckedRevision) -> PyErr { PyErr::new::<ValueError, _>( py, format!("revlog index out of range: {}", rev), ) } /// Standard treatment of NodeMapError fn nodemap_error(py: Python, err: NodeMapError) -> PyErr { match err { NodeMapError::MultipleResults => revlog_error(py), NodeMapError::RevisionNotInIndex(r) => nodemap_rev_not_in_index(py, r), } } fn assert_py_eq( py: Python, method: &str, rust: &PyObject, c: &PyObject, ) -> PyResult<()> { let locals = PyDict::new(py); locals.set_item(py, "rust".into_py_object(py).into_object(), rust)?; locals.set_item(py, "c".into_py_object(py).into_object(), c)?; let is_eq: PyBool = py.eval("rust == c", None, Some(&locals))?.extract(py)?; assert!( is_eq.is_true(), "{} results differ. Rust: {:?} C: {:?}", method, rust, c ); Ok(()) } /// Create the module, with __package__ given from parent pub fn init_module(py: Python, package: &str) -> PyResult<PyModule> { let dotted_name = &format!("{}.revlog", package); let m = PyModule::new(py, dotted_name)?; m.add(py, "__package__", package)?; m.add(py, "__doc__", "RevLog - Rust implementations")?; m.add_class::<MixedIndex>(py)?; let sys = PyModule::import(py, "sys")?; let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?; sys_modules.set_item(py, dotted_name, &m)?; Ok(m) }