Mercurial > hg
changeset 47234:616b8f412676
revlogv2: introduce a very basic docket file
This is the first stone toward using a docket file in revlogv2. Right now the
docket is very basic and only store the version number (which is -also- stored
into the index file…) and the other files have fixed name. This new
implementation break transactionally… but they are no test checking
transactionally for revlogv2… So I take this as an opportunity to start small.
They are no usage of revlogv2 outside of tests anyway.
The docket keeps the `.i` naming used by previous version index to preserve a
unique entry point. We could decide to use a different name and look it up
first, or to fully rework this in a future "store" version. However that does
not seems necessary right now.
We will re-introduces transactionality (and associated testing…) in a later
changesets.
A long list of TODOs have been added to the relevant comment.
Differential Revision: https://phab.mercurial-scm.org/D10624
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Mon, 03 May 2021 12:34:11 +0200 |
parents | bcafcd779d2e |
children | 6b1eae313b2f |
files | mercurial/changelog.py mercurial/configitems.py mercurial/revlog.py mercurial/revlogutils/constants.py mercurial/revlogutils/docket.py mercurial/store.py |
diffstat | 6 files changed, 132 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/changelog.py Mon May 17 15:05:24 2021 +0200 +++ b/mercurial/changelog.py Mon May 03 12:34:11 2021 +0200 @@ -445,6 +445,8 @@ def delayupdate(self, tr): """delay visibility of index updates to other readers""" + if self._docket is not None: + return if not self._delayed: if len(self) == 0:
--- a/mercurial/configitems.py Mon May 17 15:05:24 2021 +0200 +++ b/mercurial/configitems.py Mon May 03 12:34:11 2021 +0200 @@ -1150,14 +1150,27 @@ ) # "out of experimental" todo list. # -# * to grow a docket file to at least store the last offset of the data -# file when rewriting sidedata. -# * need a way of dealing with garbage data if we allow rewriting -# *existing* sidedata. +# * stop storing version information in the index (it is already in the docket) +# * properly hide uncommitted content to other process +# * expose transaction content hooks during pre-commit validation +# * include management of a persistent nodemap in the main docket +# * enforce a "no-truncate" policy for mmap safety +# - for censoring operation +# - for stripping operation +# - for rollback operation +# * store the data size in the docket to simplify sidedata rewrite. +# * track garbage data to evemtually allow rewriting -existing- sidedata. # * Exchange-wise, we will also need to do something more efficient than # keeping references to the affected revlogs, especially memory-wise when # rewriting sidedata. -# * Also... compress the sidedata? (this should be coming very soon) +# * sidedata compression +# * introduce a proper solution to reduce the number of filelog related files. +# * Improvement to consider +# - track compression mode in the index entris instead of the chunks +# - split the data offset and flag field (the 2 bytes save are mostly trouble) +# - keep track of uncompressed -chunk- size (to preallocate memory better) +# - keep track of chain base or size (probably not that useful anymore) +# - store data and sidedata in different files coreconfigitem( b'experimental', b'revlogv2',
--- a/mercurial/revlog.py Mon May 17 15:05:24 2021 +0200 +++ b/mercurial/revlog.py Mon May 03 12:34:11 2021 +0200 @@ -75,6 +75,7 @@ ) from .revlogutils import ( deltas as deltautil, + docket as docketutil, flagutil, nodemap as nodemaputil, revlogv0, @@ -317,6 +318,7 @@ self.radix = radix + self._docket_file = None self._indexfile = None self._datafile = None self._nodemap_file = None @@ -344,6 +346,7 @@ self._maxchainlen = None self._deltabothparents = True self.index = None + self._docket = None self._nodemap_docket = None # Mapping of partial identifiers to full nodes. self._pcache = {} @@ -505,8 +508,23 @@ self._generaldelta = features[b'generaldelta'](self._format_flags) self.hassidedata = features[b'sidedata'] - index_data = entry_data - self._indexfile = entry_point + if not features[b'docket']: + self._indexfile = entry_point + index_data = entry_data + else: + self._docket_file = entry_point + if self._initempty: + self._docket = docketutil.default_docket(self, header) + else: + self._docket = docketutil.parse_docket(self, entry_data) + self._indexfile = self._docket.index_filepath() + index_data = self._get_data(self._indexfile, mmapindexthreshold) + self._inline = False + # generaldelta implied by version 2 revlogs. + self._generaldelta = True + # the logic for persistent nodemap will be dealt with within the + # main docket, so disable it for now. + self._nodemap_file = None if self.postfix is None or self.postfix == b'a': self._datafile = b'%s.d' % self.radix @@ -2053,6 +2071,8 @@ self._writinghandles = (ifh, dfh) try: yield + if self._docket is not None: + self._docket.write(transaction) finally: self._writinghandles = None finally: @@ -3126,9 +3146,7 @@ def rewrite_sidedata(self, transaction, helpers, startrev, endrev): if not self.hassidedata: return - # inline are not yet supported because they suffer from an issue when - # rewriting them (since it's not an append-only operation). - # See issue6485. + # revlog formats with sidedata support does not support inline assert not self._inline if not helpers[1] and not helpers[2]: # Nothing to generate or remove
--- a/mercurial/revlogutils/constants.py Mon May 17 15:05:24 2021 +0200 +++ b/mercurial/revlogutils/constants.py Mon May 03 12:34:11 2021 +0200 @@ -133,20 +133,22 @@ b'inline': _no, b'generaldelta': _no, b'sidedata': False, + b'docket': False, }, REVLOGV1: { b'inline': _from_flag(FLAG_INLINE_DATA), b'generaldelta': _from_flag(FLAG_GENERALDELTA), b'sidedata': False, + b'docket': False, }, REVLOGV2: { - # There is a bug in the transaction handling when going from an - # inline revlog to a separate index and data file. Turn it off until - # it's fixed, since v2 revlogs sometimes get rewritten on exchange. - # See issue6485 + # The point of inline-revlog is to reduce the number of files used in + # the store. Using a docket defeat this purpose. So we needs other + # means to reduce the number of files for revlogv2. b'inline': _no, b'generaldelta': _yes, b'sidedata': True, + b'docket': True, }, }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/revlogutils/docket.py Mon May 03 12:34:11 2021 +0200 @@ -0,0 +1,80 @@ +# docket - code related to revlog "docket" +# +# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net> +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +### Revlog docket file +# +# The revlog is stored on disk using multiple files: +# +# * a small docket file, containing metadata and a pointer, +# +# * an index file, containing fixed width information about revisions, +# +# * a data file, containing variable width data for these revisions, + +from __future__ import absolute_import + +import struct + +from . import ( + constants, +) + +# Docket format +# +# * 4 bytes: revlog version +# | This is mandatory as docket must be compatible with the previous +# | revlog index header. +S_HEADER = struct.Struct(constants.INDEX_HEADER.format) + + +class RevlogDocket(object): + """metadata associated with revlog""" + + def __init__(self, revlog, version_header=None): + self._version_header = version_header + self._dirty = False + self._radix = revlog.radix + self._path = revlog._docket_file + self._opener = revlog.opener + + def index_filepath(self): + """file path to the current index file associated to this docket""" + # very simplistic version at first + return b"%s.idx" % self._radix + + def write(self, transaction): + """write the modification of disk if any + + This make the new content visible to all process""" + if self._dirty: + transaction.addbackup(self._path, location=b'store') + with self._opener(self._path, mode=b'w', atomictemp=True) as f: + f.write(self._serialize()) + self._dirty = False + + def _serialize(self): + return S_HEADER.pack(self._version_header) + + +def default_docket(revlog, version_header): + """given a revlog version a new docket object for the given revlog""" + if (version_header & 0xFFFF) != constants.REVLOGV2: + return None + docket = RevlogDocket(revlog, version_header=version_header) + docket._dirty = True + return docket + + +def parse_docket(revlog, data): + """given some docket data return a docket object for the given revlog""" + header = S_HEADER.unpack(data[: S_HEADER.size]) + (version_header,) = header + docket = RevlogDocket( + revlog, + version_header=version_header, + ) + return docket
--- a/mercurial/store.py Mon May 17 15:05:24 2021 +0200 +++ b/mercurial/store.py Mon May 03 12:34:11 2021 +0200 @@ -389,7 +389,7 @@ ] REVLOG_FILES_MAIN_EXT = (b'.i', b'i.tmpcensored') -REVLOG_FILES_OTHER_EXT = (b'.d', b'.n', b'.nd', b'd.tmpcensored') +REVLOG_FILES_OTHER_EXT = (b'.idx', b'.d', b'.n', b'.nd', b'd.tmpcensored') # files that are "volatile" and might change between listing and streaming # # note: the ".nd" file are nodemap data and won't "change" but they might be @@ -397,7 +397,7 @@ REVLOG_FILES_VOLATILE_EXT = (b'.n', b'.nd') # some exception to the above matching -EXCLUDED = re.compile(b'.*undo\.[^/]+\.nd?$') +EXCLUDED = re.compile(b'.*undo\.[^/]+\.(nd?|i)$') def is_revlog(f, kind, st): @@ -407,7 +407,7 @@ def revlog_type(f): - if f.endswith(REVLOG_FILES_MAIN_EXT): + if f.endswith(REVLOG_FILES_MAIN_EXT) and EXCLUDED.match(f) is None: return FILEFLAGS_REVLOG_MAIN elif f.endswith(REVLOG_FILES_OTHER_EXT) and EXCLUDED.match(f) is None: t = FILETYPE_FILELOG_OTHER