revlogv2: introduce a very basic docket file
This is the first stone toward using a docket file in revlogv2. Right now the
docket is very basic and only store the version number (which is -also- stored
into the index file…) and the other files have fixed name. This new
implementation break transactionally… but they are no test checking
transactionally for revlogv2… So I take this as an opportunity to start small.
They are no usage of revlogv2 outside of tests anyway.
The docket keeps the `.i` naming used by previous version index to preserve a
unique entry point. We could decide to use a different name and look it up
first, or to fully rework this in a future "store" version. However that does
not seems necessary right now.
We will re-introduces transactionality (and associated testing…) in a later
changesets.
A long list of TODOs have been added to the relevant comment.
Differential Revision: https://phab.mercurial-scm.org/D10624
--- a/mercurial/changelog.py Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/changelog.py Mon May 03 12:34:11 2021 +0200
@@ -445,6 +445,8 @@
def delayupdate(self, tr):
"""delay visibility of index updates to other readers"""
+ if self._docket is not None:
+ return
if not self._delayed:
if len(self) == 0:
--- a/mercurial/configitems.py Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/configitems.py Mon May 03 12:34:11 2021 +0200
@@ -1150,14 +1150,27 @@
)
# "out of experimental" todo list.
#
-# * to grow a docket file to at least store the last offset of the data
-# file when rewriting sidedata.
-# * need a way of dealing with garbage data if we allow rewriting
-# *existing* sidedata.
+# * stop storing version information in the index (it is already in the docket)
+# * properly hide uncommitted content to other process
+# * expose transaction content hooks during pre-commit validation
+# * include management of a persistent nodemap in the main docket
+# * enforce a "no-truncate" policy for mmap safety
+# - for censoring operation
+# - for stripping operation
+# - for rollback operation
+# * store the data size in the docket to simplify sidedata rewrite.
+# * track garbage data to evemtually allow rewriting -existing- sidedata.
# * Exchange-wise, we will also need to do something more efficient than
# keeping references to the affected revlogs, especially memory-wise when
# rewriting sidedata.
-# * Also... compress the sidedata? (this should be coming very soon)
+# * sidedata compression
+# * introduce a proper solution to reduce the number of filelog related files.
+# * Improvement to consider
+# - track compression mode in the index entris instead of the chunks
+# - split the data offset and flag field (the 2 bytes save are mostly trouble)
+# - keep track of uncompressed -chunk- size (to preallocate memory better)
+# - keep track of chain base or size (probably not that useful anymore)
+# - store data and sidedata in different files
coreconfigitem(
b'experimental',
b'revlogv2',
--- a/mercurial/revlog.py Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/revlog.py Mon May 03 12:34:11 2021 +0200
@@ -75,6 +75,7 @@
)
from .revlogutils import (
deltas as deltautil,
+ docket as docketutil,
flagutil,
nodemap as nodemaputil,
revlogv0,
@@ -317,6 +318,7 @@
self.radix = radix
+ self._docket_file = None
self._indexfile = None
self._datafile = None
self._nodemap_file = None
@@ -344,6 +346,7 @@
self._maxchainlen = None
self._deltabothparents = True
self.index = None
+ self._docket = None
self._nodemap_docket = None
# Mapping of partial identifiers to full nodes.
self._pcache = {}
@@ -505,8 +508,23 @@
self._generaldelta = features[b'generaldelta'](self._format_flags)
self.hassidedata = features[b'sidedata']
- index_data = entry_data
- self._indexfile = entry_point
+ if not features[b'docket']:
+ self._indexfile = entry_point
+ index_data = entry_data
+ else:
+ self._docket_file = entry_point
+ if self._initempty:
+ self._docket = docketutil.default_docket(self, header)
+ else:
+ self._docket = docketutil.parse_docket(self, entry_data)
+ self._indexfile = self._docket.index_filepath()
+ index_data = self._get_data(self._indexfile, mmapindexthreshold)
+ self._inline = False
+ # generaldelta implied by version 2 revlogs.
+ self._generaldelta = True
+ # the logic for persistent nodemap will be dealt with within the
+ # main docket, so disable it for now.
+ self._nodemap_file = None
if self.postfix is None or self.postfix == b'a':
self._datafile = b'%s.d' % self.radix
@@ -2053,6 +2071,8 @@
self._writinghandles = (ifh, dfh)
try:
yield
+ if self._docket is not None:
+ self._docket.write(transaction)
finally:
self._writinghandles = None
finally:
@@ -3126,9 +3146,7 @@
def rewrite_sidedata(self, transaction, helpers, startrev, endrev):
if not self.hassidedata:
return
- # inline are not yet supported because they suffer from an issue when
- # rewriting them (since it's not an append-only operation).
- # See issue6485.
+ # revlog formats with sidedata support does not support inline
assert not self._inline
if not helpers[1] and not helpers[2]:
# Nothing to generate or remove
--- a/mercurial/revlogutils/constants.py Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/revlogutils/constants.py Mon May 03 12:34:11 2021 +0200
@@ -133,20 +133,22 @@
b'inline': _no,
b'generaldelta': _no,
b'sidedata': False,
+ b'docket': False,
},
REVLOGV1: {
b'inline': _from_flag(FLAG_INLINE_DATA),
b'generaldelta': _from_flag(FLAG_GENERALDELTA),
b'sidedata': False,
+ b'docket': False,
},
REVLOGV2: {
- # There is a bug in the transaction handling when going from an
- # inline revlog to a separate index and data file. Turn it off until
- # it's fixed, since v2 revlogs sometimes get rewritten on exchange.
- # See issue6485
+ # The point of inline-revlog is to reduce the number of files used in
+ # the store. Using a docket defeat this purpose. So we needs other
+ # means to reduce the number of files for revlogv2.
b'inline': _no,
b'generaldelta': _yes,
b'sidedata': True,
+ b'docket': True,
},
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/revlogutils/docket.py Mon May 03 12:34:11 2021 +0200
@@ -0,0 +1,80 @@
+# docket - code related to revlog "docket"
+#
+# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+### Revlog docket file
+#
+# The revlog is stored on disk using multiple files:
+#
+# * a small docket file, containing metadata and a pointer,
+#
+# * an index file, containing fixed width information about revisions,
+#
+# * a data file, containing variable width data for these revisions,
+
+from __future__ import absolute_import
+
+import struct
+
+from . import (
+ constants,
+)
+
+# Docket format
+#
+# * 4 bytes: revlog version
+# | This is mandatory as docket must be compatible with the previous
+# | revlog index header.
+S_HEADER = struct.Struct(constants.INDEX_HEADER.format)
+
+
+class RevlogDocket(object):
+ """metadata associated with revlog"""
+
+ def __init__(self, revlog, version_header=None):
+ self._version_header = version_header
+ self._dirty = False
+ self._radix = revlog.radix
+ self._path = revlog._docket_file
+ self._opener = revlog.opener
+
+ def index_filepath(self):
+ """file path to the current index file associated to this docket"""
+ # very simplistic version at first
+ return b"%s.idx" % self._radix
+
+ def write(self, transaction):
+ """write the modification of disk if any
+
+ This make the new content visible to all process"""
+ if self._dirty:
+ transaction.addbackup(self._path, location=b'store')
+ with self._opener(self._path, mode=b'w', atomictemp=True) as f:
+ f.write(self._serialize())
+ self._dirty = False
+
+ def _serialize(self):
+ return S_HEADER.pack(self._version_header)
+
+
+def default_docket(revlog, version_header):
+ """given a revlog version a new docket object for the given revlog"""
+ if (version_header & 0xFFFF) != constants.REVLOGV2:
+ return None
+ docket = RevlogDocket(revlog, version_header=version_header)
+ docket._dirty = True
+ return docket
+
+
+def parse_docket(revlog, data):
+ """given some docket data return a docket object for the given revlog"""
+ header = S_HEADER.unpack(data[: S_HEADER.size])
+ (version_header,) = header
+ docket = RevlogDocket(
+ revlog,
+ version_header=version_header,
+ )
+ return docket
--- a/mercurial/store.py Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/store.py Mon May 03 12:34:11 2021 +0200
@@ -389,7 +389,7 @@
]
REVLOG_FILES_MAIN_EXT = (b'.i', b'i.tmpcensored')
-REVLOG_FILES_OTHER_EXT = (b'.d', b'.n', b'.nd', b'd.tmpcensored')
+REVLOG_FILES_OTHER_EXT = (b'.idx', b'.d', b'.n', b'.nd', b'd.tmpcensored')
# files that are "volatile" and might change between listing and streaming
#
# note: the ".nd" file are nodemap data and won't "change" but they might be
@@ -397,7 +397,7 @@
REVLOG_FILES_VOLATILE_EXT = (b'.n', b'.nd')
# some exception to the above matching
-EXCLUDED = re.compile(b'.*undo\.[^/]+\.nd?$')
+EXCLUDED = re.compile(b'.*undo\.[^/]+\.(nd?|i)$')
def is_revlog(f, kind, st):
@@ -407,7 +407,7 @@
def revlog_type(f):
- if f.endswith(REVLOG_FILES_MAIN_EXT):
+ if f.endswith(REVLOG_FILES_MAIN_EXT) and EXCLUDED.match(f) is None:
return FILEFLAGS_REVLOG_MAIN
elif f.endswith(REVLOG_FILES_OTHER_EXT) and EXCLUDED.match(f) is None:
t = FILETYPE_FILELOG_OTHER