revlog: store sidedata in their own file
This makes sidedata manipulation simpler and results in more compact data when
traversing either data or sidedata.
Differential Revision: https://phab.mercurial-scm.org/D10787
--- a/mercurial/configitems.py Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/configitems.py Fri May 28 23:41:17 2021 +0200
@@ -1162,13 +1162,13 @@
# rewriting sidedata.
# * introduce a proper solution to reduce the number of filelog related files.
# * use caching for reading sidedata (similar to what we do for data).
+# * no longer set offset=0 if sidedata_size=0 (simplify cutoff computation).
# * Improvement to consider
# - avoid compression header in chunk using the default compression?
# - forbid "inline" compression mode entirely?
# - split the data offset and flag field (the 2 bytes save are mostly trouble)
# - keep track of uncompressed -chunk- size (to preallocate memory better)
# - keep track of chain base or size (probably not that useful anymore)
-# - store data and sidedata in different files
coreconfigitem(
b'experimental',
b'revlogv2',
--- a/mercurial/revlog.py Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/revlog.py Fri May 28 23:41:17 2021 +0200
@@ -1,4 +1,5 @@
# revlog.py - storage back-end for mercurial
+# coding: utf8
#
# Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
#
@@ -260,6 +261,11 @@
b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
)
+FILE_TOO_SHORT_MSG = _(
+ b'cannot read from revlog %s;'
+ b' expected %d bytes from offset %d, data size is %d'
+)
+
class revlog(object):
"""
@@ -401,6 +407,7 @@
self._docket_file = None
self._indexfile = None
self._datafile = None
+ self._sidedatafile = None
self._nodemap_file = None
self.postfix = postfix
self._trypending = trypending
@@ -445,7 +452,7 @@
# custom flags.
self._flagprocessors = dict(flagutil.flagprocessors)
- # 2-tuple of file handles being used for active writing.
+ # 3-tuple of file handles being used for active writing.
self._writinghandles = None
# prevent nesting of addgroup
self._adding_group = None
@@ -634,6 +641,7 @@
if self._docket is not None:
self._datafile = self._docket.data_filepath()
+ self._sidedatafile = self._docket.sidedata_filepath()
elif self.postfix is None:
self._datafile = b'%s.d' % self.radix
else:
@@ -803,9 +811,14 @@
with func() as fp:
yield fp
+ @contextlib.contextmanager
def _sidedatareadfp(self):
"""file object suitable to read sidedata"""
- return self._datareadfp()
+ if self._writinghandles:
+ yield self._writinghandles[2]
+ else:
+ with self.opener(self._sidedatafile) as fp:
+ yield fp
def tiprev(self):
return len(self.index) - 1
@@ -909,6 +922,23 @@
def start(self, rev):
return int(self.index[rev][0] >> 16)
+ def sidedata_cut_off(self, rev):
+ sd_cut_off = self.index[rev][8]
+ if sd_cut_off != 0:
+ return sd_cut_off
+ # This is some annoying dance, because entries without sidedata
+ # currently use 0 as their ofsset. (instead of previous-offset +
+ # previous-size)
+ #
+ # We should reconsider this sidedata → 0 sidata_offset policy.
+ # In the meantime, we need this.
+ while 0 <= rev:
+ e = self.index[rev]
+ if e[9] != 0:
+ return e[8] + e[9]
+ rev -= 1
+ return 0
+
def flags(self, rev):
return self.index[rev][0] & 0xFFFF
@@ -2074,11 +2104,19 @@
# XXX this need caching, as we do for data
with self._sidedatareadfp() as sdf:
- sdf.seek(sidedata_offset)
+ if self._docket.sidedata_end < sidedata_offset + sidedata_size:
+ filename = self._sidedatafile
+ end = self._docket.sidedata_end
+ offset = sidedata_offset
+ length = sidedata_size
+ m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
+ raise error.RevlogError(m)
+
+ sdf.seek(sidedata_offset, os.SEEK_SET)
comp_segment = sdf.read(sidedata_size)
if len(comp_segment) < sidedata_size:
- filename = self._datafile
+ filename = self._sidedatafile
length = sidedata_size
offset = sidedata_offset
got = len(comp_segment)
@@ -2215,7 +2253,7 @@
if existing_handles:
# switched from inline to conventional reopen the index
ifh = self.__index_write_fp()
- self._writinghandles = (ifh, new_dfh)
+ self._writinghandles = (ifh, new_dfh, None)
new_dfh = None
finally:
if new_dfh is not None:
@@ -2233,7 +2271,7 @@
if self._writinghandles is not None:
yield
else:
- ifh = dfh = None
+ ifh = dfh = sdfh = None
try:
r = len(self)
# opening the data file.
@@ -2253,6 +2291,17 @@
raise
dfh = self._datafp(b"w+")
transaction.add(self._datafile, dsize)
+ if self._sidedatafile is not None:
+ try:
+ sdfh = self.opener(self._sidedatafile, mode=b"r+")
+ dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+ except IOError as inst:
+ if inst.errno != errno.ENOENT:
+ raise
+ sdfh = self.opener(self._sidedatafile, mode=b"w+")
+ transaction.add(
+ self._sidedatafile, self._docket.sidedata_end
+ )
# opening the index file.
isize = r * self.index.entry_size
@@ -2262,7 +2311,7 @@
else:
transaction.add(self._indexfile, isize)
# exposing all file handle for writing.
- self._writinghandles = (ifh, dfh)
+ self._writinghandles = (ifh, dfh, sdfh)
yield
if self._docket is not None:
self._write_docket(transaction)
@@ -2270,6 +2319,8 @@
self._writinghandles = None
if dfh is not None:
dfh.close()
+ if sdfh is not None:
+ dfh.close()
# closing the index file last to avoid exposing referent to
# potential unflushed data content.
if ifh is not None:
@@ -2513,7 +2564,8 @@
offset = self._get_data_offset(prev)
if self._concurrencychecker:
- ifh, dfh = self._writinghandles
+ ifh, dfh, sdfh = self._writinghandles
+ # XXX no checking for the sidedata file
if self._inline:
# offset is "as if" it were in the .d file, so we need to add on
# the size of the entry metadata.
@@ -2570,7 +2622,7 @@
if sidedata and self.hassidedata:
sidedata_compression_mode = COMP_MODE_PLAIN
serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
- sidedata_offset = offset + deltainfo.deltalen
+ sidedata_offset = self._docket.sidedata_end
h, comp_sidedata = self.compress(serialized_sidedata)
if (
h != b'u'
@@ -2622,6 +2674,7 @@
link,
offset,
serialized_sidedata,
+ sidedata_offset,
)
rawtext = btext[0]
@@ -2648,7 +2701,9 @@
else:
return self._docket.data_end
- def _writeentry(self, transaction, entry, data, link, offset, sidedata):
+ def _writeentry(
+ self, transaction, entry, data, link, offset, sidedata, sidedata_offset
+ ):
# Files opened in a+ mode have inconsistent behavior on various
# platforms. Windows requires that a file positioning call be made
# when the file handle transitions between reads and writes. See
@@ -2664,7 +2719,7 @@
if self._writinghandles is None:
msg = b'adding revision outside `revlog._writing` context'
raise error.ProgrammingError(msg)
- ifh, dfh = self._writinghandles
+ ifh, dfh, sdfh = self._writinghandles
if self._docket is None:
ifh.seek(0, os.SEEK_END)
else:
@@ -2674,16 +2729,20 @@
dfh.seek(0, os.SEEK_END)
else:
dfh.seek(self._docket.data_end, os.SEEK_SET)
+ if sdfh:
+ sdfh.seek(self._docket.sidedata_end, os.SEEK_SET)
curr = len(self) - 1
if not self._inline:
transaction.add(self._datafile, offset)
+ if self._sidedatafile:
+ transaction.add(self._sidedatafile, sidedata_offset)
transaction.add(self._indexfile, curr * len(entry))
if data[0]:
dfh.write(data[0])
dfh.write(data[1])
if sidedata:
- dfh.write(sidedata)
+ sdfh.write(sidedata)
ifh.write(entry)
else:
offset += curr * self.index.entry_size
@@ -2691,12 +2750,12 @@
ifh.write(entry)
ifh.write(data[0])
ifh.write(data[1])
- if sidedata:
- ifh.write(sidedata)
+ assert not sidedata
self._enforceinlinesize(transaction)
if self._docket is not None:
self._docket.index_end = self._writinghandles[0].tell()
self._docket.data_end = self._writinghandles[1].tell()
+ self._docket.sidedata_end = self._writinghandles[2].tell()
nodemaputil.setup_persistent_nodemap(transaction, self)
@@ -2866,12 +2925,17 @@
else:
end = data_end + (rev * self.index.entry_size)
+ if self._sidedatafile:
+ sidedata_end = self.sidedata_cut_off(rev)
+ transaction.add(self._sidedatafile, sidedata_end)
+
transaction.add(self._indexfile, end)
if self._docket is not None:
# XXX we could, leverage the docket while stripping. However it is
# not powerfull enough at the time of this comment
self._docket.index_end = end
self._docket.data_end = data_end
+ self._docket.sidedata_end = sidedata_end
self._docket.write(transaction, stripping=True)
# then reset internal state in memory to forget those revisions
@@ -3398,13 +3462,10 @@
new_entries = []
# append the new sidedata
with self._writing(transaction):
- ifh, dfh = self._writinghandles
- if self._docket is not None:
- dfh.seek(self._docket.data_end, os.SEEK_SET)
- else:
- dfh.seek(0, os.SEEK_END)
-
- current_offset = dfh.tell()
+ ifh, dfh, sdfh = self._writinghandles
+ dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+
+ current_offset = sdfh.tell()
for rev in range(startrev, endrev + 1):
entry = self.index[rev]
new_sidedata, flags = sidedatautil.run_sidedata_helpers(
@@ -3455,12 +3516,11 @@
)
# the sidedata computation might have move the file cursors around
- dfh.seek(current_offset, os.SEEK_SET)
- dfh.write(serialized_sidedata)
+ sdfh.seek(current_offset, os.SEEK_SET)
+ sdfh.write(serialized_sidedata)
new_entries.append(entry_update)
current_offset += len(serialized_sidedata)
- if self._docket is not None:
- self._docket.data_end = dfh.tell()
+ self._docket.sidedata_end = sdfh.tell()
# rewrite the new index entries
ifh.seek(startrev * self.index.entry_size)
--- a/mercurial/revlogutils/docket.py Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/revlogutils/docket.py Fri May 28 23:41:17 2021 +0200
@@ -90,12 +90,15 @@
# | revlog index header.
# * 1 bytes: size of index uuid
# * 1 bytes: size of data uuid
+# * 1 bytes: size of sizedata uuid
# * 8 bytes: size of index-data
# * 8 bytes: pending size of index-data
# * 8 bytes: size of data
+# * 8 bytes: size of sidedata
# * 8 bytes: pending size of data
+# * 8 bytes: pending size of sidedata
# * 1 bytes: default compression header
-S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBLLLLc')
+S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBBLLLLLLc')
class RevlogDocket(object):
@@ -108,10 +111,13 @@
version_header=None,
index_uuid=None,
data_uuid=None,
+ sidedata_uuid=None,
index_end=0,
pending_index_end=0,
data_end=0,
pending_data_end=0,
+ sidedata_end=0,
+ pending_sidedata_end=0,
default_compression_header=None,
):
self._version_header = version_header
@@ -122,19 +128,25 @@
self._opener = revlog.opener
self._index_uuid = index_uuid
self._data_uuid = data_uuid
+ self._sidedata_uuid = sidedata_uuid
# thes asserts should be True as long as we have a single index filename
assert index_end <= pending_index_end
assert data_end <= pending_data_end
+ assert sidedata_end <= pending_sidedata_end
self._initial_index_end = index_end
self._pending_index_end = pending_index_end
self._initial_data_end = data_end
self._pending_data_end = pending_data_end
+ self._initial_sidedata_end = sidedata_end
+ self._pending_sidedata_end = pending_sidedata_end
if use_pending:
self._index_end = self._pending_index_end
self._data_end = self._pending_data_end
+ self._sidedata_end = self._pending_sidedata_end
else:
self._index_end = self._initial_index_end
self._data_end = self._initial_data_end
+ self._sidedata_end = self._initial_sidedata_end
self.default_compression_header = default_compression_header
def index_filepath(self):
@@ -151,6 +163,13 @@
self._data_uuid = make_uid()
return b"%s-%s.dat" % (self._radix, self._data_uuid)
+ def sidedata_filepath(self):
+ """file path to the current sidedata file associated to this docket"""
+ # very simplistic version at first
+ if self._sidedata_uuid is None:
+ self._sidedata_uuid = make_uid()
+ return b"%s-%s.sda" % (self._radix, self._sidedata_uuid)
+
@property
def index_end(self):
return self._index_end
@@ -171,6 +190,16 @@
self._data_end = new_size
self._dirty = True
+ @property
+ def sidedata_end(self):
+ return self._sidedata_end
+
+ @sidedata_end.setter
+ def sidedata_end(self, new_size):
+ if new_size != self._sidedata_end:
+ self._sidedata_end = new_size
+ self._dirty = True
+
def write(self, transaction, pending=False, stripping=False):
"""write the modification of disk if any
@@ -196,26 +225,33 @@
if pending:
official_index_end = self._initial_index_end
official_data_end = self._initial_data_end
+ official_sidedata_end = self._initial_sidedata_end
else:
official_index_end = self._index_end
official_data_end = self._data_end
+ official_sidedata_end = self._sidedata_end
# this assert should be True as long as we have a single index filename
assert official_data_end <= self._data_end
+ assert official_sidedata_end <= self._sidedata_end
data = (
self._version_header,
len(self._index_uuid),
len(self._data_uuid),
+ len(self._sidedata_uuid),
official_index_end,
self._index_end,
official_data_end,
self._data_end,
+ official_sidedata_end,
+ self._sidedata_end,
self.default_compression_header,
)
s = []
s.append(S_HEADER.pack(*data))
s.append(self._index_uuid)
s.append(self._data_uuid)
+ s.append(self._sidedata_uuid)
return b''.join(s)
@@ -262,6 +298,9 @@
data_uuid_size = next(iheader)
data_uuid = get_data(data_uuid_size)
+ sidedata_uuid_size = next(iheader)
+ sidedata_uuid = get_data(sidedata_uuid_size)
+
index_size = next(iheader)
pending_index_size = next(iheader)
@@ -270,6 +309,10 @@
pending_data_size = next(iheader)
+ sidedata_size = next(iheader)
+
+ pending_sidedata_size = next(iheader)
+
default_compression_header = next(iheader)
docket = RevlogDocket(
@@ -278,10 +321,13 @@
version_header=version_header,
index_uuid=index_uuid,
data_uuid=data_uuid,
+ sidedata_uuid=sidedata_uuid,
index_end=index_size,
pending_index_end=pending_index_size,
data_end=data_size,
pending_data_end=pending_data_size,
+ sidedata_end=sidedata_size,
+ pending_sidedata_end=pending_sidedata_size,
default_compression_header=default_compression_header,
)
return docket
--- a/mercurial/store.py Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/store.py Fri May 28 23:41:17 2021 +0200
@@ -395,6 +395,7 @@
b'.dat',
b'.n',
b'.nd',
+ b'.sda',
b'd.tmpcensored',
)
# files that are "volatile" and might change between listing and streaming
--- a/tests/test-revlog-v2.t Fri May 28 23:41:12 2021 +0200
+++ b/tests/test-revlog-v2.t Fri May 28 23:41:17 2021 +0200
@@ -86,9 +86,11 @@
- a data file
$ ls .hg/store/00changelog* .hg/store/00manifest*
- .hg/store/00changelog-6b8ab34b.dat
- .hg/store/00changelog-88698448.idx
+ .hg/store/00changelog-1335303a.sda
+ .hg/store/00changelog-6b8ab34b.idx
+ .hg/store/00changelog-b875dfc5.dat
.hg/store/00changelog.i
- .hg/store/00manifest-1335303a.dat
- .hg/store/00manifest-b875dfc5.idx
+ .hg/store/00manifest-05a21d65.idx
+ .hg/store/00manifest-43c37dde.dat
+ .hg/store/00manifest-e2c9362a.sda
.hg/store/00manifest.i