revlog: store sidedata in their own file
authorPierre-Yves David <pierre-yves.david@octobus.net>
Fri, 28 May 2021 23:41:17 +0200
changeset 47395 e6292eb33384
parent 47394 bcf92bdc2bca
child 47396 65b86f516ba2
revlog: store sidedata in their own file This makes sidedata manipulation simpler and results in more compact data when traversing either data or sidedata. Differential Revision: https://phab.mercurial-scm.org/D10787
mercurial/configitems.py
mercurial/revlog.py
mercurial/revlogutils/docket.py
mercurial/store.py
tests/test-revlog-v2.t
--- a/mercurial/configitems.py	Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/configitems.py	Fri May 28 23:41:17 2021 +0200
@@ -1162,13 +1162,13 @@
 #   rewriting sidedata.
 # * introduce a proper solution to reduce the number of filelog related files.
 # * use caching for reading sidedata (similar to what we do for data).
+# * no longer set offset=0 if sidedata_size=0 (simplify cutoff computation).
 # * Improvement to consider
 #   - avoid compression header in chunk using the default compression?
 #   - forbid "inline" compression mode entirely?
 #   - split the data offset and flag field (the 2 bytes save are mostly trouble)
 #   - keep track of uncompressed -chunk- size (to preallocate memory better)
 #   - keep track of chain base or size (probably not that useful anymore)
-#   - store data and sidedata in different files
 coreconfigitem(
     b'experimental',
     b'revlogv2',
--- a/mercurial/revlog.py	Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/revlog.py	Fri May 28 23:41:17 2021 +0200
@@ -1,4 +1,5 @@
 # revlog.py - storage back-end for mercurial
+# coding: utf8
 #
 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
 #
@@ -260,6 +261,11 @@
     b'partial read of revlog %s; expected %d bytes from offset %d, got %d'
 )
 
+FILE_TOO_SHORT_MSG = _(
+    b'cannot read from revlog %s;'
+    b'  expected %d bytes from offset %d, data size is %d'
+)
+
 
 class revlog(object):
     """
@@ -401,6 +407,7 @@
         self._docket_file = None
         self._indexfile = None
         self._datafile = None
+        self._sidedatafile = None
         self._nodemap_file = None
         self.postfix = postfix
         self._trypending = trypending
@@ -445,7 +452,7 @@
         # custom flags.
         self._flagprocessors = dict(flagutil.flagprocessors)
 
-        # 2-tuple of file handles being used for active writing.
+        # 3-tuple of file handles being used for active writing.
         self._writinghandles = None
         # prevent nesting of addgroup
         self._adding_group = None
@@ -634,6 +641,7 @@
 
         if self._docket is not None:
             self._datafile = self._docket.data_filepath()
+            self._sidedatafile = self._docket.sidedata_filepath()
         elif self.postfix is None:
             self._datafile = b'%s.d' % self.radix
         else:
@@ -803,9 +811,14 @@
             with func() as fp:
                 yield fp
 
+    @contextlib.contextmanager
     def _sidedatareadfp(self):
         """file object suitable to read sidedata"""
-        return self._datareadfp()
+        if self._writinghandles:
+            yield self._writinghandles[2]
+        else:
+            with self.opener(self._sidedatafile) as fp:
+                yield fp
 
     def tiprev(self):
         return len(self.index) - 1
@@ -909,6 +922,23 @@
     def start(self, rev):
         return int(self.index[rev][0] >> 16)
 
+    def sidedata_cut_off(self, rev):
+        sd_cut_off = self.index[rev][8]
+        if sd_cut_off != 0:
+            return sd_cut_off
+        # This is some annoying dance, because entries without sidedata
+        # currently use 0 as their ofsset. (instead of previous-offset +
+        # previous-size)
+        #
+        # We should reconsider this sidedata → 0 sidata_offset policy.
+        # In the meantime, we need this.
+        while 0 <= rev:
+            e = self.index[rev]
+            if e[9] != 0:
+                return e[8] + e[9]
+            rev -= 1
+        return 0
+
     def flags(self, rev):
         return self.index[rev][0] & 0xFFFF
 
@@ -2074,11 +2104,19 @@
 
         # XXX this need caching, as we do for data
         with self._sidedatareadfp() as sdf:
-            sdf.seek(sidedata_offset)
+            if self._docket.sidedata_end < sidedata_offset + sidedata_size:
+                filename = self._sidedatafile
+                end = self._docket.sidedata_end
+                offset = sidedata_offset
+                length = sidedata_size
+                m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
+                raise error.RevlogError(m)
+
+            sdf.seek(sidedata_offset, os.SEEK_SET)
             comp_segment = sdf.read(sidedata_size)
 
             if len(comp_segment) < sidedata_size:
-                filename = self._datafile
+                filename = self._sidedatafile
                 length = sidedata_size
                 offset = sidedata_offset
                 got = len(comp_segment)
@@ -2215,7 +2253,7 @@
             if existing_handles:
                 # switched from inline to conventional reopen the index
                 ifh = self.__index_write_fp()
-                self._writinghandles = (ifh, new_dfh)
+                self._writinghandles = (ifh, new_dfh, None)
                 new_dfh = None
         finally:
             if new_dfh is not None:
@@ -2233,7 +2271,7 @@
         if self._writinghandles is not None:
             yield
         else:
-            ifh = dfh = None
+            ifh = dfh = sdfh = None
             try:
                 r = len(self)
                 # opening the data file.
@@ -2253,6 +2291,17 @@
                             raise
                         dfh = self._datafp(b"w+")
                     transaction.add(self._datafile, dsize)
+                if self._sidedatafile is not None:
+                    try:
+                        sdfh = self.opener(self._sidedatafile, mode=b"r+")
+                        dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+                    except IOError as inst:
+                        if inst.errno != errno.ENOENT:
+                            raise
+                        sdfh = self.opener(self._sidedatafile, mode=b"w+")
+                    transaction.add(
+                        self._sidedatafile, self._docket.sidedata_end
+                    )
 
                 # opening the index file.
                 isize = r * self.index.entry_size
@@ -2262,7 +2311,7 @@
                 else:
                     transaction.add(self._indexfile, isize)
                 # exposing all file handle for writing.
-                self._writinghandles = (ifh, dfh)
+                self._writinghandles = (ifh, dfh, sdfh)
                 yield
                 if self._docket is not None:
                     self._write_docket(transaction)
@@ -2270,6 +2319,8 @@
                 self._writinghandles = None
                 if dfh is not None:
                     dfh.close()
+                if sdfh is not None:
+                    dfh.close()
                 # closing the index file last to avoid exposing referent to
                 # potential unflushed data content.
                 if ifh is not None:
@@ -2513,7 +2564,8 @@
         offset = self._get_data_offset(prev)
 
         if self._concurrencychecker:
-            ifh, dfh = self._writinghandles
+            ifh, dfh, sdfh = self._writinghandles
+            # XXX no checking for the sidedata file
             if self._inline:
                 # offset is "as if" it were in the .d file, so we need to add on
                 # the size of the entry metadata.
@@ -2570,7 +2622,7 @@
         if sidedata and self.hassidedata:
             sidedata_compression_mode = COMP_MODE_PLAIN
             serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
-            sidedata_offset = offset + deltainfo.deltalen
+            sidedata_offset = self._docket.sidedata_end
             h, comp_sidedata = self.compress(serialized_sidedata)
             if (
                 h != b'u'
@@ -2622,6 +2674,7 @@
             link,
             offset,
             serialized_sidedata,
+            sidedata_offset,
         )
 
         rawtext = btext[0]
@@ -2648,7 +2701,9 @@
         else:
             return self._docket.data_end
 
-    def _writeentry(self, transaction, entry, data, link, offset, sidedata):
+    def _writeentry(
+        self, transaction, entry, data, link, offset, sidedata, sidedata_offset
+    ):
         # Files opened in a+ mode have inconsistent behavior on various
         # platforms. Windows requires that a file positioning call be made
         # when the file handle transitions between reads and writes. See
@@ -2664,7 +2719,7 @@
         if self._writinghandles is None:
             msg = b'adding revision outside `revlog._writing` context'
             raise error.ProgrammingError(msg)
-        ifh, dfh = self._writinghandles
+        ifh, dfh, sdfh = self._writinghandles
         if self._docket is None:
             ifh.seek(0, os.SEEK_END)
         else:
@@ -2674,16 +2729,20 @@
                 dfh.seek(0, os.SEEK_END)
             else:
                 dfh.seek(self._docket.data_end, os.SEEK_SET)
+        if sdfh:
+            sdfh.seek(self._docket.sidedata_end, os.SEEK_SET)
 
         curr = len(self) - 1
         if not self._inline:
             transaction.add(self._datafile, offset)
+            if self._sidedatafile:
+                transaction.add(self._sidedatafile, sidedata_offset)
             transaction.add(self._indexfile, curr * len(entry))
             if data[0]:
                 dfh.write(data[0])
             dfh.write(data[1])
             if sidedata:
-                dfh.write(sidedata)
+                sdfh.write(sidedata)
             ifh.write(entry)
         else:
             offset += curr * self.index.entry_size
@@ -2691,12 +2750,12 @@
             ifh.write(entry)
             ifh.write(data[0])
             ifh.write(data[1])
-            if sidedata:
-                ifh.write(sidedata)
+            assert not sidedata
             self._enforceinlinesize(transaction)
         if self._docket is not None:
             self._docket.index_end = self._writinghandles[0].tell()
             self._docket.data_end = self._writinghandles[1].tell()
+            self._docket.sidedata_end = self._writinghandles[2].tell()
 
         nodemaputil.setup_persistent_nodemap(transaction, self)
 
@@ -2866,12 +2925,17 @@
         else:
             end = data_end + (rev * self.index.entry_size)
 
+        if self._sidedatafile:
+            sidedata_end = self.sidedata_cut_off(rev)
+            transaction.add(self._sidedatafile, sidedata_end)
+
         transaction.add(self._indexfile, end)
         if self._docket is not None:
             # XXX we could, leverage the docket while stripping. However it is
             # not powerfull enough at the time of this comment
             self._docket.index_end = end
             self._docket.data_end = data_end
+            self._docket.sidedata_end = sidedata_end
             self._docket.write(transaction, stripping=True)
 
         # then reset internal state in memory to forget those revisions
@@ -3398,13 +3462,10 @@
         new_entries = []
         # append the new sidedata
         with self._writing(transaction):
-            ifh, dfh = self._writinghandles
-            if self._docket is not None:
-                dfh.seek(self._docket.data_end, os.SEEK_SET)
-            else:
-                dfh.seek(0, os.SEEK_END)
-
-            current_offset = dfh.tell()
+            ifh, dfh, sdfh = self._writinghandles
+            dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
+
+            current_offset = sdfh.tell()
             for rev in range(startrev, endrev + 1):
                 entry = self.index[rev]
                 new_sidedata, flags = sidedatautil.run_sidedata_helpers(
@@ -3455,12 +3516,11 @@
                 )
 
                 # the sidedata computation might have move the file cursors around
-                dfh.seek(current_offset, os.SEEK_SET)
-                dfh.write(serialized_sidedata)
+                sdfh.seek(current_offset, os.SEEK_SET)
+                sdfh.write(serialized_sidedata)
                 new_entries.append(entry_update)
                 current_offset += len(serialized_sidedata)
-                if self._docket is not None:
-                    self._docket.data_end = dfh.tell()
+                self._docket.sidedata_end = sdfh.tell()
 
             # rewrite the new index entries
             ifh.seek(startrev * self.index.entry_size)
--- a/mercurial/revlogutils/docket.py	Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/revlogutils/docket.py	Fri May 28 23:41:17 2021 +0200
@@ -90,12 +90,15 @@
 #          |   revlog index header.
 # * 1 bytes: size of index uuid
 # * 1 bytes: size of data uuid
+# * 1 bytes: size of sizedata uuid
 # * 8 bytes: size of index-data
 # * 8 bytes: pending size of index-data
 # * 8 bytes: size of data
+# * 8 bytes: size of sidedata
 # * 8 bytes: pending size of data
+# * 8 bytes: pending size of sidedata
 # * 1 bytes: default compression header
-S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBLLLLc')
+S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBBLLLLLLc')
 
 
 class RevlogDocket(object):
@@ -108,10 +111,13 @@
         version_header=None,
         index_uuid=None,
         data_uuid=None,
+        sidedata_uuid=None,
         index_end=0,
         pending_index_end=0,
         data_end=0,
         pending_data_end=0,
+        sidedata_end=0,
+        pending_sidedata_end=0,
         default_compression_header=None,
     ):
         self._version_header = version_header
@@ -122,19 +128,25 @@
         self._opener = revlog.opener
         self._index_uuid = index_uuid
         self._data_uuid = data_uuid
+        self._sidedata_uuid = sidedata_uuid
         # thes asserts should be True as long as we have a single index filename
         assert index_end <= pending_index_end
         assert data_end <= pending_data_end
+        assert sidedata_end <= pending_sidedata_end
         self._initial_index_end = index_end
         self._pending_index_end = pending_index_end
         self._initial_data_end = data_end
         self._pending_data_end = pending_data_end
+        self._initial_sidedata_end = sidedata_end
+        self._pending_sidedata_end = pending_sidedata_end
         if use_pending:
             self._index_end = self._pending_index_end
             self._data_end = self._pending_data_end
+            self._sidedata_end = self._pending_sidedata_end
         else:
             self._index_end = self._initial_index_end
             self._data_end = self._initial_data_end
+            self._sidedata_end = self._initial_sidedata_end
         self.default_compression_header = default_compression_header
 
     def index_filepath(self):
@@ -151,6 +163,13 @@
             self._data_uuid = make_uid()
         return b"%s-%s.dat" % (self._radix, self._data_uuid)
 
+    def sidedata_filepath(self):
+        """file path to the current sidedata file associated to this docket"""
+        # very simplistic version at first
+        if self._sidedata_uuid is None:
+            self._sidedata_uuid = make_uid()
+        return b"%s-%s.sda" % (self._radix, self._sidedata_uuid)
+
     @property
     def index_end(self):
         return self._index_end
@@ -171,6 +190,16 @@
             self._data_end = new_size
             self._dirty = True
 
+    @property
+    def sidedata_end(self):
+        return self._sidedata_end
+
+    @sidedata_end.setter
+    def sidedata_end(self, new_size):
+        if new_size != self._sidedata_end:
+            self._sidedata_end = new_size
+            self._dirty = True
+
     def write(self, transaction, pending=False, stripping=False):
         """write the modification of disk if any
 
@@ -196,26 +225,33 @@
         if pending:
             official_index_end = self._initial_index_end
             official_data_end = self._initial_data_end
+            official_sidedata_end = self._initial_sidedata_end
         else:
             official_index_end = self._index_end
             official_data_end = self._data_end
+            official_sidedata_end = self._sidedata_end
 
         # this assert should be True as long as we have a single index filename
         assert official_data_end <= self._data_end
+        assert official_sidedata_end <= self._sidedata_end
         data = (
             self._version_header,
             len(self._index_uuid),
             len(self._data_uuid),
+            len(self._sidedata_uuid),
             official_index_end,
             self._index_end,
             official_data_end,
             self._data_end,
+            official_sidedata_end,
+            self._sidedata_end,
             self.default_compression_header,
         )
         s = []
         s.append(S_HEADER.pack(*data))
         s.append(self._index_uuid)
         s.append(self._data_uuid)
+        s.append(self._sidedata_uuid)
         return b''.join(s)
 
 
@@ -262,6 +298,9 @@
     data_uuid_size = next(iheader)
     data_uuid = get_data(data_uuid_size)
 
+    sidedata_uuid_size = next(iheader)
+    sidedata_uuid = get_data(sidedata_uuid_size)
+
     index_size = next(iheader)
 
     pending_index_size = next(iheader)
@@ -270,6 +309,10 @@
 
     pending_data_size = next(iheader)
 
+    sidedata_size = next(iheader)
+
+    pending_sidedata_size = next(iheader)
+
     default_compression_header = next(iheader)
 
     docket = RevlogDocket(
@@ -278,10 +321,13 @@
         version_header=version_header,
         index_uuid=index_uuid,
         data_uuid=data_uuid,
+        sidedata_uuid=sidedata_uuid,
         index_end=index_size,
         pending_index_end=pending_index_size,
         data_end=data_size,
         pending_data_end=pending_data_size,
+        sidedata_end=sidedata_size,
+        pending_sidedata_end=pending_sidedata_size,
         default_compression_header=default_compression_header,
     )
     return docket
--- a/mercurial/store.py	Fri May 28 23:41:12 2021 +0200
+++ b/mercurial/store.py	Fri May 28 23:41:17 2021 +0200
@@ -395,6 +395,7 @@
     b'.dat',
     b'.n',
     b'.nd',
+    b'.sda',
     b'd.tmpcensored',
 )
 # files that are "volatile" and might change between listing and streaming
--- a/tests/test-revlog-v2.t	Fri May 28 23:41:12 2021 +0200
+++ b/tests/test-revlog-v2.t	Fri May 28 23:41:17 2021 +0200
@@ -86,9 +86,11 @@
 - a data file
 
   $ ls .hg/store/00changelog* .hg/store/00manifest*
-  .hg/store/00changelog-6b8ab34b.dat
-  .hg/store/00changelog-88698448.idx
+  .hg/store/00changelog-1335303a.sda
+  .hg/store/00changelog-6b8ab34b.idx
+  .hg/store/00changelog-b875dfc5.dat
   .hg/store/00changelog.i
-  .hg/store/00manifest-1335303a.dat
-  .hg/store/00manifest-b875dfc5.idx
+  .hg/store/00manifest-05a21d65.idx
+  .hg/store/00manifest-43c37dde.dat
+  .hg/store/00manifest-e2c9362a.sda
   .hg/store/00manifest.i