revlogv2: also keep track for the size of the "data" file
authorPierre-Yves David <pierre-yves.david@octobus.net>
Mon, 03 May 2021 12:35:35 +0200
changeset 47242 4abd474a10af
parent 47241 2219853a1503
child 47243 3b04cf976c67
revlogv2: also keep track for the size of the "data" file This is useful to make sure we always start writing at the right location, without effort. Differential Revision: https://phab.mercurial-scm.org/D10632
mercurial/configitems.py
mercurial/revlog.py
mercurial/revlogutils/docket.py
--- a/mercurial/configitems.py	Mon May 03 12:35:25 2021 +0200
+++ b/mercurial/configitems.py	Mon May 03 12:35:35 2021 +0200
@@ -1156,7 +1156,6 @@
 #      - for stripping operation
 #      - for rollback operation
 # * proper streaming (race free) of the docket file
-# * store the data size in the docket to simplify sidedata rewrite.
 # * track garbage data to evemtually allow rewriting -existing- sidedata.
 # * Exchange-wise, we will also need to do something more efficient than
 #   keeping references to the affected revlogs, especially memory-wise when
--- a/mercurial/revlog.py	Mon May 03 12:35:25 2021 +0200
+++ b/mercurial/revlog.py	Mon May 03 12:35:35 2021 +0200
@@ -2088,7 +2088,10 @@
             if not self._inline:
                 try:
                     dfh = self._datafp(b"r+")
-                    dfh.seek(0, os.SEEK_END)
+                    if self._docket is None:
+                        dfh.seek(0, os.SEEK_END)
+                    else:
+                        dfh.seek(self._docket.data_end, os.SEEK_SET)
                 except IOError as inst:
                     if inst.errno != errno.ENOENT:
                         raise
@@ -2455,16 +2458,10 @@
         to `n - 1`'s sidedata being written after `n`'s data.
 
         TODO cache this in a docket file before getting out of experimental."""
-        if self._format_version != REVLOGV2:
+        if self._docket is None:
             return self.end(prev)
-
-        offset = 0
-        for rev, entry in enumerate(self.index):
-            sidedata_end = entry[8] + entry[9]
-            # Sidedata for a previous rev has potentially been written after
-            # this rev's end, so take the max.
-            offset = max(self.end(rev), offset, sidedata_end)
-        return offset
+        else:
+            return self._docket.data_end
 
     def _writeentry(self, transaction, entry, data, link, offset, sidedata):
         # Files opened in a+ mode have inconsistent behavior on various
@@ -2488,7 +2485,10 @@
         else:
             ifh.seek(self._docket.index_end, os.SEEK_SET)
         if dfh:
-            dfh.seek(0, os.SEEK_END)
+            if self._docket is None:
+                dfh.seek(0, os.SEEK_END)
+            else:
+                dfh.seek(self._docket.data_end, os.SEEK_SET)
 
         curr = len(self) - 1
         if not self._inline:
@@ -2511,6 +2511,7 @@
             self._enforceinlinesize(transaction)
         if self._docket is not None:
             self._docket.index_end = self._writinghandles[0].tell()
+            self._docket.data_end = self._writinghandles[1].tell()
 
         nodemaputil.setup_persistent_nodemap(transaction, self)
 
@@ -2673,18 +2674,19 @@
             return
 
         # first truncate the files on disk
-        end = self.start(rev)
+        data_end = self.start(rev)
         if not self._inline:
-            transaction.add(self._datafile, end)
+            transaction.add(self._datafile, data_end)
             end = rev * self.index.entry_size
         else:
-            end += rev * self.index.entry_size
+            end = data_end + (rev * self.index.entry_size)
 
         transaction.add(self._indexfile, end)
         if self._docket is not None:
             # XXX we could, leverage the docket while stripping. However it is
             # not powerfull enough at the time of this comment
             self._docket.index_end = end
+            self._docket.data_end = data_end
             self._docket.write(transaction, stripping=True)
 
         # then reset internal state in memory to forget those revisions
@@ -3210,7 +3212,11 @@
         # append the new sidedata
         with self._writing(transaction):
             ifh, dfh = self._writinghandles
-            dfh.seek(0, os.SEEK_END)
+            if self._docket is not None:
+                dfh.seek(self._docket.data_end, os.SEEK_SET)
+            else:
+                dfh.seek(0, os.SEEK_END)
+
             current_offset = dfh.tell()
             for rev in range(startrev, endrev + 1):
                 entry = self.index[rev]
@@ -3242,6 +3248,8 @@
                 dfh.write(serialized_sidedata)
                 new_entries.append(entry)
                 current_offset += len(serialized_sidedata)
+                if self._docket is not None:
+                    self._docket.data_end = dfh.tell()
 
             # rewrite the new index entries
             ifh.seek(startrev * self.index.entry_size)
--- a/mercurial/revlogutils/docket.py	Mon May 03 12:35:25 2021 +0200
+++ b/mercurial/revlogutils/docket.py	Mon May 03 12:35:35 2021 +0200
@@ -32,9 +32,11 @@
 # * 4 bytes: revlog version
 #          |   This is mandatory as docket must be compatible with the previous
 #          |   revlog index header.
-# * 8 bytes: size of index data
-# * 8 bytes: pending size of index data
-S_HEADER = struct.Struct(constants.INDEX_HEADER.format + 'LL')
+# * 8 bytes: size of index-data
+# * 8 bytes: pending size of index-data
+# * 8 bytes: size of data
+# * 8 bytes: pending size of data
+S_HEADER = struct.Struct(constants.INDEX_HEADER.format + 'LLLL')
 
 
 class RevlogDocket(object):
@@ -47,6 +49,8 @@
         version_header=None,
         index_end=0,
         pending_index_end=0,
+        data_end=0,
+        pending_data_end=0,
     ):
         self._version_header = version_header
         self._read_only = bool(use_pending)
@@ -54,14 +58,19 @@
         self._radix = revlog.radix
         self._path = revlog._docket_file
         self._opener = revlog.opener
-        # this assert should be True as long as we have a single index filename
+        # thes asserts should be True as long as we have a single index filename
         assert index_end <= pending_index_end
+        assert data_end <= pending_data_end
         self._initial_index_end = index_end
         self._pending_index_end = pending_index_end
+        self._initial_data_end = data_end
+        self._pending_data_end = pending_data_end
         if use_pending:
             self._index_end = self._pending_index_end
+            self._data_end = self._pending_data_end
         else:
             self._index_end = self._initial_index_end
+            self._data_end = self._initial_data_end
 
     def index_filepath(self):
         """file path to the current index file associated to this docket"""
@@ -78,6 +87,16 @@
             self._index_end = new_size
             self._dirty = True
 
+    @property
+    def data_end(self):
+        return self._data_end
+
+    @data_end.setter
+    def data_end(self, new_size):
+        if new_size != self._data_end:
+            self._data_end = new_size
+            self._dirty = True
+
     def write(self, transaction, pending=False, stripping=False):
         """write the modification of disk if any
 
@@ -102,15 +121,19 @@
     def _serialize(self, pending=False):
         if pending:
             official_index_end = self._initial_index_end
+            official_data_end = self._initial_data_end
         else:
             official_index_end = self._index_end
+            official_data_end = self._data_end
 
         # this assert should be True as long as we have a single index filename
-        assert official_index_end <= self._index_end
+        assert official_data_end <= self._data_end
         data = (
             self._version_header,
             official_index_end,
             self._index_end,
+            official_data_end,
+            self._data_end,
         )
         return S_HEADER.pack(*data)
 
@@ -127,12 +150,18 @@
 def parse_docket(revlog, data, use_pending=False):
     """given some docket data return a docket object for the given revlog"""
     header = S_HEADER.unpack(data[: S_HEADER.size])
-    version_header, index_size, pending_index_size = header
+    version_header = header[0]
+    index_size = header[1]
+    pending_index_size = header[2]
+    data_size = header[3]
+    pending_data_size = header[4]
     docket = RevlogDocket(
         revlog,
         use_pending=use_pending,
         version_header=version_header,
         index_end=index_size,
         pending_index_end=pending_index_size,
+        data_end=data_size,
+        pending_data_end=pending_data_size,
     )
     return docket