revlog: create a iteration of a _InnerRevlog object within the revlog
authorPierre-Yves David <pierre-yves.david@octobus.net>
Tue, 17 Oct 2023 06:02:33 +0200
changeset 51086 c3748f38dcd0
parent 51085 118c99c6092b
child 51087 df50a1592e0c
revlog: create a iteration of a _InnerRevlog object within the revlog The goal of this object is to isolate a sub-API that can be implemented by a compiled object (e.g. Rust). So the boundary of this object will be arbitrary depending of what can we easily implemented in the Compiled code. For now, we start simple, and move the code that manage the IO objects in the inner object. More will come in the coming changesets. Note: the object definition could live in the different module to thin the `revlog.py` file, however there are other better candidate for extraction first and I have enought patch stacked on top of the this one for the split in this patch not to be worth it. So I leave this to future me.
mercurial/changelog.py
mercurial/revlog.py
--- a/mercurial/changelog.py	Tue Oct 17 05:17:02 2023 +0200
+++ b/mercurial/changelog.py	Tue Oct 17 06:02:33 2023 +0200
@@ -459,8 +459,9 @@
                 self.opener = _delayopener(
                     self._realopener, self._indexfile, self._delaybuf
                 )
-            self._segmentfile.opener = self.opener
-            self._segmentfile_sidedata.opener = self.opener
+            self._inner.opener = self.opener
+            self._inner._segmentfile.opener = self.opener
+            self._inner._segmentfile_sidedata.opener = self.opener
         self._delayed = True
         tr.addpending(b'cl-%i' % id(self), self._writepending)
         tr.addfinalize(b'cl-%i' % id(self), self._finalize)
@@ -469,8 +470,9 @@
         """finalize index updates"""
         self._delayed = False
         self.opener = self._realopener
-        self._segmentfile.opener = self.opener
-        self._segmentfile_sidedata.opener = self.opener
+        self._inner.opener = self.opener
+        self._inner._segmentfile.opener = self.opener
+        self._inner._segmentfile_sidedata.opener = self.opener
         # move redirected index data back into place
         if self._docket is not None:
             self._write_docket(tr)
@@ -510,8 +512,9 @@
             self._delaybuf = None
             self._divert = True
             self.opener = _divertopener(self._realopener, self._indexfile)
-            self._segmentfile.opener = self.opener
-            self._segmentfile_sidedata.opener = self.opener
+            self._inner.opener = self.opener
+            self._inner._segmentfile.opener = self.opener
+            self._inner._segmentfile_sidedata.opener = self.opener
 
         if self._divert:
             return True
--- a/mercurial/revlog.py	Tue Oct 17 05:17:02 2023 +0200
+++ b/mercurial/revlog.py	Tue Oct 17 06:02:33 2023 +0200
@@ -337,6 +337,178 @@
     lazy_delta_base = attr.ib(default=False)
 
 
+class _InnerRevlog:
+    """An inner layer of the revlog object
+
+    That layer exist to be able to delegate some operation to Rust, its
+    boundaries are arbitrary and based on what we can delegate to Rust.
+    """
+
+    def __init__(
+        self,
+        opener,
+        index,
+        index_file,
+        data_file,
+        sidedata_file,
+        inline,
+        data_config,
+        chunk_cache,
+    ):
+        self.opener = opener
+        self.index = index
+
+        self.index_file = index_file
+        self.data_file = data_file
+        self.sidedata_file = sidedata_file
+        self.inline = inline
+        self.data_config = data_config
+
+        # index
+
+        # 3-tuple of file handles being used for active writing.
+        self._writinghandles = None
+
+        self._segmentfile = randomaccessfile.randomaccessfile(
+            self.opener,
+            (self.index_file if self.inline else self.data_file),
+            self.data_config.chunk_cache_size,
+            chunk_cache,
+        )
+        self._segmentfile_sidedata = randomaccessfile.randomaccessfile(
+            self.opener,
+            self.sidedata_file,
+            self.data_config.chunk_cache_size,
+        )
+
+    # Derived from index values.
+
+    def start(self, rev):
+        """the offset of the data chunk for this revision"""
+        return int(self.index[rev][0] >> 16)
+
+    def length(self, rev):
+        """the length of the data chunk for this revision"""
+        return self.index[rev][1]
+
+    def end(self, rev):
+        """the end of the data chunk for this revision"""
+        return self.start(rev) + self.length(rev)
+
+    @contextlib.contextmanager
+    def reading(self):
+        """Context manager that keeps data and sidedata files open for reading"""
+        if len(self.index) == 0:
+            yield  # nothing to be read
+        else:
+            with self._segmentfile.reading():
+                with self._segmentfile_sidedata.reading():
+                    yield
+
+    @property
+    def is_writing(self):
+        """True is a writing context is open"""
+        return self._writinghandles is not None
+
+    @contextlib.contextmanager
+    def writing(self, transaction, data_end=None, sidedata_end=None):
+        """Open the revlog files for writing
+
+        Add content to a revlog should be done within such context.
+        """
+        if self.is_writing:
+            yield
+        else:
+            ifh = dfh = sdfh = None
+            try:
+                r = len(self.index)
+                # opening the data file.
+                dsize = 0
+                if r:
+                    dsize = self.end(r - 1)
+                dfh = None
+                if not self.inline:
+                    try:
+                        dfh = self.opener(self.data_file, mode=b"r+")
+                        if data_end is None:
+                            dfh.seek(0, os.SEEK_END)
+                        else:
+                            dfh.seek(data_end, os.SEEK_SET)
+                    except FileNotFoundError:
+                        dfh = self.opener(self.data_file, mode=b"w+")
+                    transaction.add(self.data_file, dsize)
+                if self.sidedata_file is not None:
+                    assert sidedata_end is not None
+                    # revlog-v2 does not inline, help Pytype
+                    assert dfh is not None
+                    try:
+                        sdfh = self.opener(self.sidedata_file, mode=b"r+")
+                        dfh.seek(sidedata_end, os.SEEK_SET)
+                    except FileNotFoundError:
+                        sdfh = self.opener(self.sidedata_file, mode=b"w+")
+                    transaction.add(self.sidedata_file, sidedata_end)
+
+                # opening the index file.
+                isize = r * self.index.entry_size
+                ifh = self.__index_write_fp()
+                if self.inline:
+                    transaction.add(self.index_file, dsize + isize)
+                else:
+                    transaction.add(self.index_file, isize)
+                # exposing all file handle for writing.
+                self._writinghandles = (ifh, dfh, sdfh)
+                self._segmentfile.writing_handle = ifh if self.inline else dfh
+                self._segmentfile_sidedata.writing_handle = sdfh
+                yield
+            finally:
+                self._writinghandles = None
+                self._segmentfile.writing_handle = None
+                self._segmentfile_sidedata.writing_handle = None
+                if dfh is not None:
+                    dfh.close()
+                if sdfh is not None:
+                    sdfh.close()
+                # closing the index file last to avoid exposing referent to
+                # potential unflushed data content.
+                if ifh is not None:
+                    ifh.close()
+
+    def __index_write_fp(self, index_end=None):
+        """internal method to open the index file for writing
+
+        You should not use this directly and use `_writing` instead
+        """
+        try:
+            f = self.opener(
+                self.index_file,
+                mode=b"r+",
+                checkambig=self.data_config.check_ambig,
+            )
+            if index_end is None:
+                f.seek(0, os.SEEK_END)
+            else:
+                f.seek(index_end, os.SEEK_SET)
+            return f
+        except FileNotFoundError:
+            return self.opener(
+                self.index_file,
+                mode=b"w+",
+                checkambig=self.data_config.check_ambig,
+            )
+
+    def __index_new_fp(self):
+        """internal method to create a new index file for writing
+
+        You should not use this unless you are upgrading from inline revlog
+        """
+        return self.opener(
+            self.index_file,
+            mode=b"w",
+            checkambig=self.data_config.check_ambig,
+            atomictemp=True,
+        )
+
+
 class revlog:
     """
     the underlying revision storage object
@@ -477,13 +649,11 @@
         # Make copy of flag processors so each revlog instance can support
         # custom flags.
         self._flagprocessors = dict(flagutil.flagprocessors)
-
-        # 3-tuple of file handles being used for active writing.
-        self._writinghandles = None
         # prevent nesting of addgroup
         self._adding_group = None
 
-        self._loadindex()
+        chunk_cache = self._loadindex()
+        self._load_inner(chunk_cache)
 
         self._concurrencychecker = concurrencychecker
 
@@ -1007,22 +1177,25 @@
                 _(b"index %s is corrupted") % self.display_id
             )
         self.index = index
-        self._segmentfile = randomaccessfile.randomaccessfile(
-            self.opener,
-            (self._indexfile if self._inline else self._datafile),
-            self.data_config.chunk_cache_size,
-            chunkcache,
-        )
-        self._segmentfile_sidedata = randomaccessfile.randomaccessfile(
-            self.opener,
-            self._sidedatafile,
-            self.data_config.chunk_cache_size,
-        )
         # revnum -> (chain-length, sum-delta-length)
         self._chaininfocache = util.lrucachedict(500)
         # revlog header -> revlog compressor
         self._decompressors = {}
 
+        return chunkcache
+
+    def _load_inner(self, chunk_cache):
+        self._inner = _InnerRevlog(
+            opener=self.opener,
+            index=self.index,
+            index_file=self._indexfile,
+            data_file=self._datafile,
+            sidedata_file=self._sidedatafile,
+            inline=self._inline,
+            data_config=self.data_config,
+            chunk_cache=chunk_cache,
+        )
+
     def get_revlog(self):
         """simple function to mirror API of other not-really-revlog API"""
         return self
@@ -1073,35 +1246,6 @@
         c = self._get_decompressor(t)
         return c.decompress
 
-    def __index_write_fp(self):
-        # You should not use this directly and use `_writing` instead
-        try:
-            f = self.opener(
-                self._indexfile,
-                mode=b"r+",
-                checkambig=self.data_config.check_ambig,
-            )
-            if self._docket is None:
-                f.seek(0, os.SEEK_END)
-            else:
-                f.seek(self._docket.index_end, os.SEEK_SET)
-            return f
-        except FileNotFoundError:
-            return self.opener(
-                self._indexfile,
-                mode=b"w+",
-                checkambig=self.data_config.check_ambig,
-            )
-
-    def __index_new_fp(self):
-        # You should not use this unless you are upgrading from inline revlog
-        return self.opener(
-            self._indexfile,
-            mode=b"w",
-            checkambig=self.data_config.check_ambig,
-            atomictemp=True,
-        )
-
     def _datafp(self, mode=b'r'):
         """file object for the revlog's data file"""
         return self.opener(self._datafile, mode=mode)
@@ -1160,8 +1304,8 @@
         """Clear in-memory caches"""
         self._revisioncache = None
         self._chainbasecache.clear()
-        self._segmentfile.clear_cache()
-        self._segmentfile_sidedata.clear_cache()
+        self._inner._segmentfile.clear_cache()
+        self._inner._segmentfile_sidedata.clear_cache()
         self._pcache = {}
         self._nodemap_docket = None
         self.index.clearcaches()
@@ -2041,7 +2185,7 @@
             end += (endrev + 1) * self.index.entry_size
         length = end - start
 
-        return start, self._segmentfile.read_chunk(start, length)
+        return start, self._inner._segmentfile.read_chunk(start, length)
 
     def _chunk(self, rev):
         """Obtain a single decompressed chunk for a revision.
@@ -2318,7 +2462,7 @@
             m = FILE_TOO_SHORT_MSG % (filename, length, offset, end)
             raise error.RevlogError(m)
 
-        comp_segment = self._segmentfile_sidedata.read_chunk(
+        comp_segment = self._inner._segmentfile_sidedata.read_chunk(
             sidedata_offset, sidedata_size
         )
 
@@ -2423,15 +2567,15 @@
         tr.add(self._datafile, 0)
 
         existing_handles = False
-        if self._writinghandles is not None:
+        if self._inner._writinghandles is not None:
             existing_handles = True
-            fp = self._writinghandles[0]
+            fp = self._inner._writinghandles[0]
             fp.flush()
             fp.close()
             # We can't use the cached file handle after close(). So prevent
             # its usage.
-            self._writinghandles = None
-            self._segmentfile.writing_handle = None
+            self._inner._writinghandles = None
+            self._inner._segmentfile.writing_handle = None
             # No need to deal with sidedata writing handle as it is only
             # relevant with revlog-v2 which is never inline, not reaching
             # this code
@@ -2451,11 +2595,13 @@
                 maybe_self = weak_self()
                 if maybe_self is not None:
                     maybe_self._indexfile = old_index_file_path
+                    maybe_self._inner.index_file = maybe_self._indexfile
 
             def abort_callback(tr):
                 maybe_self = weak_self()
                 if maybe_self is not None:
                     maybe_self._indexfile = old_index_file_path
+                    maybe_self._inner.index_file = old_index_file_path
 
             tr.registertmp(new_index_file_path)
             if self.target[1] is not None:
@@ -2475,9 +2621,11 @@
 
             if side_write:
                 self._indexfile = new_index_file_path
-            with self.__index_new_fp() as fp:
+                self._inner.index_file = self._indexfile
+            with self._inner._InnerRevlog__index_new_fp() as fp:
                 self._format_flags &= ~FLAG_INLINE_DATA
                 self._inline = False
+                self._inner.inline = False
                 for i in self:
                     e = self.index.entry_binary(i)
                     if i == 0 and self._docket is None:
@@ -2492,7 +2640,7 @@
                 # index when we exit the context manager
 
             nodemaputil.setup_persistent_nodemap(tr, self)
-            self._segmentfile = randomaccessfile.randomaccessfile(
+            self._inner._segmentfile = randomaccessfile.randomaccessfile(
                 self.opener,
                 self._datafile,
                 self.data_config.chunk_cache_size,
@@ -2500,9 +2648,14 @@
 
             if existing_handles:
                 # switched from inline to conventional reopen the index
-                ifh = self.__index_write_fp()
-                self._writinghandles = (ifh, new_dfh, None)
-                self._segmentfile.writing_handle = new_dfh
+                index_end = None
+                if self._docket is not None:
+                    index_end = self._docket.index_end
+                ifh = self._inner._InnerRevlog__index_write_fp(
+                    index_end=index_end
+                )
+                self._inner._writinghandles = (ifh, new_dfh, None)
+                self._inner._segmentfile.writing_handle = new_dfh
                 new_dfh = None
                 # No need to deal with sidedata writing handle as it is only
                 # relevant with revlog-v2 which is never inline, not reaching
@@ -2516,13 +2669,8 @@
 
     @contextlib.contextmanager
     def reading(self):
-        """Context manager that keeps data and sidedata files open for reading"""
-        if len(self.index) == 0:
-            yield  # nothing to be read
-        else:
-            with self._segmentfile.reading():
-                with self._segmentfile_sidedata.reading():
-                    yield
+        with self._inner.reading():
+            yield
 
     @contextlib.contextmanager
     def _writing(self, transaction):
@@ -2530,65 +2678,22 @@
             msg = b'try to write in a `trypending` revlog: %s'
             msg %= self.display_id
             raise error.ProgrammingError(msg)
-        if self._writinghandles is not None:
+        if self._inner.is_writing:
             yield
         else:
-            ifh = dfh = sdfh = None
-            try:
-                r = len(self)
-                # opening the data file.
-                dsize = 0
-                if r:
-                    dsize = self.end(r - 1)
-                dfh = None
-                if not self._inline:
-                    try:
-                        dfh = self._datafp(b"r+")
-                        if self._docket is None:
-                            dfh.seek(0, os.SEEK_END)
-                        else:
-                            dfh.seek(self._docket.data_end, os.SEEK_SET)
-                    except FileNotFoundError:
-                        dfh = self._datafp(b"w+")
-                    transaction.add(self._datafile, dsize)
-                if self._sidedatafile is not None:
-                    # revlog-v2 does not inline, help Pytype
-                    assert dfh is not None
-                    try:
-                        sdfh = self.opener(self._sidedatafile, mode=b"r+")
-                        dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
-                    except FileNotFoundError:
-                        sdfh = self.opener(self._sidedatafile, mode=b"w+")
-                    transaction.add(
-                        self._sidedatafile, self._docket.sidedata_end
-                    )
-
-                # opening the index file.
-                isize = r * self.index.entry_size
-                ifh = self.__index_write_fp()
-                if self._inline:
-                    transaction.add(self._indexfile, dsize + isize)
-                else:
-                    transaction.add(self._indexfile, isize)
-                # exposing all file handle for writing.
-                self._writinghandles = (ifh, dfh, sdfh)
-                self._segmentfile.writing_handle = ifh if self._inline else dfh
-                self._segmentfile_sidedata.writing_handle = sdfh
+            data_end = None
+            sidedata_end = None
+            if self._docket is not None:
+                data_end = self._docket.data_end
+                sidedata_end = self._docket.sidedata_end
+            with self._inner.writing(
+                transaction,
+                data_end=data_end,
+                sidedata_end=sidedata_end,
+            ):
                 yield
                 if self._docket is not None:
                     self._write_docket(transaction)
-            finally:
-                self._writinghandles = None
-                self._segmentfile.writing_handle = None
-                self._segmentfile_sidedata.writing_handle = None
-                if dfh is not None:
-                    dfh.close()
-                if sdfh is not None:
-                    sdfh.close()
-                # closing the index file last to avoid exposing referent to
-                # potential unflushed data content.
-                if ifh is not None:
-                    ifh.close()
 
     def _write_docket(self, transaction):
         """write the current docket on disk
@@ -2811,7 +2916,7 @@
             raise error.RevlogError(
                 _(b"%s: attempt to add wdir revision") % self.display_id
             )
-        if self._writinghandles is None:
+        if self._inner._writinghandles is None:
             msg = b'adding revision outside `revlog._writing` context'
             raise error.ProgrammingError(msg)
 
@@ -2823,7 +2928,7 @@
         offset = self._get_data_offset(prev)
 
         if self._concurrencychecker:
-            ifh, dfh, sdfh = self._writinghandles
+            ifh, dfh, sdfh = self._inner._writinghandles
             # XXX no checking for the sidedata file
             if self._inline:
                 # offset is "as if" it were in the .d file, so we need to add on
@@ -3007,10 +3112,10 @@
         # Note: This is likely not necessary on Python 3. However, because
         # the file handle is reused for reads and may be seeked there, we need
         # to be careful before changing this.
-        if self._writinghandles is None:
+        if self._inner._writinghandles is None:
             msg = b'adding revision outside `revlog._writing` context'
             raise error.ProgrammingError(msg)
-        ifh, dfh, sdfh = self._writinghandles
+        ifh, dfh, sdfh = self._inner._writinghandles
         if self._docket is None:
             ifh.seek(0, os.SEEK_END)
         else:
@@ -3045,9 +3150,9 @@
             self._enforceinlinesize(transaction)
         if self._docket is not None:
             # revlog-v2 always has 3 writing handles, help Pytype
-            wh1 = self._writinghandles[0]
-            wh2 = self._writinghandles[1]
-            wh3 = self._writinghandles[2]
+            wh1 = self._inner._writinghandles[0]
+            wh2 = self._inner._writinghandles[1]
+            wh3 = self._inner._writinghandles[2]
             assert wh1 is not None
             assert wh2 is not None
             assert wh3 is not None
@@ -3259,8 +3364,8 @@
         # then reset internal state in memory to forget those revisions
         self._revisioncache = None
         self._chaininfocache = util.lrucachedict(500)
-        self._segmentfile.clear_cache()
-        self._segmentfile_sidedata.clear_cache()
+        self._inner._segmentfile.clear_cache()
+        self._inner._segmentfile_sidedata.clear_cache()
 
         del self.index[rev:-1]
 
@@ -3723,7 +3828,7 @@
         new_entries = []
         # append the new sidedata
         with self._writing(transaction):
-            ifh, dfh, sdfh = self._writinghandles
+            ifh, dfh, sdfh = self._inner._writinghandles
             dfh.seek(self._docket.sidedata_end, os.SEEK_SET)
 
             current_offset = sdfh.tell()