diff hgext/fastannotate/revmap.py @ 39210:1ddb296e0dee

fastannotate: initial import from Facebook's hg-experimental I made as few changes as I could to get the tests to pass, but this was a bit involved due to some churn in the blame code since someone last gave fastannotate any TLC. There's still follow-up work here to rip out support for old versions of hg and to integrate the protocol with modern standards. Some performance numbers (all on my 2016 MacBook Pro with a 2.6Ghz i7): Mercurial mercurial/manifest.py traditional blame time: real 1.050 secs (user 0.990+0.000 sys 0.060+0.000) build cache time: real 5.900 secs (user 5.720+0.000 sys 0.110+0.000) fastannotate time: real 0.120 secs (user 0.100+0.000 sys 0.020+0.000) Mercurial mercurial/localrepo.py traditional blame time: real 3.330 secs (user 3.220+0.000 sys 0.070+0.000) build cache time: real 30.610 secs (user 30.190+0.000 sys 0.230+0.000) fastannotate time: real 0.180 secs (user 0.160+0.000 sys 0.020+0.000) mozilla-central dom/ipc/ContentParent.cpp traditional blame time: real 7.640 secs (user 7.210+0.000 sys 0.380+0.000) build cache time: real 98.650 secs (user 97.000+0.000 sys 0.950+0.000) fastannotate time: real 1.580 secs (user 1.340+0.000 sys 0.240+0.000) mozilla-central dom/base/nsDocument.cpp traditional blame time: real 17.110 secs (user 16.490+0.000 sys 0.500+0.000) build cache time: real 399.750 secs (user 394.520+0.000 sys 2.610+0.000) fastannotate time: real 1.780 secs (user 1.530+0.000 sys 0.240+0.000) So building the cache is expensive (but might be faster with xdiff enabled), but the blame results are *way* faster. Differential Revision: https://phab.mercurial-scm.org/D3994
author Augie Fackler <augie@google.com>
date Mon, 30 Jul 2018 22:50:00 -0400
parents
children 1205ba8f11ac
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/fastannotate/revmap.py	Mon Jul 30 22:50:00 2018 -0400
@@ -0,0 +1,254 @@
+# Copyright 2016-present Facebook. All Rights Reserved.
+#
+# revmap: trivial hg hash - linelog rev bidirectional map
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import bisect
+import os
+import struct
+
+from mercurial.node import hex
+from mercurial import (
+    error as hgerror,
+    pycompat,
+)
+from . import error
+
+# the revmap file format is straightforward:
+#
+#    8 bytes: header
+#    1 byte : flag for linelog revision 1
+#    ? bytes: (optional) '\0'-terminated path string
+#             only exists if (flag & renameflag) != 0
+#   20 bytes: hg hash for linelog revision 1
+#    1 byte : flag for linelog revision 2
+#    ? bytes: (optional) '\0'-terminated path string
+#   20 bytes: hg hash for linelog revision 2
+#   ....
+#
+# the implementation is kinda stupid: __init__ loads the whole revmap.
+# no laziness. benchmark shows loading 10000 revisions is about 0.015
+# seconds, which looks enough for our use-case. if this implementation
+# becomes a bottleneck, we can change it to lazily read the file
+# from the end.
+
+# whether the changeset is in the side branch. i.e. not in the linear main
+# branch but only got referenced by lines in merge changesets.
+sidebranchflag = 1
+
+# whether the changeset changes the file path (ie. is a rename)
+renameflag = 2
+
+# len(mercurial.node.nullid)
+_hshlen = 20
+
+class revmap(object):
+    """trivial hg bin hash - linelog rev bidirectional map
+
+    also stores a flag (uint8) for each revision, and track renames.
+    """
+
+    HEADER = b'REVMAP1\0'
+
+    def __init__(self, path=None):
+        """create or load the revmap, optionally associate to a file
+
+        if path is None, the revmap is entirely in-memory. the caller is
+        responsible for locking. concurrent writes to a same file is unsafe.
+        the caller needs to make sure one file is associated to at most one
+        revmap object at a time."""
+        self.path = path
+        self._rev2hsh = [None]
+        self._rev2flag = [None]
+        self._hsh2rev = {}
+        # since rename does not happen frequently, do not store path for every
+        # revision. self._renamerevs can be used for bisecting.
+        self._renamerevs = [0]
+        self._renamepaths = ['']
+        self._lastmaxrev = -1
+        if path:
+            if os.path.exists(path):
+                self._load()
+            else:
+                # write the header so "append" can do incremental updates
+                self.flush()
+
+    def copyfrom(self, rhs):
+        """copy the map data from another revmap. do not affect self.path"""
+        self._rev2hsh = rhs._rev2hsh[:]
+        self._rev2flag = rhs._rev2flag[:]
+        self._hsh2rev = rhs._hsh2rev.copy()
+        self._renamerevs = rhs._renamerevs[:]
+        self._renamepaths = rhs._renamepaths[:]
+        self._lastmaxrev = -1
+
+    @property
+    def maxrev(self):
+        """return max linelog revision number"""
+        return len(self._rev2hsh) - 1
+
+    def append(self, hsh, sidebranch=False, path=None, flush=False):
+        """add a binary hg hash and return the mapped linelog revision.
+        if flush is True, incrementally update the file.
+        """
+        if hsh in self._hsh2rev:
+            raise error.CorruptedFileError('%r is in revmap already' % hex(hsh))
+        if len(hsh) != _hshlen:
+            raise hgerror.ProgrammingError('hsh must be %d-char long' % _hshlen)
+        idx = len(self._rev2hsh)
+        flag = 0
+        if sidebranch:
+            flag |= sidebranchflag
+        if path is not None and path != self._renamepaths[-1]:
+            flag |= renameflag
+            self._renamerevs.append(idx)
+            self._renamepaths.append(path)
+        self._rev2hsh.append(hsh)
+        self._rev2flag.append(flag)
+        self._hsh2rev[hsh] = idx
+        if flush:
+            self.flush()
+        return idx
+
+    def rev2hsh(self, rev):
+        """convert linelog revision to hg hash. return None if not found."""
+        if rev > self.maxrev or rev < 0:
+            return None
+        return self._rev2hsh[rev]
+
+    def rev2flag(self, rev):
+        """get the flag (uint8) for a given linelog revision.
+        return None if revision does not exist.
+        """
+        if rev > self.maxrev or rev < 0:
+            return None
+        return self._rev2flag[rev]
+
+    def rev2path(self, rev):
+        """get the path for a given linelog revision.
+        return None if revision does not exist.
+        """
+        if rev > self.maxrev or rev < 0:
+            return None
+        idx = bisect.bisect_right(self._renamerevs, rev) - 1
+        return self._renamepaths[idx]
+
+    def hsh2rev(self, hsh):
+        """convert hg hash to linelog revision. return None if not found."""
+        return self._hsh2rev.get(hsh)
+
+    def clear(self, flush=False):
+        """make the map empty. if flush is True, write to disk"""
+        # rev 0 is reserved, real rev starts from 1
+        self._rev2hsh = [None]
+        self._rev2flag = [None]
+        self._hsh2rev = {}
+        self._rev2path = ['']
+        self._lastmaxrev = -1
+        if flush:
+            self.flush()
+
+    def flush(self):
+        """write the state down to the file"""
+        if not self.path:
+            return
+        if self._lastmaxrev == -1: # write the entire file
+            with open(self.path, 'wb') as f:
+                f.write(self.HEADER)
+                for i in pycompat.xrange(1, len(self._rev2hsh)):
+                    self._writerev(i, f)
+        else: # append incrementally
+            with open(self.path, 'ab') as f:
+                for i in pycompat.xrange(self._lastmaxrev + 1,
+                                         len(self._rev2hsh)):
+                    self._writerev(i, f)
+        self._lastmaxrev = self.maxrev
+
+    def _load(self):
+        """load state from file"""
+        if not self.path:
+            return
+        # use local variables in a loop. CPython uses LOAD_FAST for them,
+        # which is faster than both LOAD_CONST and LOAD_GLOBAL.
+        flaglen = 1
+        hshlen = _hshlen
+        with open(self.path, 'rb') as f:
+            if f.read(len(self.HEADER)) != self.HEADER:
+                raise error.CorruptedFileError()
+            self.clear(flush=False)
+            while True:
+                buf = f.read(flaglen)
+                if not buf:
+                    break
+                flag = ord(buf)
+                rev = len(self._rev2hsh)
+                if flag & renameflag:
+                    path = self._readcstr(f)
+                    self._renamerevs.append(rev)
+                    self._renamepaths.append(path)
+                hsh = f.read(hshlen)
+                if len(hsh) != hshlen:
+                    raise error.CorruptedFileError()
+                self._hsh2rev[hsh] = rev
+                self._rev2flag.append(flag)
+                self._rev2hsh.append(hsh)
+        self._lastmaxrev = self.maxrev
+
+    def _writerev(self, rev, f):
+        """append a revision data to file"""
+        flag = self._rev2flag[rev]
+        hsh = self._rev2hsh[rev]
+        f.write(struct.pack('B', flag))
+        if flag & renameflag:
+            path = self.rev2path(rev)
+            if path is None:
+                raise error.CorruptedFileError('cannot find path for %s' % rev)
+            f.write(path + '\0')
+        f.write(hsh)
+
+    @staticmethod
+    def _readcstr(f):
+        """read a C-language-like '\0'-terminated string"""
+        buf = ''
+        while True:
+            ch = f.read(1)
+            if not ch: # unexpected eof
+                raise error.CorruptedFileError()
+            if ch == '\0':
+                break
+            buf += ch
+        return buf
+
+    def __contains__(self, f):
+        """(fctx or (node, path)) -> bool.
+        test if (node, path) is in the map, and is not in a side branch.
+        f can be either a tuple of (node, path), or a fctx.
+        """
+        if isinstance(f, tuple): # f: (node, path)
+            hsh, path = f
+        else: # f: fctx
+            hsh, path = f.node(), f.path()
+        rev = self.hsh2rev(hsh)
+        if rev is None:
+            return False
+        if path is not None and path != self.rev2path(rev):
+            return False
+        return (self.rev2flag(rev) & sidebranchflag) == 0
+
+def getlastnode(path):
+    """return the last hash in a revmap, without loading its full content.
+    this is equivalent to `m = revmap(path); m.rev2hsh(m.maxrev)`, but faster.
+    """
+    hsh = None
+    try:
+        with open(path, 'rb') as f:
+            f.seek(-_hshlen, 2)
+            if f.tell() > len(revmap.HEADER):
+                hsh = f.read(_hshlen)
+    except IOError:
+        pass
+    return hsh