diff hgext/git/gitlog.py @ 44477:ad718271a9eb

git: skeleton of a new extension to _directly_ operate on git repos This is based in part of work I did years ago in hgit, but it's mostly new code since I'm using pygit2 instead of dulwich and the hg storage interfaces have improved. Some cleanup of old hgit code by Pulkit, which I greatly appreciate. test-git-interop.t does not cover a whole lot of cases, but it passes. It includes status, diff, making a new commit, and `hg annotate` working on the git repository. This is _not_ (yet) production quality code: this is an experiment. Known technical debt lurking in this implementation: * Writing bookmarks just totally ignores transactions. * The way progress is threaded down into the gitstore is awful. * Ideally we'd find a way to incrementally reindex DAGs. I'm not sure how to do that efficiently, so we might need a "known only fast-forwards" mode on the DAG indexer for use on `hg commit` and friends. * We don't even _try_ to do anything reasonable for `hg pull` or `hg push`. * Mercurial need an interface for the changelog type. Tests currently require git 2.24 as far as I'm aware: `git status` has some changed output that I didn't try and handle in a compatible way. This patch has produced some interesting cleanups, most recently on the manifest type. I expect continuing down this road will produce other meritorious cleanups throughout our code. Differential Revision: https://phab.mercurial-scm.org/D6734
author Augie Fackler <augie@google.com>
date Tue, 11 Feb 2020 00:44:59 -0500
parents
children 6d953b3fc2bd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/git/gitlog.py	Tue Feb 11 00:44:59 2020 -0500
@@ -0,0 +1,463 @@
+from __future__ import absolute_import
+
+import pygit2
+
+from mercurial.i18n import _
+
+from mercurial import (
+    ancestor,
+    changelog as hgchangelog,
+    dagop,
+    encoding,
+    error,
+    manifest,
+    node as nodemod,
+    pycompat,
+)
+from mercurial.interfaces import (
+    repository,
+    util as interfaceutil,
+)
+from mercurial.utils import stringutil
+from . import (
+    gitutil,
+    index,
+    manifest as gitmanifest,
+)
+
+
+class baselog(object):  # revlog.revlog):
+    """Common implementations between changelog and manifestlog."""
+
+    def __init__(self, gr, db):
+        self.gitrepo = gr
+        self._db = db
+
+    def __len__(self):
+        return int(
+            self._db.execute('SELECT COUNT(*) FROM changelog').fetchone()[0]
+        )
+
+    def rev(self, n):
+        if n == nodemod.nullid:
+            return -1
+        t = self._db.execute(
+            'SELECT rev FROM changelog WHERE node = ?', (gitutil.togitnode(n),)
+        ).fetchone()
+        if t is None:
+            raise error.LookupError(n, b'00changelog.i', _(b'no node %d'))
+        return t[0]
+
+    def node(self, r):
+        if r == nodemod.nullrev:
+            return nodemod.nullid
+        t = self._db.execute(
+            'SELECT node FROM changelog WHERE rev = ?', (r,)
+        ).fetchone()
+        if t is None:
+            raise error.LookupError(r, b'00changelog.i', _(b'no node'))
+        return nodemod.bin(t[0])
+
+    def hasnode(self, n):
+        t = self._db.execute(
+            'SELECT node FROM changelog WHERE node = ?', (n,)
+        ).fetchone()
+        return t is not None
+
+
+class baselogindex(object):
+    def __init__(self, log):
+        self._log = log
+
+    def has_node(self, n):
+        return self._log.rev(n) != -1
+
+    def __len__(self):
+        return len(self._log)
+
+    def __getitem__(self, idx):
+        p1rev, p2rev = self._log.parentrevs(idx)
+        # TODO: it's messy that the index leaks so far out of the
+        # storage layer that we have to implement things like reading
+        # this raw tuple, which exposes revlog internals.
+        return (
+            # Pretend offset is just the index, since we don't really care.
+            idx,
+            # Same with lengths
+            idx,  # length
+            idx,  # rawsize
+            -1,  # delta base
+            idx,  # linkrev TODO is this right?
+            p1rev,
+            p2rev,
+            self._log.node(idx),
+        )
+
+
+# TODO: an interface for the changelog type?
+class changelog(baselog):
+    def __contains__(self, rev):
+        try:
+            self.node(rev)
+            return True
+        except error.LookupError:
+            return False
+
+    @property
+    def filteredrevs(self):
+        # TODO: we should probably add a refs/hg/ namespace for hidden
+        # heads etc, but that's an idea for later.
+        return set()
+
+    @property
+    def index(self):
+        return baselogindex(self)
+
+    @property
+    def nodemap(self):
+        r = {
+            nodemod.bin(v[0]): v[1]
+            for v in self._db.execute('SELECT node, rev FROM changelog')
+        }
+        r[nodemod.nullid] = nodemod.nullrev
+        return r
+
+    def tip(self):
+        t = self._db.execute(
+            'SELECT node FROM changelog ORDER BY rev DESC LIMIT 1'
+        ).fetchone()
+        if t:
+            return nodemod.bin(t[0])
+        return nodemod.nullid
+
+    def revs(self, start=0, stop=None):
+        if stop is None:
+            stop = self.tip()
+        t = self._db.execute(
+            'SELECT rev FROM changelog '
+            'WHERE rev >= ? AND rev <= ? '
+            'ORDER BY REV ASC',
+            (start, stop),
+        )
+        return (int(r[0]) for r in t)
+
+    def _partialmatch(self, id):
+        if nodemod.wdirhex.startswith(id):
+            raise error.WdirUnsupported
+        candidates = [
+            nodemod.bin(x[0])
+            for x in self._db.execute(
+                'SELECT node FROM changelog WHERE node LIKE ?', (id + b'%',)
+            )
+        ]
+        if nodemod.nullhex.startswith(id):
+            candidates.append(nodemod.nullid)
+        if len(candidates) > 1:
+            raise error.AmbiguousPrefixLookupError(
+                id, b'00changelog.i', _(b'ambiguous identifier')
+            )
+        if candidates:
+            return candidates[0]
+        return None
+
+    def flags(self, rev):
+        return 0
+
+    def shortest(self, node, minlength=1):
+        nodehex = nodemod.hex(node)
+        for attempt in pycompat.xrange(minlength, len(nodehex) + 1):
+            candidate = nodehex[:attempt]
+            matches = int(
+                self._db.execute(
+                    'SELECT COUNT(*) FROM changelog WHERE node LIKE ?',
+                    (pycompat.sysstr(nodehex + b'%'),),
+                ).fetchone()[0]
+            )
+            if matches == 1:
+                return candidate
+        return nodehex
+
+    def headrevs(self, revs=None):
+        realheads = [
+            int(x[0])
+            for x in self._db.execute(
+                'SELECT rev FROM changelog '
+                'INNER JOIN heads ON changelog.node = heads.node'
+            )
+        ]
+        if revs:
+            return sorted([r for r in revs if r in realheads])
+        return sorted(realheads)
+
+    def changelogrevision(self, nodeorrev):
+        # Ensure we have a node id
+        if isinstance(nodeorrev, int):
+            n = self.node(nodeorrev)
+        else:
+            n = nodeorrev
+        # handle looking up nullid
+        if n == nodemod.nullid:
+            return hgchangelog._changelogrevision(extra={})
+        hn = gitutil.togitnode(n)
+        # We've got a real commit!
+        files = [
+            r[0]
+            for r in self._db.execute(
+                'SELECT filename FROM changedfiles '
+                'WHERE node = ? and filenode != ?',
+                (hn, gitutil.nullgit),
+            )
+        ]
+        filesremoved = [
+            r[0]
+            for r in self._db.execute(
+                'SELECT filename FROM changedfiles '
+                'WHERE node = ? and filenode = ?',
+                (hn, nodemod.nullhex),
+            )
+        ]
+        c = self.gitrepo[hn]
+        return hgchangelog._changelogrevision(
+            manifest=n,  # pretend manifest the same as the commit node
+            user=b'%s <%s>'
+            % (c.author.name.encode('utf8'), c.author.email.encode('utf8')),
+            date=(c.author.time, -c.author.offset * 60),
+            files=files,
+            # TODO filesadded in the index
+            filesremoved=filesremoved,
+            description=c.message.encode('utf8'),
+            # TODO do we want to handle extra? how?
+            extra={b'branch': b'default'},
+        )
+
+    def ancestors(self, revs, stoprev=0, inclusive=False):
+        revs = list(revs)
+        tip = self.rev(self.tip())
+        for r in revs:
+            if r > tip:
+                raise IndexError(b'Invalid rev %r' % r)
+        return ancestor.lazyancestors(
+            self.parentrevs, revs, stoprev=stoprev, inclusive=inclusive
+        )
+
+    # Cleanup opportunity: this is *identical* to the revlog.py version
+    def descendants(self, revs):
+        return dagop.descendantrevs(revs, self.revs, self.parentrevs)
+
+    def reachableroots(self, minroot, heads, roots, includepath=False):
+        return dagop._reachablerootspure(
+            self.parentrevs, minroot, roots, heads, includepath
+        )
+
+    # Cleanup opportunity: this is *identical* to the revlog.py version
+    def isancestor(self, a, b):
+        a, b = self.rev(a), self.rev(b)
+        return self.isancestorrev(a, b)
+
+    # Cleanup opportunity: this is *identical* to the revlog.py version
+    def isancestorrev(self, a, b):
+        if a == nodemod.nullrev:
+            return True
+        elif a == b:
+            return True
+        elif a > b:
+            return False
+        return bool(self.reachableroots(a, [b], [a], includepath=False))
+
+    def parentrevs(self, rev):
+        n = self.node(rev)
+        hn = gitutil.togitnode(n)
+        c = self.gitrepo[hn]
+        p1 = p2 = nodemod.nullrev
+        if c.parents:
+            p1 = self.rev(c.parents[0].id.raw)
+            if len(c.parents) > 2:
+                raise error.Abort(b'TODO octopus merge handling')
+            if len(c.parents) == 2:
+                p2 = self.rev(c.parents[0].id.raw)
+        return p1, p2
+
+    # Private method is used at least by the tags code.
+    _uncheckedparentrevs = parentrevs
+
+    def commonancestorsheads(self, a, b):
+        # TODO the revlog verson of this has a C path, so we probably
+        # need to optimize this...
+        a, b = self.rev(a), self.rev(b)
+        return [
+            self.node(n)
+            for n in ancestor.commonancestorsheads(self.parentrevs, a, b)
+        ]
+
+    def branchinfo(self, rev):
+        """Git doesn't do named branches, so just put everything on default."""
+        return b'default', False
+
+    def delayupdate(self, tr):
+        # TODO: I think we can elide this because we're just dropping
+        # an object in the git repo?
+        pass
+
+    def add(
+        self,
+        manifest,
+        files,
+        desc,
+        transaction,
+        p1,
+        p2,
+        user,
+        date=None,
+        extra=None,
+        p1copies=None,
+        p2copies=None,
+        filesadded=None,
+        filesremoved=None,
+    ):
+        parents = []
+        hp1, hp2 = gitutil.togitnode(p1), gitutil.togitnode(p2)
+        if p1 != nodemod.nullid:
+            parents.append(hp1)
+        if p2 and p2 != nodemod.nullid:
+            parents.append(hp2)
+        assert date is not None
+        timestamp, tz = date
+        sig = pygit2.Signature(
+            encoding.unifromlocal(stringutil.person(user)),
+            encoding.unifromlocal(stringutil.email(user)),
+            timestamp,
+            -(tz // 60),
+        )
+        oid = self.gitrepo.create_commit(
+            None, sig, sig, desc, gitutil.togitnode(manifest), parents
+        )
+        # Set up an internal reference to force the commit into the
+        # changelog. Hypothetically, we could even use this refs/hg/
+        # namespace to allow for anonymous heads on git repos, which
+        # would be neat.
+        self.gitrepo.references.create(
+            'refs/hg/internal/latest-commit', oid, force=True
+        )
+        # Reindex now to pick up changes. We omit the progress
+        # callback because this will be very quick.
+        index._index_repo(self.gitrepo, self._db)
+        return oid.raw
+
+
+class manifestlog(baselog):
+    def __getitem__(self, node):
+        return self.get(b'', node)
+
+    def get(self, relpath, node):
+        if node == nodemod.nullid:
+            # TODO: this should almost certainly be a memgittreemanifestctx
+            return manifest.memtreemanifestctx(self, relpath)
+        commit = self.gitrepo[gitutil.togitnode(node)]
+        t = commit.tree
+        if relpath:
+            parts = relpath.split(b'/')
+            for p in parts:
+                te = t[p]
+                t = self.gitrepo[te.id]
+        return gitmanifest.gittreemanifestctx(self.gitrepo, t)
+
+
+@interfaceutil.implementer(repository.ifilestorage)
+class filelog(baselog):
+    def __init__(self, gr, db, path):
+        super(filelog, self).__init__(gr, db)
+        assert isinstance(path, bytes)
+        self.path = path
+
+    def read(self, node):
+        if node == nodemod.nullid:
+            return b''
+        return self.gitrepo[gitutil.togitnode(node)].data
+
+    def lookup(self, node):
+        if len(node) not in (20, 40):
+            node = int(node)
+        if isinstance(node, int):
+            assert False, b'todo revnums for nodes'
+        if len(node) == 40:
+            node = nodemod.bin(node)
+        hnode = gitutil.togitnode(node)
+        if hnode in self.gitrepo:
+            return node
+        raise error.LookupError(self.path, node, _(b'no match found'))
+
+    def cmp(self, node, text):
+        """Returns True if text is different than content at `node`."""
+        return self.read(node) != text
+
+    def add(self, text, meta, transaction, link, p1=None, p2=None):
+        assert not meta  # Should we even try to handle this?
+        return self.gitrepo.create_blob(text).raw
+
+    def __iter__(self):
+        for clrev in self._db.execute(
+            '''
+SELECT rev FROM changelog
+INNER JOIN changedfiles ON changelog.node = changedfiles.node
+WHERE changedfiles.filename = ? AND changedfiles.filenode != ?
+        ''',
+            (pycompat.fsdecode(self.path), gitutil.nullgit),
+        ):
+            yield clrev[0]
+
+    def linkrev(self, fr):
+        return fr
+
+    def rev(self, node):
+        row = self._db.execute(
+            '''
+SELECT rev FROM changelog
+INNER JOIN changedfiles ON changelog.node = changedfiles.node
+WHERE changedfiles.filename = ? AND changedfiles.filenode = ?''',
+            (pycompat.fsdecode(self.path), gitutil.togitnode(node)),
+        ).fetchone()
+        if row is None:
+            raise error.LookupError(self.path, node, _(b'no such node'))
+        return int(row[0])
+
+    def node(self, rev):
+        maybe = self._db.execute(
+            '''SELECT filenode FROM changedfiles
+INNER JOIN changelog ON changelog.node = changedfiles.node
+WHERE changelog.rev = ? AND filename = ?
+''',
+            (rev, pycompat.fsdecode(self.path)),
+        ).fetchone()
+        if maybe is None:
+            raise IndexError('gitlog %r out of range %d' % (self.path, rev))
+        return nodemod.bin(maybe[0])
+
+    def parents(self, node):
+        gn = gitutil.togitnode(node)
+        gp = pycompat.fsdecode(self.path)
+        ps = []
+        for p in self._db.execute(
+            '''SELECT p1filenode, p2filenode FROM changedfiles
+WHERE filenode = ? AND filename = ?
+''',
+            (gn, gp),
+        ).fetchone():
+            if p is None:
+                commit = self._db.execute(
+                    "SELECT node FROM changedfiles "
+                    "WHERE filenode = ? AND filename = ?",
+                    (gn, gp),
+                ).fetchone()[0]
+                # This filelog is missing some data. Build the
+                # filelog, then recurse (which will always find data).
+                if pycompat.ispy3:
+                    commit = commit.decode('ascii')
+                index.fill_in_filelog(self.gitrepo, self._db, commit, gp, gn)
+                return self.parents(node)
+            else:
+                ps.append(nodemod.bin(p))
+        return ps
+
+    def renamed(self, node):
+        # TODO: renames/copies
+        return False