hgext/git/index.py
changeset 44477 ad718271a9eb
child 44484 ec54b3d2af0b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/git/index.py	Tue Feb 11 00:44:59 2020 -0500
@@ -0,0 +1,346 @@
+from __future__ import absolute_import
+
+import collections
+import os
+import sqlite3
+
+import pygit2
+
+from mercurial.i18n import _
+
+from mercurial import (
+    encoding,
+    error,
+    node as nodemod,
+    pycompat,
+)
+
+from . import gitutil
+
+
+_CURRENT_SCHEMA_VERSION = 1
+_SCHEMA = (
+    """
+CREATE TABLE refs (
+  -- node and name are unique together. There may be more than one name for
+  -- a given node, and there may be no name at all for a given node (in the
+  -- case of an anonymous hg head).
+  node TEXT NOT NULL,
+  name TEXT
+);
+
+-- The "possible heads" of the repository, which we use to figure out
+-- if we need to re-walk the changelog.
+CREATE TABLE possible_heads (
+  node TEXT NOT NULL
+);
+
+-- The topological heads of the changelog, which hg depends on.
+CREATE TABLE heads (
+  node TEXT NOT NULL
+);
+
+-- A total ordering of the changelog
+CREATE TABLE changelog (
+  rev INTEGER NOT NULL PRIMARY KEY,
+  node TEXT NOT NULL,
+  p1 TEXT,
+  p2 TEXT
+);
+
+CREATE UNIQUE INDEX changelog_node_idx ON changelog(node);
+CREATE UNIQUE INDEX changelog_node_rev_idx ON changelog(rev, node);
+
+-- Changed files for each commit, which lets us dynamically build
+-- filelogs.
+CREATE TABLE changedfiles (
+  node TEXT NOT NULL,
+  filename TEXT NOT NULL,
+  -- 40 zeroes for deletions
+  filenode TEXT NOT NULL,
+-- to handle filelog parentage:
+  p1node TEXT,
+  p1filenode TEXT,
+  p2node TEXT,
+  p2filenode TEXT
+);
+
+CREATE INDEX changedfiles_nodes_idx
+  ON changedfiles(node);
+
+PRAGMA user_version=%d
+"""
+    % _CURRENT_SCHEMA_VERSION
+)
+
+
+def _createdb(path):
+    # print('open db', path)
+    # import traceback
+    # traceback.print_stack()
+    db = sqlite3.connect(encoding.strfromlocal(path))
+    db.text_factory = bytes
+
+    res = db.execute('PRAGMA user_version').fetchone()[0]
+
+    # New database.
+    if res == 0:
+        for statement in _SCHEMA.split(';'):
+            db.execute(statement.strip())
+
+        db.commit()
+
+    elif res == _CURRENT_SCHEMA_VERSION:
+        pass
+
+    else:
+        raise error.Abort(_(b'sqlite database has unrecognized version'))
+
+    db.execute('PRAGMA journal_mode=WAL')
+
+    return db
+
+
+_OUR_ORDER = (
+    pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE
+)
+
+_DIFF_FLAGS = 1 << 21  # GIT_DIFF_FORCE_BINARY, which isn't exposed by pygit2
+
+
+def _find_nearest_ancestor_introducing_node(
+    db, gitrepo, file_path, walk_start, filenode
+):
+    """Find the nearest ancestor that introduces a file node.
+
+    Args:
+      db: a handle to our sqlite database.
+      gitrepo: A pygit2.Repository instance.
+      file_path: the path of a file in the repo
+      walk_start: a pygit2.Oid that is a commit where we should start walking
+                  for our nearest ancestor.
+
+    Returns:
+      A hexlified SHA that is the commit ID of the next-nearest parent.
+    """
+    assert isinstance(file_path, str), 'file_path must be str, got %r' % type(
+        file_path
+    )
+    assert isinstance(filenode, str), 'filenode must be str, got %r' % type(
+        filenode
+    )
+    parent_options = {
+        row[0].decode('ascii')
+        for row in db.execute(
+            'SELECT node FROM changedfiles '
+            'WHERE filename = ? AND filenode = ?',
+            (file_path, filenode),
+        )
+    }
+    inner_walker = gitrepo.walk(walk_start, _OUR_ORDER)
+    for w in inner_walker:
+        if w.id.hex in parent_options:
+            return w.id.hex
+    raise error.ProgrammingError(
+        'Unable to find introducing commit for %s node %s from %s',
+        (file_path, filenode, walk_start),
+    )
+
+
+def fill_in_filelog(gitrepo, db, startcommit, path, startfilenode):
+    """Given a starting commit and path, fill in a filelog's parent pointers.
+
+    Args:
+      gitrepo: a pygit2.Repository
+      db: a handle to our sqlite database
+      startcommit: a hexlified node id for the commit to start at
+      path: the path of the file whose parent pointers we should fill in.
+      filenode: the hexlified node id of the file at startcommit
+
+    TODO: make filenode optional
+    """
+    assert isinstance(
+        startcommit, str
+    ), 'startcommit must be str, got %r' % type(startcommit)
+    assert isinstance(
+        startfilenode, str
+    ), 'startfilenode must be str, got %r' % type(startfilenode)
+    visit = collections.deque([(startcommit, startfilenode)])
+    while visit:
+        cnode, filenode = visit.popleft()
+        commit = gitrepo[cnode]
+        parents = []
+        for parent in commit.parents:
+            t = parent.tree
+            for comp in path.split('/'):
+                try:
+                    t = gitrepo[t[comp].id]
+                except KeyError:
+                    break
+            else:
+                introducer = _find_nearest_ancestor_introducing_node(
+                    db, gitrepo, path, parent.id, t.id.hex
+                )
+                parents.append((introducer, t.id.hex))
+        p1node = p1fnode = p2node = p2fnode = gitutil.nullgit
+        for par, parfnode in parents:
+            found = int(
+                db.execute(
+                    'SELECT COUNT(*) FROM changedfiles WHERE '
+                    'node = ? AND filename = ? AND filenode = ? AND '
+                    'p1node NOT NULL',
+                    (par, path, parfnode),
+                ).fetchone()[0]
+            )
+            if found == 0:
+                assert par is not None
+                visit.append((par, parfnode))
+        if parents:
+            p1node, p1fnode = parents[0]
+        if len(parents) == 2:
+            p2node, p2fnode = parents[1]
+        if len(parents) > 2:
+            raise error.ProgrammingError(
+                b"git support can't handle octopus merges"
+            )
+        db.execute(
+            'UPDATE changedfiles SET '
+            'p1node = ?, p1filenode = ?, p2node = ?, p2filenode = ? '
+            'WHERE node = ? AND filename = ? AND filenode = ?',
+            (p1node, p1fnode, p2node, p2fnode, commit.id.hex, path, filenode),
+        )
+    db.commit()
+
+
+def _index_repo(gitrepo, db, progress_factory=lambda *args, **kwargs: None):
+    # Identify all references so we can tell the walker to visit all of them.
+    all_refs = gitrepo.listall_references()
+    possible_heads = set()
+    prog = progress_factory(b'refs')
+    for pos, ref in enumerate(all_refs):
+        if prog is not None:
+            prog.update(pos)
+        if not (
+            ref.startswith('refs/heads/')  # local branch
+            or ref.startswith('refs/tags/')  # tag
+            or ref.startswith('refs/remotes/')  # remote branch
+            or ref.startswith('refs/hg/')  # from this extension
+        ):
+            continue
+        try:
+            start = gitrepo.lookup_reference(ref).peel(pygit2.GIT_OBJ_COMMIT)
+        except ValueError:
+            # No commit to be found, so we don't care for hg's purposes.
+            continue
+        possible_heads.add(start.id)
+    # Optimization: if the list of heads hasn't changed, don't
+    # reindex, the changelog. This doesn't matter on small
+    # repositories, but on even moderately deep histories (eg cpython)
+    # this is a very important performance win.
+    #
+    # TODO: we should figure out how to incrementally index history
+    # (preferably by detecting rewinds!) so that we don't have to do a
+    # full changelog walk every time a new commit is created.
+    cache_heads = {x[0] for x in db.execute('SELECT node FROM possible_heads')}
+    walker = None
+    cur_cache_heads = {h.hex for h in possible_heads}
+    if cur_cache_heads == cache_heads:
+        return
+    for start in possible_heads:
+        if walker is None:
+            walker = gitrepo.walk(start, _OUR_ORDER)
+        else:
+            walker.push(start)
+
+    # Empty out the existing changelog. Even for large-ish histories
+    # we can do the top-level "walk all the commits" dance very
+    # quickly as long as we don't need to figure out the changed files
+    # list.
+    db.execute('DELETE FROM changelog')
+    if prog is not None:
+        prog.complete()
+    prog = progress_factory(b'commits')
+    # This walker is sure to visit all the revisions in history, but
+    # only once.
+    for pos, commit in enumerate(walker):
+        if prog is not None:
+            prog.update(pos)
+        p1 = p2 = nodemod.nullhex
+        if len(commit.parents) > 2:
+            raise error.ProgrammingError(
+                (
+                    b"git support can't handle octopus merges, "
+                    b"found a commit with %d parents :("
+                )
+                % len(commit.parents)
+            )
+        if commit.parents:
+            p1 = commit.parents[0].id.hex
+        if len(commit.parents) == 2:
+            p2 = commit.parents[1].id.hex
+        db.execute(
+            'INSERT INTO changelog (rev, node, p1, p2) VALUES(?, ?, ?, ?)',
+            (pos, commit.id.hex, p1, p2),
+        )
+
+        num_changedfiles = db.execute(
+            "SELECT COUNT(*) from changedfiles WHERE node = ?",
+            (commit.id.hex,),
+        ).fetchone()[0]
+        if not num_changedfiles:
+            files = {}
+            # I *think* we only need to check p1 for changed files
+            # (and therefore linkrevs), because any node that would
+            # actually have this commit as a linkrev would be
+            # completely new in this rev.
+            p1 = commit.parents[0].id.hex if commit.parents else None
+            if p1 is not None:
+                patchgen = gitrepo.diff(p1, commit.id.hex, flags=_DIFF_FLAGS)
+            else:
+                patchgen = commit.tree.diff_to_tree(
+                    swap=True, flags=_DIFF_FLAGS
+                )
+            new_files = (p.delta.new_file for p in patchgen)
+            files = {
+                nf.path: nf.id.hex
+                for nf in new_files
+                if nf.id.raw != nodemod.nullid
+            }
+            for p, n in files.items():
+                # We intentionally set NULLs for any file parentage
+                # information so it'll get demand-computed later. We
+                # used to do it right here, and it was _very_ slow.
+                db.execute(
+                    'INSERT INTO changedfiles ('
+                    'node, filename, filenode, p1node, p1filenode, p2node, '
+                    'p2filenode) VALUES(?, ?, ?, ?, ?, ?, ?)',
+                    (commit.id.hex, p, n, None, None, None, None),
+                )
+    db.execute('DELETE FROM heads')
+    db.execute('DELETE FROM possible_heads')
+    for hid in possible_heads:
+        h = hid.hex
+        db.execute('INSERT INTO possible_heads (node) VALUES(?)', (h,))
+        haschild = db.execute(
+            'SELECT COUNT(*) FROM changelog WHERE p1 = ? OR p2 = ?', (h, h)
+        ).fetchone()[0]
+        if not haschild:
+            db.execute('INSERT INTO heads (node) VALUES(?)', (h,))
+
+    db.commit()
+    if prog is not None:
+        prog.complete()
+
+
+def get_index(gitrepo, progress_factory=lambda *args, **kwargs: None):
+    cachepath = os.path.join(
+        pycompat.fsencode(gitrepo.path), b'..', b'.hg', b'cache'
+    )
+    if not os.path.exists(cachepath):
+        os.makedirs(cachepath)
+    dbpath = os.path.join(cachepath, b'git-commits.sqlite')
+    db = _createdb(dbpath)
+    # TODO check against gitrepo heads before doing a full index
+    # TODO thread a ui.progress call into this layer
+    _index_repo(gitrepo, db, progress_factory)
+    return db