diff mercurial/wireprotov2server.py @ 40178:46a40bce3ae0

wireprotov2: define and implement "filesdata" command Previously, the only way to access file revision data was the "filedata" command. This command is useful to have. But, it only allowed resolving revision data for a single file. This meant that clients needed to send 1 command for each tracked path they were seeking data on. Furthermore, those commands would need to enumerate the exact file nodes they wanted data for. This approach meant that clients were sending a lot of data to remotes in order to request file data. e.g. if there were 1M file revisions, we'd need at least 20,000,000 bytes just to encode file nodes! Many clients on the internet don't have that kind of upload capacity. In order to limit the amount of data that clients must send, we'll need more efficient ways to request repository data. This commit defines and implements a new "filesdata" command. This command allows the retrieval of data for multiple files by specifying changeset revisions and optional file patterns. The command figures out what file revisions are "relevant" and sends them in bulk. The logic around choosing which file revisions to send in the case of haveparents not being set is overly simple and will over-send files. We will need more smarts here eventually. (Specifically, the client will need to tell the server which revisions it knows about.) This work is deferred until a later time. Differential Revision: https://phab.mercurial-scm.org/D4981
author Gregory Szorc <gregory.szorc@gmail.com>
date Wed, 03 Oct 2018 12:54:39 -0700
parents 41e2633bcd00
children ed55a0077490
line wrap: on
line diff
--- a/mercurial/wireprotov2server.py	Tue Oct 02 10:31:36 2018 -0700
+++ b/mercurial/wireprotov2server.py	Wed Oct 03 12:54:39 2018 -0700
@@ -6,6 +6,7 @@
 
 from __future__ import absolute_import
 
+import collections
 import contextlib
 import hashlib
 
@@ -18,6 +19,7 @@
     discovery,
     encoding,
     error,
+    match as matchmod,
     narrowspec,
     pycompat,
     wireprotoframing,
@@ -1004,6 +1006,29 @@
         for extra in followingdata:
             yield extra
 
+def makefilematcher(repo, pathfilter):
+    """Construct a matcher from a path filter dict."""
+
+    # Validate values.
+    if pathfilter:
+        for key in (b'include', b'exclude'):
+            for pattern in pathfilter.get(key, []):
+                if not pattern.startswith((b'path:', b'rootfilesin:')):
+                    raise error.WireprotoCommandError(
+                        '%s pattern must begin with `path:` or `rootfilesin:`; '
+                        'got %s', (key, pattern))
+
+    if pathfilter:
+        matcher = matchmod.match(repo.root, b'',
+                                 include=pathfilter.get(b'include', []),
+                                 exclude=pathfilter.get(b'exclude', []))
+    else:
+        matcher = matchmod.match(repo.root, b'')
+
+    # Requested patterns could include files not in the local store. So
+    # filter those out.
+    return matchmod.intersectmatchers(repo.narrowmatch(), matcher)
+
 @wireprotocommand(
     'filedata',
     args={
@@ -1033,6 +1058,10 @@
     # the cache key.
     cachekeyfn=makecommandcachekeyfn('filedata', 1, allargs=True))
 def filedata(repo, proto, haveparents, nodes, fields, path):
+    # TODO this API allows access to file revisions that are attached to
+    # secret changesets. filesdata does not have this problem. Maybe this
+    # API should be deleted?
+
     try:
         # Extensions may wish to access the protocol handler.
         store = getfilestore(repo, proto, path)
@@ -1058,6 +1087,117 @@
     for o in emitfilerevisions(revisions, fields):
         yield o
 
+def filesdatacapabilities(repo, proto):
+    batchsize = repo.ui.configint(
+        b'experimental', b'server.filesdata.recommended-batch-size')
+    return {
+        b'recommendedbatchsize': batchsize,
+    }
+
+@wireprotocommand(
+    'filesdata',
+    args={
+        'haveparents': {
+            'type': 'bool',
+            'default': lambda: False,
+            'example': True,
+        },
+        'fields': {
+            'type': 'set',
+            'default': set,
+            'example': {b'parents', b'revision'},
+            'validvalues': {b'firstchangeset', b'parents', b'revision'},
+        },
+        'pathfilter': {
+            'type': 'dict',
+            'default': lambda: None,
+            'example': {b'include': [b'path:tests']},
+        },
+        'revisions': {
+            'type': 'list',
+            'example': [{
+                b'type': b'changesetexplicit',
+                b'nodes': [b'abcdef...'],
+            }],
+        },
+    },
+    permission='pull',
+    # TODO censoring a file revision won't invalidate the cache.
+    # Figure out a way to take censoring into account when deriving
+    # the cache key.
+    cachekeyfn=makecommandcachekeyfn('filesdata', 1, allargs=True),
+    extracapabilitiesfn=filesdatacapabilities)
+def filesdata(repo, proto, haveparents, fields, pathfilter, revisions):
+    # TODO This should operate on a repo that exposes obsolete changesets. There
+    # is a race between a client making a push that obsoletes a changeset and
+    # another client fetching files data for that changeset. If a client has a
+    # changeset, it should probably be allowed to access files data for that
+    # changeset.
+
+    cl = repo.changelog
+    outgoing = resolvenodes(repo, revisions)
+    filematcher = makefilematcher(repo, pathfilter)
+
+    # Figure out what needs to be emitted.
+    changedpaths = set()
+    fnodes = collections.defaultdict(set)
+
+    for node in outgoing:
+        ctx = repo[node]
+        changedpaths.update(ctx.files())
+
+    changedpaths = sorted(p for p in changedpaths if filematcher(p))
+
+    # If ancestors are known, we send file revisions having a linkrev in the
+    # outgoing set of changeset revisions.
+    if haveparents:
+        outgoingclrevs = set(cl.rev(n) for n in outgoing)
+
+        for path in changedpaths:
+            try:
+                store = getfilestore(repo, proto, path)
+            except FileAccessError as e:
+                raise error.WireprotoCommandError(e.msg, e.args)
+
+            for rev in store:
+                linkrev = store.linkrev(rev)
+
+                if linkrev in outgoingclrevs:
+                    fnodes[path].add(store.node(rev))
+
+    # If ancestors aren't known, we walk the manifests and send all
+    # encountered file revisions.
+    else:
+        for node in outgoing:
+            mctx = repo[node].manifestctx()
+
+            for path, fnode in mctx.read().items():
+                if filematcher(path):
+                    fnodes[path].add(fnode)
+
+    yield {
+        b'totalpaths': len(fnodes),
+        b'totalitems': sum(len(v) for v in fnodes.values())
+    }
+
+    for path, filenodes in sorted(fnodes.items()):
+        try:
+            store = getfilestore(repo, proto, path)
+        except FileAccessError as e:
+            raise error.WireprotoCommandError(e.msg, e.args)
+
+        yield {
+            b'path': path,
+            b'totalitems': len(filenodes),
+        }
+
+        revisions = store.emitrevisions(filenodes,
+                                        revisiondata=b'revision' in fields,
+                                        assumehaveparentrevisions=haveparents)
+
+        for o in emitfilerevisions(revisions, fields):
+            yield o
+
 @wireprotocommand(
     'heads',
     args={