view tests/test-ancestor.py @ 40022:33eb670e2834

wireprotov2: define semantics for content redirects When I implemented the clonebundles feature and deployed it on hg.mozilla.org using Amazon S3 as a content server, server-side CPU and bandwidth usage dropped off a cliff and a ton of server scaling headaches went away pretty much the instant clients with support for clonebundles were rolled out to Firefox CI. An obvious takeaway from that experience was that offloading server load to scalable file servers - potentially backed by a CDN - is a really good idea. Another takeaway was that Mercurial's wire protocol wasn't in a good position to support data offload generally. In wire protocol version 1, there isn't a mechanism in the protocol to say "grab the data from over here instead." For HTTP, we could teach the client to follow HTTP redirects. Or we could invent a media type that encoded redirects inline. But for SSH, we were pretty much out of luck because that protocol wasn't very flexible. Wire protocol version 2 offers the opportunity to do something better. The recent generic server-side content caching layer in the wire protocol version 2 server demonstrated that it is possible to have drop-in caching of responses to command requests. This by itself adds tons of value and already makes the built-in server much more scalable. But I don't want to stop there. The existing server-side caching implementation has a big weakness: it requires the server to send data to the client. This means that the Mercurial server is potentially sending gigabytes of data to thousands of clients. This is problematic because compared to scaling static file servers, scaling dynamic servers is *hard*. A solution to this is to "offload" serving of content to something that isn't the Mercurial server. By offloading content serving, you turn the Mercurial server from a centralized monolithic service to a distributed mostly-indexing service. Assuming high rates of content offload, this should drastically reduce the total work performed by the Mercurial server, both in terms of CPU and data transfer. This will make Mercurial servers vastly easier to scale. This commit defines the semantics for "content redirects" in wire protocol version 2. Essentially: * Servers advertise the set of locations a response could be served from. * When making requests, clients advertise the set of locations they are willing to fetch content from. * Servers can then replace the inline response with one that says "get the response from over here instead." This feature - when fully implemented - will allow extending the server-side caching layer to facilitate such things as integrating your server-side cache with a scalable blob store (such as S3 or a CDN) and offloading most data transfer to that external service. This feature could also be leveraged for load balancing. e.g. requests could come into a central server and then get redirected to an available mirror depending on server availability or locality. There's tons of potential :) Differential Revision: https://phab.mercurial-scm.org/D4774
author Gregory Szorc <gregory.szorc@gmail.com>
date Wed, 26 Sep 2018 18:02:06 -0700
parents bdb177923291
children d097dd0afc19
line wrap: on
line source

from __future__ import absolute_import, print_function

import binascii
import getopt
import math
import os
import random
import sys
import time

from mercurial.node import nullrev
from mercurial import (
    ancestor,
    debugcommands,
    hg,
    pycompat,
    ui as uimod,
    util,
)

if pycompat.ispy3:
    long = int
    xrange = range

def buildgraph(rng, nodes=100, rootprob=0.05, mergeprob=0.2, prevprob=0.7):
    '''nodes: total number of nodes in the graph
    rootprob: probability that a new node (not 0) will be a root
    mergeprob: probability that, excluding a root a node will be a merge
    prevprob: probability that p1 will be the previous node

    return value is a graph represented as an adjacency list.
    '''
    graph = [None] * nodes
    for i in xrange(nodes):
        if i == 0 or rng.random() < rootprob:
            graph[i] = [nullrev]
        elif i == 1:
            graph[i] = [0]
        elif rng.random() < mergeprob:
            if i == 2 or rng.random() < prevprob:
                # p1 is prev
                p1 = i - 1
            else:
                p1 = rng.randrange(i - 1)
            p2 = rng.choice(list(range(0, p1)) + list(range(p1 + 1, i)))
            graph[i] = [p1, p2]
        elif rng.random() < prevprob:
            graph[i] = [i - 1]
        else:
            graph[i] = [rng.randrange(i - 1)]

    return graph

def buildancestorsets(graph):
    ancs = [None] * len(graph)
    for i in xrange(len(graph)):
        ancs[i] = {i}
        if graph[i] == [nullrev]:
            continue
        for p in graph[i]:
            ancs[i].update(ancs[p])
    return ancs

class naiveincrementalmissingancestors(object):
    def __init__(self, ancs, bases):
        self.ancs = ancs
        self.bases = set(bases)
    def addbases(self, newbases):
        self.bases.update(newbases)
    def removeancestorsfrom(self, revs):
        for base in self.bases:
            if base != nullrev:
                revs.difference_update(self.ancs[base])
        revs.discard(nullrev)
    def missingancestors(self, revs):
        res = set()
        for rev in revs:
            if rev != nullrev:
                res.update(self.ancs[rev])
        for base in self.bases:
            if base != nullrev:
                res.difference_update(self.ancs[base])
        return sorted(res)

def test_missingancestors(seed, rng):
    # empirically observed to take around 1 second
    graphcount = 100
    testcount = 10
    inccount = 10
    nerrs = [0]
    # the default mu and sigma give us a nice distribution of mostly
    # single-digit counts (including 0) with some higher ones
    def lognormrandom(mu, sigma):
        return int(math.floor(rng.lognormvariate(mu, sigma)))

    def samplerevs(nodes, mu=1.1, sigma=0.8):
        count = min(lognormrandom(mu, sigma), len(nodes))
        return rng.sample(nodes, count)

    def err(seed, graph, bases, seq, output, expected):
        if nerrs[0] == 0:
            print('seed:', hex(seed)[:-1], file=sys.stderr)
        if gerrs[0] == 0:
            print('graph:', graph, file=sys.stderr)
        print('* bases:', bases, file=sys.stderr)
        print('* seq: ', seq, file=sys.stderr)
        print('*  output:  ', output, file=sys.stderr)
        print('*  expected:', expected, file=sys.stderr)
        nerrs[0] += 1
        gerrs[0] += 1

    for g in xrange(graphcount):
        graph = buildgraph(rng)
        ancs = buildancestorsets(graph)
        gerrs = [0]
        for _ in xrange(testcount):
            # start from nullrev to include it as a possibility
            graphnodes = range(nullrev, len(graph))
            bases = samplerevs(graphnodes)

            # fast algorithm
            inc = ancestor.incrementalmissingancestors(graph.__getitem__, bases)
            # reference slow algorithm
            naiveinc = naiveincrementalmissingancestors(ancs, bases)
            seq = []
            revs = []
            for _ in xrange(inccount):
                if rng.random() < 0.2:
                    newbases = samplerevs(graphnodes)
                    seq.append(('addbases', newbases))
                    inc.addbases(newbases)
                    naiveinc.addbases(newbases)
                if rng.random() < 0.4:
                    # larger set so that there are more revs to remove from
                    revs = samplerevs(graphnodes, mu=1.5)
                    seq.append(('removeancestorsfrom', revs))
                    hrevs = set(revs)
                    rrevs = set(revs)
                    inc.removeancestorsfrom(hrevs)
                    naiveinc.removeancestorsfrom(rrevs)
                    if hrevs != rrevs:
                        err(seed, graph, bases, seq, sorted(hrevs),
                            sorted(rrevs))
                else:
                    revs = samplerevs(graphnodes)
                    seq.append(('missingancestors', revs))
                    h = inc.missingancestors(revs)
                    r = naiveinc.missingancestors(revs)
                    if h != r:
                        err(seed, graph, bases, seq, h, r)

# graph is a dict of child->parent adjacency lists for this graph:
# o  13
# |
# | o  12
# | |
# | | o    11
# | | |\
# | | | | o  10
# | | | | |
# | o---+ |  9
# | | | | |
# o | | | |  8
#  / / / /
# | | o |  7
# | | | |
# o---+ |  6
#  / / /
# | | o  5
# | |/
# | o  4
# | |
# o |  3
# | |
# | o  2
# |/
# o  1
# |
# o  0

graph = {0: [-1, -1], 1: [0, -1], 2: [1, -1], 3: [1, -1], 4: [2, -1],
         5: [4, -1], 6: [4, -1], 7: [4, -1], 8: [-1, -1], 9: [6, 7],
         10: [5, -1], 11: [3, 7], 12: [9, -1], 13: [8, -1]}

def genlazyancestors(revs, stoprev=0, inclusive=False):
    print(("%% lazy ancestor set for %s, stoprev = %s, inclusive = %s" %
           (revs, stoprev, inclusive)))
    return ancestor.lazyancestors(graph.get, revs, stoprev=stoprev,
                                  inclusive=inclusive)

def printlazyancestors(s, l):
    print('membership: %r' % [n for n in l if n in s])
    print('iteration:  %r' % list(s))

def test_lazyancestors():
    # Empty revs
    s = genlazyancestors([])
    printlazyancestors(s, [3, 0, -1])

    # Standard example
    s = genlazyancestors([11, 13])
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])

    # Standard with ancestry in the initial set (1 is ancestor of 3)
    s = genlazyancestors([1, 3])
    printlazyancestors(s, [1, -1, 0])

    # Including revs
    s = genlazyancestors([11, 13], inclusive=True)
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])

    # Test with stoprev
    s = genlazyancestors([11, 13], stoprev=6)
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])
    s = genlazyancestors([11, 13], stoprev=6, inclusive=True)
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])

    # Test with stoprev >= min(initrevs)
    s = genlazyancestors([11, 13], stoprev=11, inclusive=True)
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])
    s = genlazyancestors([11, 13], stoprev=12, inclusive=True)
    printlazyancestors(s, [11, 13, 7, 9, 8, 3, 6, 4, 1, -1, 0])

    # Contiguous chains: 5->4, 2->1 (where 1 is in seen set), 1->0
    s = genlazyancestors([10, 1], inclusive=True)
    printlazyancestors(s, [2, 10, 4, 5, -1, 0, 1])

# The C gca algorithm requires a real repo. These are textual descriptions of
# DAGs that have been known to be problematic, and, optionally, known pairs
# of revisions and their expected ancestor list.
dagtests = [
    (b'+2*2*2/*3/2', {}),
    (b'+3*3/*2*2/*4*4/*4/2*4/2*2', {}),
    (b'+2*2*/2*4*/4*/3*2/4', {(6, 7): [3, 5]}),
]
def test_gca():
    u = uimod.ui.load()
    for i, (dag, tests) in enumerate(dagtests):
        repo = hg.repository(u, b'gca%d' % i, create=1)
        cl = repo.changelog
        if not util.safehasattr(cl.index, 'ancestors'):
            # C version not available
            return

        debugcommands.debugbuilddag(u, repo, dag)
        # Compare the results of the Python and C versions. This does not
        # include choosing a winner when more than one gca exists -- we make
        # sure both return exactly the same set of gcas.
        # Also compare against expected results, if available.
        for a in cl:
            for b in cl:
                cgcas = sorted(cl.index.ancestors(a, b))
                pygcas = sorted(ancestor.ancestors(cl.parentrevs, a, b))
                expected = None
                if (a, b) in tests:
                    expected = tests[(a, b)]
                if cgcas != pygcas or (expected and cgcas != expected):
                    print("test_gca: for dag %s, gcas for %d, %d:"
                          % (dag, a, b))
                    print("  C returned:      %s" % cgcas)
                    print("  Python returned: %s" % pygcas)
                    if expected:
                        print("  expected:        %s" % expected)

def main():
    seed = None
    opts, args = getopt.getopt(sys.argv[1:], 's:', ['seed='])
    for o, a in opts:
        if o in ('-s', '--seed'):
            seed = long(a, base=0) # accepts base 10 or 16 strings

    if seed is None:
        try:
            seed = long(binascii.hexlify(os.urandom(16)), 16)
        except AttributeError:
            seed = long(time.time() * 1000)

    rng = random.Random(seed)
    test_missingancestors(seed, rng)
    test_lazyancestors()
    test_gca()

if __name__ == '__main__':
    main()