view tests/test-pathencode.py @ 48670:6d2ddea0721a stable

stream-clone: filter possible missing requirements using all supported one The `supportedformat` requirements is missing some important requirements and it seems better to filter out with all requirements we know, not just an "arbitrary" subset. The `supportedformat` set is lacking some important requirements (for example `revlog-compression-zstd`). This is getting fixed on default (for Mercurial 6.1) However, fixing that in 6.1 means the stream requirements sent over the wire will contains more items. And if we don't apply this fix on older version, they might end up complaining about lacking support for feature they actually support for years. This patch does not fix the deeper problem (advertised stream requirement lacking some of them), but focus on the trivial part : Lets use the full set of supported requirement for looking for unsupported ones. This patch should be simple to backport to older version of Mercurial and packager should be encouraged to do so. This is a graft of d9017df70135 from default. Differential Revision: https://phab.mercurial-scm.org/D12091
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Mon, 24 Jan 2022 11:49:06 +0100
parents 89a2afe31e82
children 6000f5b25c9b
line wrap: on
line source

# This is a randomized test that generates different pathnames every
# time it is invoked, and tests the encoding of those pathnames.
#
# It uses a simple probabilistic model to generate valid pathnames
# that have proven likely to expose bugs and divergent behavior in
# different encoding implementations.

from __future__ import absolute_import, print_function

import binascii
import collections
import itertools
import math
import os
import random
import sys
import time
from mercurial import (
    pycompat,
    store,
)

try:
    xrange
except NameError:
    xrange = range

validchars = set(map(pycompat.bytechr, range(0, 256)))
alphanum = range(ord('A'), ord('Z'))

for c in (b'\0', b'/'):
    validchars.remove(c)

winreserved = (
    b'aux con prn nul'.split()
    + [b'com%d' % i for i in xrange(1, 10)]
    + [b'lpt%d' % i for i in xrange(1, 10)]
)


def casecombinations(names):
    '''Build all case-diddled combinations of names.'''

    combos = set()

    for r in names:
        for i in xrange(len(r) + 1):
            for c in itertools.combinations(xrange(len(r)), i):
                d = r
                for j in c:
                    d = b''.join((d[:j], d[j : j + 1].upper(), d[j + 1 :]))
                combos.add(d)
    return sorted(combos)


def buildprobtable(fp, cmd='hg manifest tip'):
    """Construct and print a table of probabilities for path name
    components.  The numbers are percentages."""

    counts = collections.defaultdict(lambda: 0)
    for line in os.popen(cmd).read().splitlines():
        if line[-2:] in ('.i', '.d'):
            line = line[:-2]
        if line.startswith('data/'):
            line = line[5:]
        for c in line:
            counts[c] += 1
    for c in '\r/\n':
        counts.pop(c, None)
    t = sum(pycompat.itervalues(counts)) / 100.0
    fp.write('probtable = (')
    for i, (k, v) in enumerate(
        sorted(counts.items(), key=lambda x: x[1], reverse=True)
    ):
        if (i % 5) == 0:
            fp.write('\n    ')
        vt = v / t
        if vt < 0.0005:
            break
        fp.write('(%r, %.03f), ' % (k, vt))
    fp.write('\n    )\n')


# A table of character frequencies (as percentages), gleaned by
# looking at filelog names from a real-world, very large repo.

probtable = (
    (b't', 9.828),
    (b'e', 9.042),
    (b's', 8.011),
    (b'a', 6.801),
    (b'i', 6.618),
    (b'g', 5.053),
    (b'r', 5.030),
    (b'o', 4.887),
    (b'p', 4.363),
    (b'n', 4.258),
    (b'l', 3.830),
    (b'h', 3.693),
    (b'_', 3.659),
    (b'.', 3.377),
    (b'm', 3.194),
    (b'u', 2.364),
    (b'd', 2.296),
    (b'c', 2.163),
    (b'b', 1.739),
    (b'f', 1.625),
    (b'6', 0.666),
    (b'j', 0.610),
    (b'y', 0.554),
    (b'x', 0.487),
    (b'w', 0.477),
    (b'k', 0.476),
    (b'v', 0.473),
    (b'3', 0.336),
    (b'1', 0.335),
    (b'2', 0.326),
    (b'4', 0.310),
    (b'5', 0.305),
    (b'9', 0.302),
    (b'8', 0.300),
    (b'7', 0.299),
    (b'q', 0.298),
    (b'0', 0.250),
    (b'z', 0.223),
    (b'-', 0.118),
    (b'C', 0.095),
    (b'T', 0.087),
    (b'F', 0.085),
    (b'B', 0.077),
    (b'S', 0.076),
    (b'P', 0.076),
    (b'L', 0.059),
    (b'A', 0.058),
    (b'N', 0.051),
    (b'D', 0.049),
    (b'M', 0.046),
    (b'E', 0.039),
    (b'I', 0.035),
    (b'R', 0.035),
    (b'G', 0.028),
    (b'U', 0.026),
    (b'W', 0.025),
    (b'O', 0.017),
    (b'V', 0.015),
    (b'H', 0.013),
    (b'Q', 0.011),
    (b'J', 0.007),
    (b'K', 0.005),
    (b'+', 0.004),
    (b'X', 0.003),
    (b'Y', 0.001),
)

for c, _ in probtable:
    validchars.remove(c)
validchars = list(validchars)


def pickfrom(rng, table):
    c = 0
    r = rng.random() * sum(i[1] for i in table)
    for i, p in table:
        c += p
        if c >= r:
            return i


reservedcombos = casecombinations(winreserved)

# The first component of a name following a slash.

firsttable = (
    (lambda rng: pickfrom(rng, probtable), 90),
    (lambda rng: rng.choice(validchars), 5),
    (lambda rng: rng.choice(reservedcombos), 5),
)

# Components of a name following the first.

resttable = firsttable[:-1]

# Special suffixes.

internalsuffixcombos = casecombinations(b'.hg .i .d'.split())

# The last component of a path, before a slash or at the end of a name.

lasttable = resttable + (
    (lambda rng: b'', 95),
    (lambda rng: rng.choice(internalsuffixcombos), 5),
)


def makepart(rng, k):
    '''Construct a part of a pathname, without slashes.'''

    p = pickfrom(rng, firsttable)(rng)
    l = len(p)
    ps = [p]
    maxl = rng.randint(1, k)
    while l < maxl:
        p = pickfrom(rng, resttable)(rng)
        l += len(p)
        ps.append(p)
    ps.append(pickfrom(rng, lasttable)(rng))
    return b''.join(ps)


def makepath(rng, j, k):
    '''Construct a complete pathname.'''

    return (
        b'data/'
        + b'/'.join(makepart(rng, k) for _ in xrange(j))
        + rng.choice([b'.d', b'.i'])
    )


def genpath(rng, count):
    '''Generate random pathnames with gradually increasing lengths.'''

    mink, maxk = 1, 4096

    def steps():
        for i in xrange(count):
            yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count)))

    for k in steps():
        x = rng.randint(1, k)
        y = rng.randint(1, k)
        yield makepath(rng, x, y)


def runtests(rng, seed, count):
    nerrs = 0
    for p in genpath(rng, count):
        h = store._pathencode(p)  # uses C implementation, if available
        r = store._hybridencode(p, True)  # reference implementation in Python
        if h != r:
            if nerrs == 0:
                print('seed:', hex(seed)[:-1], file=sys.stderr)
            print("\np: '%s'" % p.encode("string_escape"), file=sys.stderr)
            print("h: '%s'" % h.encode("string_escape"), file=sys.stderr)
            print("r: '%s'" % r.encode("string_escape"), file=sys.stderr)
            nerrs += 1
    return nerrs


def main():
    import getopt

    # Empirically observed to take about a second to run
    count = 100
    seed = None
    opts, args = getopt.getopt(
        sys.argv[1:], 'c:s:', ['build', 'count=', 'seed=']
    )
    for o, a in opts:
        if o in ('-c', '--count'):
            count = int(a)
        elif o in ('-s', '--seed'):
            seed = int(a, base=0)  # accepts base 10 or 16 strings
        elif o == '--build':
            buildprobtable(
                sys.stdout,
                'find .hg/store/data -type f && '
                'cat .hg/store/fncache 2>/dev/null',
            )
            sys.exit(0)

    if seed is None:
        try:
            seed = int(binascii.hexlify(os.urandom(16)), 16)
        except AttributeError:
            seed = int(time.time() * 1000)

    rng = random.Random(seed)
    if runtests(rng, seed, count):
        sys.exit(1)


if __name__ == '__main__':
    main()