view tests/test-pathencode.py @ 43455:6792da448437

hghave: disallow symlinks on Windows Symlinks on Windows require either a special priviledge, or enabling Developer Mode. It's probably the latter that is enabled on the new CI machine. But since Mercurial itself is saying no to symlinks on Windows, the tests for symlinks shouldn't be attempted. This should fix a lot of the noise in the py3 tests. Differential Revision: https://phab.mercurial-scm.org/D7233
author Matt Harbison <matt_harbison@yahoo.com>
date Tue, 05 Nov 2019 18:32:00 -0500
parents 74802979dd9d
children 89a2afe31e82
line wrap: on
line source

# This is a randomized test that generates different pathnames every
# time it is invoked, and tests the encoding of those pathnames.
#
# It uses a simple probabilistic model to generate valid pathnames
# that have proven likely to expose bugs and divergent behavior in
# different encoding implementations.

from __future__ import absolute_import, print_function

import binascii
import collections
import itertools
import math
import os
import random
import sys
import time
from mercurial import (
    pycompat,
    store,
)

try:
    xrange
except NameError:
    xrange = range

validchars = set(map(pycompat.bytechr, range(0, 256)))
alphanum = range(ord('A'), ord('Z'))

for c in (b'\0', b'/'):
    validchars.remove(c)

winreserved = (
    b'aux con prn nul'.split()
    + [b'com%d' % i for i in xrange(1, 10)]
    + [b'lpt%d' % i for i in xrange(1, 10)]
)


def casecombinations(names):
    '''Build all case-diddled combinations of names.'''

    combos = set()

    for r in names:
        for i in xrange(len(r) + 1):
            for c in itertools.combinations(xrange(len(r)), i):
                d = r
                for j in c:
                    d = b''.join((d[:j], d[j : j + 1].upper(), d[j + 1 :]))
                combos.add(d)
    return sorted(combos)


def buildprobtable(fp, cmd='hg manifest tip'):
    '''Construct and print a table of probabilities for path name
    components.  The numbers are percentages.'''

    counts = collections.defaultdict(lambda: 0)
    for line in os.popen(cmd).read().splitlines():
        if line[-2:] in ('.i', '.d'):
            line = line[:-2]
        if line.startswith('data/'):
            line = line[5:]
        for c in line:
            counts[c] += 1
    for c in '\r/\n':
        counts.pop(c, None)
    t = sum(pycompat.itervalues(counts)) / 100.0
    fp.write('probtable = (')
    for i, (k, v) in enumerate(
        sorted(counts.items(), key=lambda x: x[1], reverse=True)
    ):
        if (i % 5) == 0:
            fp.write('\n    ')
        vt = v / t
        if vt < 0.0005:
            break
        fp.write('(%r, %.03f), ' % (k, vt))
    fp.write('\n    )\n')


# A table of character frequencies (as percentages), gleaned by
# looking at filelog names from a real-world, very large repo.

probtable = (
    (b't', 9.828),
    (b'e', 9.042),
    (b's', 8.011),
    (b'a', 6.801),
    (b'i', 6.618),
    (b'g', 5.053),
    (b'r', 5.030),
    (b'o', 4.887),
    (b'p', 4.363),
    (b'n', 4.258),
    (b'l', 3.830),
    (b'h', 3.693),
    (b'_', 3.659),
    (b'.', 3.377),
    (b'm', 3.194),
    (b'u', 2.364),
    (b'd', 2.296),
    (b'c', 2.163),
    (b'b', 1.739),
    (b'f', 1.625),
    (b'6', 0.666),
    (b'j', 0.610),
    (b'y', 0.554),
    (b'x', 0.487),
    (b'w', 0.477),
    (b'k', 0.476),
    (b'v', 0.473),
    (b'3', 0.336),
    (b'1', 0.335),
    (b'2', 0.326),
    (b'4', 0.310),
    (b'5', 0.305),
    (b'9', 0.302),
    (b'8', 0.300),
    (b'7', 0.299),
    (b'q', 0.298),
    (b'0', 0.250),
    (b'z', 0.223),
    (b'-', 0.118),
    (b'C', 0.095),
    (b'T', 0.087),
    (b'F', 0.085),
    (b'B', 0.077),
    (b'S', 0.076),
    (b'P', 0.076),
    (b'L', 0.059),
    (b'A', 0.058),
    (b'N', 0.051),
    (b'D', 0.049),
    (b'M', 0.046),
    (b'E', 0.039),
    (b'I', 0.035),
    (b'R', 0.035),
    (b'G', 0.028),
    (b'U', 0.026),
    (b'W', 0.025),
    (b'O', 0.017),
    (b'V', 0.015),
    (b'H', 0.013),
    (b'Q', 0.011),
    (b'J', 0.007),
    (b'K', 0.005),
    (b'+', 0.004),
    (b'X', 0.003),
    (b'Y', 0.001),
)

for c, _ in probtable:
    validchars.remove(c)
validchars = list(validchars)


def pickfrom(rng, table):
    c = 0
    r = rng.random() * sum(i[1] for i in table)
    for i, p in table:
        c += p
        if c >= r:
            return i


reservedcombos = casecombinations(winreserved)

# The first component of a name following a slash.

firsttable = (
    (lambda rng: pickfrom(rng, probtable), 90),
    (lambda rng: rng.choice(validchars), 5),
    (lambda rng: rng.choice(reservedcombos), 5),
)

# Components of a name following the first.

resttable = firsttable[:-1]

# Special suffixes.

internalsuffixcombos = casecombinations(b'.hg .i .d'.split())

# The last component of a path, before a slash or at the end of a name.

lasttable = resttable + (
    (lambda rng: b'', 95),
    (lambda rng: rng.choice(internalsuffixcombos), 5),
)


def makepart(rng, k):
    '''Construct a part of a pathname, without slashes.'''

    p = pickfrom(rng, firsttable)(rng)
    l = len(p)
    ps = [p]
    maxl = rng.randint(1, k)
    while l < maxl:
        p = pickfrom(rng, resttable)(rng)
        l += len(p)
        ps.append(p)
    ps.append(pickfrom(rng, lasttable)(rng))
    return b''.join(ps)


def makepath(rng, j, k):
    '''Construct a complete pathname.'''

    return (
        b'data/'
        + b'/'.join(makepart(rng, k) for _ in xrange(j))
        + rng.choice([b'.d', b'.i'])
    )


def genpath(rng, count):
    '''Generate random pathnames with gradually increasing lengths.'''

    mink, maxk = 1, 4096

    def steps():
        for i in xrange(count):
            yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count)))

    for k in steps():
        x = rng.randint(1, k)
        y = rng.randint(1, k)
        yield makepath(rng, x, y)


def runtests(rng, seed, count):
    nerrs = 0
    for p in genpath(rng, count):
        h = store._pathencode(p)  # uses C implementation, if available
        r = store._hybridencode(p, True)  # reference implementation in Python
        if h != r:
            if nerrs == 0:
                print('seed:', hex(seed)[:-1], file=sys.stderr)
            print("\np: '%s'" % p.encode("string_escape"), file=sys.stderr)
            print("h: '%s'" % h.encode("string_escape"), file=sys.stderr)
            print("r: '%s'" % r.encode("string_escape"), file=sys.stderr)
            nerrs += 1
    return nerrs


def main():
    import getopt

    # Empirically observed to take about a second to run
    count = 100
    seed = None
    opts, args = getopt.getopt(
        sys.argv[1:], 'c:s:', ['build', 'count=', 'seed=']
    )
    for o, a in opts:
        if o in ('-c', '--count'):
            count = int(a)
        elif o in ('-s', '--seed'):
            seed = int(a, base=0)  # accepts base 10 or 16 strings
        elif o == '--build':
            buildprobtable(
                sys.stdout,
                'find .hg/store/data -type f && '
                'cat .hg/store/fncache 2>/dev/null',
            )
            sys.exit(0)

    if seed is None:
        try:
            seed = int(binascii.hexlify(os.urandom(16)), 16)
        except AttributeError:
            seed = int(time.time() * 1000)

    rng = random.Random(seed)
    if runtests(rng, seed, count):
        sys.exit(1)


if __name__ == '__main__':
    main()