# HG changeset patch # User Bryan O'Sullivan # Date 1353002669 28800 # Node ID 736f1c09f321d9f067e2ec79c6f9d5a4a7b3a80f # Parent 8243dd66e0e3b94ac2c65ee2f2a5b87349937c3a tests: add a randomized test for pathencode This is a probabilistic test - it generates different test cases on every run, unless invoked from the command line with a specific seed. The default number of tests run should make the tests take about a second to complete on a semi-modern laptop. diff -r 8243dd66e0e3 -r 736f1c09f321 tests/test-pathencode.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-pathencode.py Thu Nov 15 10:04:29 2012 -0800 @@ -0,0 +1,190 @@ +# This is a randomized test that generates different pathnames every +# time it is invoked, and tests the encoding of those pathnames. +# +# It uses a simple probabilistic model to generate valid pathnames +# that have proven likely to expose bugs and divergent behaviour in +# different encoding implementations. + +from mercurial import parsers +from mercurial import store +import binascii, itertools, math, os, random, sys, time +from collections import defaultdict + +def hybridencode(path): + return store._hybridencode(path, True) + +validchars = set(map(chr, range(0, 256))) +alphanum = range(ord('A'), ord('Z')) + +for c in '\0/': + validchars.remove(c) + +winreserved = ('aux con prn nul'.split() + + ['com%d' % i for i in xrange(1, 10)] + + ['lpt%d' % i for i in xrange(1, 10)]) + +def casecombinations(names): + '''Build all case-diddled combinations of names.''' + + combos = set() + + for r in names: + for i in xrange(len(r) + 1): + for c in itertools.combinations(xrange(len(r)), i): + d = r + for j in c: + d = ''.join((d[:j], d[j].upper(), d[j + 1:])) + combos.add(d) + return sorted(combos) + +def buildprobtable(fp, cmd='hg manifest tip'): + '''Construct and print a table of probabilities for path name + components. The numbers are percentages.''' + + counts = defaultdict(lambda: 0) + for line in os.popen(cmd).read().splitlines(): + if line[-2:] in ('.i', '.d'): + line = line[:-2] + if line.startswith('data/'): + line = line[5:] + for c in line: + counts[c] += 1 + for c in '\r/\n': + counts.pop(c, None) + t = sum(counts.itervalues()) / 100.0 + fp.write('probtable = (') + for i, (k, v) in enumerate(sorted(counts.iteritems(), key=lambda x: x[1], + reverse=True)): + if (i % 5) == 0: + fp.write('\n ') + vt = v / t + if vt < 0.0005: + break + fp.write('(%r, %.03f), ' % (k, vt)) + fp.write('\n )\n') + +# A table of character frequencies (as percentages), gleaned by +# looking at filelog names from a real-world, very large repo. + +probtable = ( + ('t', 9.828), ('e', 9.042), ('s', 8.011), ('a', 6.801), ('i', 6.618), + ('g', 5.053), ('r', 5.030), ('o', 4.887), ('p', 4.363), ('n', 4.258), + ('l', 3.830), ('h', 3.693), ('_', 3.659), ('.', 3.377), ('m', 3.194), + ('u', 2.364), ('d', 2.296), ('c', 2.163), ('b', 1.739), ('f', 1.625), + ('6', 0.666), ('j', 0.610), ('y', 0.554), ('x', 0.487), ('w', 0.477), + ('k', 0.476), ('v', 0.473), ('3', 0.336), ('1', 0.335), ('2', 0.326), + ('4', 0.310), ('5', 0.305), ('9', 0.302), ('8', 0.300), ('7', 0.299), + ('q', 0.298), ('0', 0.250), ('z', 0.223), ('-', 0.118), ('C', 0.095), + ('T', 0.087), ('F', 0.085), ('B', 0.077), ('S', 0.076), ('P', 0.076), + ('L', 0.059), ('A', 0.058), ('N', 0.051), ('D', 0.049), ('M', 0.046), + ('E', 0.039), ('I', 0.035), ('R', 0.035), ('G', 0.028), ('U', 0.026), + ('W', 0.025), ('O', 0.017), ('V', 0.015), ('H', 0.013), ('Q', 0.011), + ('J', 0.007), ('K', 0.005), ('+', 0.004), ('X', 0.003), ('Y', 0.001), + ) + +for c, _ in probtable: + validchars.remove(c) +validchars = list(validchars) + +def pickfrom(rng, table): + c = 0 + r = rng.random() * sum(i[1] for i in table) + for i, p in table: + c += p + if c >= r: + return i + +reservedcombos = casecombinations(winreserved) + +# The first component of a name following a slash. + +firsttable = ( + (lambda rng: pickfrom(rng, probtable), 90), + (lambda rng: rng.choice(validchars), 5), + (lambda rng: rng.choice(reservedcombos), 5), + ) + +# Components of a name following the first. + +resttable = firsttable[:-1] + +# Special suffixes. + +internalsuffixcombos = casecombinations('.hg .i .d'.split()) + +# The last component of a path, before a slash or at the end of a name. + +lasttable = resttable + ( + (lambda rng: '', 95), + (lambda rng: rng.choice(internalsuffixcombos), 5), + ) + +def makepart(rng, k): + '''Construct a part of a pathname, without slashes.''' + + p = pickfrom(rng, firsttable)(rng) + l = len(p) + ps = [p] + while l <= k: + p = pickfrom(rng, resttable)(rng) + l += len(p) + ps.append(p) + ps.append(pickfrom(rng, lasttable)(rng)) + return ''.join(ps) + +def makepath(rng, j, k): + '''Construct a complete pathname.''' + + return ('data/' + '/'.join(makepart(rng, k) for _ in xrange(j)) + + rng.choice(['.d', '.i'])) + +def genpath(rng, count): + '''Generate random pathnames with gradually increasing lengths.''' + + mink, maxk = 1, 4096 + def steps(): + x, k = 0, mink + for i in xrange(count): + yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count))) + for k in steps(): + x = rng.randint(1, k) + y = rng.randint(1, k) + yield makepath(rng, x, y) + +def runtests(rng, seed, count): + nerrs = 0 + for p in genpath(rng, count): + hybridencode(p) + return nerrs + +def main(): + import getopt + + # Empirically observed to take about a second to run + count = 100 + seed = None + opts, args = getopt.getopt(sys.argv[1:], 'c:s:', + ['build', 'count=', 'seed=']) + for o, a in opts: + if o in ('-c', '--count'): + count = int(a) + elif o in ('-s', '--seed'): + seed = long(a) + elif o == '--build': + buildprobtable(sys.stdout, + 'find .hg/store/data -type f && ' + 'cat .hg/store/fncache 2>/dev/null') + sys.exit(0) + + if seed is None: + try: + seed = long(binascii.hexlify(os.urandom(16)), 16) + except AttributeError: + seed = long(time.time() * 1000) + + rng = random.Random(seed) + if runtests(rng, seed, count): + sys.exit(1) + +if __name__ == '__main__' and sys.version_info[:2] >= (2, 6): + main()