Mercurial > hg
view tests/test-pathencode.py @ 49269:395f28064826
worker: avoid potential partial write of pickled data
Previously, the code wrote the pickled data using os.write(). However,
os.write() can write less bytes than passed to it. To trigger the problem, the
pickled data had to be larger than 2147479552 bytes on my system.
Instead, open a file object and pass it to pickle.dump(). This also has the
advantage that it doesn’t buffer the whole pickled data in memory.
Note that the opened file must be buffered because pickle doesn’t support
unbuffered streams because unbuffered streams’ write() method might write less
bytes than passed to it (like os.write()) but pickle.dump() relies on that all
bytes are written (see https://github.com/python/cpython/issues/93050).
The side effect of using a file object and a with statement is that wfd is
explicitly closed now while it seems like before it was implicitly closed by
process exit.
author | Manuel Jacob <me@manueljacob.de> |
---|---|
date | Sun, 22 May 2022 03:50:34 +0200 |
parents | 2cce2fa5bcf7 |
children | 56f98406831b |
line wrap: on
line source
# This is a randomized test that generates different pathnames every # time it is invoked, and tests the encoding of those pathnames. # # It uses a simple probabilistic model to generate valid pathnames # that have proven likely to expose bugs and divergent behavior in # different encoding implementations. import binascii import collections import itertools import math import os import random import sys import time from mercurial import ( pycompat, store, ) try: xrange except NameError: xrange = range validchars = set(map(pycompat.bytechr, range(0, 256))) alphanum = range(ord('A'), ord('Z')) for c in (b'\0', b'/'): validchars.remove(c) winreserved = ( b'aux con prn nul'.split() + [b'com%d' % i for i in xrange(1, 10)] + [b'lpt%d' % i for i in xrange(1, 10)] ) def casecombinations(names): '''Build all case-diddled combinations of names.''' combos = set() for r in names: for i in xrange(len(r) + 1): for c in itertools.combinations(xrange(len(r)), i): d = r for j in c: d = b''.join((d[:j], d[j : j + 1].upper(), d[j + 1 :])) combos.add(d) return sorted(combos) def buildprobtable(fp, cmd='hg manifest tip'): """Construct and print a table of probabilities for path name components. The numbers are percentages.""" counts = collections.defaultdict(lambda: 0) for line in os.popen(cmd).read().splitlines(): if line[-2:] in ('.i', '.d'): line = line[:-2] if line.startswith('data/'): line = line[5:] for c in line: counts[c] += 1 for c in '\r/\n': counts.pop(c, None) t = sum(counts.values()) / 100.0 fp.write('probtable = (') for i, (k, v) in enumerate( sorted(counts.items(), key=lambda x: x[1], reverse=True) ): if (i % 5) == 0: fp.write('\n ') vt = v / t if vt < 0.0005: break fp.write('(%r, %.03f), ' % (k, vt)) fp.write('\n )\n') # A table of character frequencies (as percentages), gleaned by # looking at filelog names from a real-world, very large repo. probtable = ( (b't', 9.828), (b'e', 9.042), (b's', 8.011), (b'a', 6.801), (b'i', 6.618), (b'g', 5.053), (b'r', 5.030), (b'o', 4.887), (b'p', 4.363), (b'n', 4.258), (b'l', 3.830), (b'h', 3.693), (b'_', 3.659), (b'.', 3.377), (b'm', 3.194), (b'u', 2.364), (b'd', 2.296), (b'c', 2.163), (b'b', 1.739), (b'f', 1.625), (b'6', 0.666), (b'j', 0.610), (b'y', 0.554), (b'x', 0.487), (b'w', 0.477), (b'k', 0.476), (b'v', 0.473), (b'3', 0.336), (b'1', 0.335), (b'2', 0.326), (b'4', 0.310), (b'5', 0.305), (b'9', 0.302), (b'8', 0.300), (b'7', 0.299), (b'q', 0.298), (b'0', 0.250), (b'z', 0.223), (b'-', 0.118), (b'C', 0.095), (b'T', 0.087), (b'F', 0.085), (b'B', 0.077), (b'S', 0.076), (b'P', 0.076), (b'L', 0.059), (b'A', 0.058), (b'N', 0.051), (b'D', 0.049), (b'M', 0.046), (b'E', 0.039), (b'I', 0.035), (b'R', 0.035), (b'G', 0.028), (b'U', 0.026), (b'W', 0.025), (b'O', 0.017), (b'V', 0.015), (b'H', 0.013), (b'Q', 0.011), (b'J', 0.007), (b'K', 0.005), (b'+', 0.004), (b'X', 0.003), (b'Y', 0.001), ) for c, _ in probtable: validchars.remove(c) validchars = list(validchars) def pickfrom(rng, table): c = 0 r = rng.random() * sum(i[1] for i in table) for i, p in table: c += p if c >= r: return i reservedcombos = casecombinations(winreserved) # The first component of a name following a slash. firsttable = ( (lambda rng: pickfrom(rng, probtable), 90), (lambda rng: rng.choice(validchars), 5), (lambda rng: rng.choice(reservedcombos), 5), ) # Components of a name following the first. resttable = firsttable[:-1] # Special suffixes. internalsuffixcombos = casecombinations(b'.hg .i .d'.split()) # The last component of a path, before a slash or at the end of a name. lasttable = resttable + ( (lambda rng: b'', 95), (lambda rng: rng.choice(internalsuffixcombos), 5), ) def makepart(rng, k): '''Construct a part of a pathname, without slashes.''' p = pickfrom(rng, firsttable)(rng) l = len(p) ps = [p] maxl = rng.randint(1, k) while l < maxl: p = pickfrom(rng, resttable)(rng) l += len(p) ps.append(p) ps.append(pickfrom(rng, lasttable)(rng)) return b''.join(ps) def makepath(rng, j, k): '''Construct a complete pathname.''' return ( b'data/' + b'/'.join(makepart(rng, k) for _ in xrange(j)) + rng.choice([b'.d', b'.i']) ) def genpath(rng, count): '''Generate random pathnames with gradually increasing lengths.''' mink, maxk = 1, 4096 def steps(): for i in xrange(count): yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count))) for k in steps(): x = rng.randint(1, k) y = rng.randint(1, k) yield makepath(rng, x, y) def runtests(rng, seed, count): nerrs = 0 for p in genpath(rng, count): h = store._pathencode(p) # uses C implementation, if available r = store._hybridencode(p, True) # reference implementation in Python if h != r: if nerrs == 0: print('seed:', hex(seed)[:-1], file=sys.stderr) print("\np: '%s'" % p.encode("string_escape"), file=sys.stderr) print("h: '%s'" % h.encode("string_escape"), file=sys.stderr) print("r: '%s'" % r.encode("string_escape"), file=sys.stderr) nerrs += 1 return nerrs def main(): import getopt # Empirically observed to take about a second to run count = 100 seed = None opts, args = getopt.getopt( sys.argv[1:], 'c:s:', ['build', 'count=', 'seed='] ) for o, a in opts: if o in ('-c', '--count'): count = int(a) elif o in ('-s', '--seed'): seed = int(a, base=0) # accepts base 10 or 16 strings elif o == '--build': buildprobtable( sys.stdout, 'find .hg/store/data -type f && ' 'cat .hg/store/fncache 2>/dev/null', ) sys.exit(0) if seed is None: try: seed = int(binascii.hexlify(os.urandom(16)), 16) except AttributeError: seed = int(time.time() * 1000) rng = random.Random(seed) if runtests(rng, seed, count): sys.exit(1) if __name__ == '__main__': main()