Mercurial > hg
comparison tests/test-pathencode.py @ 17934:736f1c09f321
tests: add a randomized test for pathencode
This is a probabilistic test - it generates different test cases on every
run, unless invoked from the command line with a specific seed.
The default number of tests run should make the tests take about a
second to complete on a semi-modern laptop.
author | Bryan O'Sullivan <bryano@fb.com> |
---|---|
date | Thu, 15 Nov 2012 10:04:29 -0800 |
parents | |
children | 9c888b945b65 |
comparison
equal
deleted
inserted
replaced
17933:8243dd66e0e3 | 17934:736f1c09f321 |
---|---|
1 # This is a randomized test that generates different pathnames every | |
2 # time it is invoked, and tests the encoding of those pathnames. | |
3 # | |
4 # It uses a simple probabilistic model to generate valid pathnames | |
5 # that have proven likely to expose bugs and divergent behaviour in | |
6 # different encoding implementations. | |
7 | |
8 from mercurial import parsers | |
9 from mercurial import store | |
10 import binascii, itertools, math, os, random, sys, time | |
11 from collections import defaultdict | |
12 | |
13 def hybridencode(path): | |
14 return store._hybridencode(path, True) | |
15 | |
16 validchars = set(map(chr, range(0, 256))) | |
17 alphanum = range(ord('A'), ord('Z')) | |
18 | |
19 for c in '\0/': | |
20 validchars.remove(c) | |
21 | |
22 winreserved = ('aux con prn nul'.split() + | |
23 ['com%d' % i for i in xrange(1, 10)] + | |
24 ['lpt%d' % i for i in xrange(1, 10)]) | |
25 | |
26 def casecombinations(names): | |
27 '''Build all case-diddled combinations of names.''' | |
28 | |
29 combos = set() | |
30 | |
31 for r in names: | |
32 for i in xrange(len(r) + 1): | |
33 for c in itertools.combinations(xrange(len(r)), i): | |
34 d = r | |
35 for j in c: | |
36 d = ''.join((d[:j], d[j].upper(), d[j + 1:])) | |
37 combos.add(d) | |
38 return sorted(combos) | |
39 | |
40 def buildprobtable(fp, cmd='hg manifest tip'): | |
41 '''Construct and print a table of probabilities for path name | |
42 components. The numbers are percentages.''' | |
43 | |
44 counts = defaultdict(lambda: 0) | |
45 for line in os.popen(cmd).read().splitlines(): | |
46 if line[-2:] in ('.i', '.d'): | |
47 line = line[:-2] | |
48 if line.startswith('data/'): | |
49 line = line[5:] | |
50 for c in line: | |
51 counts[c] += 1 | |
52 for c in '\r/\n': | |
53 counts.pop(c, None) | |
54 t = sum(counts.itervalues()) / 100.0 | |
55 fp.write('probtable = (') | |
56 for i, (k, v) in enumerate(sorted(counts.iteritems(), key=lambda x: x[1], | |
57 reverse=True)): | |
58 if (i % 5) == 0: | |
59 fp.write('\n ') | |
60 vt = v / t | |
61 if vt < 0.0005: | |
62 break | |
63 fp.write('(%r, %.03f), ' % (k, vt)) | |
64 fp.write('\n )\n') | |
65 | |
66 # A table of character frequencies (as percentages), gleaned by | |
67 # looking at filelog names from a real-world, very large repo. | |
68 | |
69 probtable = ( | |
70 ('t', 9.828), ('e', 9.042), ('s', 8.011), ('a', 6.801), ('i', 6.618), | |
71 ('g', 5.053), ('r', 5.030), ('o', 4.887), ('p', 4.363), ('n', 4.258), | |
72 ('l', 3.830), ('h', 3.693), ('_', 3.659), ('.', 3.377), ('m', 3.194), | |
73 ('u', 2.364), ('d', 2.296), ('c', 2.163), ('b', 1.739), ('f', 1.625), | |
74 ('6', 0.666), ('j', 0.610), ('y', 0.554), ('x', 0.487), ('w', 0.477), | |
75 ('k', 0.476), ('v', 0.473), ('3', 0.336), ('1', 0.335), ('2', 0.326), | |
76 ('4', 0.310), ('5', 0.305), ('9', 0.302), ('8', 0.300), ('7', 0.299), | |
77 ('q', 0.298), ('0', 0.250), ('z', 0.223), ('-', 0.118), ('C', 0.095), | |
78 ('T', 0.087), ('F', 0.085), ('B', 0.077), ('S', 0.076), ('P', 0.076), | |
79 ('L', 0.059), ('A', 0.058), ('N', 0.051), ('D', 0.049), ('M', 0.046), | |
80 ('E', 0.039), ('I', 0.035), ('R', 0.035), ('G', 0.028), ('U', 0.026), | |
81 ('W', 0.025), ('O', 0.017), ('V', 0.015), ('H', 0.013), ('Q', 0.011), | |
82 ('J', 0.007), ('K', 0.005), ('+', 0.004), ('X', 0.003), ('Y', 0.001), | |
83 ) | |
84 | |
85 for c, _ in probtable: | |
86 validchars.remove(c) | |
87 validchars = list(validchars) | |
88 | |
89 def pickfrom(rng, table): | |
90 c = 0 | |
91 r = rng.random() * sum(i[1] for i in table) | |
92 for i, p in table: | |
93 c += p | |
94 if c >= r: | |
95 return i | |
96 | |
97 reservedcombos = casecombinations(winreserved) | |
98 | |
99 # The first component of a name following a slash. | |
100 | |
101 firsttable = ( | |
102 (lambda rng: pickfrom(rng, probtable), 90), | |
103 (lambda rng: rng.choice(validchars), 5), | |
104 (lambda rng: rng.choice(reservedcombos), 5), | |
105 ) | |
106 | |
107 # Components of a name following the first. | |
108 | |
109 resttable = firsttable[:-1] | |
110 | |
111 # Special suffixes. | |
112 | |
113 internalsuffixcombos = casecombinations('.hg .i .d'.split()) | |
114 | |
115 # The last component of a path, before a slash or at the end of a name. | |
116 | |
117 lasttable = resttable + ( | |
118 (lambda rng: '', 95), | |
119 (lambda rng: rng.choice(internalsuffixcombos), 5), | |
120 ) | |
121 | |
122 def makepart(rng, k): | |
123 '''Construct a part of a pathname, without slashes.''' | |
124 | |
125 p = pickfrom(rng, firsttable)(rng) | |
126 l = len(p) | |
127 ps = [p] | |
128 while l <= k: | |
129 p = pickfrom(rng, resttable)(rng) | |
130 l += len(p) | |
131 ps.append(p) | |
132 ps.append(pickfrom(rng, lasttable)(rng)) | |
133 return ''.join(ps) | |
134 | |
135 def makepath(rng, j, k): | |
136 '''Construct a complete pathname.''' | |
137 | |
138 return ('data/' + '/'.join(makepart(rng, k) for _ in xrange(j)) + | |
139 rng.choice(['.d', '.i'])) | |
140 | |
141 def genpath(rng, count): | |
142 '''Generate random pathnames with gradually increasing lengths.''' | |
143 | |
144 mink, maxk = 1, 4096 | |
145 def steps(): | |
146 x, k = 0, mink | |
147 for i in xrange(count): | |
148 yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count))) | |
149 for k in steps(): | |
150 x = rng.randint(1, k) | |
151 y = rng.randint(1, k) | |
152 yield makepath(rng, x, y) | |
153 | |
154 def runtests(rng, seed, count): | |
155 nerrs = 0 | |
156 for p in genpath(rng, count): | |
157 hybridencode(p) | |
158 return nerrs | |
159 | |
160 def main(): | |
161 import getopt | |
162 | |
163 # Empirically observed to take about a second to run | |
164 count = 100 | |
165 seed = None | |
166 opts, args = getopt.getopt(sys.argv[1:], 'c:s:', | |
167 ['build', 'count=', 'seed=']) | |
168 for o, a in opts: | |
169 if o in ('-c', '--count'): | |
170 count = int(a) | |
171 elif o in ('-s', '--seed'): | |
172 seed = long(a) | |
173 elif o == '--build': | |
174 buildprobtable(sys.stdout, | |
175 'find .hg/store/data -type f && ' | |
176 'cat .hg/store/fncache 2>/dev/null') | |
177 sys.exit(0) | |
178 | |
179 if seed is None: | |
180 try: | |
181 seed = long(binascii.hexlify(os.urandom(16)), 16) | |
182 except AttributeError: | |
183 seed = long(time.time() * 1000) | |
184 | |
185 rng = random.Random(seed) | |
186 if runtests(rng, seed, count): | |
187 sys.exit(1) | |
188 | |
189 if __name__ == '__main__' and sys.version_info[:2] >= (2, 6): | |
190 main() |