comparison mercurial/pathencode.c @ 17616:9535a0dc41f2

store: implement fncache basic path encoding in C (This is not yet enabled; it will be turned on in a followup patch.) The path encoding performed by fncache is complex and (perhaps surprisingly) slow enough to negatively affect the overall performance of Mercurial. For a short path (< 120 bytes), the Python code can be reduced to a fairly tractable state machine that either determines that nothing needs to be done in a single pass, or performs the encoding in a second pass. For longer paths, we avoid the more complicated hashed encoding scheme for now, and fall back to Python. Raw performance: I measured in a repo containing 150,000 files in its tip manifest, with a median path name length of 57 bytes, and 95th percentile of 96 bytes. In this repo, the Python code takes 3.1 seconds to encode all path names, while the hybrid C-and-Python code (called from Python) takes 0.21 seconds, for a speedup of about 14. Across several other large repositories, I've measured the speedup from the C code at between 26x and 40x. For path names above 120 bytes where we must fall back to Python for hashed encoding, the speedup is about 1.7x. Thus absolute performance will depend strongly on the characteristics of a particular repository.
author Bryan O'Sullivan <bryano@fb.com>
date Tue, 18 Sep 2012 15:42:19 -0700
parents 318fb32b980e
children c6c7e466dd3a
comparison
equal deleted inserted replaced
17615:9e2dc0d292cd 17616:9535a0dc41f2
4 Copyright 2012 Facebook 4 Copyright 2012 Facebook
5 5
6 This software may be used and distributed according to the terms of 6 This software may be used and distributed according to the terms of
7 the GNU General Public License, incorporated herein by reference. 7 the GNU General Public License, incorporated herein by reference.
8 */ 8 */
9
10 /*
11 * An implementation of the name encoding scheme used by the fncache
12 * store. The common case is of a path < 120 bytes long, which is
13 * handled either in a single pass with no allocations or two passes
14 * with a single allocation. For longer paths, multiple passes are
15 * required.
16 */
9 17
10 #include <Python.h> 18 #include <Python.h>
11 #include <assert.h> 19 #include <assert.h>
12 #include <ctype.h> 20 #include <ctype.h>
13 #include <stdlib.h> 21 #include <stdlib.h>
14 #include <string.h> 22 #include <string.h>
15 23
16 #include "util.h" 24 #include "util.h"
25
26 /* state machine for the fast path */
27 enum path_state {
28 START, /* first byte of a path component */
29 A, /* "AUX" */
30 AU,
31 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
32 C, /* "CON" or "COMn" */
33 CO,
34 COMLPT, /* "COM" or "LPT" */
35 COMLPTn,
36 L,
37 LP,
38 N,
39 NU,
40 P, /* "PRN" */
41 PR,
42 LDOT, /* leading '.' */
43 DOT, /* '.' in a non-leading position */
44 H, /* ".h" */
45 HGDI, /* ".hg", ".d", or ".i" */
46 SPACE,
47 DEFAULT, /* byte of a path component after the first */
48 };
17 49
18 /* state machine for dir-encoding */ 50 /* state machine for dir-encoding */
19 enum dir_state { 51 enum dir_state {
20 DDOT, 52 DDOT,
21 DH, 53 DH,
22 DHGDI, 54 DHGDI,
23 DDEFAULT, 55 DDEFAULT,
24 }; 56 };
25 57
58 static inline int isset(const uint32_t bitset[], char c)
59 {
60 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
61 }
62
26 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, 63 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
27 char c) 64 char c)
28 { 65 {
29 if (dest) { 66 if (dest) {
30 assert(*destlen < destsize); 67 assert(*destlen < destsize);
39 if (dest) { 76 if (dest) {
40 assert(*destlen + len < destsize); 77 assert(*destlen + len < destsize);
41 memcpy((void *)&dest[*destlen], src, len); 78 memcpy((void *)&dest[*destlen], src, len);
42 } 79 }
43 *destlen += len; 80 *destlen += len;
81 }
82
83 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
84 uint8_t c)
85 {
86 static const char hexdigit[] = "0123456789abcdef";
87
88 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
89 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
90 }
91
92 /* 3-byte escape: tilde followed by two hex digits */
93 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
94 char c)
95 {
96 charcopy(dest, destlen, destsize, '~');
97 hexencode(dest, destlen, destsize, c);
44 } 98 }
45 99
46 static Py_ssize_t _encodedir(char *dest, size_t destsize, 100 static Py_ssize_t _encodedir(char *dest, size_t destsize,
47 const char *src, Py_ssize_t len) 101 const char *src, Py_ssize_t len)
48 { 102 {
121 len + 1); 175 len + 1);
122 } 176 }
123 177
124 return newobj; 178 return newobj;
125 } 179 }
180
181 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
182 char *dest, Py_ssize_t destlen, size_t destsize,
183 const char *src, Py_ssize_t len,
184 int encodedir)
185 {
186 enum path_state state = START;
187 Py_ssize_t i = 0;
188
189 /*
190 * Python strings end with a zero byte, which we use as a
191 * terminal token as they are not valid inside path names.
192 */
193
194 while (i < len) {
195 switch (state) {
196 case START:
197 switch (src[i]) {
198 case '/':
199 charcopy(dest, &destlen, destsize, src[i++]);
200 break;
201 case '.':
202 state = LDOT;
203 escape3(dest, &destlen, destsize, src[i++]);
204 break;
205 case ' ':
206 state = DEFAULT;
207 escape3(dest, &destlen, destsize, src[i++]);
208 break;
209 case 'a':
210 state = A;
211 charcopy(dest, &destlen, destsize, src[i++]);
212 break;
213 case 'c':
214 state = C;
215 charcopy(dest, &destlen, destsize, src[i++]);
216 break;
217 case 'l':
218 state = L;
219 charcopy(dest, &destlen, destsize, src[i++]);
220 break;
221 case 'n':
222 state = N;
223 charcopy(dest, &destlen, destsize, src[i++]);
224 break;
225 case 'p':
226 state = P;
227 charcopy(dest, &destlen, destsize, src[i++]);
228 break;
229 default:
230 state = DEFAULT;
231 break;
232 }
233 break;
234 case A:
235 if (src[i] == 'u') {
236 state = AU;
237 charcopy(dest, &destlen, destsize, src[i++]);
238 }
239 else state = DEFAULT;
240 break;
241 case AU:
242 if (src[i] == 'x') {
243 state = THIRD;
244 i++;
245 }
246 else state = DEFAULT;
247 break;
248 case THIRD:
249 state = DEFAULT;
250 switch (src[i]) {
251 case '.':
252 case '/':
253 case '\0':
254 escape3(dest, &destlen, destsize, src[i - 1]);
255 break;
256 default:
257 i--;
258 break;
259 }
260 break;
261 case C:
262 if (src[i] == 'o') {
263 state = CO;
264 charcopy(dest, &destlen, destsize, src[i++]);
265 }
266 else state = DEFAULT;
267 break;
268 case CO:
269 if (src[i] == 'm') {
270 state = COMLPT;
271 i++;
272 }
273 else if (src[i] == 'n') {
274 state = THIRD;
275 i++;
276 }
277 else state = DEFAULT;
278 break;
279 case COMLPT:
280 switch (src[i]) {
281 case '1': case '2': case '3': case '4': case '5':
282 case '6': case '7': case '8': case '9':
283 state = COMLPTn;
284 i++;
285 break;
286 default:
287 state = DEFAULT;
288 charcopy(dest, &destlen, destsize, src[i - 1]);
289 break;
290 }
291 break;
292 case COMLPTn:
293 state = DEFAULT;
294 switch (src[i]) {
295 case '.':
296 case '/':
297 case '\0':
298 escape3(dest, &destlen, destsize, src[i - 2]);
299 charcopy(dest, &destlen, destsize, src[i - 1]);
300 break;
301 default:
302 memcopy(dest, &destlen, destsize,
303 &src[i - 2], 2);
304 break;
305 }
306 break;
307 case L:
308 if (src[i] == 'p') {
309 state = LP;
310 charcopy(dest, &destlen, destsize, src[i++]);
311 }
312 else state = DEFAULT;
313 break;
314 case LP:
315 if (src[i] == 't') {
316 state = COMLPT;
317 i++;
318 }
319 else state = DEFAULT;
320 break;
321 case N:
322 if (src[i] == 'u') {
323 state = NU;
324 charcopy(dest, &destlen, destsize, src[i++]);
325 }
326 else state = DEFAULT;
327 break;
328 case NU:
329 if (src[i] == 'l') {
330 state = THIRD;
331 i++;
332 }
333 else state = DEFAULT;
334 break;
335 case P:
336 if (src[i] == 'r') {
337 state = PR;
338 charcopy(dest, &destlen, destsize, src[i++]);
339 }
340 else state = DEFAULT;
341 break;
342 case PR:
343 if (src[i] == 'n') {
344 state = THIRD;
345 i++;
346 }
347 else state = DEFAULT;
348 break;
349 case LDOT:
350 switch (src[i]) {
351 case 'd':
352 case 'i':
353 state = HGDI;
354 charcopy(dest, &destlen, destsize, src[i++]);
355 break;
356 case 'h':
357 state = H;
358 charcopy(dest, &destlen, destsize, src[i++]);
359 break;
360 default:
361 state = DEFAULT;
362 break;
363 }
364 break;
365 case DOT:
366 switch (src[i]) {
367 case '/':
368 case '\0':
369 state = START;
370 memcopy(dest, &destlen, destsize, "~2e", 3);
371 charcopy(dest, &destlen, destsize, src[i++]);
372 break;
373 case 'd':
374 case 'i':
375 state = HGDI;
376 charcopy(dest, &destlen, destsize, '.');
377 charcopy(dest, &destlen, destsize, src[i++]);
378 break;
379 case 'h':
380 state = H;
381 memcopy(dest, &destlen, destsize, ".h", 2);
382 i++;
383 break;
384 default:
385 state = DEFAULT;
386 charcopy(dest, &destlen, destsize, '.');
387 break;
388 }
389 break;
390 case H:
391 if (src[i] == 'g') {
392 state = HGDI;
393 charcopy(dest, &destlen, destsize, src[i++]);
394 }
395 else state = DEFAULT;
396 break;
397 case HGDI:
398 if (src[i] == '/') {
399 state = START;
400 if (encodedir)
401 memcopy(dest, &destlen, destsize, ".hg",
402 3);
403 charcopy(dest, &destlen, destsize, src[i++]);
404 }
405 else state = DEFAULT;
406 break;
407 case SPACE:
408 switch (src[i]) {
409 case '/':
410 case '\0':
411 state = START;
412 memcopy(dest, &destlen, destsize, "~20", 3);
413 charcopy(dest, &destlen, destsize, src[i++]);
414 break;
415 default:
416 state = DEFAULT;
417 charcopy(dest, &destlen, destsize, ' ');
418 break;
419 }
420 break;
421 case DEFAULT:
422 while (isset(onebyte, src[i])) {
423 charcopy(dest, &destlen, destsize, src[i++]);
424 if (i == len)
425 goto done;
426 }
427 switch (src[i]) {
428 case '.':
429 state = DOT;
430 i++;
431 break;
432 case ' ':
433 state = SPACE;
434 i++;
435 break;
436 case '/':
437 state = START;
438 charcopy(dest, &destlen, destsize, '/');
439 i++;
440 break;
441 default:
442 if (isset(onebyte, src[i])) {
443 do {
444 charcopy(dest, &destlen,
445 destsize, src[i++]);
446 } while (i < len &&
447 isset(onebyte, src[i]));
448 }
449 else if (isset(twobytes, src[i])) {
450 char c = src[i++];
451 charcopy(dest, &destlen, destsize, '_');
452 charcopy(dest, &destlen, destsize,
453 c == '_' ? '_' : c + 32);
454 }
455 else
456 escape3(dest, &destlen, destsize,
457 src[i++]);
458 break;
459 }
460 break;
461 }
462 }
463 done:
464 return destlen;
465 }
466
467 static Py_ssize_t basicencode(char *dest, size_t destsize,
468 const char *src, Py_ssize_t len)
469 {
470 static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
471
472 static const uint32_t onebyte[8] = {
473 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
474 };
475
476 Py_ssize_t destlen = 0;
477
478 if (len < 5 || memcmp(src, "data/", 5) != 0) {
479 memcopy(dest, &destlen, destsize, src, len);
480 return destlen;
481 }
482
483 memcopy(dest, &destlen, destsize, "data/", 5);
484
485 return _encode(twobytes, onebyte, dest, destlen, destsize,
486 src + 5, len - 5, 1);
487 }
488
489 static const Py_ssize_t maxstorepathlen = 120;
490
491 /*
492 * We currently implement only basic encoding.
493 *
494 * If a name is too long to encode due to Windows path name limits,
495 * this function returns None.
496 */
497 PyObject *pathencode(PyObject *self, PyObject *args)
498 {
499 Py_ssize_t len, newlen;
500 PyObject *pathobj, *newobj;
501 char *path;
502
503 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
504 return NULL;
505
506 if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
507 PyErr_SetString(PyExc_TypeError, "expected a string");
508 return NULL;
509 }
510
511 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
512
513 if (newlen <= maxstorepathlen + 1) {
514 if (newlen == len + 1) {
515 Py_INCREF(pathobj);
516 return pathobj;
517 }
518
519 newobj = PyString_FromStringAndSize(NULL, newlen);
520
521 if (newobj) {
522 PyString_GET_SIZE(newobj)--;
523 basicencode(PyString_AS_STRING(newobj), newlen, path,
524 len + 1);
525 }
526 } else {
527 newobj = Py_None;
528 Py_INCREF(newobj);
529 }
530
531 return newobj;
532 }