comparison mercurial/cext/revlog.c @ 45936:0ce15a8c7b8b

revlog: store new index entries as binary For a pure-Python unbundle of the current NetBSD test repository, this results in a 10% peak RSS reduction. Using the C revlog index, it shows 25% peak RSS reduction. This is a direct result of avoiding at least 8 objects per new changeset or 200 Bytes+ on AMD64. Differential Revision: https://phab.mercurial-scm.org/D9162
author Joerg Sonnenberger <joerg@bec.de>
date Tue, 06 Oct 2020 03:25:15 +0200
parents 4404f129341e
children c581b9ee22b1
comparison
equal deleted inserted replaced
45935:eccbfa7e19c0 45936:0ce15a8c7b8b
80 PyObject_HEAD 80 PyObject_HEAD
81 /* Type-specific fields go here. */ 81 /* Type-specific fields go here. */
82 PyObject *data; /* raw bytes of index */ 82 PyObject *data; /* raw bytes of index */
83 Py_buffer buf; /* buffer of data */ 83 Py_buffer buf; /* buffer of data */
84 const char **offsets; /* populated on demand */ 84 const char **offsets; /* populated on demand */
85 Py_ssize_t raw_length; /* original number of elements */ 85 Py_ssize_t length; /* current on-disk number of elements */
86 Py_ssize_t length; /* current number of elements */ 86 unsigned new_length; /* number of added elements */
87 PyObject *added; /* populated on demand */ 87 unsigned added_length; /* space reserved for added elements */
88 char *added; /* populated on demand */
88 PyObject *headrevs; /* cache, invalidated on changes */ 89 PyObject *headrevs; /* cache, invalidated on changes */
89 PyObject *filteredrevs; /* filtered revs set */ 90 PyObject *filteredrevs; /* filtered revs set */
90 nodetree nt; /* base-16 trie */ 91 nodetree nt; /* base-16 trie */
91 int ntinitialized; /* 0 or 1 */ 92 int ntinitialized; /* 0 or 1 */
92 int ntrev; /* last rev scanned */ 93 int ntrev; /* last rev scanned */
95 int inlined; 96 int inlined;
96 }; 97 };
97 98
98 static Py_ssize_t index_length(const indexObject *self) 99 static Py_ssize_t index_length(const indexObject *self)
99 { 100 {
100 if (self->added == NULL) 101 return self->length + self->new_length;
101 return self->length;
102 return self->length + PyList_GET_SIZE(self->added);
103 } 102 }
104 103
105 static PyObject *nullentry = NULL; 104 static PyObject *nullentry = NULL;
106 static const char nullid[20] = {0}; 105 static const char nullid[20] = {0};
107 static const Py_ssize_t nullrev = -1; 106 static const Py_ssize_t nullrev = -1;
153 /* 152 /*
154 * Return a pointer to the beginning of a RevlogNG record. 153 * Return a pointer to the beginning of a RevlogNG record.
155 */ 154 */
156 static const char *index_deref(indexObject *self, Py_ssize_t pos) 155 static const char *index_deref(indexObject *self, Py_ssize_t pos)
157 { 156 {
157 if (pos >= self->length)
158 return self->added + (pos - self->length) * v1_hdrsize;
159
158 if (self->inlined && pos > 0) { 160 if (self->inlined && pos > 0) {
159 if (self->offsets == NULL) { 161 if (self->offsets == NULL) {
160 Py_ssize_t ret; 162 Py_ssize_t ret;
161 self->offsets = PyMem_Malloc(self->raw_length * 163 self->offsets =
162 sizeof(*self->offsets)); 164 PyMem_Malloc(self->length * sizeof(*self->offsets));
163 if (self->offsets == NULL) 165 if (self->offsets == NULL)
164 return (const char *)PyErr_NoMemory(); 166 return (const char *)PyErr_NoMemory();
165 ret = inline_scan(self, self->offsets); 167 ret = inline_scan(self, self->offsets);
166 if (ret == -1) { 168 if (ret == -1) {
167 return NULL; 169 return NULL;
180 * parent revision may be nullrev, but is guaranteed to be in valid range. 182 * parent revision may be nullrev, but is guaranteed to be in valid range.
181 */ 183 */
182 static inline int index_get_parents(indexObject *self, Py_ssize_t rev, int *ps, 184 static inline int index_get_parents(indexObject *self, Py_ssize_t rev, int *ps,
183 int maxrev) 185 int maxrev)
184 { 186 {
185 if (rev >= self->length) { 187 const char *data = index_deref(self, rev);
186 long tmp; 188
187 PyObject *tuple = 189 ps[0] = getbe32(data + 24);
188 PyList_GET_ITEM(self->added, rev - self->length); 190 ps[1] = getbe32(data + 28);
189 if (!pylong_to_long(PyTuple_GET_ITEM(tuple, 5), &tmp)) { 191
190 return -1;
191 }
192 ps[0] = (int)tmp;
193 if (!pylong_to_long(PyTuple_GET_ITEM(tuple, 6), &tmp)) {
194 return -1;
195 }
196 ps[1] = (int)tmp;
197 } else {
198 const char *data = index_deref(self, rev);
199 ps[0] = getbe32(data + 24);
200 ps[1] = getbe32(data + 28);
201 }
202 /* If index file is corrupted, ps[] may point to invalid revisions. So 192 /* If index file is corrupted, ps[] may point to invalid revisions. So
203 * there is a risk of buffer overflow to trust them unconditionally. */ 193 * there is a risk of buffer overflow to trust them unconditionally. */
204 if (ps[0] < -1 || ps[0] > maxrev || ps[1] < -1 || ps[1] > maxrev) { 194 if (ps[0] < -1 || ps[0] > maxrev || ps[1] < -1 || ps[1] > maxrev) {
205 PyErr_SetString(PyExc_ValueError, "parent out of range"); 195 PyErr_SetString(PyExc_ValueError, "parent out of range");
206 return -1; 196 return -1;
235 } 225 }
236 } 226 }
237 227
238 static inline int64_t index_get_start(indexObject *self, Py_ssize_t rev) 228 static inline int64_t index_get_start(indexObject *self, Py_ssize_t rev)
239 { 229 {
230 const char *data;
240 uint64_t offset; 231 uint64_t offset;
241 if (rev == nullrev) { 232
233 if (rev == nullrev)
242 return 0; 234 return 0;
243 } 235
244 if (rev >= self->length) { 236 data = index_deref(self, rev);
245 PyObject *tuple; 237 offset = getbe32(data + 4);
246 PyObject *pylong; 238 if (rev == 0) {
247 PY_LONG_LONG tmp; 239 /* mask out version number for the first entry */
248 tuple = PyList_GET_ITEM(self->added, rev - self->length); 240 offset &= 0xFFFF;
249 pylong = PyTuple_GET_ITEM(tuple, 0);
250 tmp = PyLong_AsLongLong(pylong);
251 if (tmp == -1 && PyErr_Occurred()) {
252 return -1;
253 }
254 if (tmp < 0) {
255 PyErr_Format(PyExc_OverflowError,
256 "revlog entry size out of bound (%lld)",
257 (long long)tmp);
258 return -1;
259 }
260 offset = (uint64_t)tmp;
261 } else { 241 } else {
262 const char *data = index_deref(self, rev); 242 uint32_t offset_high = getbe32(data);
263 offset = getbe32(data + 4); 243 offset |= ((uint64_t)offset_high) << 32;
264 if (rev == 0) {
265 /* mask out version number for the first entry */
266 offset &= 0xFFFF;
267 } else {
268 uint32_t offset_high = getbe32(data);
269 offset |= ((uint64_t)offset_high) << 32;
270 }
271 } 244 }
272 return (int64_t)(offset >> 16); 245 return (int64_t)(offset >> 16);
273 } 246 }
274 247
275 static inline int index_get_length(indexObject *self, Py_ssize_t rev) 248 static inline int index_get_length(indexObject *self, Py_ssize_t rev)
276 { 249 {
277 if (rev == nullrev) { 250 const char *data;
251 int tmp;
252
253 if (rev == nullrev)
278 return 0; 254 return 0;
279 } 255
280 if (rev >= self->length) { 256 data = index_deref(self, rev);
281 PyObject *tuple; 257
282 PyObject *pylong; 258 tmp = (int)getbe32(data + 8);
283 long ret; 259 if (tmp < 0) {
284 tuple = PyList_GET_ITEM(self->added, rev - self->length); 260 PyErr_Format(PyExc_OverflowError,
285 pylong = PyTuple_GET_ITEM(tuple, 1); 261 "revlog entry size out of bound (%d)", tmp);
286 ret = PyInt_AsLong(pylong); 262 return -1;
287 if (ret == -1 && PyErr_Occurred()) { 263 }
288 return -1; 264 return tmp;
289 }
290 if (ret < 0 || ret > (long)INT_MAX) {
291 PyErr_Format(PyExc_OverflowError,
292 "revlog entry size out of bound (%ld)",
293 ret);
294 return -1;
295 }
296 return (int)ret;
297 } else {
298 const char *data = index_deref(self, rev);
299 int tmp = (int)getbe32(data + 8);
300 if (tmp < 0) {
301 PyErr_Format(PyExc_OverflowError,
302 "revlog entry size out of bound (%d)",
303 tmp);
304 return -1;
305 }
306 return tmp;
307 }
308 } 265 }
309 266
310 /* 267 /*
311 * RevlogNG format (all in big endian, data may be inlined): 268 * RevlogNG format (all in big endian, data may be inlined):
312 * 6 bytes: offset 269 * 6 bytes: offset
335 if (pos < 0 || pos >= length) { 292 if (pos < 0 || pos >= length) {
336 PyErr_SetString(PyExc_IndexError, "revlog index out of range"); 293 PyErr_SetString(PyExc_IndexError, "revlog index out of range");
337 return NULL; 294 return NULL;
338 } 295 }
339 296
340 if (pos >= self->length) {
341 PyObject *obj;
342 obj = PyList_GET_ITEM(self->added, pos - self->length);
343 Py_INCREF(obj);
344 return obj;
345 }
346
347 data = index_deref(self, pos); 297 data = index_deref(self, pos);
348 if (data == NULL) 298 if (data == NULL)
349 return NULL; 299 return NULL;
350 300
351 offset_flags = getbe32(data + 4); 301 offset_flags = getbe32(data + 4);
352 if (pos == 0) /* mask out version number for the first entry */ 302 /*
303 * The first entry on-disk needs the version number masked out,
304 * but this doesn't apply if entries are added to an empty index.
305 */
306 if (self->length && pos == 0)
353 offset_flags &= 0xFFFF; 307 offset_flags &= 0xFFFF;
354 else { 308 else {
355 uint32_t offset_high = getbe32(data); 309 uint32_t offset_high = getbe32(data);
356 offset_flags |= ((uint64_t)offset_high) << 32; 310 offset_flags |= ((uint64_t)offset_high) << 32;
357 } 311 }
381 return nullid; 335 return nullid;
382 336
383 if (pos >= length) 337 if (pos >= length)
384 return NULL; 338 return NULL;
385 339
386 if (pos >= self->length) {
387 PyObject *tuple, *str;
388 tuple = PyList_GET_ITEM(self->added, pos - self->length);
389 str = PyTuple_GetItem(tuple, 7);
390 return str ? PyBytes_AS_STRING(str) : NULL;
391 }
392
393 data = index_deref(self, pos); 340 data = index_deref(self, pos);
394 return data ? data + 32 : NULL; 341 return data ? data + 32 : NULL;
395 } 342 }
396 343
397 /* 344 /*
421 return -1; 368 return -1;
422 } 369 }
423 370
424 static PyObject *index_append(indexObject *self, PyObject *obj) 371 static PyObject *index_append(indexObject *self, PyObject *obj)
425 { 372 {
426 char *node; 373 unsigned long offset_flags;
427 Py_ssize_t len; 374 int rev, comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
428 375 Py_ssize_t c_node_id_len;
429 if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != 8) { 376 const char *c_node_id;
377 char *data;
378
379 if (!PyArg_ParseTuple(obj, tuple_format, &offset_flags, &comp_len,
380 &uncomp_len, &base_rev, &link_rev, &parent_1,
381 &parent_2, &c_node_id, &c_node_id_len)) {
430 PyErr_SetString(PyExc_TypeError, "8-tuple required"); 382 PyErr_SetString(PyExc_TypeError, "8-tuple required");
431 return NULL; 383 return NULL;
432 } 384 }
433 385 if (c_node_id_len != 20 && c_node_id_len != 32) {
434 if (node_check(PyTuple_GET_ITEM(obj, 7), &node) == -1) 386 PyErr_SetString(PyExc_TypeError, "invalid node");
435 return NULL; 387 return NULL;
436 388 }
437 len = index_length(self); 389
438 390 if (self->new_length == self->added_length) {
439 if (self->added == NULL) { 391 size_t new_added_length =
440 self->added = PyList_New(0); 392 self->added_length ? self->added_length * 2 : 4096;
441 if (self->added == NULL) 393 void *new_added =
442 return NULL; 394 PyMem_Realloc(self->added, new_added_length * v1_hdrsize);
443 } 395 if (!new_added)
444 396 return PyErr_NoMemory();
445 if (PyList_Append(self->added, obj) == -1) 397 self->added = new_added;
446 return NULL; 398 self->added_length = new_added_length;
399 }
400 rev = self->length + self->new_length;
401 data = self->added + v1_hdrsize * self->new_length++;
402 putbe32(offset_flags >> 32, data);
403 putbe32(offset_flags & 0xffffffffU, data + 4);
404 putbe32(comp_len, data + 8);
405 putbe32(uncomp_len, data + 12);
406 putbe32(base_rev, data + 16);
407 putbe32(link_rev, data + 20);
408 putbe32(parent_1, data + 24);
409 putbe32(parent_2, data + 28);
410 memcpy(data + 32, c_node_id, c_node_id_len);
411 memset(data + 32 + c_node_id_len, 0, 32 - c_node_id_len);
447 412
448 if (self->ntinitialized) 413 if (self->ntinitialized)
449 nt_insert(&self->nt, node, (int)len); 414 nt_insert(&self->nt, c_node_id, rev);
450 415
451 Py_CLEAR(self->headrevs); 416 Py_CLEAR(self->headrevs);
452 Py_RETURN_NONE; 417 Py_RETURN_NONE;
453 } 418 }
454 419
471 goto bail; \ 436 goto bail; \
472 Py_CLEAR(s); \ 437 Py_CLEAR(s); \
473 Py_CLEAR(t); \ 438 Py_CLEAR(t); \
474 } while (0) 439 } while (0)
475 440
476 if (self->added) { 441 if (self->added_length)
477 Py_ssize_t len = PyList_GET_SIZE(self->added); 442 istat(new_length, "index entries added");
478 s = PyBytes_FromString("index entries added");
479 t = PyInt_FromSsize_t(len);
480 if (!s || !t)
481 goto bail;
482 if (PyDict_SetItem(obj, s, t) == -1)
483 goto bail;
484 Py_CLEAR(s);
485 Py_CLEAR(t);
486 }
487
488 if (self->raw_length != self->length)
489 istat(raw_length, "revs on disk");
490 istat(length, "revs in memory"); 443 istat(length, "revs in memory");
491 istat(ntlookups, "node trie lookups"); 444 istat(ntlookups, "node trie lookups");
492 istat(ntmisses, "node trie misses"); 445 istat(ntmisses, "node trie misses");
493 istat(ntrev, "node trie last rev scanned"); 446 istat(ntrev, "node trie last rev scanned");
494 if (self->ntinitialized) { 447 if (self->ntinitialized) {
996 static inline int index_baserev(indexObject *self, int rev) 949 static inline int index_baserev(indexObject *self, int rev)
997 { 950 {
998 const char *data; 951 const char *data;
999 int result; 952 int result;
1000 953
1001 if (rev >= self->length) { 954 data = index_deref(self, rev);
1002 PyObject *tuple = 955 if (data == NULL)
1003 PyList_GET_ITEM(self->added, rev - self->length); 956 return -2;
1004 long ret; 957 result = getbe32(data + 16);
1005 if (!pylong_to_long(PyTuple_GET_ITEM(tuple, 3), &ret)) { 958
1006 return -2;
1007 }
1008 result = (int)ret;
1009 } else {
1010 data = index_deref(self, rev);
1011 if (data == NULL) {
1012 return -2;
1013 }
1014
1015 result = getbe32(data + 16);
1016 }
1017 if (result > rev) { 959 if (result > rev) {
1018 PyErr_Format( 960 PyErr_Format(
1019 PyExc_ValueError, 961 PyExc_ValueError,
1020 "corrupted revlog, revision base above revision: %d, %d", 962 "corrupted revlog, revision base above revision: %d, %d",
1021 rev, result); 963 rev, result);
1852 }; 1794 };
1853 1795
1854 static int index_init_nt(indexObject *self) 1796 static int index_init_nt(indexObject *self)
1855 { 1797 {
1856 if (!self->ntinitialized) { 1798 if (!self->ntinitialized) {
1857 if (nt_init(&self->nt, self, (int)self->raw_length) == -1) { 1799 if (nt_init(&self->nt, self, (int)self->length) == -1) {
1858 nt_dealloc(&self->nt); 1800 nt_dealloc(&self->nt);
1859 return -1; 1801 return -1;
1860 } 1802 }
1861 if (nt_insert(&self->nt, nullid, -1) == -1) { 1803 if (nt_insert(&self->nt, nullid, -1) == -1) {
1862 nt_dealloc(&self->nt); 1804 nt_dealloc(&self->nt);
2477 /* 2419 /*
2478 * Invalidate any trie entries introduced by added revs. 2420 * Invalidate any trie entries introduced by added revs.
2479 */ 2421 */
2480 static void index_invalidate_added(indexObject *self, Py_ssize_t start) 2422 static void index_invalidate_added(indexObject *self, Py_ssize_t start)
2481 { 2423 {
2482 Py_ssize_t i, len = PyList_GET_SIZE(self->added); 2424 Py_ssize_t i, len;
2483 2425
2484 for (i = start; i < len; i++) { 2426 len = self->length + self->new_length;
2485 PyObject *tuple = PyList_GET_ITEM(self->added, i); 2427 i = start - self->length;
2486 PyObject *node = PyTuple_GET_ITEM(tuple, 7); 2428 if (i < 0)
2487 2429 return;
2488 nt_delete_node(&self->nt, PyBytes_AS_STRING(node)); 2430
2489 } 2431 for (i = start; i < len; i++)
2490 2432 nt_delete_node(&self->nt, index_deref(self, i) + 32);
2491 if (start == 0) 2433
2492 Py_CLEAR(self->added); 2434 self->new_length = start - self->length;
2493 } 2435 }
2494 2436
2495 /* 2437 /*
2496 * Delete a numeric range of revs, which must be at the end of the 2438 * Delete a numeric range of revs, which must be at the end of the
2497 * range. 2439 * range.
2545 if (node == NULL) 2487 if (node == NULL)
2546 return -1; 2488 return -1;
2547 2489
2548 nt_delete_node(&self->nt, node); 2490 nt_delete_node(&self->nt, node);
2549 } 2491 }
2550 if (self->added) 2492 if (self->new_length)
2551 index_invalidate_added(self, 0); 2493 index_invalidate_added(self, self->length);
2552 if (self->ntrev > start) 2494 if (self->ntrev > start)
2553 self->ntrev = (int)start; 2495 self->ntrev = (int)start;
2554 } else if (self->added) { 2496 } else if (self->new_length) {
2555 Py_CLEAR(self->added); 2497 self->new_length = 0;
2556 } 2498 }
2557 2499
2558 self->length = start; 2500 self->length = start;
2559 if (start < self->raw_length)
2560 self->raw_length = start;
2561 goto done; 2501 goto done;
2562 } 2502 }
2563 2503
2564 if (self->ntinitialized) { 2504 if (self->ntinitialized) {
2565 index_invalidate_added(self, start - self->length); 2505 index_invalidate_added(self, start);
2566 if (self->ntrev > start) 2506 if (self->ntrev > start)
2567 self->ntrev = (int)start; 2507 self->ntrev = (int)start;
2568 } 2508 } else {
2569 if (self->added) 2509 self->new_length = start - self->length;
2570 ret = PyList_SetSlice(self->added, start - self->length, 2510 }
2571 PyList_GET_SIZE(self->added), NULL);
2572 done: 2511 done:
2573 Py_CLEAR(self->headrevs); 2512 Py_CLEAR(self->headrevs);
2574 return ret; 2513 return ret;
2575 } 2514 }
2576 2515
2645 PyObject *data_obj, *inlined_obj; 2584 PyObject *data_obj, *inlined_obj;
2646 Py_ssize_t size; 2585 Py_ssize_t size;
2647 2586
2648 /* Initialize before argument-checking to avoid index_dealloc() crash. 2587 /* Initialize before argument-checking to avoid index_dealloc() crash.
2649 */ 2588 */
2650 self->raw_length = 0;
2651 self->added = NULL; 2589 self->added = NULL;
2590 self->new_length = 0;
2591 self->added_length = 0;
2652 self->data = NULL; 2592 self->data = NULL;
2653 memset(&self->buf, 0, sizeof(self->buf)); 2593 memset(&self->buf, 0, sizeof(self->buf));
2654 self->headrevs = NULL; 2594 self->headrevs = NULL;
2655 self->filteredrevs = Py_None; 2595 self->filteredrevs = Py_None;
2656 Py_INCREF(Py_None); 2596 Py_INCREF(Py_None);
2678 2618
2679 if (self->inlined) { 2619 if (self->inlined) {
2680 Py_ssize_t len = inline_scan(self, NULL); 2620 Py_ssize_t len = inline_scan(self, NULL);
2681 if (len == -1) 2621 if (len == -1)
2682 goto bail; 2622 goto bail;
2683 self->raw_length = len;
2684 self->length = len; 2623 self->length = len;
2685 } else { 2624 } else {
2686 if (size % v1_hdrsize) { 2625 if (size % v1_hdrsize) {
2687 PyErr_SetString(PyExc_ValueError, "corrupt index file"); 2626 PyErr_SetString(PyExc_ValueError, "corrupt index file");
2688 goto bail; 2627 goto bail;
2689 } 2628 }
2690 self->raw_length = size / v1_hdrsize; 2629 self->length = size / v1_hdrsize;
2691 self->length = self->raw_length;
2692 } 2630 }
2693 2631
2694 return 0; 2632 return 0;
2695 bail: 2633 bail:
2696 return -1; 2634 return -1;
2730 if (self->buf.buf) { 2668 if (self->buf.buf) {
2731 PyBuffer_Release(&self->buf); 2669 PyBuffer_Release(&self->buf);
2732 memset(&self->buf, 0, sizeof(self->buf)); 2670 memset(&self->buf, 0, sizeof(self->buf));
2733 } 2671 }
2734 Py_XDECREF(self->data); 2672 Py_XDECREF(self->data);
2735 Py_XDECREF(self->added); 2673 PyMem_Free(self->added);
2736 PyObject_Del(self); 2674 PyObject_Del(self);
2737 } 2675 }
2738 2676
2739 static PySequenceMethods index_sequence_methods = { 2677 static PySequenceMethods index_sequence_methods = {
2740 (lenfunc)index_length, /* sq_length */ 2678 (lenfunc)index_length, /* sq_length */