Mercurial > hg
view mercurial/pure/bdiff.py @ 10262:eb243551cbd8 stable
copies: speed up copy detection
On some large repos, copy detection could spend > 10min using
fctx.ancestor() to determine if file revisions were actually related.
Because ancestor must traverse history to the root to determine the
GCA, it was doing a lot more work than necessary. With this
replacement, same status -r a:b takes ~3 seconds.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Tue, 19 Jan 2010 22:20:05 -0600 |
parents | 46293a0c7e9f |
children | 25e572394f5c |
line wrap: on
line source
# bdiff.py - Python implementation of bdiff.c # # Copyright 2009 Matt Mackall <mpm@selenic.com> and others # # This software may be used and distributed according to the terms of the # GNU General Public License version 2, incorporated herein by reference. import struct, difflib def splitnewlines(text): '''like str.splitlines, but only split on newlines.''' lines = [l + '\n' for l in text.split('\n')] if lines: if lines[-1] == '\n': lines.pop() else: lines[-1] = lines[-1][:-1] return lines def _normalizeblocks(a, b, blocks): prev = None for curr in blocks: if prev is None: prev = curr continue shift = 0 a1, b1, l1 = prev a1end = a1 + l1 b1end = b1 + l1 a2, b2, l2 = curr a2end = a2 + l2 b2end = b2 + l2 if a1end == a2: while a1end+shift < a2end and a[a1end+shift] == b[b1end+shift]: shift += 1 elif b1end == b2: while b1end+shift < b2end and a[a1end+shift] == b[b1end+shift]: shift += 1 yield a1, b1, l1+shift prev = a2+shift, b2+shift, l2-shift yield prev def bdiff(a, b): a = str(a).splitlines(True) b = str(b).splitlines(True) if not a: s = "".join(b) return s and (struct.pack(">lll", 0, 0, len(s)) + s) bin = [] p = [0] for i in a: p.append(p[-1] + len(i)) d = difflib.SequenceMatcher(None, a, b).get_matching_blocks() d = _normalizeblocks(a, b, d) la = 0 lb = 0 for am, bm, size in d: s = "".join(b[lb:bm]) if am > la or s: bin.append(struct.pack(">lll", p[la], p[am], len(s)) + s) la = am + size lb = bm + size return "".join(bin) def blocks(a, b): an = splitnewlines(a) bn = splitnewlines(b) d = difflib.SequenceMatcher(None, an, bn).get_matching_blocks() d = _normalizeblocks(an, bn, d) return [(i, i + n, j, j + n) for (i, j, n) in d]