Mercurial > hg
view mercurial/pure/parsers.py @ 29322:66dbdd3cc2b9 stable
bdiff: extend matches across popular lines
For very large diffs that have large numbers of identical lines (JSON
dumps) that also have large blocks of identical text, bdiff could become
confused about which block matches which because it can only match
very limited regions. The result is very large diffs for small sets of edits.
The earlier recursion rebalancing fix made this behavior more frequent because
it's now more prone to match block 1 to block 2. One frequent user of
large JSON files reported being unable to pass the resulting diffs
through their code review system.
Prior to this change, bdiff would calculate the length of a match at
(i, j) as 1 + length found at (i-1, j-1). With large number of popular
(ignored) lines, this often meant matches couldn't be extended
backwards at all and thus all matching regions were very small.
Disabling the popularity threshold is not an option because it brings
back quadratic behavior.
Instead, we extend a match backwards until we either found a previously
discovered match or we find a mismatching line. This thus successfully
bridges over any popular lines inside and before a matching region.
The larger regions then significant reduce the probability of confusion.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 02 Jun 2016 17:09:06 -0500 |
parents | 86db5cb55d46 |
children | 255274719dc1 |
line wrap: on
line source
# parsers.py - Python implementation of parsers.c # # Copyright 2009 Matt Mackall <mpm@selenic.com> and others # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. from __future__ import absolute_import import struct import zlib from .node import nullid from . import pycompat stringio = pycompat.stringio _pack = struct.pack _unpack = struct.unpack _compress = zlib.compress _decompress = zlib.decompress # Some code below makes tuples directly because it's more convenient. However, # code outside this module should always use dirstatetuple. def dirstatetuple(*x): # x is a tuple return x def parse_index2(data, inline): def gettype(q): return int(q & 0xFFFF) def offset_type(offset, type): return long(long(offset) << 16 | type) indexformatng = ">Qiiiiii20s12x" s = struct.calcsize(indexformatng) index = [] cache = None off = 0 l = len(data) - s append = index.append if inline: cache = (0, data) while off <= l: e = _unpack(indexformatng, data[off:off + s]) append(e) if e[1] < 0: break off += e[1] + s else: while off <= l: e = _unpack(indexformatng, data[off:off + s]) append(e) off += s if off != len(data): raise ValueError('corrupt index file') if index: e = list(index[0]) type = gettype(e[0]) e[0] = offset_type(0, type) index[0] = tuple(e) # add the magic null revision at -1 index.append((0, 0, 0, -1, -1, -1, -1, nullid)) return index, cache def parse_dirstate(dmap, copymap, st): parents = [st[:20], st[20: 40]] # dereference fields so they will be local in loop format = ">cllll" e_size = struct.calcsize(format) pos1 = 40 l = len(st) # the inner loop while pos1 < l: pos2 = pos1 + e_size e = _unpack(">cllll", st[pos1:pos2]) # a literal here is faster pos1 = pos2 + e[4] f = st[pos2:pos1] if '\0' in f: f, c = f.split('\0') copymap[f] = c dmap[f] = e[:4] return parents def pack_dirstate(dmap, copymap, pl, now): now = int(now) cs = stringio() write = cs.write write("".join(pl)) for f, e in dmap.iteritems(): if e[0] == 'n' and e[3] == now: # The file was last modified "simultaneously" with the current # write to dirstate (i.e. within the same second for file- # systems with a granularity of 1 sec). This commonly happens # for at least a couple of files on 'update'. # The user could change the file without changing its size # within the same second. Invalidate the file's mtime in # dirstate, forcing future 'status' calls to compare the # contents of the file if the size is the same. This prevents # mistakenly treating such files as clean. e = dirstatetuple(e[0], e[1], e[2], -1) dmap[f] = e if f in copymap: f = "%s\0%s" % (f, copymap[f]) e = _pack(">cllll", e[0], e[1], e[2], e[3], len(f)) write(e) write(f) return cs.getvalue()