Improved binary diff from Christopher Li
authormpm@selenic.com
Fri, 27 May 2005 19:38:34 -0800
changeset 184 697f05bfe976
parent 183 767916673e16
child 185 db3aa85b9379
Improved binary diff from Christopher Li This is more intelligent/efficient by combining neighboring inserts, replaces and deletes. Passes test of converting kernel repo, but doesn't appear to substantially affect compression or performance.
mercurial/mdiff.py
--- a/mercurial/mdiff.py	Fri May 27 13:30:35 2005 -0800
+++ b/mercurial/mdiff.py	Fri May 27 19:38:34 2005 -0800
@@ -19,28 +19,25 @@
 
 def sortdiff(a, b):
     la = lb = 0
-
+    lena = len(a)
+    lenb = len(b)
     while 1:
-        if la >= len(a) or lb >= len(b): break
-        if b[lb] < a[la]:
-            si = lb
-            while lb < len(b) and b[lb] < a[la] : lb += 1
-            yield "insert", la, la, si, lb
-        elif a[la] < b[lb]:
-            si = la
-            while la < len(a) and a[la] < b[lb]: la += 1
-            yield "delete", si, la, lb, lb
-        else:
+        am, bm, = la, lb
+        while lb < lenb and la < len and a[la] == b[lb] :
             la += 1
             lb += 1
-
-    if lb < len(b):
-        yield "insert", la, la, lb, len(b)
-
-    if la < len(a):
-        yield "delete", la, len(a), lb, lb
+        if la>am: yield (am, bm, la-am)
+        while lb < lenb and b[lb] < a[la]: lb += 1
+        if lb>=lenb: break
+        while la < lena and b[lb] > a[la]: la += 1
+        if la>=lena: break
+    yield (lena, lenb, 0)
 
 def diff(a, b, sorted=0):
+    if not a:
+        s = "".join(b)
+        return s and (struct.pack(">lll", 0, 0, len(s)) + s)
+
     bin = []
     p = [0]
     for i in a: p.append(p[-1] + len(i))
@@ -48,13 +45,16 @@
     if sorted:
         d = sortdiff(a, b)
     else:
-        d = difflib.SequenceMatcher(None, a, b).get_opcodes()
-
-    for o, m, n, s, t in d:
-        if o == 'equal': continue
-        s = "".join(b[s:t])
-        bin.append(struct.pack(">lll", p[m], p[n], len(s)) + s)
-
+        d = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
+    la = 0
+    lb = 0
+    for am, bm, size in d:
+        s = "".join(b[lb:bm])
+        if am > la or s:
+            bin.append(struct.pack(">lll", p[la], p[am], len(s)) + s)
+        la = am + size
+        lb = bm + size
+    
     return "".join(bin)
 
 def patchtext(bin):