Add mdiff.patches to speed up applying thousands of patches to the manifest
authormpm@selenic.com
Sat, 14 May 2005 10:27:14 -0800
changeset 71 47c9a869adee
parent 70 ce080e8eccd7
child 72 4a6ab4d80dc4
child 103 33500fe7d56c
Add mdiff.patches to speed up applying thousands of patches to the manifest
mercurial/mdiff.py
mercurial/revlog.py
--- a/mercurial/mdiff.py	Sat May 14 10:13:49 2005 -0800
+++ b/mercurial/mdiff.py	Sat May 14 10:27:14 2005 -0800
@@ -1,6 +1,7 @@
 #!/usr/bin/python
-import difflib, struct
-from cStringIO import StringIO
+import difflib, struct, mmap
+
+devzero = file("/dev/zero")
 
 def unidiff(a, ad, b, bd, fn):
     if not a and not b: return ""
@@ -52,6 +53,74 @@
 
     return "".join(bin)
 
+# This attempts to apply a series of patches in time proportional to
+# the total size of the patches, rather than patches * len(text). This
+# means rather than shuffling strings around, we shuffle around
+# pointers to fragments with fragment lists.
+#
+# When the fragment lists get too long, we collapse them. To do this
+# efficiently, we do all our operations inside a buffer created by
+# mmap and simply use memmove. This avoids creating a bunch of large
+# temporary string buffers.
+
+def patches(a, bins):
+    if not bins: return a
+
+    plens = [len(x) for x in bins]
+    pl = sum(plens)
+    bl = len(a) + pl
+    tl = bl + bl + pl # enough for the patches and two working texts
+    b1, b2 = 0, bl
+    m = mmap.mmap(devzero.fileno(), tl, mmap.MAP_PRIVATE)
+
+    # load our original text
+    m.write(a)
+    frags = [(len(a), b1)]
+
+    # copy all the patches into our segment so we can memmove from them
+    pos = b2 + bl
+    m.seek(pos)
+    for p in bins: m.write(p)
+
+    def pull(dst, src, l): # pull l bytes from src
+        while l:
+            f = src.pop(0)
+            if f[0] > l: # do we need to split?
+                src.insert(0, (f[0] - l, f[1] + l))
+                dst.append((l, f[1]))
+                return
+            dst.append(f)
+            l -= f[0]
+
+    def collect(buf, list):
+        start = buf
+        for l, p in list:
+            m.move(buf, p, l)
+            buf += l
+        return (buf - start, start)
+
+    for plen in plens:
+        # if our list gets too long, execute it
+        if len(frags) > 128:
+            b2, b1 = b1, b2
+            frags = [collect(b1, frags)]
+
+        new = []
+        end = pos + plen
+        last = 0
+        while pos < end:
+            p1, p2, l = struct.unpack(">lll", m[pos:pos + 12])
+            pull(new, frags, p1 - last) # what didn't change
+            pull([], frags, p2 - p1)    # what got deleted
+            new.append((l, pos + 12))        # what got added
+            pos += l + 12
+            last = p2
+        frags = new + frags                    # what was left at the end
+
+    t = collect(b2, frags)
+
+    return m[t[1]:t[1] + t[0]]
+
 def patch(a, bin):
     last = pos = 0
     r = []
--- a/mercurial/revlog.py	Sat May 14 10:13:49 2005 -0800
+++ b/mercurial/revlog.py	Sat May 14 10:27:14 2005 -0800
@@ -41,7 +41,7 @@
             n = 0
             i = self.opener(self.indexfile).read()
             s = struct.calcsize(indexformat)
-            for f in range(0, len(i), s):
+            for f in xrange(0, len(i), s):
                 # offset, size, base, linkrev, p1, p2, nodeid
                 e = struct.unpack(indexformat, i[f:f + s])
                 self.nodemap[e[6]] = n
@@ -87,9 +87,6 @@
     def diff(self, a, b):
         return mdiff.textdiff(a, b)
 
-    def patch(self, text, patch):
-        return mdiff.patch(text, patch)
-
     def revision(self, node):
         if node == nullid: return ""
         if self.cache and self.cache[0] == node: return self.cache[2]
@@ -114,12 +111,14 @@
             last = self.length(base)
             text = decompress(data[:last])
 
+        bins = []
         for r in xrange(base + 1, rev + 1):
             s = self.length(r)
-            b = decompress(data[last:last + s])
-            text = self.patch(text, b)
+            bins.append(decompress(data[last:last + s]))
             last = last + s
 
+        text = mdiff.patches(text, bins)
+
         (p1, p2) = self.parents(node)
         if node != hash(text, p1, p2):
             raise "integrity check failed on %s:%d" % (self.datafile, rev)
@@ -301,14 +300,12 @@
 
         # helper to reconstruct intermediate versions
         def construct(text, base, rev):
-            for r in range(base + 1, rev + 1):
-                b = decompress(chunks[r])
-                text = self.patch(text, b)
-            return text
+            bins = [decompress(chunks[r]) for r in xrange(base + 1, rev + 1)]
+            return mdiff.patches(text, bins)
 
         # build deltas
         deltas = []
-        for d in range(0, len(revs) - 1):
+        for d in xrange(0, len(revs) - 1):
             a, b = revs[d], revs[d + 1]
             n = self.node(b)