Add a fast binary diff extension (not yet used)
authormpm@selenic.com
Tue, 21 Jun 2005 17:27:58 -0800
changeset 400 8b067bde6679
parent 399 f060d728fe54
child 401 af4848f83e68
Add a fast binary diff extension (not yet used) -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Add a fast binary diff extension (not yet used) manifest hash: fbfdcab2b1f4856af45176fcf2423d602dfe88c4 -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (GNU/Linux) iD8DBQFCuL6eywK+sNU5EO8RAoT+AJ95z9gx2IrucBS30CnCQFkZT7lSbgCgnXh5 ScZcgwzJ/mEo9i5vuKNlCIs= =Gr58 -----END PGP SIGNATURE-----
mercurial/bdiff.c
setup.py
tests/test-bdiff
tests/test-bdiff.out
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/bdiff.c	Tue Jun 21 17:27:58 2005 -0800
@@ -0,0 +1,279 @@
+/*
+ bdiff.c - efficient binary diff extension for Mercurial
+
+ Copyright 2005 Matt Mackall <mpm@selenic.com>
+
+ This software may be used and distributed according to the terms of
+ the GNU General Public License, incorporated herein by reference.
+
+ Based roughly on Python difflib
+*/
+
+#include <Python.h>
+#include <stdlib.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+
+struct line {
+	int h, len, n;
+	const char *l;
+};
+
+struct hunk {
+	int a1, a2, b1, b2;
+};
+
+struct hunklist {
+	struct hunk *base, *head;
+};
+
+static inline uint32_t rol32(uint32_t word, unsigned int shift)
+{
+        return (word << shift) | (word >> (32 - shift));
+}
+
+int splitlines(const char *a, int len, struct line **lr)
+{
+	int h, i;
+	const char *p, *b = a;
+	struct line *l;
+
+	/* count the lines */
+	i = 1; /* extra line for sentinel */
+	for (p = a; p < a + len; p++)
+		if (*p == '\n' || p == a + len - 1)
+			i++;
+
+	*lr = l = malloc(sizeof(struct line) * i);
+	if (!l)
+		return -1;
+
+	/* build the line array and calculate hashes */
+	h = 0;
+	for (p = a; p < a + len; p++) {
+		h = *p + rol32(h, 7); /* a simple hash from GNU diff */
+		if (*p == '\n' || p == a + len - 1) {
+			l->len = p - b + 1;
+			l->h = h;
+			l->l = b;
+			l->n = -1;
+			l++;
+			b = p + 1;
+			h = 0;
+		}
+	}
+
+	/* set up a sentinel */
+	l->h = l->len = 0;
+	l->l = a + len;
+	return i - 1;
+}
+
+int inline cmp(struct line *a, struct line *b)
+{
+	return a->len != b->len || memcmp(a->l, b->l, a->len);
+}
+
+static int equatelines(struct line *a, int an, struct line *b, int bn)
+{
+	int i, j, buckets = 1, t, *h, *l;
+
+	/* build a hash table of the next highest power of 2 */
+	while (buckets < bn + 1)
+		buckets *= 2;
+
+	h = malloc(buckets * sizeof(int));
+	l = calloc(buckets, sizeof(int));
+	buckets = buckets - 1;
+	if (!h || !l) {
+		free(h);
+		return 0;
+	}
+
+	/* clear the hash table */
+	for (i = 0; i <= buckets; i++)
+		h[i] = -1;
+
+	/* add lines to the hash table chains */
+	for (i = bn - 1; i >= 0; i--) {
+		/* find the equivalence class */
+		for (j = b[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
+			if (!cmp(b + i, b + h[j]))
+				break;
+
+		/* add to the head of the equivalence class */
+		b[i].n = h[j];
+		b[i].h = j;
+		h[j] = i;
+		l[j]++; /* keep track of popularity */
+	}
+
+	/* compute popularity threshold */
+	t = (bn >= 200) ? bn / 100 : bn + 1;
+
+	/* match items in a to their equivalence class in b */
+	for (i = 0; i < an; i++) {
+		/* find the equivalence class */
+		for (j = a[i].h & buckets; h[j] != -1; j = (j + 1) & buckets)
+			if (!cmp(a + i, b + h[j]))
+				break;
+
+		a[i].h = j; /* use equivalence class for quick compare */
+		if(l[j] <= t)
+			a[i].n = h[j]; /* point to head of match list */
+		else
+			a[i].n = -1; /* too popular */
+	}
+
+	/* discard hash tables */
+	free(h);
+	free(l);
+	return 1;
+}
+
+static int longest_match(struct line *a, struct line *b, int *jpos, int *jlen,
+			 int a1, int a2, int b1, int b2, int *omi, int *omj)
+{
+	int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k;
+
+	for (i = a1; i < a2; i++) {
+		/* skip things before the current block */
+		for (j = a[i].n; j != -1 && j < b1; j = b[j].n)
+			;
+
+		/* loop through all lines match a[i] in b */
+		for (; j != -1 && j < b2; j = b[j].n) {
+			/* does this extend an earlier match? */
+			if (i > a1 && j > b1 && jpos[j - 1] == i)
+				k = jlen[j - 1] + 1;
+			else
+				k = 1;
+			jpos[j] = i + 1;
+			jlen[j] = k;
+
+			/* best match so far? */
+			if (k > mk) {
+				mi = i;
+				mj = j;
+				mk = k;
+			}
+		}
+	}
+
+	if (mk) {
+		mi = mi - mk + 1;
+		mj = mj - mk + 1;
+	}
+
+	/* expand match to include neighboring popular lines */
+	while (mi - mb > a1 && mj - mb > b1 &&
+	       a[mi - mb - 1].h == b[mj - mb - 1].h)
+		mb++;
+	while (mi + mk < a2 && mj + mk < b2 &&
+	       a[mi + mk].h == b[mj + mk].h)
+		mk++;
+
+	*omi = mi - mb;
+	*omj = mj - mb;
+	return mk + mb;
+}
+
+static void recurse(struct line *a, struct line *b, int *jpos, int *jlen,
+		    int a1, int a2, int b1, int b2, struct hunklist *l)
+{
+	int i, j, k;
+
+	/* find the longest match in this chunk */
+	k = longest_match(a, b, jpos, jlen, a1, a2, b1, b2, &i, &j);
+	if (!k)
+		return;
+
+	/* and recurse on the remaining chunks on either side */
+	recurse(a, b, jpos, jlen, a1, i, b1, j, l);
+	l->head->a1 = i;
+	l->head->a2 = i + k;
+	l->head->b1 = j;
+	l->head->b2 = j + k;
+	l->head++;
+	recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l);
+}
+
+static PyObject *bdiff(PyObject *self, PyObject *args)
+{
+	PyObject *sa, *sb, *result = NULL;
+	struct hunklist l;
+	struct hunk *h;
+	struct line *al, *bl;
+	char encode[12], *rb;
+	int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen;
+
+	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
+		return NULL;
+
+	/* allocate and fill arrays */
+	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
+	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);
+	t = equatelines(al, an, bl, bn);
+	jpos = calloc(bn, sizeof(int));
+	jlen = calloc(bn, sizeof(int));
+	l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
+	if (!al || !bl || !jpos || !jlen || !l.base || !t)
+		goto nomem;
+
+	/* generate the matching block list */
+	recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l);
+	l.head->a1 = an;
+	l.head->b1 = bn;
+	l.head++;
+
+	/* calculate length of output */
+	for(h = l.base; h != l.head; h++) {
+		if (h->a1 != la || h->b1 != lb)
+			len += 12 + bl[h->b1].l - bl[lb].l;
+		la = h->a2;
+		lb = h->b2;
+	}
+
+	result = PyString_FromStringAndSize(NULL, len);
+	if (!result)
+		goto nomem;
+
+	/* build binary patch */
+	rb = PyString_AsString(result);
+	la = lb = 0;
+
+	for(h = l.base; h != l.head; h++) {
+		if (h->a1 != la || h->b1 != lb) {
+			len = bl[h->b1].l - bl[lb].l;
+			*(uint32_t *)(encode)     = htonl(al[la].l - al->l);
+			*(uint32_t *)(encode + 4) = htonl(al[h->a1].l - al->l);
+			*(uint32_t *)(encode + 8) = htonl(len);
+			memcpy(rb, encode, 12);
+			memcpy(rb + 12, bl[lb].l, len);
+			rb += 12 + len;
+		}
+		la = h->a2;
+		lb = h->b2;
+	}
+
+nomem:
+	free(al);
+	free(bl);
+	free(jpos);
+	free(jlen);
+	free(l.base);
+	return result ? result : PyErr_NoMemory();
+}
+
+static char mdiff_doc[] = "Efficient binary diff.";
+
+static PyMethodDef methods[] = {
+	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
+	{NULL, NULL}
+};
+
+PyMODINIT_FUNC initbdiff(void)
+{
+	Py_InitModule3("bdiff", methods, mdiff_doc);
+}
--- a/setup.py	Tue Jun 21 17:24:41 2005 -0800
+++ b/setup.py	Tue Jun 21 17:27:58 2005 -0800
@@ -23,7 +23,8 @@
       description='scalable distributed SCM',
       license='GNU GPL',
       packages=['mercurial'],
-      ext_modules=[Extension('mercurial.mpatch', ['mercurial/mpatch.c'])],
+      ext_modules=[Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
+                   Extension('mercurial.bdiff', ['mercurial/bdiff.c'])],
       data_files=[('mercurial/templates',
                    ['templates/map'] +
                    glob.glob('templates/map-*') +
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-bdiff	Tue Jun 21 17:27:58 2005 -0800
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import sys
+from mercurial import bdiff, mpatch
+
+def test1(a, b):
+    d = bdiff.bdiff(a, b)
+    c = a
+    if d:
+        c = mpatch.patches(a, [d])
+    if c != b:
+        print "***", `a`, `b`
+        print "bad:"
+        print `c`[:200]
+        print `d`
+
+def test(a, b):
+    print "***", `a`, `b`
+    test1(a, b)
+    test1(b, a)
+
+test("a\nc\n\n\n\n", "a\nb\n\n\n")
+test("a\nb\nc\n", "a\nc\n")
+test("", "")
+test("a\nb\nc", "a\nb\nc")
+test("a\nb\nc\nd\n", "a\nd\n")
+test("a\nb\nc\nd\n", "a\nc\ne\n")
+test("a\nb\nc\n", "a\nc\n")
+test("a\n", "c\na\nb\n")
+test("a\n", "")
+test("a\n", "b\nc\n")
+test("a\n", "c\na\n")
+test("", "adjfkjdjksdhfksj")
+test("", "ab")
+test("", "abc")
+test("a", "a")
+test("ab", "ab")
+test("abc", "abc")
+test("a\n", "a\n")
+test("a\nb", "a\nb")
+
+print "done"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-bdiff.out	Tue Jun 21 17:27:58 2005 -0800
@@ -0,0 +1,20 @@
+*** 'a\nc\n\n\n\n' 'a\nb\n\n\n'
+*** 'a\nb\nc\n' 'a\nc\n'
+*** '' ''
+*** 'a\nb\nc' 'a\nb\nc'
+*** 'a\nb\nc\nd\n' 'a\nd\n'
+*** 'a\nb\nc\nd\n' 'a\nc\ne\n'
+*** 'a\nb\nc\n' 'a\nc\n'
+*** 'a\n' 'c\na\nb\n'
+*** 'a\n' ''
+*** 'a\n' 'b\nc\n'
+*** 'a\n' 'c\na\n'
+*** '' 'adjfkjdjksdhfksj'
+*** '' 'ab'
+*** '' 'abc'
+*** 'a' 'a'
+*** 'ab' 'ab'
+*** 'abc' 'abc'
+*** 'a\n' 'a\n'
+*** 'a\nb' 'a\nb'
+done