comparison mercurial/thirdparty/xdiff/xutils.c @ 36671:34e2ff1f9cd8

xdiff: vendor xdiff library from git Vendor git's xdiff library from git commit d7c6c2369d7c6c2369ac21141b7c6cceaebc6414ec3da14ad using GPL2+ license. There is another recent user report that hg diff generates suboptimal result. It seems the fix to issue4074 isn't good enough. I crafted some other interesting cases, and hg diff barely has any advantage compared with gnu diffutils or git diff. | testcase | gnu diffutils | hg diff | git diff | | | lines time | lines time | lines time | | patience | 6 0.00 | 602 0.08 | 6 0.00 | | random | 91772 0.90 | 109462 0.70 | 91772 0.24 | | json | 2 0.03 | 1264814 1.81 | 2 0.29 | "lines" means the size of the output, i.e. the count of "+/-" lines. "time" means seconds needed to do the calculation. Both are the smaller the better. "hg diff" counts Python startup overhead. Git and GNU diffutils generate optimal results. For the "json" case, git can have an optimization that does a scan for common prefix and suffix first, and match them if the length is greater than half of the text. See https://neil.fraser.name/news/2006/03/12/. That would make git the fastest for all above cases. About testcases: patience: Aiming for the weakness of the greedy "patience diff" algorithm. Using git's patience diff option would also get suboptimal result. Generated using the Python script: ``` open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n'))) open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n'))) ``` random: Generated using the script in `test-issue4074.t`. It practically makes the algorithm suffer. Impressively, git wins in both performance and diff quality. json: The recent user reported case. It's a single line movement near the end of a very large (800K lines) JSON file. Test Plan: Code taken as-is. Differential Revision: https://phab.mercurial-scm.org/D2572 # no-check-commit for vendored code
author Jun Wu <quark@fb.com>
date Sat, 03 Mar 2018 10:39:43 -0800
parents
children 09f320067591
comparison
equal deleted inserted replaced
36670:44048f1bcee5 36671:34e2ff1f9cd8
1 /*
2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003 Davide Libenzi
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
18 *
19 * Davide Libenzi <davidel@xmailserver.org>
20 *
21 */
22
23 #include <limits.h>
24 #include <assert.h>
25 #include "xinclude.h"
26
27
28
29
30 long xdl_bogosqrt(long n) {
31 long i;
32
33 /*
34 * Classical integer square root approximation using shifts.
35 */
36 for (i = 1; n > 0; n >>= 2)
37 i <<= 1;
38
39 return i;
40 }
41
42
43 int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
44 xdemitcb_t *ecb) {
45 int i = 2;
46 mmbuffer_t mb[3];
47
48 mb[0].ptr = (char *) pre;
49 mb[0].size = psize;
50 mb[1].ptr = (char *) rec;
51 mb[1].size = size;
52 if (size > 0 && rec[size - 1] != '\n') {
53 mb[2].ptr = (char *) "\n\\ No newline at end of file\n";
54 mb[2].size = strlen(mb[2].ptr);
55 i++;
56 }
57 if (ecb->outf(ecb->priv, mb, i) < 0) {
58
59 return -1;
60 }
61
62 return 0;
63 }
64
65 void *xdl_mmfile_first(mmfile_t *mmf, long *size)
66 {
67 *size = mmf->size;
68 return mmf->ptr;
69 }
70
71
72 long xdl_mmfile_size(mmfile_t *mmf)
73 {
74 return mmf->size;
75 }
76
77
78 int xdl_cha_init(chastore_t *cha, long isize, long icount) {
79
80 cha->head = cha->tail = NULL;
81 cha->isize = isize;
82 cha->nsize = icount * isize;
83 cha->ancur = cha->sncur = NULL;
84 cha->scurr = 0;
85
86 return 0;
87 }
88
89
90 void xdl_cha_free(chastore_t *cha) {
91 chanode_t *cur, *tmp;
92
93 for (cur = cha->head; (tmp = cur) != NULL;) {
94 cur = cur->next;
95 xdl_free(tmp);
96 }
97 }
98
99
100 void *xdl_cha_alloc(chastore_t *cha) {
101 chanode_t *ancur;
102 void *data;
103
104 if (!(ancur = cha->ancur) || ancur->icurr == cha->nsize) {
105 if (!(ancur = (chanode_t *) xdl_malloc(sizeof(chanode_t) + cha->nsize))) {
106
107 return NULL;
108 }
109 ancur->icurr = 0;
110 ancur->next = NULL;
111 if (cha->tail)
112 cha->tail->next = ancur;
113 if (!cha->head)
114 cha->head = ancur;
115 cha->tail = ancur;
116 cha->ancur = ancur;
117 }
118
119 data = (char *) ancur + sizeof(chanode_t) + ancur->icurr;
120 ancur->icurr += cha->isize;
121
122 return data;
123 }
124
125 long xdl_guess_lines(mmfile_t *mf, long sample) {
126 long nl = 0, size, tsize = 0;
127 char const *data, *cur, *top;
128
129 if ((cur = data = xdl_mmfile_first(mf, &size)) != NULL) {
130 for (top = data + size; nl < sample && cur < top; ) {
131 nl++;
132 if (!(cur = memchr(cur, '\n', top - cur)))
133 cur = top;
134 else
135 cur++;
136 }
137 tsize += (long) (cur - data);
138 }
139
140 if (nl && tsize)
141 nl = xdl_mmfile_size(mf) / (tsize / nl);
142
143 return nl + 1;
144 }
145
146 int xdl_blankline(const char *line, long size, long flags)
147 {
148 long i;
149
150 if (!(flags & XDF_WHITESPACE_FLAGS))
151 return (size <= 1);
152
153 for (i = 0; i < size && XDL_ISSPACE(line[i]); i++)
154 ;
155
156 return (i == size);
157 }
158
159 /*
160 * Have we eaten everything on the line, except for an optional
161 * CR at the very end?
162 */
163 static int ends_with_optional_cr(const char *l, long s, long i)
164 {
165 int complete = s && l[s-1] == '\n';
166
167 if (complete)
168 s--;
169 if (s == i)
170 return 1;
171 /* do not ignore CR at the end of an incomplete line */
172 if (complete && s == i + 1 && l[i] == '\r')
173 return 1;
174 return 0;
175 }
176
177 int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags)
178 {
179 int i1, i2;
180
181 if (s1 == s2 && !memcmp(l1, l2, s1))
182 return 1;
183 if (!(flags & XDF_WHITESPACE_FLAGS))
184 return 0;
185
186 i1 = 0;
187 i2 = 0;
188
189 /*
190 * -w matches everything that matches with -b, and -b in turn
191 * matches everything that matches with --ignore-space-at-eol,
192 * which in turn matches everything that matches with --ignore-cr-at-eol.
193 *
194 * Each flavor of ignoring needs different logic to skip whitespaces
195 * while we have both sides to compare.
196 */
197 if (flags & XDF_IGNORE_WHITESPACE) {
198 goto skip_ws;
199 while (i1 < s1 && i2 < s2) {
200 if (l1[i1++] != l2[i2++])
201 return 0;
202 skip_ws:
203 while (i1 < s1 && XDL_ISSPACE(l1[i1]))
204 i1++;
205 while (i2 < s2 && XDL_ISSPACE(l2[i2]))
206 i2++;
207 }
208 } else if (flags & XDF_IGNORE_WHITESPACE_CHANGE) {
209 while (i1 < s1 && i2 < s2) {
210 if (XDL_ISSPACE(l1[i1]) && XDL_ISSPACE(l2[i2])) {
211 /* Skip matching spaces and try again */
212 while (i1 < s1 && XDL_ISSPACE(l1[i1]))
213 i1++;
214 while (i2 < s2 && XDL_ISSPACE(l2[i2]))
215 i2++;
216 continue;
217 }
218 if (l1[i1++] != l2[i2++])
219 return 0;
220 }
221 } else if (flags & XDF_IGNORE_WHITESPACE_AT_EOL) {
222 while (i1 < s1 && i2 < s2 && l1[i1] == l2[i2]) {
223 i1++;
224 i2++;
225 }
226 } else if (flags & XDF_IGNORE_CR_AT_EOL) {
227 /* Find the first difference and see how the line ends */
228 while (i1 < s1 && i2 < s2 && l1[i1] == l2[i2]) {
229 i1++;
230 i2++;
231 }
232 return (ends_with_optional_cr(l1, s1, i1) &&
233 ends_with_optional_cr(l2, s2, i2));
234 }
235
236 /*
237 * After running out of one side, the remaining side must have
238 * nothing but whitespace for the lines to match. Note that
239 * ignore-whitespace-at-eol case may break out of the loop
240 * while there still are characters remaining on both lines.
241 */
242 if (i1 < s1) {
243 while (i1 < s1 && XDL_ISSPACE(l1[i1]))
244 i1++;
245 if (s1 != i1)
246 return 0;
247 }
248 if (i2 < s2) {
249 while (i2 < s2 && XDL_ISSPACE(l2[i2]))
250 i2++;
251 return (s2 == i2);
252 }
253 return 1;
254 }
255
256 static unsigned long xdl_hash_record_with_whitespace(char const **data,
257 char const *top, long flags) {
258 unsigned long ha = 5381;
259 char const *ptr = *data;
260 int cr_at_eol_only = (flags & XDF_WHITESPACE_FLAGS) == XDF_IGNORE_CR_AT_EOL;
261
262 for (; ptr < top && *ptr != '\n'; ptr++) {
263 if (cr_at_eol_only) {
264 /* do not ignore CR at the end of an incomplete line */
265 if (*ptr == '\r' &&
266 (ptr + 1 < top && ptr[1] == '\n'))
267 continue;
268 }
269 else if (XDL_ISSPACE(*ptr)) {
270 const char *ptr2 = ptr;
271 int at_eol;
272 while (ptr + 1 < top && XDL_ISSPACE(ptr[1])
273 && ptr[1] != '\n')
274 ptr++;
275 at_eol = (top <= ptr + 1 || ptr[1] == '\n');
276 if (flags & XDF_IGNORE_WHITESPACE)
277 ; /* already handled */
278 else if (flags & XDF_IGNORE_WHITESPACE_CHANGE
279 && !at_eol) {
280 ha += (ha << 5);
281 ha ^= (unsigned long) ' ';
282 }
283 else if (flags & XDF_IGNORE_WHITESPACE_AT_EOL
284 && !at_eol) {
285 while (ptr2 != ptr + 1) {
286 ha += (ha << 5);
287 ha ^= (unsigned long) *ptr2;
288 ptr2++;
289 }
290 }
291 continue;
292 }
293 ha += (ha << 5);
294 ha ^= (unsigned long) *ptr;
295 }
296 *data = ptr < top ? ptr + 1: ptr;
297
298 return ha;
299 }
300
301 unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
302 unsigned long ha = 5381;
303 char const *ptr = *data;
304
305 if (flags & XDF_WHITESPACE_FLAGS)
306 return xdl_hash_record_with_whitespace(data, top, flags);
307
308 for (; ptr < top && *ptr != '\n'; ptr++) {
309 ha += (ha << 5);
310 ha ^= (unsigned long) *ptr;
311 }
312 *data = ptr < top ? ptr + 1: ptr;
313
314 return ha;
315 }
316
317 unsigned int xdl_hashbits(unsigned int size) {
318 unsigned int val = 1, bits = 0;
319
320 for (; val < size && bits < CHAR_BIT * sizeof(unsigned int); val <<= 1, bits++);
321 return bits ? bits: 1;
322 }
323
324
325 int xdl_num_out(char *out, long val) {
326 char *ptr, *str = out;
327 char buf[32];
328
329 ptr = buf + sizeof(buf) - 1;
330 *ptr = '\0';
331 if (val < 0) {
332 *--ptr = '-';
333 val = -val;
334 }
335 for (; val && ptr > buf; val /= 10)
336 *--ptr = "0123456789"[val % 10];
337 if (*ptr)
338 for (; *ptr; ptr++, str++)
339 *str = *ptr;
340 else
341 *str++ = '0';
342 *str = '\0';
343
344 return str - out;
345 }
346
347 int xdl_emit_hunk_hdr(long s1, long c1, long s2, long c2,
348 const char *func, long funclen, xdemitcb_t *ecb) {
349 int nb = 0;
350 mmbuffer_t mb;
351 char buf[128];
352
353 memcpy(buf, "@@ -", 4);
354 nb += 4;
355
356 nb += xdl_num_out(buf + nb, c1 ? s1: s1 - 1);
357
358 if (c1 != 1) {
359 memcpy(buf + nb, ",", 1);
360 nb += 1;
361
362 nb += xdl_num_out(buf + nb, c1);
363 }
364
365 memcpy(buf + nb, " +", 2);
366 nb += 2;
367
368 nb += xdl_num_out(buf + nb, c2 ? s2: s2 - 1);
369
370 if (c2 != 1) {
371 memcpy(buf + nb, ",", 1);
372 nb += 1;
373
374 nb += xdl_num_out(buf + nb, c2);
375 }
376
377 memcpy(buf + nb, " @@", 3);
378 nb += 3;
379 if (func && funclen) {
380 buf[nb++] = ' ';
381 if (funclen > sizeof(buf) - nb - 1)
382 funclen = sizeof(buf) - nb - 1;
383 memcpy(buf + nb, func, funclen);
384 nb += funclen;
385 }
386 buf[nb++] = '\n';
387
388 mb.ptr = buf;
389 mb.size = nb;
390 if (ecb->outf(ecb->priv, &mb, 1) < 0)
391 return -1;
392
393 return 0;
394 }
395
396 int xdl_fall_back_diff(xdfenv_t *diff_env, xpparam_t const *xpp,
397 int line1, int count1, int line2, int count2)
398 {
399 /*
400 * This probably does not work outside Git, since
401 * we have a very simple mmfile structure.
402 *
403 * Note: ideally, we would reuse the prepared environment, but
404 * the libxdiff interface does not (yet) allow for diffing only
405 * ranges of lines instead of the whole files.
406 */
407 mmfile_t subfile1, subfile2;
408 xdfenv_t env;
409
410 subfile1.ptr = (char *)diff_env->xdf1.recs[line1 - 1]->ptr;
411 subfile1.size = diff_env->xdf1.recs[line1 + count1 - 2]->ptr +
412 diff_env->xdf1.recs[line1 + count1 - 2]->size - subfile1.ptr;
413 subfile2.ptr = (char *)diff_env->xdf2.recs[line2 - 1]->ptr;
414 subfile2.size = diff_env->xdf2.recs[line2 + count2 - 2]->ptr +
415 diff_env->xdf2.recs[line2 + count2 - 2]->size - subfile2.ptr;
416 if (xdl_do_diff(&subfile1, &subfile2, xpp, &env) < 0)
417 return -1;
418
419 memcpy(diff_env->xdf1.rchg + line1 - 1, env.xdf1.rchg, count1);
420 memcpy(diff_env->xdf2.rchg + line2 - 1, env.xdf2.rchg, count2);
421
422 xdl_free_env(&env);
423
424 return 0;
425 }