Mercurial > hg
comparison mercurial/thirdparty/xdiff/xutils.c @ 36671:34e2ff1f9cd8
xdiff: vendor xdiff library from git
Vendor git's xdiff library from git commit
d7c6c2369d7c6c2369ac21141b7c6cceaebc6414ec3da14ad using GPL2+ license.
There is another recent user report that hg diff generates suboptimal
result. It seems the fix to issue4074 isn't good enough. I crafted some
other interesting cases, and hg diff barely has any advantage compared with
gnu diffutils or git diff.
| testcase | gnu diffutils | hg diff | git diff |
| | lines time | lines time | lines time |
| patience | 6 0.00 | 602 0.08 | 6 0.00 |
| random | 91772 0.90 | 109462 0.70 | 91772 0.24 |
| json | 2 0.03 | 1264814 1.81 | 2 0.29 |
"lines" means the size of the output, i.e. the count of "+/-" lines. "time"
means seconds needed to do the calculation. Both are the smaller the better.
"hg diff" counts Python startup overhead.
Git and GNU diffutils generate optimal results. For the "json" case, git can
have an optimization that does a scan for common prefix and suffix first,
and match them if the length is greater than half of the text. See
https://neil.fraser.name/news/2006/03/12/. That would make git the fastest
for all above cases.
About testcases:
patience:
Aiming for the weakness of the greedy "patience diff" algorithm. Using
git's patience diff option would also get suboptimal result. Generated using
the Python script:
```
open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n')))
open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n')))
```
random:
Generated using the script in `test-issue4074.t`. It practically makes the
algorithm suffer. Impressively, git wins in both performance and diff
quality.
json:
The recent user reported case. It's a single line movement near the end of a
very large (800K lines) JSON file.
Test Plan:
Code taken as-is.
Differential Revision: https://phab.mercurial-scm.org/D2572
# no-check-commit for vendored code
author | Jun Wu <quark@fb.com> |
---|---|
date | Sat, 03 Mar 2018 10:39:43 -0800 |
parents | |
children | 09f320067591 |
comparison
equal
deleted
inserted
replaced
36670:44048f1bcee5 | 36671:34e2ff1f9cd8 |
---|---|
1 /* | |
2 * LibXDiff by Davide Libenzi ( File Differential Library ) | |
3 * Copyright (C) 2003 Davide Libenzi | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2.1 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, see | |
17 * <http://www.gnu.org/licenses/>. | |
18 * | |
19 * Davide Libenzi <davidel@xmailserver.org> | |
20 * | |
21 */ | |
22 | |
23 #include <limits.h> | |
24 #include <assert.h> | |
25 #include "xinclude.h" | |
26 | |
27 | |
28 | |
29 | |
30 long xdl_bogosqrt(long n) { | |
31 long i; | |
32 | |
33 /* | |
34 * Classical integer square root approximation using shifts. | |
35 */ | |
36 for (i = 1; n > 0; n >>= 2) | |
37 i <<= 1; | |
38 | |
39 return i; | |
40 } | |
41 | |
42 | |
43 int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize, | |
44 xdemitcb_t *ecb) { | |
45 int i = 2; | |
46 mmbuffer_t mb[3]; | |
47 | |
48 mb[0].ptr = (char *) pre; | |
49 mb[0].size = psize; | |
50 mb[1].ptr = (char *) rec; | |
51 mb[1].size = size; | |
52 if (size > 0 && rec[size - 1] != '\n') { | |
53 mb[2].ptr = (char *) "\n\\ No newline at end of file\n"; | |
54 mb[2].size = strlen(mb[2].ptr); | |
55 i++; | |
56 } | |
57 if (ecb->outf(ecb->priv, mb, i) < 0) { | |
58 | |
59 return -1; | |
60 } | |
61 | |
62 return 0; | |
63 } | |
64 | |
65 void *xdl_mmfile_first(mmfile_t *mmf, long *size) | |
66 { | |
67 *size = mmf->size; | |
68 return mmf->ptr; | |
69 } | |
70 | |
71 | |
72 long xdl_mmfile_size(mmfile_t *mmf) | |
73 { | |
74 return mmf->size; | |
75 } | |
76 | |
77 | |
78 int xdl_cha_init(chastore_t *cha, long isize, long icount) { | |
79 | |
80 cha->head = cha->tail = NULL; | |
81 cha->isize = isize; | |
82 cha->nsize = icount * isize; | |
83 cha->ancur = cha->sncur = NULL; | |
84 cha->scurr = 0; | |
85 | |
86 return 0; | |
87 } | |
88 | |
89 | |
90 void xdl_cha_free(chastore_t *cha) { | |
91 chanode_t *cur, *tmp; | |
92 | |
93 for (cur = cha->head; (tmp = cur) != NULL;) { | |
94 cur = cur->next; | |
95 xdl_free(tmp); | |
96 } | |
97 } | |
98 | |
99 | |
100 void *xdl_cha_alloc(chastore_t *cha) { | |
101 chanode_t *ancur; | |
102 void *data; | |
103 | |
104 if (!(ancur = cha->ancur) || ancur->icurr == cha->nsize) { | |
105 if (!(ancur = (chanode_t *) xdl_malloc(sizeof(chanode_t) + cha->nsize))) { | |
106 | |
107 return NULL; | |
108 } | |
109 ancur->icurr = 0; | |
110 ancur->next = NULL; | |
111 if (cha->tail) | |
112 cha->tail->next = ancur; | |
113 if (!cha->head) | |
114 cha->head = ancur; | |
115 cha->tail = ancur; | |
116 cha->ancur = ancur; | |
117 } | |
118 | |
119 data = (char *) ancur + sizeof(chanode_t) + ancur->icurr; | |
120 ancur->icurr += cha->isize; | |
121 | |
122 return data; | |
123 } | |
124 | |
125 long xdl_guess_lines(mmfile_t *mf, long sample) { | |
126 long nl = 0, size, tsize = 0; | |
127 char const *data, *cur, *top; | |
128 | |
129 if ((cur = data = xdl_mmfile_first(mf, &size)) != NULL) { | |
130 for (top = data + size; nl < sample && cur < top; ) { | |
131 nl++; | |
132 if (!(cur = memchr(cur, '\n', top - cur))) | |
133 cur = top; | |
134 else | |
135 cur++; | |
136 } | |
137 tsize += (long) (cur - data); | |
138 } | |
139 | |
140 if (nl && tsize) | |
141 nl = xdl_mmfile_size(mf) / (tsize / nl); | |
142 | |
143 return nl + 1; | |
144 } | |
145 | |
146 int xdl_blankline(const char *line, long size, long flags) | |
147 { | |
148 long i; | |
149 | |
150 if (!(flags & XDF_WHITESPACE_FLAGS)) | |
151 return (size <= 1); | |
152 | |
153 for (i = 0; i < size && XDL_ISSPACE(line[i]); i++) | |
154 ; | |
155 | |
156 return (i == size); | |
157 } | |
158 | |
159 /* | |
160 * Have we eaten everything on the line, except for an optional | |
161 * CR at the very end? | |
162 */ | |
163 static int ends_with_optional_cr(const char *l, long s, long i) | |
164 { | |
165 int complete = s && l[s-1] == '\n'; | |
166 | |
167 if (complete) | |
168 s--; | |
169 if (s == i) | |
170 return 1; | |
171 /* do not ignore CR at the end of an incomplete line */ | |
172 if (complete && s == i + 1 && l[i] == '\r') | |
173 return 1; | |
174 return 0; | |
175 } | |
176 | |
177 int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags) | |
178 { | |
179 int i1, i2; | |
180 | |
181 if (s1 == s2 && !memcmp(l1, l2, s1)) | |
182 return 1; | |
183 if (!(flags & XDF_WHITESPACE_FLAGS)) | |
184 return 0; | |
185 | |
186 i1 = 0; | |
187 i2 = 0; | |
188 | |
189 /* | |
190 * -w matches everything that matches with -b, and -b in turn | |
191 * matches everything that matches with --ignore-space-at-eol, | |
192 * which in turn matches everything that matches with --ignore-cr-at-eol. | |
193 * | |
194 * Each flavor of ignoring needs different logic to skip whitespaces | |
195 * while we have both sides to compare. | |
196 */ | |
197 if (flags & XDF_IGNORE_WHITESPACE) { | |
198 goto skip_ws; | |
199 while (i1 < s1 && i2 < s2) { | |
200 if (l1[i1++] != l2[i2++]) | |
201 return 0; | |
202 skip_ws: | |
203 while (i1 < s1 && XDL_ISSPACE(l1[i1])) | |
204 i1++; | |
205 while (i2 < s2 && XDL_ISSPACE(l2[i2])) | |
206 i2++; | |
207 } | |
208 } else if (flags & XDF_IGNORE_WHITESPACE_CHANGE) { | |
209 while (i1 < s1 && i2 < s2) { | |
210 if (XDL_ISSPACE(l1[i1]) && XDL_ISSPACE(l2[i2])) { | |
211 /* Skip matching spaces and try again */ | |
212 while (i1 < s1 && XDL_ISSPACE(l1[i1])) | |
213 i1++; | |
214 while (i2 < s2 && XDL_ISSPACE(l2[i2])) | |
215 i2++; | |
216 continue; | |
217 } | |
218 if (l1[i1++] != l2[i2++]) | |
219 return 0; | |
220 } | |
221 } else if (flags & XDF_IGNORE_WHITESPACE_AT_EOL) { | |
222 while (i1 < s1 && i2 < s2 && l1[i1] == l2[i2]) { | |
223 i1++; | |
224 i2++; | |
225 } | |
226 } else if (flags & XDF_IGNORE_CR_AT_EOL) { | |
227 /* Find the first difference and see how the line ends */ | |
228 while (i1 < s1 && i2 < s2 && l1[i1] == l2[i2]) { | |
229 i1++; | |
230 i2++; | |
231 } | |
232 return (ends_with_optional_cr(l1, s1, i1) && | |
233 ends_with_optional_cr(l2, s2, i2)); | |
234 } | |
235 | |
236 /* | |
237 * After running out of one side, the remaining side must have | |
238 * nothing but whitespace for the lines to match. Note that | |
239 * ignore-whitespace-at-eol case may break out of the loop | |
240 * while there still are characters remaining on both lines. | |
241 */ | |
242 if (i1 < s1) { | |
243 while (i1 < s1 && XDL_ISSPACE(l1[i1])) | |
244 i1++; | |
245 if (s1 != i1) | |
246 return 0; | |
247 } | |
248 if (i2 < s2) { | |
249 while (i2 < s2 && XDL_ISSPACE(l2[i2])) | |
250 i2++; | |
251 return (s2 == i2); | |
252 } | |
253 return 1; | |
254 } | |
255 | |
256 static unsigned long xdl_hash_record_with_whitespace(char const **data, | |
257 char const *top, long flags) { | |
258 unsigned long ha = 5381; | |
259 char const *ptr = *data; | |
260 int cr_at_eol_only = (flags & XDF_WHITESPACE_FLAGS) == XDF_IGNORE_CR_AT_EOL; | |
261 | |
262 for (; ptr < top && *ptr != '\n'; ptr++) { | |
263 if (cr_at_eol_only) { | |
264 /* do not ignore CR at the end of an incomplete line */ | |
265 if (*ptr == '\r' && | |
266 (ptr + 1 < top && ptr[1] == '\n')) | |
267 continue; | |
268 } | |
269 else if (XDL_ISSPACE(*ptr)) { | |
270 const char *ptr2 = ptr; | |
271 int at_eol; | |
272 while (ptr + 1 < top && XDL_ISSPACE(ptr[1]) | |
273 && ptr[1] != '\n') | |
274 ptr++; | |
275 at_eol = (top <= ptr + 1 || ptr[1] == '\n'); | |
276 if (flags & XDF_IGNORE_WHITESPACE) | |
277 ; /* already handled */ | |
278 else if (flags & XDF_IGNORE_WHITESPACE_CHANGE | |
279 && !at_eol) { | |
280 ha += (ha << 5); | |
281 ha ^= (unsigned long) ' '; | |
282 } | |
283 else if (flags & XDF_IGNORE_WHITESPACE_AT_EOL | |
284 && !at_eol) { | |
285 while (ptr2 != ptr + 1) { | |
286 ha += (ha << 5); | |
287 ha ^= (unsigned long) *ptr2; | |
288 ptr2++; | |
289 } | |
290 } | |
291 continue; | |
292 } | |
293 ha += (ha << 5); | |
294 ha ^= (unsigned long) *ptr; | |
295 } | |
296 *data = ptr < top ? ptr + 1: ptr; | |
297 | |
298 return ha; | |
299 } | |
300 | |
301 unsigned long xdl_hash_record(char const **data, char const *top, long flags) { | |
302 unsigned long ha = 5381; | |
303 char const *ptr = *data; | |
304 | |
305 if (flags & XDF_WHITESPACE_FLAGS) | |
306 return xdl_hash_record_with_whitespace(data, top, flags); | |
307 | |
308 for (; ptr < top && *ptr != '\n'; ptr++) { | |
309 ha += (ha << 5); | |
310 ha ^= (unsigned long) *ptr; | |
311 } | |
312 *data = ptr < top ? ptr + 1: ptr; | |
313 | |
314 return ha; | |
315 } | |
316 | |
317 unsigned int xdl_hashbits(unsigned int size) { | |
318 unsigned int val = 1, bits = 0; | |
319 | |
320 for (; val < size && bits < CHAR_BIT * sizeof(unsigned int); val <<= 1, bits++); | |
321 return bits ? bits: 1; | |
322 } | |
323 | |
324 | |
325 int xdl_num_out(char *out, long val) { | |
326 char *ptr, *str = out; | |
327 char buf[32]; | |
328 | |
329 ptr = buf + sizeof(buf) - 1; | |
330 *ptr = '\0'; | |
331 if (val < 0) { | |
332 *--ptr = '-'; | |
333 val = -val; | |
334 } | |
335 for (; val && ptr > buf; val /= 10) | |
336 *--ptr = "0123456789"[val % 10]; | |
337 if (*ptr) | |
338 for (; *ptr; ptr++, str++) | |
339 *str = *ptr; | |
340 else | |
341 *str++ = '0'; | |
342 *str = '\0'; | |
343 | |
344 return str - out; | |
345 } | |
346 | |
347 int xdl_emit_hunk_hdr(long s1, long c1, long s2, long c2, | |
348 const char *func, long funclen, xdemitcb_t *ecb) { | |
349 int nb = 0; | |
350 mmbuffer_t mb; | |
351 char buf[128]; | |
352 | |
353 memcpy(buf, "@@ -", 4); | |
354 nb += 4; | |
355 | |
356 nb += xdl_num_out(buf + nb, c1 ? s1: s1 - 1); | |
357 | |
358 if (c1 != 1) { | |
359 memcpy(buf + nb, ",", 1); | |
360 nb += 1; | |
361 | |
362 nb += xdl_num_out(buf + nb, c1); | |
363 } | |
364 | |
365 memcpy(buf + nb, " +", 2); | |
366 nb += 2; | |
367 | |
368 nb += xdl_num_out(buf + nb, c2 ? s2: s2 - 1); | |
369 | |
370 if (c2 != 1) { | |
371 memcpy(buf + nb, ",", 1); | |
372 nb += 1; | |
373 | |
374 nb += xdl_num_out(buf + nb, c2); | |
375 } | |
376 | |
377 memcpy(buf + nb, " @@", 3); | |
378 nb += 3; | |
379 if (func && funclen) { | |
380 buf[nb++] = ' '; | |
381 if (funclen > sizeof(buf) - nb - 1) | |
382 funclen = sizeof(buf) - nb - 1; | |
383 memcpy(buf + nb, func, funclen); | |
384 nb += funclen; | |
385 } | |
386 buf[nb++] = '\n'; | |
387 | |
388 mb.ptr = buf; | |
389 mb.size = nb; | |
390 if (ecb->outf(ecb->priv, &mb, 1) < 0) | |
391 return -1; | |
392 | |
393 return 0; | |
394 } | |
395 | |
396 int xdl_fall_back_diff(xdfenv_t *diff_env, xpparam_t const *xpp, | |
397 int line1, int count1, int line2, int count2) | |
398 { | |
399 /* | |
400 * This probably does not work outside Git, since | |
401 * we have a very simple mmfile structure. | |
402 * | |
403 * Note: ideally, we would reuse the prepared environment, but | |
404 * the libxdiff interface does not (yet) allow for diffing only | |
405 * ranges of lines instead of the whole files. | |
406 */ | |
407 mmfile_t subfile1, subfile2; | |
408 xdfenv_t env; | |
409 | |
410 subfile1.ptr = (char *)diff_env->xdf1.recs[line1 - 1]->ptr; | |
411 subfile1.size = diff_env->xdf1.recs[line1 + count1 - 2]->ptr + | |
412 diff_env->xdf1.recs[line1 + count1 - 2]->size - subfile1.ptr; | |
413 subfile2.ptr = (char *)diff_env->xdf2.recs[line2 - 1]->ptr; | |
414 subfile2.size = diff_env->xdf2.recs[line2 + count2 - 2]->ptr + | |
415 diff_env->xdf2.recs[line2 + count2 - 2]->size - subfile2.ptr; | |
416 if (xdl_do_diff(&subfile1, &subfile2, xpp, &env) < 0) | |
417 return -1; | |
418 | |
419 memcpy(diff_env->xdf1.rchg + line1 - 1, env.xdf1.rchg, count1); | |
420 memcpy(diff_env->xdf2.rchg + line2 - 1, env.xdf2.rchg, count2); | |
421 | |
422 xdl_free_env(&env); | |
423 | |
424 return 0; | |
425 } |