Mercurial > hg
comparison mercurial/thirdparty/xdiff/xdiffi.c @ 36671:34e2ff1f9cd8
xdiff: vendor xdiff library from git
Vendor git's xdiff library from git commit
d7c6c2369d7c6c2369ac21141b7c6cceaebc6414ec3da14ad using GPL2+ license.
There is another recent user report that hg diff generates suboptimal
result. It seems the fix to issue4074 isn't good enough. I crafted some
other interesting cases, and hg diff barely has any advantage compared with
gnu diffutils or git diff.
| testcase | gnu diffutils | hg diff | git diff |
| | lines time | lines time | lines time |
| patience | 6 0.00 | 602 0.08 | 6 0.00 |
| random | 91772 0.90 | 109462 0.70 | 91772 0.24 |
| json | 2 0.03 | 1264814 1.81 | 2 0.29 |
"lines" means the size of the output, i.e. the count of "+/-" lines. "time"
means seconds needed to do the calculation. Both are the smaller the better.
"hg diff" counts Python startup overhead.
Git and GNU diffutils generate optimal results. For the "json" case, git can
have an optimization that does a scan for common prefix and suffix first,
and match them if the length is greater than half of the text. See
https://neil.fraser.name/news/2006/03/12/. That would make git the fastest
for all above cases.
About testcases:
patience:
Aiming for the weakness of the greedy "patience diff" algorithm. Using
git's patience diff option would also get suboptimal result. Generated using
the Python script:
```
open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n')))
open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n')))
```
random:
Generated using the script in `test-issue4074.t`. It practically makes the
algorithm suffer. Impressively, git wins in both performance and diff
quality.
json:
The recent user reported case. It's a single line movement near the end of a
very large (800K lines) JSON file.
Test Plan:
Code taken as-is.
Differential Revision: https://phab.mercurial-scm.org/D2572
# no-check-commit for vendored code
author | Jun Wu <quark@fb.com> |
---|---|
date | Sat, 03 Mar 2018 10:39:43 -0800 |
parents | |
children | 9e7b14caf67f |
comparison
equal
deleted
inserted
replaced
36670:44048f1bcee5 | 36671:34e2ff1f9cd8 |
---|---|
1 /* | |
2 * LibXDiff by Davide Libenzi ( File Differential Library ) | |
3 * Copyright (C) 2003 Davide Libenzi | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2.1 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, see | |
17 * <http://www.gnu.org/licenses/>. | |
18 * | |
19 * Davide Libenzi <davidel@xmailserver.org> | |
20 * | |
21 */ | |
22 | |
23 #include "xinclude.h" | |
24 | |
25 | |
26 | |
27 #define XDL_MAX_COST_MIN 256 | |
28 #define XDL_HEUR_MIN_COST 256 | |
29 #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1) | |
30 #define XDL_SNAKE_CNT 20 | |
31 #define XDL_K_HEUR 4 | |
32 | |
33 | |
34 | |
35 typedef struct s_xdpsplit { | |
36 long i1, i2; | |
37 int min_lo, min_hi; | |
38 } xdpsplit_t; | |
39 | |
40 | |
41 | |
42 | |
43 static long xdl_split(unsigned long const *ha1, long off1, long lim1, | |
44 unsigned long const *ha2, long off2, long lim2, | |
45 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl, | |
46 xdalgoenv_t *xenv); | |
47 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2); | |
48 | |
49 | |
50 | |
51 | |
52 | |
53 /* | |
54 * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. | |
55 * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both | |
56 * the forward diagonal starting from (off1, off2) and the backward diagonal | |
57 * starting from (lim1, lim2). If the K values on the same diagonal crosses | |
58 * returns the furthest point of reach. We might end up having to expensive | |
59 * cases using this algorithm is full, so a little bit of heuristic is needed | |
60 * to cut the search and to return a suboptimal point. | |
61 */ | |
62 static long xdl_split(unsigned long const *ha1, long off1, long lim1, | |
63 unsigned long const *ha2, long off2, long lim2, | |
64 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl, | |
65 xdalgoenv_t *xenv) { | |
66 long dmin = off1 - lim2, dmax = lim1 - off2; | |
67 long fmid = off1 - off2, bmid = lim1 - lim2; | |
68 long odd = (fmid - bmid) & 1; | |
69 long fmin = fmid, fmax = fmid; | |
70 long bmin = bmid, bmax = bmid; | |
71 long ec, d, i1, i2, prev1, best, dd, v, k; | |
72 | |
73 /* | |
74 * Set initial diagonal values for both forward and backward path. | |
75 */ | |
76 kvdf[fmid] = off1; | |
77 kvdb[bmid] = lim1; | |
78 | |
79 for (ec = 1;; ec++) { | |
80 int got_snake = 0; | |
81 | |
82 /* | |
83 * We need to extent the diagonal "domain" by one. If the next | |
84 * values exits the box boundaries we need to change it in the | |
85 * opposite direction because (max - min) must be a power of two. | |
86 * Also we initialize the external K value to -1 so that we can | |
87 * avoid extra conditions check inside the core loop. | |
88 */ | |
89 if (fmin > dmin) | |
90 kvdf[--fmin - 1] = -1; | |
91 else | |
92 ++fmin; | |
93 if (fmax < dmax) | |
94 kvdf[++fmax + 1] = -1; | |
95 else | |
96 --fmax; | |
97 | |
98 for (d = fmax; d >= fmin; d -= 2) { | |
99 if (kvdf[d - 1] >= kvdf[d + 1]) | |
100 i1 = kvdf[d - 1] + 1; | |
101 else | |
102 i1 = kvdf[d + 1]; | |
103 prev1 = i1; | |
104 i2 = i1 - d; | |
105 for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++); | |
106 if (i1 - prev1 > xenv->snake_cnt) | |
107 got_snake = 1; | |
108 kvdf[d] = i1; | |
109 if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) { | |
110 spl->i1 = i1; | |
111 spl->i2 = i2; | |
112 spl->min_lo = spl->min_hi = 1; | |
113 return ec; | |
114 } | |
115 } | |
116 | |
117 /* | |
118 * We need to extent the diagonal "domain" by one. If the next | |
119 * values exits the box boundaries we need to change it in the | |
120 * opposite direction because (max - min) must be a power of two. | |
121 * Also we initialize the external K value to -1 so that we can | |
122 * avoid extra conditions check inside the core loop. | |
123 */ | |
124 if (bmin > dmin) | |
125 kvdb[--bmin - 1] = XDL_LINE_MAX; | |
126 else | |
127 ++bmin; | |
128 if (bmax < dmax) | |
129 kvdb[++bmax + 1] = XDL_LINE_MAX; | |
130 else | |
131 --bmax; | |
132 | |
133 for (d = bmax; d >= bmin; d -= 2) { | |
134 if (kvdb[d - 1] < kvdb[d + 1]) | |
135 i1 = kvdb[d - 1]; | |
136 else | |
137 i1 = kvdb[d + 1] - 1; | |
138 prev1 = i1; | |
139 i2 = i1 - d; | |
140 for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--); | |
141 if (prev1 - i1 > xenv->snake_cnt) | |
142 got_snake = 1; | |
143 kvdb[d] = i1; | |
144 if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) { | |
145 spl->i1 = i1; | |
146 spl->i2 = i2; | |
147 spl->min_lo = spl->min_hi = 1; | |
148 return ec; | |
149 } | |
150 } | |
151 | |
152 if (need_min) | |
153 continue; | |
154 | |
155 /* | |
156 * If the edit cost is above the heuristic trigger and if | |
157 * we got a good snake, we sample current diagonals to see | |
158 * if some of the, have reached an "interesting" path. Our | |
159 * measure is a function of the distance from the diagonal | |
160 * corner (i1 + i2) penalized with the distance from the | |
161 * mid diagonal itself. If this value is above the current | |
162 * edit cost times a magic factor (XDL_K_HEUR) we consider | |
163 * it interesting. | |
164 */ | |
165 if (got_snake && ec > xenv->heur_min) { | |
166 for (best = 0, d = fmax; d >= fmin; d -= 2) { | |
167 dd = d > fmid ? d - fmid: fmid - d; | |
168 i1 = kvdf[d]; | |
169 i2 = i1 - d; | |
170 v = (i1 - off1) + (i2 - off2) - dd; | |
171 | |
172 if (v > XDL_K_HEUR * ec && v > best && | |
173 off1 + xenv->snake_cnt <= i1 && i1 < lim1 && | |
174 off2 + xenv->snake_cnt <= i2 && i2 < lim2) { | |
175 for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++) | |
176 if (k == xenv->snake_cnt) { | |
177 best = v; | |
178 spl->i1 = i1; | |
179 spl->i2 = i2; | |
180 break; | |
181 } | |
182 } | |
183 } | |
184 if (best > 0) { | |
185 spl->min_lo = 1; | |
186 spl->min_hi = 0; | |
187 return ec; | |
188 } | |
189 | |
190 for (best = 0, d = bmax; d >= bmin; d -= 2) { | |
191 dd = d > bmid ? d - bmid: bmid - d; | |
192 i1 = kvdb[d]; | |
193 i2 = i1 - d; | |
194 v = (lim1 - i1) + (lim2 - i2) - dd; | |
195 | |
196 if (v > XDL_K_HEUR * ec && v > best && | |
197 off1 < i1 && i1 <= lim1 - xenv->snake_cnt && | |
198 off2 < i2 && i2 <= lim2 - xenv->snake_cnt) { | |
199 for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++) | |
200 if (k == xenv->snake_cnt - 1) { | |
201 best = v; | |
202 spl->i1 = i1; | |
203 spl->i2 = i2; | |
204 break; | |
205 } | |
206 } | |
207 } | |
208 if (best > 0) { | |
209 spl->min_lo = 0; | |
210 spl->min_hi = 1; | |
211 return ec; | |
212 } | |
213 } | |
214 | |
215 /* | |
216 * Enough is enough. We spent too much time here and now we collect | |
217 * the furthest reaching path using the (i1 + i2) measure. | |
218 */ | |
219 if (ec >= xenv->mxcost) { | |
220 long fbest, fbest1, bbest, bbest1; | |
221 | |
222 fbest = fbest1 = -1; | |
223 for (d = fmax; d >= fmin; d -= 2) { | |
224 i1 = XDL_MIN(kvdf[d], lim1); | |
225 i2 = i1 - d; | |
226 if (lim2 < i2) | |
227 i1 = lim2 + d, i2 = lim2; | |
228 if (fbest < i1 + i2) { | |
229 fbest = i1 + i2; | |
230 fbest1 = i1; | |
231 } | |
232 } | |
233 | |
234 bbest = bbest1 = XDL_LINE_MAX; | |
235 for (d = bmax; d >= bmin; d -= 2) { | |
236 i1 = XDL_MAX(off1, kvdb[d]); | |
237 i2 = i1 - d; | |
238 if (i2 < off2) | |
239 i1 = off2 + d, i2 = off2; | |
240 if (i1 + i2 < bbest) { | |
241 bbest = i1 + i2; | |
242 bbest1 = i1; | |
243 } | |
244 } | |
245 | |
246 if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) { | |
247 spl->i1 = fbest1; | |
248 spl->i2 = fbest - fbest1; | |
249 spl->min_lo = 1; | |
250 spl->min_hi = 0; | |
251 } else { | |
252 spl->i1 = bbest1; | |
253 spl->i2 = bbest - bbest1; | |
254 spl->min_lo = 0; | |
255 spl->min_hi = 1; | |
256 } | |
257 return ec; | |
258 } | |
259 } | |
260 } | |
261 | |
262 | |
263 /* | |
264 * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling | |
265 * the box splitting function. Note that the real job (marking changed lines) | |
266 * is done in the two boundary reaching checks. | |
267 */ | |
268 int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1, | |
269 diffdata_t *dd2, long off2, long lim2, | |
270 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) { | |
271 unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha; | |
272 | |
273 /* | |
274 * Shrink the box by walking through each diagonal snake (SW and NE). | |
275 */ | |
276 for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++); | |
277 for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--); | |
278 | |
279 /* | |
280 * If one dimension is empty, then all records on the other one must | |
281 * be obviously changed. | |
282 */ | |
283 if (off1 == lim1) { | |
284 char *rchg2 = dd2->rchg; | |
285 long *rindex2 = dd2->rindex; | |
286 | |
287 for (; off2 < lim2; off2++) | |
288 rchg2[rindex2[off2]] = 1; | |
289 } else if (off2 == lim2) { | |
290 char *rchg1 = dd1->rchg; | |
291 long *rindex1 = dd1->rindex; | |
292 | |
293 for (; off1 < lim1; off1++) | |
294 rchg1[rindex1[off1]] = 1; | |
295 } else { | |
296 xdpsplit_t spl; | |
297 spl.i1 = spl.i2 = 0; | |
298 | |
299 /* | |
300 * Divide ... | |
301 */ | |
302 if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb, | |
303 need_min, &spl, xenv) < 0) { | |
304 | |
305 return -1; | |
306 } | |
307 | |
308 /* | |
309 * ... et Impera. | |
310 */ | |
311 if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2, | |
312 kvdf, kvdb, spl.min_lo, xenv) < 0 || | |
313 xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2, | |
314 kvdf, kvdb, spl.min_hi, xenv) < 0) { | |
315 | |
316 return -1; | |
317 } | |
318 } | |
319 | |
320 return 0; | |
321 } | |
322 | |
323 | |
324 int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, | |
325 xdfenv_t *xe) { | |
326 long ndiags; | |
327 long *kvd, *kvdf, *kvdb; | |
328 xdalgoenv_t xenv; | |
329 diffdata_t dd1, dd2; | |
330 | |
331 if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) | |
332 return xdl_do_patience_diff(mf1, mf2, xpp, xe); | |
333 | |
334 if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) | |
335 return xdl_do_histogram_diff(mf1, mf2, xpp, xe); | |
336 | |
337 if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) { | |
338 | |
339 return -1; | |
340 } | |
341 | |
342 /* | |
343 * Allocate and setup K vectors to be used by the differential algorithm. | |
344 * One is to store the forward path and one to store the backward path. | |
345 */ | |
346 ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3; | |
347 if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) { | |
348 | |
349 xdl_free_env(xe); | |
350 return -1; | |
351 } | |
352 kvdf = kvd; | |
353 kvdb = kvdf + ndiags; | |
354 kvdf += xe->xdf2.nreff + 1; | |
355 kvdb += xe->xdf2.nreff + 1; | |
356 | |
357 xenv.mxcost = xdl_bogosqrt(ndiags); | |
358 if (xenv.mxcost < XDL_MAX_COST_MIN) | |
359 xenv.mxcost = XDL_MAX_COST_MIN; | |
360 xenv.snake_cnt = XDL_SNAKE_CNT; | |
361 xenv.heur_min = XDL_HEUR_MIN_COST; | |
362 | |
363 dd1.nrec = xe->xdf1.nreff; | |
364 dd1.ha = xe->xdf1.ha; | |
365 dd1.rchg = xe->xdf1.rchg; | |
366 dd1.rindex = xe->xdf1.rindex; | |
367 dd2.nrec = xe->xdf2.nreff; | |
368 dd2.ha = xe->xdf2.ha; | |
369 dd2.rchg = xe->xdf2.rchg; | |
370 dd2.rindex = xe->xdf2.rindex; | |
371 | |
372 if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec, | |
373 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) { | |
374 | |
375 xdl_free(kvd); | |
376 xdl_free_env(xe); | |
377 return -1; | |
378 } | |
379 | |
380 xdl_free(kvd); | |
381 | |
382 return 0; | |
383 } | |
384 | |
385 | |
386 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) { | |
387 xdchange_t *xch; | |
388 | |
389 if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t)))) | |
390 return NULL; | |
391 | |
392 xch->next = xscr; | |
393 xch->i1 = i1; | |
394 xch->i2 = i2; | |
395 xch->chg1 = chg1; | |
396 xch->chg2 = chg2; | |
397 xch->ignore = 0; | |
398 | |
399 return xch; | |
400 } | |
401 | |
402 | |
403 static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags) | |
404 { | |
405 return (rec1->ha == rec2->ha && | |
406 xdl_recmatch(rec1->ptr, rec1->size, | |
407 rec2->ptr, rec2->size, | |
408 flags)); | |
409 } | |
410 | |
411 /* | |
412 * If a line is indented more than this, get_indent() just returns this value. | |
413 * This avoids having to do absurd amounts of work for data that are not | |
414 * human-readable text, and also ensures that the output of get_indent fits within | |
415 * an int. | |
416 */ | |
417 #define MAX_INDENT 200 | |
418 | |
419 /* | |
420 * Return the amount of indentation of the specified line, treating TAB as 8 | |
421 * columns. Return -1 if line is empty or contains only whitespace. Clamp the | |
422 * output value at MAX_INDENT. | |
423 */ | |
424 static int get_indent(xrecord_t *rec) | |
425 { | |
426 long i; | |
427 int ret = 0; | |
428 | |
429 for (i = 0; i < rec->size; i++) { | |
430 char c = rec->ptr[i]; | |
431 | |
432 if (!XDL_ISSPACE(c)) | |
433 return ret; | |
434 else if (c == ' ') | |
435 ret += 1; | |
436 else if (c == '\t') | |
437 ret += 8 - ret % 8; | |
438 /* ignore other whitespace characters */ | |
439 | |
440 if (ret >= MAX_INDENT) | |
441 return MAX_INDENT; | |
442 } | |
443 | |
444 /* The line contains only whitespace. */ | |
445 return -1; | |
446 } | |
447 | |
448 /* | |
449 * If more than this number of consecutive blank rows are found, just return this | |
450 * value. This avoids requiring O(N^2) work for pathological cases, and also | |
451 * ensures that the output of score_split fits in an int. | |
452 */ | |
453 #define MAX_BLANKS 20 | |
454 | |
455 /* Characteristics measured about a hypothetical split position. */ | |
456 struct split_measurement { | |
457 /* | |
458 * Is the split at the end of the file (aside from any blank lines)? | |
459 */ | |
460 int end_of_file; | |
461 | |
462 /* | |
463 * How much is the line immediately following the split indented (or -1 if | |
464 * the line is blank): | |
465 */ | |
466 int indent; | |
467 | |
468 /* | |
469 * How many consecutive lines above the split are blank? | |
470 */ | |
471 int pre_blank; | |
472 | |
473 /* | |
474 * How much is the nearest non-blank line above the split indented (or -1 | |
475 * if there is no such line)? | |
476 */ | |
477 int pre_indent; | |
478 | |
479 /* | |
480 * How many lines after the line following the split are blank? | |
481 */ | |
482 int post_blank; | |
483 | |
484 /* | |
485 * How much is the nearest non-blank line after the line following the | |
486 * split indented (or -1 if there is no such line)? | |
487 */ | |
488 int post_indent; | |
489 }; | |
490 | |
491 struct split_score { | |
492 /* The effective indent of this split (smaller is preferred). */ | |
493 int effective_indent; | |
494 | |
495 /* Penalty for this split (smaller is preferred). */ | |
496 int penalty; | |
497 }; | |
498 | |
499 /* | |
500 * Fill m with information about a hypothetical split of xdf above line split. | |
501 */ | |
502 static void measure_split(const xdfile_t *xdf, long split, | |
503 struct split_measurement *m) | |
504 { | |
505 long i; | |
506 | |
507 if (split >= xdf->nrec) { | |
508 m->end_of_file = 1; | |
509 m->indent = -1; | |
510 } else { | |
511 m->end_of_file = 0; | |
512 m->indent = get_indent(xdf->recs[split]); | |
513 } | |
514 | |
515 m->pre_blank = 0; | |
516 m->pre_indent = -1; | |
517 for (i = split - 1; i >= 0; i--) { | |
518 m->pre_indent = get_indent(xdf->recs[i]); | |
519 if (m->pre_indent != -1) | |
520 break; | |
521 m->pre_blank += 1; | |
522 if (m->pre_blank == MAX_BLANKS) { | |
523 m->pre_indent = 0; | |
524 break; | |
525 } | |
526 } | |
527 | |
528 m->post_blank = 0; | |
529 m->post_indent = -1; | |
530 for (i = split + 1; i < xdf->nrec; i++) { | |
531 m->post_indent = get_indent(xdf->recs[i]); | |
532 if (m->post_indent != -1) | |
533 break; | |
534 m->post_blank += 1; | |
535 if (m->post_blank == MAX_BLANKS) { | |
536 m->post_indent = 0; | |
537 break; | |
538 } | |
539 } | |
540 } | |
541 | |
542 /* | |
543 * The empirically-determined weight factors used by score_split() below. | |
544 * Larger values means that the position is a less favorable place to split. | |
545 * | |
546 * Note that scores are only ever compared against each other, so multiplying | |
547 * all of these weight/penalty values by the same factor wouldn't change the | |
548 * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*. | |
549 * In practice, these numbers are chosen to be large enough that they can be | |
550 * adjusted relative to each other with sufficient precision despite using | |
551 * integer math. | |
552 */ | |
553 | |
554 /* Penalty if there are no non-blank lines before the split */ | |
555 #define START_OF_FILE_PENALTY 1 | |
556 | |
557 /* Penalty if there are no non-blank lines after the split */ | |
558 #define END_OF_FILE_PENALTY 21 | |
559 | |
560 /* Multiplier for the number of blank lines around the split */ | |
561 #define TOTAL_BLANK_WEIGHT (-30) | |
562 | |
563 /* Multiplier for the number of blank lines after the split */ | |
564 #define POST_BLANK_WEIGHT 6 | |
565 | |
566 /* | |
567 * Penalties applied if the line is indented more than its predecessor | |
568 */ | |
569 #define RELATIVE_INDENT_PENALTY (-4) | |
570 #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10 | |
571 | |
572 /* | |
573 * Penalties applied if the line is indented less than both its predecessor and | |
574 * its successor | |
575 */ | |
576 #define RELATIVE_OUTDENT_PENALTY 24 | |
577 #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17 | |
578 | |
579 /* | |
580 * Penalties applied if the line is indented less than its predecessor but not | |
581 * less than its successor | |
582 */ | |
583 #define RELATIVE_DEDENT_PENALTY 23 | |
584 #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17 | |
585 | |
586 /* | |
587 * We only consider whether the sum of the effective indents for splits are | |
588 * less than (-1), equal to (0), or greater than (+1) each other. The resulting | |
589 * value is multiplied by the following weight and combined with the penalty to | |
590 * determine the better of two scores. | |
591 */ | |
592 #define INDENT_WEIGHT 60 | |
593 | |
594 /* | |
595 * Compute a badness score for the hypothetical split whose measurements are | |
596 * stored in m. The weight factors were determined empirically using the tools and | |
597 * corpus described in | |
598 * | |
599 * https://github.com/mhagger/diff-slider-tools | |
600 * | |
601 * Also see that project if you want to improve the weights based on, for example, | |
602 * a larger or more diverse corpus. | |
603 */ | |
604 static void score_add_split(const struct split_measurement *m, struct split_score *s) | |
605 { | |
606 /* | |
607 * A place to accumulate penalty factors (positive makes this index more | |
608 * favored): | |
609 */ | |
610 int post_blank, total_blank, indent, any_blanks; | |
611 | |
612 if (m->pre_indent == -1 && m->pre_blank == 0) | |
613 s->penalty += START_OF_FILE_PENALTY; | |
614 | |
615 if (m->end_of_file) | |
616 s->penalty += END_OF_FILE_PENALTY; | |
617 | |
618 /* | |
619 * Set post_blank to the number of blank lines following the split, | |
620 * including the line immediately after the split: | |
621 */ | |
622 post_blank = (m->indent == -1) ? 1 + m->post_blank : 0; | |
623 total_blank = m->pre_blank + post_blank; | |
624 | |
625 /* Penalties based on nearby blank lines: */ | |
626 s->penalty += TOTAL_BLANK_WEIGHT * total_blank; | |
627 s->penalty += POST_BLANK_WEIGHT * post_blank; | |
628 | |
629 if (m->indent != -1) | |
630 indent = m->indent; | |
631 else | |
632 indent = m->post_indent; | |
633 | |
634 any_blanks = (total_blank != 0); | |
635 | |
636 /* Note that the effective indent is -1 at the end of the file: */ | |
637 s->effective_indent += indent; | |
638 | |
639 if (indent == -1) { | |
640 /* No additional adjustments needed. */ | |
641 } else if (m->pre_indent == -1) { | |
642 /* No additional adjustments needed. */ | |
643 } else if (indent > m->pre_indent) { | |
644 /* | |
645 * The line is indented more than its predecessor. | |
646 */ | |
647 s->penalty += any_blanks ? | |
648 RELATIVE_INDENT_WITH_BLANK_PENALTY : | |
649 RELATIVE_INDENT_PENALTY; | |
650 } else if (indent == m->pre_indent) { | |
651 /* | |
652 * The line has the same indentation level as its predecessor. | |
653 * No additional adjustments needed. | |
654 */ | |
655 } else { | |
656 /* | |
657 * The line is indented less than its predecessor. It could be | |
658 * the block terminator of the previous block, but it could | |
659 * also be the start of a new block (e.g., an "else" block, or | |
660 * maybe the previous block didn't have a block terminator). | |
661 * Try to distinguish those cases based on what comes next: | |
662 */ | |
663 if (m->post_indent != -1 && m->post_indent > indent) { | |
664 /* | |
665 * The following line is indented more. So it is likely | |
666 * that this line is the start of a block. | |
667 */ | |
668 s->penalty += any_blanks ? | |
669 RELATIVE_OUTDENT_WITH_BLANK_PENALTY : | |
670 RELATIVE_OUTDENT_PENALTY; | |
671 } else { | |
672 /* | |
673 * That was probably the end of a block. | |
674 */ | |
675 s->penalty += any_blanks ? | |
676 RELATIVE_DEDENT_WITH_BLANK_PENALTY : | |
677 RELATIVE_DEDENT_PENALTY; | |
678 } | |
679 } | |
680 } | |
681 | |
682 static int score_cmp(struct split_score *s1, struct split_score *s2) | |
683 { | |
684 /* -1 if s1.effective_indent < s2->effective_indent, etc. */ | |
685 int cmp_indents = ((s1->effective_indent > s2->effective_indent) - | |
686 (s1->effective_indent < s2->effective_indent)); | |
687 | |
688 return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty); | |
689 } | |
690 | |
691 /* | |
692 * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group | |
693 * of lines that was inserted or deleted from the corresponding version of the | |
694 * file). We consider there to be such a group at the beginning of the file, at | |
695 * the end of the file, and between any two unchanged lines, though most such | |
696 * groups will usually be empty. | |
697 * | |
698 * If the first line in a group is equal to the line following the group, then | |
699 * the group can be slid down. Similarly, if the last line in a group is equal | |
700 * to the line preceding the group, then the group can be slid up. See | |
701 * group_slide_down() and group_slide_up(). | |
702 * | |
703 * Note that loops that are testing for changed lines in xdf->rchg do not need | |
704 * index bounding since the array is prepared with a zero at position -1 and N. | |
705 */ | |
706 struct xdlgroup { | |
707 /* | |
708 * The index of the first changed line in the group, or the index of | |
709 * the unchanged line above which the (empty) group is located. | |
710 */ | |
711 long start; | |
712 | |
713 /* | |
714 * The index of the first unchanged line after the group. For an empty | |
715 * group, end is equal to start. | |
716 */ | |
717 long end; | |
718 }; | |
719 | |
720 /* | |
721 * Initialize g to point at the first group in xdf. | |
722 */ | |
723 static void group_init(xdfile_t *xdf, struct xdlgroup *g) | |
724 { | |
725 g->start = g->end = 0; | |
726 while (xdf->rchg[g->end]) | |
727 g->end++; | |
728 } | |
729 | |
730 /* | |
731 * Move g to describe the next (possibly empty) group in xdf and return 0. If g | |
732 * is already at the end of the file, do nothing and return -1. | |
733 */ | |
734 static inline int group_next(xdfile_t *xdf, struct xdlgroup *g) | |
735 { | |
736 if (g->end == xdf->nrec) | |
737 return -1; | |
738 | |
739 g->start = g->end + 1; | |
740 for (g->end = g->start; xdf->rchg[g->end]; g->end++) | |
741 ; | |
742 | |
743 return 0; | |
744 } | |
745 | |
746 /* | |
747 * Move g to describe the previous (possibly empty) group in xdf and return 0. | |
748 * If g is already at the beginning of the file, do nothing and return -1. | |
749 */ | |
750 static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g) | |
751 { | |
752 if (g->start == 0) | |
753 return -1; | |
754 | |
755 g->end = g->start - 1; | |
756 for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--) | |
757 ; | |
758 | |
759 return 0; | |
760 } | |
761 | |
762 /* | |
763 * If g can be slid toward the end of the file, do so, and if it bumps into a | |
764 * following group, expand this group to include it. Return 0 on success or -1 | |
765 * if g cannot be slid down. | |
766 */ | |
767 static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags) | |
768 { | |
769 if (g->end < xdf->nrec && | |
770 recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) { | |
771 xdf->rchg[g->start++] = 0; | |
772 xdf->rchg[g->end++] = 1; | |
773 | |
774 while (xdf->rchg[g->end]) | |
775 g->end++; | |
776 | |
777 return 0; | |
778 } else { | |
779 return -1; | |
780 } | |
781 } | |
782 | |
783 /* | |
784 * If g can be slid toward the beginning of the file, do so, and if it bumps | |
785 * into a previous group, expand this group to include it. Return 0 on success | |
786 * or -1 if g cannot be slid up. | |
787 */ | |
788 static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags) | |
789 { | |
790 if (g->start > 0 && | |
791 recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) { | |
792 xdf->rchg[--g->start] = 1; | |
793 xdf->rchg[--g->end] = 0; | |
794 | |
795 while (xdf->rchg[g->start - 1]) | |
796 g->start--; | |
797 | |
798 return 0; | |
799 } else { | |
800 return -1; | |
801 } | |
802 } | |
803 | |
804 static void xdl_bug(const char *msg) | |
805 { | |
806 fprintf(stderr, "BUG: %s\n", msg); | |
807 exit(1); | |
808 } | |
809 | |
810 /* | |
811 * Move back and forward change groups for a consistent and pretty diff output. | |
812 * This also helps in finding joinable change groups and reducing the diff | |
813 * size. | |
814 */ | |
815 int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) { | |
816 struct xdlgroup g, go; | |
817 long earliest_end, end_matching_other; | |
818 long groupsize; | |
819 | |
820 group_init(xdf, &g); | |
821 group_init(xdfo, &go); | |
822 | |
823 while (1) { | |
824 /* If the group is empty in the to-be-compacted file, skip it: */ | |
825 if (g.end == g.start) | |
826 goto next; | |
827 | |
828 /* | |
829 * Now shift the change up and then down as far as possible in | |
830 * each direction. If it bumps into any other changes, merge them. | |
831 */ | |
832 do { | |
833 groupsize = g.end - g.start; | |
834 | |
835 /* | |
836 * Keep track of the last "end" index that causes this | |
837 * group to align with a group of changed lines in the | |
838 * other file. -1 indicates that we haven't found such | |
839 * a match yet: | |
840 */ | |
841 end_matching_other = -1; | |
842 | |
843 /* Shift the group backward as much as possible: */ | |
844 while (!group_slide_up(xdf, &g, flags)) | |
845 if (group_previous(xdfo, &go)) | |
846 xdl_bug("group sync broken sliding up"); | |
847 | |
848 /* | |
849 * This is this highest that this group can be shifted. | |
850 * Record its end index: | |
851 */ | |
852 earliest_end = g.end; | |
853 | |
854 if (go.end > go.start) | |
855 end_matching_other = g.end; | |
856 | |
857 /* Now shift the group forward as far as possible: */ | |
858 while (1) { | |
859 if (group_slide_down(xdf, &g, flags)) | |
860 break; | |
861 if (group_next(xdfo, &go)) | |
862 xdl_bug("group sync broken sliding down"); | |
863 | |
864 if (go.end > go.start) | |
865 end_matching_other = g.end; | |
866 } | |
867 } while (groupsize != g.end - g.start); | |
868 | |
869 /* | |
870 * If the group can be shifted, then we can possibly use this | |
871 * freedom to produce a more intuitive diff. | |
872 * | |
873 * The group is currently shifted as far down as possible, so the | |
874 * heuristics below only have to handle upwards shifts. | |
875 */ | |
876 | |
877 if (g.end == earliest_end) { | |
878 /* no shifting was possible */ | |
879 } else if (end_matching_other != -1) { | |
880 /* | |
881 * Move the possibly merged group of changes back to line | |
882 * up with the last group of changes from the other file | |
883 * that it can align with. | |
884 */ | |
885 while (go.end == go.start) { | |
886 if (group_slide_up(xdf, &g, flags)) | |
887 xdl_bug("match disappeared"); | |
888 if (group_previous(xdfo, &go)) | |
889 xdl_bug("group sync broken sliding to match"); | |
890 } | |
891 } else if (flags & XDF_INDENT_HEURISTIC) { | |
892 /* | |
893 * Indent heuristic: a group of pure add/delete lines | |
894 * implies two splits, one between the end of the "before" | |
895 * context and the start of the group, and another between | |
896 * the end of the group and the beginning of the "after" | |
897 * context. Some splits are aesthetically better and some | |
898 * are worse. We compute a badness "score" for each split, | |
899 * and add the scores for the two splits to define a | |
900 * "score" for each position that the group can be shifted | |
901 * to. Then we pick the shift with the lowest score. | |
902 */ | |
903 long shift, best_shift = -1; | |
904 struct split_score best_score; | |
905 | |
906 for (shift = earliest_end; shift <= g.end; shift++) { | |
907 struct split_measurement m; | |
908 struct split_score score = {0, 0}; | |
909 | |
910 measure_split(xdf, shift, &m); | |
911 score_add_split(&m, &score); | |
912 measure_split(xdf, shift - groupsize, &m); | |
913 score_add_split(&m, &score); | |
914 if (best_shift == -1 || | |
915 score_cmp(&score, &best_score) <= 0) { | |
916 best_score.effective_indent = score.effective_indent; | |
917 best_score.penalty = score.penalty; | |
918 best_shift = shift; | |
919 } | |
920 } | |
921 | |
922 while (g.end > best_shift) { | |
923 if (group_slide_up(xdf, &g, flags)) | |
924 xdl_bug("best shift unreached"); | |
925 if (group_previous(xdfo, &go)) | |
926 xdl_bug("group sync broken sliding to blank line"); | |
927 } | |
928 } | |
929 | |
930 next: | |
931 /* Move past the just-processed group: */ | |
932 if (group_next(xdf, &g)) | |
933 break; | |
934 if (group_next(xdfo, &go)) | |
935 xdl_bug("group sync broken moving to next group"); | |
936 } | |
937 | |
938 if (!group_next(xdfo, &go)) | |
939 xdl_bug("group sync broken at end of file"); | |
940 | |
941 return 0; | |
942 } | |
943 | |
944 | |
945 int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) { | |
946 xdchange_t *cscr = NULL, *xch; | |
947 char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg; | |
948 long i1, i2, l1, l2; | |
949 | |
950 /* | |
951 * Trivial. Collects "groups" of changes and creates an edit script. | |
952 */ | |
953 for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--) | |
954 if (rchg1[i1 - 1] || rchg2[i2 - 1]) { | |
955 for (l1 = i1; rchg1[i1 - 1]; i1--); | |
956 for (l2 = i2; rchg2[i2 - 1]; i2--); | |
957 | |
958 if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) { | |
959 xdl_free_script(cscr); | |
960 return -1; | |
961 } | |
962 cscr = xch; | |
963 } | |
964 | |
965 *xscr = cscr; | |
966 | |
967 return 0; | |
968 } | |
969 | |
970 | |
971 void xdl_free_script(xdchange_t *xscr) { | |
972 xdchange_t *xch; | |
973 | |
974 while ((xch = xscr) != NULL) { | |
975 xscr = xscr->next; | |
976 xdl_free(xch); | |
977 } | |
978 } | |
979 | |
980 static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb, | |
981 xdemitconf_t const *xecfg) | |
982 { | |
983 xdchange_t *xch, *xche; | |
984 | |
985 for (xch = xscr; xch; xch = xche->next) { | |
986 xche = xdl_get_hunk(&xch, xecfg); | |
987 if (!xch) | |
988 break; | |
989 if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1, | |
990 xch->i2, xche->i2 + xche->chg2 - xch->i2, | |
991 ecb->priv) < 0) | |
992 return -1; | |
993 } | |
994 return 0; | |
995 } | |
996 | |
997 static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags) | |
998 { | |
999 xdchange_t *xch; | |
1000 | |
1001 for (xch = xscr; xch; xch = xch->next) { | |
1002 int ignore = 1; | |
1003 xrecord_t **rec; | |
1004 long i; | |
1005 | |
1006 rec = &xe->xdf1.recs[xch->i1]; | |
1007 for (i = 0; i < xch->chg1 && ignore; i++) | |
1008 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags); | |
1009 | |
1010 rec = &xe->xdf2.recs[xch->i2]; | |
1011 for (i = 0; i < xch->chg2 && ignore; i++) | |
1012 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags); | |
1013 | |
1014 xch->ignore = ignore; | |
1015 } | |
1016 } | |
1017 | |
1018 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, | |
1019 xdemitconf_t const *xecfg, xdemitcb_t *ecb) { | |
1020 xdchange_t *xscr; | |
1021 xdfenv_t xe; | |
1022 emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff; | |
1023 | |
1024 if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { | |
1025 | |
1026 return -1; | |
1027 } | |
1028 if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || | |
1029 xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || | |
1030 xdl_build_script(&xe, &xscr) < 0) { | |
1031 | |
1032 xdl_free_env(&xe); | |
1033 return -1; | |
1034 } | |
1035 if (xscr) { | |
1036 if (xpp->flags & XDF_IGNORE_BLANK_LINES) | |
1037 xdl_mark_ignorable(xscr, &xe, xpp->flags); | |
1038 | |
1039 if (ef(&xe, xscr, ecb, xecfg) < 0) { | |
1040 | |
1041 xdl_free_script(xscr); | |
1042 xdl_free_env(&xe); | |
1043 return -1; | |
1044 } | |
1045 xdl_free_script(xscr); | |
1046 } | |
1047 xdl_free_env(&xe); | |
1048 | |
1049 return 0; | |
1050 } |