comparison mercurial/thirdparty/xdiff/xdiffi.c @ 36671:34e2ff1f9cd8

xdiff: vendor xdiff library from git Vendor git's xdiff library from git commit d7c6c2369d7c6c2369ac21141b7c6cceaebc6414ec3da14ad using GPL2+ license. There is another recent user report that hg diff generates suboptimal result. It seems the fix to issue4074 isn't good enough. I crafted some other interesting cases, and hg diff barely has any advantage compared with gnu diffutils or git diff. | testcase | gnu diffutils | hg diff | git diff | | | lines time | lines time | lines time | | patience | 6 0.00 | 602 0.08 | 6 0.00 | | random | 91772 0.90 | 109462 0.70 | 91772 0.24 | | json | 2 0.03 | 1264814 1.81 | 2 0.29 | "lines" means the size of the output, i.e. the count of "+/-" lines. "time" means seconds needed to do the calculation. Both are the smaller the better. "hg diff" counts Python startup overhead. Git and GNU diffutils generate optimal results. For the "json" case, git can have an optimization that does a scan for common prefix and suffix first, and match them if the length is greater than half of the text. See https://neil.fraser.name/news/2006/03/12/. That would make git the fastest for all above cases. About testcases: patience: Aiming for the weakness of the greedy "patience diff" algorithm. Using git's patience diff option would also get suboptimal result. Generated using the Python script: ``` open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n'))) open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n'))) ``` random: Generated using the script in `test-issue4074.t`. It practically makes the algorithm suffer. Impressively, git wins in both performance and diff quality. json: The recent user reported case. It's a single line movement near the end of a very large (800K lines) JSON file. Test Plan: Code taken as-is. Differential Revision: https://phab.mercurial-scm.org/D2572 # no-check-commit for vendored code
author Jun Wu <quark@fb.com>
date Sat, 03 Mar 2018 10:39:43 -0800
parents
children 9e7b14caf67f
comparison
equal deleted inserted replaced
36670:44048f1bcee5 36671:34e2ff1f9cd8
1 /*
2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003 Davide Libenzi
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
18 *
19 * Davide Libenzi <davidel@xmailserver.org>
20 *
21 */
22
23 #include "xinclude.h"
24
25
26
27 #define XDL_MAX_COST_MIN 256
28 #define XDL_HEUR_MIN_COST 256
29 #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
30 #define XDL_SNAKE_CNT 20
31 #define XDL_K_HEUR 4
32
33
34
35 typedef struct s_xdpsplit {
36 long i1, i2;
37 int min_lo, min_hi;
38 } xdpsplit_t;
39
40
41
42
43 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
44 unsigned long const *ha2, long off2, long lim2,
45 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
46 xdalgoenv_t *xenv);
47 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
48
49
50
51
52
53 /*
54 * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
55 * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
56 * the forward diagonal starting from (off1, off2) and the backward diagonal
57 * starting from (lim1, lim2). If the K values on the same diagonal crosses
58 * returns the furthest point of reach. We might end up having to expensive
59 * cases using this algorithm is full, so a little bit of heuristic is needed
60 * to cut the search and to return a suboptimal point.
61 */
62 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
63 unsigned long const *ha2, long off2, long lim2,
64 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
65 xdalgoenv_t *xenv) {
66 long dmin = off1 - lim2, dmax = lim1 - off2;
67 long fmid = off1 - off2, bmid = lim1 - lim2;
68 long odd = (fmid - bmid) & 1;
69 long fmin = fmid, fmax = fmid;
70 long bmin = bmid, bmax = bmid;
71 long ec, d, i1, i2, prev1, best, dd, v, k;
72
73 /*
74 * Set initial diagonal values for both forward and backward path.
75 */
76 kvdf[fmid] = off1;
77 kvdb[bmid] = lim1;
78
79 for (ec = 1;; ec++) {
80 int got_snake = 0;
81
82 /*
83 * We need to extent the diagonal "domain" by one. If the next
84 * values exits the box boundaries we need to change it in the
85 * opposite direction because (max - min) must be a power of two.
86 * Also we initialize the external K value to -1 so that we can
87 * avoid extra conditions check inside the core loop.
88 */
89 if (fmin > dmin)
90 kvdf[--fmin - 1] = -1;
91 else
92 ++fmin;
93 if (fmax < dmax)
94 kvdf[++fmax + 1] = -1;
95 else
96 --fmax;
97
98 for (d = fmax; d >= fmin; d -= 2) {
99 if (kvdf[d - 1] >= kvdf[d + 1])
100 i1 = kvdf[d - 1] + 1;
101 else
102 i1 = kvdf[d + 1];
103 prev1 = i1;
104 i2 = i1 - d;
105 for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
106 if (i1 - prev1 > xenv->snake_cnt)
107 got_snake = 1;
108 kvdf[d] = i1;
109 if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
110 spl->i1 = i1;
111 spl->i2 = i2;
112 spl->min_lo = spl->min_hi = 1;
113 return ec;
114 }
115 }
116
117 /*
118 * We need to extent the diagonal "domain" by one. If the next
119 * values exits the box boundaries we need to change it in the
120 * opposite direction because (max - min) must be a power of two.
121 * Also we initialize the external K value to -1 so that we can
122 * avoid extra conditions check inside the core loop.
123 */
124 if (bmin > dmin)
125 kvdb[--bmin - 1] = XDL_LINE_MAX;
126 else
127 ++bmin;
128 if (bmax < dmax)
129 kvdb[++bmax + 1] = XDL_LINE_MAX;
130 else
131 --bmax;
132
133 for (d = bmax; d >= bmin; d -= 2) {
134 if (kvdb[d - 1] < kvdb[d + 1])
135 i1 = kvdb[d - 1];
136 else
137 i1 = kvdb[d + 1] - 1;
138 prev1 = i1;
139 i2 = i1 - d;
140 for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
141 if (prev1 - i1 > xenv->snake_cnt)
142 got_snake = 1;
143 kvdb[d] = i1;
144 if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
145 spl->i1 = i1;
146 spl->i2 = i2;
147 spl->min_lo = spl->min_hi = 1;
148 return ec;
149 }
150 }
151
152 if (need_min)
153 continue;
154
155 /*
156 * If the edit cost is above the heuristic trigger and if
157 * we got a good snake, we sample current diagonals to see
158 * if some of the, have reached an "interesting" path. Our
159 * measure is a function of the distance from the diagonal
160 * corner (i1 + i2) penalized with the distance from the
161 * mid diagonal itself. If this value is above the current
162 * edit cost times a magic factor (XDL_K_HEUR) we consider
163 * it interesting.
164 */
165 if (got_snake && ec > xenv->heur_min) {
166 for (best = 0, d = fmax; d >= fmin; d -= 2) {
167 dd = d > fmid ? d - fmid: fmid - d;
168 i1 = kvdf[d];
169 i2 = i1 - d;
170 v = (i1 - off1) + (i2 - off2) - dd;
171
172 if (v > XDL_K_HEUR * ec && v > best &&
173 off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
174 off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
175 for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
176 if (k == xenv->snake_cnt) {
177 best = v;
178 spl->i1 = i1;
179 spl->i2 = i2;
180 break;
181 }
182 }
183 }
184 if (best > 0) {
185 spl->min_lo = 1;
186 spl->min_hi = 0;
187 return ec;
188 }
189
190 for (best = 0, d = bmax; d >= bmin; d -= 2) {
191 dd = d > bmid ? d - bmid: bmid - d;
192 i1 = kvdb[d];
193 i2 = i1 - d;
194 v = (lim1 - i1) + (lim2 - i2) - dd;
195
196 if (v > XDL_K_HEUR * ec && v > best &&
197 off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
198 off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
199 for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
200 if (k == xenv->snake_cnt - 1) {
201 best = v;
202 spl->i1 = i1;
203 spl->i2 = i2;
204 break;
205 }
206 }
207 }
208 if (best > 0) {
209 spl->min_lo = 0;
210 spl->min_hi = 1;
211 return ec;
212 }
213 }
214
215 /*
216 * Enough is enough. We spent too much time here and now we collect
217 * the furthest reaching path using the (i1 + i2) measure.
218 */
219 if (ec >= xenv->mxcost) {
220 long fbest, fbest1, bbest, bbest1;
221
222 fbest = fbest1 = -1;
223 for (d = fmax; d >= fmin; d -= 2) {
224 i1 = XDL_MIN(kvdf[d], lim1);
225 i2 = i1 - d;
226 if (lim2 < i2)
227 i1 = lim2 + d, i2 = lim2;
228 if (fbest < i1 + i2) {
229 fbest = i1 + i2;
230 fbest1 = i1;
231 }
232 }
233
234 bbest = bbest1 = XDL_LINE_MAX;
235 for (d = bmax; d >= bmin; d -= 2) {
236 i1 = XDL_MAX(off1, kvdb[d]);
237 i2 = i1 - d;
238 if (i2 < off2)
239 i1 = off2 + d, i2 = off2;
240 if (i1 + i2 < bbest) {
241 bbest = i1 + i2;
242 bbest1 = i1;
243 }
244 }
245
246 if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
247 spl->i1 = fbest1;
248 spl->i2 = fbest - fbest1;
249 spl->min_lo = 1;
250 spl->min_hi = 0;
251 } else {
252 spl->i1 = bbest1;
253 spl->i2 = bbest - bbest1;
254 spl->min_lo = 0;
255 spl->min_hi = 1;
256 }
257 return ec;
258 }
259 }
260 }
261
262
263 /*
264 * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
265 * the box splitting function. Note that the real job (marking changed lines)
266 * is done in the two boundary reaching checks.
267 */
268 int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
269 diffdata_t *dd2, long off2, long lim2,
270 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
271 unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
272
273 /*
274 * Shrink the box by walking through each diagonal snake (SW and NE).
275 */
276 for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
277 for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
278
279 /*
280 * If one dimension is empty, then all records on the other one must
281 * be obviously changed.
282 */
283 if (off1 == lim1) {
284 char *rchg2 = dd2->rchg;
285 long *rindex2 = dd2->rindex;
286
287 for (; off2 < lim2; off2++)
288 rchg2[rindex2[off2]] = 1;
289 } else if (off2 == lim2) {
290 char *rchg1 = dd1->rchg;
291 long *rindex1 = dd1->rindex;
292
293 for (; off1 < lim1; off1++)
294 rchg1[rindex1[off1]] = 1;
295 } else {
296 xdpsplit_t spl;
297 spl.i1 = spl.i2 = 0;
298
299 /*
300 * Divide ...
301 */
302 if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
303 need_min, &spl, xenv) < 0) {
304
305 return -1;
306 }
307
308 /*
309 * ... et Impera.
310 */
311 if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
312 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
313 xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
314 kvdf, kvdb, spl.min_hi, xenv) < 0) {
315
316 return -1;
317 }
318 }
319
320 return 0;
321 }
322
323
324 int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
325 xdfenv_t *xe) {
326 long ndiags;
327 long *kvd, *kvdf, *kvdb;
328 xdalgoenv_t xenv;
329 diffdata_t dd1, dd2;
330
331 if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
332 return xdl_do_patience_diff(mf1, mf2, xpp, xe);
333
334 if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
335 return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
336
337 if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
338
339 return -1;
340 }
341
342 /*
343 * Allocate and setup K vectors to be used by the differential algorithm.
344 * One is to store the forward path and one to store the backward path.
345 */
346 ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
347 if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) {
348
349 xdl_free_env(xe);
350 return -1;
351 }
352 kvdf = kvd;
353 kvdb = kvdf + ndiags;
354 kvdf += xe->xdf2.nreff + 1;
355 kvdb += xe->xdf2.nreff + 1;
356
357 xenv.mxcost = xdl_bogosqrt(ndiags);
358 if (xenv.mxcost < XDL_MAX_COST_MIN)
359 xenv.mxcost = XDL_MAX_COST_MIN;
360 xenv.snake_cnt = XDL_SNAKE_CNT;
361 xenv.heur_min = XDL_HEUR_MIN_COST;
362
363 dd1.nrec = xe->xdf1.nreff;
364 dd1.ha = xe->xdf1.ha;
365 dd1.rchg = xe->xdf1.rchg;
366 dd1.rindex = xe->xdf1.rindex;
367 dd2.nrec = xe->xdf2.nreff;
368 dd2.ha = xe->xdf2.ha;
369 dd2.rchg = xe->xdf2.rchg;
370 dd2.rindex = xe->xdf2.rindex;
371
372 if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
373 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
374
375 xdl_free(kvd);
376 xdl_free_env(xe);
377 return -1;
378 }
379
380 xdl_free(kvd);
381
382 return 0;
383 }
384
385
386 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
387 xdchange_t *xch;
388
389 if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
390 return NULL;
391
392 xch->next = xscr;
393 xch->i1 = i1;
394 xch->i2 = i2;
395 xch->chg1 = chg1;
396 xch->chg2 = chg2;
397 xch->ignore = 0;
398
399 return xch;
400 }
401
402
403 static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags)
404 {
405 return (rec1->ha == rec2->ha &&
406 xdl_recmatch(rec1->ptr, rec1->size,
407 rec2->ptr, rec2->size,
408 flags));
409 }
410
411 /*
412 * If a line is indented more than this, get_indent() just returns this value.
413 * This avoids having to do absurd amounts of work for data that are not
414 * human-readable text, and also ensures that the output of get_indent fits within
415 * an int.
416 */
417 #define MAX_INDENT 200
418
419 /*
420 * Return the amount of indentation of the specified line, treating TAB as 8
421 * columns. Return -1 if line is empty or contains only whitespace. Clamp the
422 * output value at MAX_INDENT.
423 */
424 static int get_indent(xrecord_t *rec)
425 {
426 long i;
427 int ret = 0;
428
429 for (i = 0; i < rec->size; i++) {
430 char c = rec->ptr[i];
431
432 if (!XDL_ISSPACE(c))
433 return ret;
434 else if (c == ' ')
435 ret += 1;
436 else if (c == '\t')
437 ret += 8 - ret % 8;
438 /* ignore other whitespace characters */
439
440 if (ret >= MAX_INDENT)
441 return MAX_INDENT;
442 }
443
444 /* The line contains only whitespace. */
445 return -1;
446 }
447
448 /*
449 * If more than this number of consecutive blank rows are found, just return this
450 * value. This avoids requiring O(N^2) work for pathological cases, and also
451 * ensures that the output of score_split fits in an int.
452 */
453 #define MAX_BLANKS 20
454
455 /* Characteristics measured about a hypothetical split position. */
456 struct split_measurement {
457 /*
458 * Is the split at the end of the file (aside from any blank lines)?
459 */
460 int end_of_file;
461
462 /*
463 * How much is the line immediately following the split indented (or -1 if
464 * the line is blank):
465 */
466 int indent;
467
468 /*
469 * How many consecutive lines above the split are blank?
470 */
471 int pre_blank;
472
473 /*
474 * How much is the nearest non-blank line above the split indented (or -1
475 * if there is no such line)?
476 */
477 int pre_indent;
478
479 /*
480 * How many lines after the line following the split are blank?
481 */
482 int post_blank;
483
484 /*
485 * How much is the nearest non-blank line after the line following the
486 * split indented (or -1 if there is no such line)?
487 */
488 int post_indent;
489 };
490
491 struct split_score {
492 /* The effective indent of this split (smaller is preferred). */
493 int effective_indent;
494
495 /* Penalty for this split (smaller is preferred). */
496 int penalty;
497 };
498
499 /*
500 * Fill m with information about a hypothetical split of xdf above line split.
501 */
502 static void measure_split(const xdfile_t *xdf, long split,
503 struct split_measurement *m)
504 {
505 long i;
506
507 if (split >= xdf->nrec) {
508 m->end_of_file = 1;
509 m->indent = -1;
510 } else {
511 m->end_of_file = 0;
512 m->indent = get_indent(xdf->recs[split]);
513 }
514
515 m->pre_blank = 0;
516 m->pre_indent = -1;
517 for (i = split - 1; i >= 0; i--) {
518 m->pre_indent = get_indent(xdf->recs[i]);
519 if (m->pre_indent != -1)
520 break;
521 m->pre_blank += 1;
522 if (m->pre_blank == MAX_BLANKS) {
523 m->pre_indent = 0;
524 break;
525 }
526 }
527
528 m->post_blank = 0;
529 m->post_indent = -1;
530 for (i = split + 1; i < xdf->nrec; i++) {
531 m->post_indent = get_indent(xdf->recs[i]);
532 if (m->post_indent != -1)
533 break;
534 m->post_blank += 1;
535 if (m->post_blank == MAX_BLANKS) {
536 m->post_indent = 0;
537 break;
538 }
539 }
540 }
541
542 /*
543 * The empirically-determined weight factors used by score_split() below.
544 * Larger values means that the position is a less favorable place to split.
545 *
546 * Note that scores are only ever compared against each other, so multiplying
547 * all of these weight/penalty values by the same factor wouldn't change the
548 * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
549 * In practice, these numbers are chosen to be large enough that they can be
550 * adjusted relative to each other with sufficient precision despite using
551 * integer math.
552 */
553
554 /* Penalty if there are no non-blank lines before the split */
555 #define START_OF_FILE_PENALTY 1
556
557 /* Penalty if there are no non-blank lines after the split */
558 #define END_OF_FILE_PENALTY 21
559
560 /* Multiplier for the number of blank lines around the split */
561 #define TOTAL_BLANK_WEIGHT (-30)
562
563 /* Multiplier for the number of blank lines after the split */
564 #define POST_BLANK_WEIGHT 6
565
566 /*
567 * Penalties applied if the line is indented more than its predecessor
568 */
569 #define RELATIVE_INDENT_PENALTY (-4)
570 #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
571
572 /*
573 * Penalties applied if the line is indented less than both its predecessor and
574 * its successor
575 */
576 #define RELATIVE_OUTDENT_PENALTY 24
577 #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
578
579 /*
580 * Penalties applied if the line is indented less than its predecessor but not
581 * less than its successor
582 */
583 #define RELATIVE_DEDENT_PENALTY 23
584 #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
585
586 /*
587 * We only consider whether the sum of the effective indents for splits are
588 * less than (-1), equal to (0), or greater than (+1) each other. The resulting
589 * value is multiplied by the following weight and combined with the penalty to
590 * determine the better of two scores.
591 */
592 #define INDENT_WEIGHT 60
593
594 /*
595 * Compute a badness score for the hypothetical split whose measurements are
596 * stored in m. The weight factors were determined empirically using the tools and
597 * corpus described in
598 *
599 * https://github.com/mhagger/diff-slider-tools
600 *
601 * Also see that project if you want to improve the weights based on, for example,
602 * a larger or more diverse corpus.
603 */
604 static void score_add_split(const struct split_measurement *m, struct split_score *s)
605 {
606 /*
607 * A place to accumulate penalty factors (positive makes this index more
608 * favored):
609 */
610 int post_blank, total_blank, indent, any_blanks;
611
612 if (m->pre_indent == -1 && m->pre_blank == 0)
613 s->penalty += START_OF_FILE_PENALTY;
614
615 if (m->end_of_file)
616 s->penalty += END_OF_FILE_PENALTY;
617
618 /*
619 * Set post_blank to the number of blank lines following the split,
620 * including the line immediately after the split:
621 */
622 post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
623 total_blank = m->pre_blank + post_blank;
624
625 /* Penalties based on nearby blank lines: */
626 s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
627 s->penalty += POST_BLANK_WEIGHT * post_blank;
628
629 if (m->indent != -1)
630 indent = m->indent;
631 else
632 indent = m->post_indent;
633
634 any_blanks = (total_blank != 0);
635
636 /* Note that the effective indent is -1 at the end of the file: */
637 s->effective_indent += indent;
638
639 if (indent == -1) {
640 /* No additional adjustments needed. */
641 } else if (m->pre_indent == -1) {
642 /* No additional adjustments needed. */
643 } else if (indent > m->pre_indent) {
644 /*
645 * The line is indented more than its predecessor.
646 */
647 s->penalty += any_blanks ?
648 RELATIVE_INDENT_WITH_BLANK_PENALTY :
649 RELATIVE_INDENT_PENALTY;
650 } else if (indent == m->pre_indent) {
651 /*
652 * The line has the same indentation level as its predecessor.
653 * No additional adjustments needed.
654 */
655 } else {
656 /*
657 * The line is indented less than its predecessor. It could be
658 * the block terminator of the previous block, but it could
659 * also be the start of a new block (e.g., an "else" block, or
660 * maybe the previous block didn't have a block terminator).
661 * Try to distinguish those cases based on what comes next:
662 */
663 if (m->post_indent != -1 && m->post_indent > indent) {
664 /*
665 * The following line is indented more. So it is likely
666 * that this line is the start of a block.
667 */
668 s->penalty += any_blanks ?
669 RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
670 RELATIVE_OUTDENT_PENALTY;
671 } else {
672 /*
673 * That was probably the end of a block.
674 */
675 s->penalty += any_blanks ?
676 RELATIVE_DEDENT_WITH_BLANK_PENALTY :
677 RELATIVE_DEDENT_PENALTY;
678 }
679 }
680 }
681
682 static int score_cmp(struct split_score *s1, struct split_score *s2)
683 {
684 /* -1 if s1.effective_indent < s2->effective_indent, etc. */
685 int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
686 (s1->effective_indent < s2->effective_indent));
687
688 return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
689 }
690
691 /*
692 * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
693 * of lines that was inserted or deleted from the corresponding version of the
694 * file). We consider there to be such a group at the beginning of the file, at
695 * the end of the file, and between any two unchanged lines, though most such
696 * groups will usually be empty.
697 *
698 * If the first line in a group is equal to the line following the group, then
699 * the group can be slid down. Similarly, if the last line in a group is equal
700 * to the line preceding the group, then the group can be slid up. See
701 * group_slide_down() and group_slide_up().
702 *
703 * Note that loops that are testing for changed lines in xdf->rchg do not need
704 * index bounding since the array is prepared with a zero at position -1 and N.
705 */
706 struct xdlgroup {
707 /*
708 * The index of the first changed line in the group, or the index of
709 * the unchanged line above which the (empty) group is located.
710 */
711 long start;
712
713 /*
714 * The index of the first unchanged line after the group. For an empty
715 * group, end is equal to start.
716 */
717 long end;
718 };
719
720 /*
721 * Initialize g to point at the first group in xdf.
722 */
723 static void group_init(xdfile_t *xdf, struct xdlgroup *g)
724 {
725 g->start = g->end = 0;
726 while (xdf->rchg[g->end])
727 g->end++;
728 }
729
730 /*
731 * Move g to describe the next (possibly empty) group in xdf and return 0. If g
732 * is already at the end of the file, do nothing and return -1.
733 */
734 static inline int group_next(xdfile_t *xdf, struct xdlgroup *g)
735 {
736 if (g->end == xdf->nrec)
737 return -1;
738
739 g->start = g->end + 1;
740 for (g->end = g->start; xdf->rchg[g->end]; g->end++)
741 ;
742
743 return 0;
744 }
745
746 /*
747 * Move g to describe the previous (possibly empty) group in xdf and return 0.
748 * If g is already at the beginning of the file, do nothing and return -1.
749 */
750 static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g)
751 {
752 if (g->start == 0)
753 return -1;
754
755 g->end = g->start - 1;
756 for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
757 ;
758
759 return 0;
760 }
761
762 /*
763 * If g can be slid toward the end of the file, do so, and if it bumps into a
764 * following group, expand this group to include it. Return 0 on success or -1
765 * if g cannot be slid down.
766 */
767 static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags)
768 {
769 if (g->end < xdf->nrec &&
770 recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
771 xdf->rchg[g->start++] = 0;
772 xdf->rchg[g->end++] = 1;
773
774 while (xdf->rchg[g->end])
775 g->end++;
776
777 return 0;
778 } else {
779 return -1;
780 }
781 }
782
783 /*
784 * If g can be slid toward the beginning of the file, do so, and if it bumps
785 * into a previous group, expand this group to include it. Return 0 on success
786 * or -1 if g cannot be slid up.
787 */
788 static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags)
789 {
790 if (g->start > 0 &&
791 recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
792 xdf->rchg[--g->start] = 1;
793 xdf->rchg[--g->end] = 0;
794
795 while (xdf->rchg[g->start - 1])
796 g->start--;
797
798 return 0;
799 } else {
800 return -1;
801 }
802 }
803
804 static void xdl_bug(const char *msg)
805 {
806 fprintf(stderr, "BUG: %s\n", msg);
807 exit(1);
808 }
809
810 /*
811 * Move back and forward change groups for a consistent and pretty diff output.
812 * This also helps in finding joinable change groups and reducing the diff
813 * size.
814 */
815 int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
816 struct xdlgroup g, go;
817 long earliest_end, end_matching_other;
818 long groupsize;
819
820 group_init(xdf, &g);
821 group_init(xdfo, &go);
822
823 while (1) {
824 /* If the group is empty in the to-be-compacted file, skip it: */
825 if (g.end == g.start)
826 goto next;
827
828 /*
829 * Now shift the change up and then down as far as possible in
830 * each direction. If it bumps into any other changes, merge them.
831 */
832 do {
833 groupsize = g.end - g.start;
834
835 /*
836 * Keep track of the last "end" index that causes this
837 * group to align with a group of changed lines in the
838 * other file. -1 indicates that we haven't found such
839 * a match yet:
840 */
841 end_matching_other = -1;
842
843 /* Shift the group backward as much as possible: */
844 while (!group_slide_up(xdf, &g, flags))
845 if (group_previous(xdfo, &go))
846 xdl_bug("group sync broken sliding up");
847
848 /*
849 * This is this highest that this group can be shifted.
850 * Record its end index:
851 */
852 earliest_end = g.end;
853
854 if (go.end > go.start)
855 end_matching_other = g.end;
856
857 /* Now shift the group forward as far as possible: */
858 while (1) {
859 if (group_slide_down(xdf, &g, flags))
860 break;
861 if (group_next(xdfo, &go))
862 xdl_bug("group sync broken sliding down");
863
864 if (go.end > go.start)
865 end_matching_other = g.end;
866 }
867 } while (groupsize != g.end - g.start);
868
869 /*
870 * If the group can be shifted, then we can possibly use this
871 * freedom to produce a more intuitive diff.
872 *
873 * The group is currently shifted as far down as possible, so the
874 * heuristics below only have to handle upwards shifts.
875 */
876
877 if (g.end == earliest_end) {
878 /* no shifting was possible */
879 } else if (end_matching_other != -1) {
880 /*
881 * Move the possibly merged group of changes back to line
882 * up with the last group of changes from the other file
883 * that it can align with.
884 */
885 while (go.end == go.start) {
886 if (group_slide_up(xdf, &g, flags))
887 xdl_bug("match disappeared");
888 if (group_previous(xdfo, &go))
889 xdl_bug("group sync broken sliding to match");
890 }
891 } else if (flags & XDF_INDENT_HEURISTIC) {
892 /*
893 * Indent heuristic: a group of pure add/delete lines
894 * implies two splits, one between the end of the "before"
895 * context and the start of the group, and another between
896 * the end of the group and the beginning of the "after"
897 * context. Some splits are aesthetically better and some
898 * are worse. We compute a badness "score" for each split,
899 * and add the scores for the two splits to define a
900 * "score" for each position that the group can be shifted
901 * to. Then we pick the shift with the lowest score.
902 */
903 long shift, best_shift = -1;
904 struct split_score best_score;
905
906 for (shift = earliest_end; shift <= g.end; shift++) {
907 struct split_measurement m;
908 struct split_score score = {0, 0};
909
910 measure_split(xdf, shift, &m);
911 score_add_split(&m, &score);
912 measure_split(xdf, shift - groupsize, &m);
913 score_add_split(&m, &score);
914 if (best_shift == -1 ||
915 score_cmp(&score, &best_score) <= 0) {
916 best_score.effective_indent = score.effective_indent;
917 best_score.penalty = score.penalty;
918 best_shift = shift;
919 }
920 }
921
922 while (g.end > best_shift) {
923 if (group_slide_up(xdf, &g, flags))
924 xdl_bug("best shift unreached");
925 if (group_previous(xdfo, &go))
926 xdl_bug("group sync broken sliding to blank line");
927 }
928 }
929
930 next:
931 /* Move past the just-processed group: */
932 if (group_next(xdf, &g))
933 break;
934 if (group_next(xdfo, &go))
935 xdl_bug("group sync broken moving to next group");
936 }
937
938 if (!group_next(xdfo, &go))
939 xdl_bug("group sync broken at end of file");
940
941 return 0;
942 }
943
944
945 int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
946 xdchange_t *cscr = NULL, *xch;
947 char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
948 long i1, i2, l1, l2;
949
950 /*
951 * Trivial. Collects "groups" of changes and creates an edit script.
952 */
953 for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
954 if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
955 for (l1 = i1; rchg1[i1 - 1]; i1--);
956 for (l2 = i2; rchg2[i2 - 1]; i2--);
957
958 if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
959 xdl_free_script(cscr);
960 return -1;
961 }
962 cscr = xch;
963 }
964
965 *xscr = cscr;
966
967 return 0;
968 }
969
970
971 void xdl_free_script(xdchange_t *xscr) {
972 xdchange_t *xch;
973
974 while ((xch = xscr) != NULL) {
975 xscr = xscr->next;
976 xdl_free(xch);
977 }
978 }
979
980 static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
981 xdemitconf_t const *xecfg)
982 {
983 xdchange_t *xch, *xche;
984
985 for (xch = xscr; xch; xch = xche->next) {
986 xche = xdl_get_hunk(&xch, xecfg);
987 if (!xch)
988 break;
989 if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
990 xch->i2, xche->i2 + xche->chg2 - xch->i2,
991 ecb->priv) < 0)
992 return -1;
993 }
994 return 0;
995 }
996
997 static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
998 {
999 xdchange_t *xch;
1000
1001 for (xch = xscr; xch; xch = xch->next) {
1002 int ignore = 1;
1003 xrecord_t **rec;
1004 long i;
1005
1006 rec = &xe->xdf1.recs[xch->i1];
1007 for (i = 0; i < xch->chg1 && ignore; i++)
1008 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1009
1010 rec = &xe->xdf2.recs[xch->i2];
1011 for (i = 0; i < xch->chg2 && ignore; i++)
1012 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1013
1014 xch->ignore = ignore;
1015 }
1016 }
1017
1018 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1019 xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1020 xdchange_t *xscr;
1021 xdfenv_t xe;
1022 emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1023
1024 if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1025
1026 return -1;
1027 }
1028 if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1029 xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1030 xdl_build_script(&xe, &xscr) < 0) {
1031
1032 xdl_free_env(&xe);
1033 return -1;
1034 }
1035 if (xscr) {
1036 if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1037 xdl_mark_ignorable(xscr, &xe, xpp->flags);
1038
1039 if (ef(&xe, xscr, ecb, xecfg) < 0) {
1040
1041 xdl_free_script(xscr);
1042 xdl_free_env(&xe);
1043 return -1;
1044 }
1045 xdl_free_script(xscr);
1046 }
1047 xdl_free_env(&xe);
1048
1049 return 0;
1050 }