author | Pierre-Yves David <pierre-yves.david@octobus.net> |
Tue, 20 Apr 2021 04:27:03 +0200 | |
changeset 46990 | 0b569c75d180 |
parent 36923 | d40b9e29c114 |
permissions | -rw-r--r-- |
36671 | 1 |
/* |
2 |
* LibXDiff by Davide Libenzi ( File Differential Library ) |
|
3 |
* Copyright (C) 2003 Davide Libenzi |
|
4 |
* |
|
5 |
* This library is free software; you can redistribute it and/or |
|
6 |
* modify it under the terms of the GNU Lesser General Public |
|
7 |
* License as published by the Free Software Foundation; either |
|
8 |
* version 2.1 of the License, or (at your option) any later version. |
|
9 |
* |
|
10 |
* This library is distributed in the hope that it will be useful, |
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
* Lesser General Public License for more details. |
|
14 |
* |
|
15 |
* You should have received a copy of the GNU Lesser General Public |
|
16 |
* License along with this library; if not, see |
|
17 |
* <http://www.gnu.org/licenses/>. |
|
18 |
* |
|
19 |
* Davide Libenzi <davidel@xmailserver.org> |
|
20 |
* |
|
21 |
*/ |
|
22 |
||
23 |
#include "xinclude.h" |
|
24 |
||
25 |
||
26 |
||
27 |
#define XDL_MAX_COST_MIN 256 |
|
28 |
#define XDL_HEUR_MIN_COST 256 |
|
29 |
#define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1) |
|
30 |
#define XDL_SNAKE_CNT 20 |
|
31 |
#define XDL_K_HEUR 4 |
|
32 |
||
36723
1f9bbd1d6b8a
xdiff: fix builds on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36674
diff
changeset
|
33 |
/* VC 2008 doesn't know about the inline keyword. */ |
1f9bbd1d6b8a
xdiff: fix builds on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36674
diff
changeset
|
34 |
#if defined(_MSC_VER) |
1f9bbd1d6b8a
xdiff: fix builds on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36674
diff
changeset
|
35 |
#define inline __forceinline |
1f9bbd1d6b8a
xdiff: fix builds on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36674
diff
changeset
|
36 |
#endif |
36671 | 37 |
|
38 |
||
39 |
typedef struct s_xdpsplit { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
40 |
int64_t i1, i2; |
36671 | 41 |
int min_lo, min_hi; |
42 |
} xdpsplit_t; |
|
43 |
||
44 |
||
45 |
||
46 |
||
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
47 |
static int64_t xdl_split(uint64_t const *ha1, int64_t off1, int64_t lim1, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
48 |
uint64_t const *ha2, int64_t off2, int64_t lim2, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
49 |
int64_t *kvdf, int64_t *kvdb, int need_min, xdpsplit_t *spl, |
36671 | 50 |
xdalgoenv_t *xenv); |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
51 |
static xdchange_t *xdl_add_change(xdchange_t *xscr, int64_t i1, int64_t i2, int64_t chg1, int64_t chg2); |
36671 | 52 |
|
53 |
||
54 |
||
55 |
||
56 |
||
57 |
/* |
|
58 |
* See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. |
|
59 |
* Basically considers a "box" (off1, off2, lim1, lim2) and scan from both |
|
60 |
* the forward diagonal starting from (off1, off2) and the backward diagonal |
|
61 |
* starting from (lim1, lim2). If the K values on the same diagonal crosses |
|
62 |
* returns the furthest point of reach. We might end up having to expensive |
|
63 |
* cases using this algorithm is full, so a little bit of heuristic is needed |
|
64 |
* to cut the search and to return a suboptimal point. |
|
65 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
66 |
static int64_t xdl_split(uint64_t const *ha1, int64_t off1, int64_t lim1, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
67 |
uint64_t const *ha2, int64_t off2, int64_t lim2, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
68 |
int64_t *kvdf, int64_t *kvdb, int need_min, xdpsplit_t *spl, |
36671 | 69 |
xdalgoenv_t *xenv) { |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
70 |
int64_t dmin = off1 - lim2, dmax = lim1 - off2; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
71 |
int64_t fmid = off1 - off2, bmid = lim1 - lim2; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
72 |
int64_t odd = (fmid - bmid) & 1; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
73 |
int64_t fmin = fmid, fmax = fmid; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
74 |
int64_t bmin = bmid, bmax = bmid; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
75 |
int64_t ec, d, i1, i2, prev1, best, dd, v, k; |
36671 | 76 |
|
77 |
/* |
|
78 |
* Set initial diagonal values for both forward and backward path. |
|
79 |
*/ |
|
80 |
kvdf[fmid] = off1; |
|
81 |
kvdb[bmid] = lim1; |
|
82 |
||
83 |
for (ec = 1;; ec++) { |
|
84 |
int got_snake = 0; |
|
85 |
||
86 |
/* |
|
87 |
* We need to extent the diagonal "domain" by one. If the next |
|
88 |
* values exits the box boundaries we need to change it in the |
|
89 |
* opposite direction because (max - min) must be a power of two. |
|
90 |
* Also we initialize the external K value to -1 so that we can |
|
91 |
* avoid extra conditions check inside the core loop. |
|
92 |
*/ |
|
93 |
if (fmin > dmin) |
|
94 |
kvdf[--fmin - 1] = -1; |
|
95 |
else |
|
96 |
++fmin; |
|
97 |
if (fmax < dmax) |
|
98 |
kvdf[++fmax + 1] = -1; |
|
99 |
else |
|
100 |
--fmax; |
|
101 |
||
102 |
for (d = fmax; d >= fmin; d -= 2) { |
|
103 |
if (kvdf[d - 1] >= kvdf[d + 1]) |
|
104 |
i1 = kvdf[d - 1] + 1; |
|
105 |
else |
|
106 |
i1 = kvdf[d + 1]; |
|
107 |
prev1 = i1; |
|
108 |
i2 = i1 - d; |
|
109 |
for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++); |
|
110 |
if (i1 - prev1 > xenv->snake_cnt) |
|
111 |
got_snake = 1; |
|
112 |
kvdf[d] = i1; |
|
113 |
if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) { |
|
114 |
spl->i1 = i1; |
|
115 |
spl->i2 = i2; |
|
116 |
spl->min_lo = spl->min_hi = 1; |
|
117 |
return ec; |
|
118 |
} |
|
119 |
} |
|
120 |
||
121 |
/* |
|
122 |
* We need to extent the diagonal "domain" by one. If the next |
|
123 |
* values exits the box boundaries we need to change it in the |
|
124 |
* opposite direction because (max - min) must be a power of two. |
|
125 |
* Also we initialize the external K value to -1 so that we can |
|
126 |
* avoid extra conditions check inside the core loop. |
|
127 |
*/ |
|
128 |
if (bmin > dmin) |
|
129 |
kvdb[--bmin - 1] = XDL_LINE_MAX; |
|
130 |
else |
|
131 |
++bmin; |
|
132 |
if (bmax < dmax) |
|
133 |
kvdb[++bmax + 1] = XDL_LINE_MAX; |
|
134 |
else |
|
135 |
--bmax; |
|
136 |
||
137 |
for (d = bmax; d >= bmin; d -= 2) { |
|
138 |
if (kvdb[d - 1] < kvdb[d + 1]) |
|
139 |
i1 = kvdb[d - 1]; |
|
140 |
else |
|
141 |
i1 = kvdb[d + 1] - 1; |
|
142 |
prev1 = i1; |
|
143 |
i2 = i1 - d; |
|
144 |
for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--); |
|
145 |
if (prev1 - i1 > xenv->snake_cnt) |
|
146 |
got_snake = 1; |
|
147 |
kvdb[d] = i1; |
|
148 |
if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) { |
|
149 |
spl->i1 = i1; |
|
150 |
spl->i2 = i2; |
|
151 |
spl->min_lo = spl->min_hi = 1; |
|
152 |
return ec; |
|
153 |
} |
|
154 |
} |
|
155 |
||
156 |
if (need_min) |
|
157 |
continue; |
|
158 |
||
159 |
/* |
|
160 |
* If the edit cost is above the heuristic trigger and if |
|
161 |
* we got a good snake, we sample current diagonals to see |
|
162 |
* if some of the, have reached an "interesting" path. Our |
|
163 |
* measure is a function of the distance from the diagonal |
|
164 |
* corner (i1 + i2) penalized with the distance from the |
|
165 |
* mid diagonal itself. If this value is above the current |
|
166 |
* edit cost times a magic factor (XDL_K_HEUR) we consider |
|
167 |
* it interesting. |
|
168 |
*/ |
|
169 |
if (got_snake && ec > xenv->heur_min) { |
|
170 |
for (best = 0, d = fmax; d >= fmin; d -= 2) { |
|
171 |
dd = d > fmid ? d - fmid: fmid - d; |
|
172 |
i1 = kvdf[d]; |
|
173 |
i2 = i1 - d; |
|
174 |
v = (i1 - off1) + (i2 - off2) - dd; |
|
175 |
||
176 |
if (v > XDL_K_HEUR * ec && v > best && |
|
177 |
off1 + xenv->snake_cnt <= i1 && i1 < lim1 && |
|
178 |
off2 + xenv->snake_cnt <= i2 && i2 < lim2) { |
|
179 |
for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++) |
|
180 |
if (k == xenv->snake_cnt) { |
|
181 |
best = v; |
|
182 |
spl->i1 = i1; |
|
183 |
spl->i2 = i2; |
|
184 |
break; |
|
185 |
} |
|
186 |
} |
|
187 |
} |
|
188 |
if (best > 0) { |
|
189 |
spl->min_lo = 1; |
|
190 |
spl->min_hi = 0; |
|
191 |
return ec; |
|
192 |
} |
|
193 |
||
194 |
for (best = 0, d = bmax; d >= bmin; d -= 2) { |
|
195 |
dd = d > bmid ? d - bmid: bmid - d; |
|
196 |
i1 = kvdb[d]; |
|
197 |
i2 = i1 - d; |
|
198 |
v = (lim1 - i1) + (lim2 - i2) - dd; |
|
199 |
||
200 |
if (v > XDL_K_HEUR * ec && v > best && |
|
201 |
off1 < i1 && i1 <= lim1 - xenv->snake_cnt && |
|
202 |
off2 < i2 && i2 <= lim2 - xenv->snake_cnt) { |
|
203 |
for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++) |
|
204 |
if (k == xenv->snake_cnt - 1) { |
|
205 |
best = v; |
|
206 |
spl->i1 = i1; |
|
207 |
spl->i2 = i2; |
|
208 |
break; |
|
209 |
} |
|
210 |
} |
|
211 |
} |
|
212 |
if (best > 0) { |
|
213 |
spl->min_lo = 0; |
|
214 |
spl->min_hi = 1; |
|
215 |
return ec; |
|
216 |
} |
|
217 |
} |
|
218 |
||
219 |
/* |
|
220 |
* Enough is enough. We spent too much time here and now we collect |
|
221 |
* the furthest reaching path using the (i1 + i2) measure. |
|
222 |
*/ |
|
223 |
if (ec >= xenv->mxcost) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
224 |
int64_t fbest, fbest1, bbest, bbest1; |
36671 | 225 |
|
226 |
fbest = fbest1 = -1; |
|
227 |
for (d = fmax; d >= fmin; d -= 2) { |
|
228 |
i1 = XDL_MIN(kvdf[d], lim1); |
|
229 |
i2 = i1 - d; |
|
230 |
if (lim2 < i2) |
|
231 |
i1 = lim2 + d, i2 = lim2; |
|
232 |
if (fbest < i1 + i2) { |
|
233 |
fbest = i1 + i2; |
|
234 |
fbest1 = i1; |
|
235 |
} |
|
236 |
} |
|
237 |
||
238 |
bbest = bbest1 = XDL_LINE_MAX; |
|
239 |
for (d = bmax; d >= bmin; d -= 2) { |
|
240 |
i1 = XDL_MAX(off1, kvdb[d]); |
|
241 |
i2 = i1 - d; |
|
242 |
if (i2 < off2) |
|
243 |
i1 = off2 + d, i2 = off2; |
|
244 |
if (i1 + i2 < bbest) { |
|
245 |
bbest = i1 + i2; |
|
246 |
bbest1 = i1; |
|
247 |
} |
|
248 |
} |
|
249 |
||
250 |
if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) { |
|
251 |
spl->i1 = fbest1; |
|
252 |
spl->i2 = fbest - fbest1; |
|
253 |
spl->min_lo = 1; |
|
254 |
spl->min_hi = 0; |
|
255 |
} else { |
|
256 |
spl->i1 = bbest1; |
|
257 |
spl->i2 = bbest - bbest1; |
|
258 |
spl->min_lo = 0; |
|
259 |
spl->min_hi = 1; |
|
260 |
} |
|
261 |
return ec; |
|
262 |
} |
|
263 |
} |
|
264 |
} |
|
265 |
||
266 |
||
267 |
/* |
|
268 |
* Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling |
|
269 |
* the box splitting function. Note that the real job (marking changed lines) |
|
270 |
* is done in the two boundary reaching checks. |
|
271 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
272 |
int xdl_recs_cmp(diffdata_t *dd1, int64_t off1, int64_t lim1, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
273 |
diffdata_t *dd2, int64_t off2, int64_t lim2, |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
274 |
int64_t *kvdf, int64_t *kvdb, int need_min, xdalgoenv_t *xenv) { |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
275 |
uint64_t const *ha1 = dd1->ha, *ha2 = dd2->ha; |
36671 | 276 |
|
277 |
/* |
|
278 |
* Shrink the box by walking through each diagonal snake (SW and NE). |
|
279 |
*/ |
|
280 |
for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++); |
|
281 |
for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--); |
|
282 |
||
283 |
/* |
|
284 |
* If one dimension is empty, then all records on the other one must |
|
285 |
* be obviously changed. |
|
286 |
*/ |
|
287 |
if (off1 == lim1) { |
|
288 |
char *rchg2 = dd2->rchg; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
289 |
int64_t *rindex2 = dd2->rindex; |
36671 | 290 |
|
291 |
for (; off2 < lim2; off2++) |
|
292 |
rchg2[rindex2[off2]] = 1; |
|
293 |
} else if (off2 == lim2) { |
|
294 |
char *rchg1 = dd1->rchg; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
295 |
int64_t *rindex1 = dd1->rindex; |
36671 | 296 |
|
297 |
for (; off1 < lim1; off1++) |
|
298 |
rchg1[rindex1[off1]] = 1; |
|
299 |
} else { |
|
300 |
xdpsplit_t spl; |
|
301 |
spl.i1 = spl.i2 = 0; |
|
302 |
||
303 |
/* |
|
304 |
* Divide ... |
|
305 |
*/ |
|
306 |
if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb, |
|
307 |
need_min, &spl, xenv) < 0) { |
|
308 |
||
309 |
return -1; |
|
310 |
} |
|
311 |
||
312 |
/* |
|
313 |
* ... et Impera. |
|
314 |
*/ |
|
315 |
if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2, |
|
316 |
kvdf, kvdb, spl.min_lo, xenv) < 0 || |
|
317 |
xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2, |
|
318 |
kvdf, kvdb, spl.min_hi, xenv) < 0) { |
|
319 |
||
320 |
return -1; |
|
321 |
} |
|
322 |
} |
|
323 |
||
324 |
return 0; |
|
325 |
} |
|
326 |
||
327 |
||
328 |
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, |
|
329 |
xdfenv_t *xe) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
330 |
int64_t ndiags; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
331 |
int64_t *kvd, *kvdf, *kvdb; |
36671 | 332 |
xdalgoenv_t xenv; |
333 |
diffdata_t dd1, dd2; |
|
334 |
||
335 |
if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) { |
|
336 |
||
337 |
return -1; |
|
338 |
} |
|
339 |
||
340 |
/* |
|
341 |
* Allocate and setup K vectors to be used by the differential algorithm. |
|
342 |
* One is to store the forward path and one to store the backward path. |
|
343 |
*/ |
|
344 |
ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3; |
|
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36826
diff
changeset
|
345 |
if (!(kvd = (int64_t *) xdl_malloc((2 * ndiags + 2) * sizeof(int64_t)))) { |
36671 | 346 |
|
347 |
xdl_free_env(xe); |
|
348 |
return -1; |
|
349 |
} |
|
350 |
kvdf = kvd; |
|
351 |
kvdb = kvdf + ndiags; |
|
352 |
kvdf += xe->xdf2.nreff + 1; |
|
353 |
kvdb += xe->xdf2.nreff + 1; |
|
354 |
||
355 |
xenv.mxcost = xdl_bogosqrt(ndiags); |
|
356 |
if (xenv.mxcost < XDL_MAX_COST_MIN) |
|
357 |
xenv.mxcost = XDL_MAX_COST_MIN; |
|
358 |
xenv.snake_cnt = XDL_SNAKE_CNT; |
|
359 |
xenv.heur_min = XDL_HEUR_MIN_COST; |
|
360 |
||
361 |
dd1.nrec = xe->xdf1.nreff; |
|
362 |
dd1.ha = xe->xdf1.ha; |
|
363 |
dd1.rchg = xe->xdf1.rchg; |
|
364 |
dd1.rindex = xe->xdf1.rindex; |
|
365 |
dd2.nrec = xe->xdf2.nreff; |
|
366 |
dd2.ha = xe->xdf2.ha; |
|
367 |
dd2.rchg = xe->xdf2.rchg; |
|
368 |
dd2.rindex = xe->xdf2.rindex; |
|
369 |
||
370 |
if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec, |
|
371 |
kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) { |
|
372 |
||
373 |
xdl_free(kvd); |
|
374 |
xdl_free_env(xe); |
|
375 |
return -1; |
|
376 |
} |
|
377 |
||
378 |
xdl_free(kvd); |
|
379 |
||
380 |
return 0; |
|
381 |
} |
|
382 |
||
383 |
||
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
384 |
static xdchange_t *xdl_add_change(xdchange_t *xscr, int64_t i1, int64_t i2, int64_t chg1, int64_t chg2) { |
36671 | 385 |
xdchange_t *xch; |
386 |
||
387 |
if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t)))) |
|
388 |
return NULL; |
|
389 |
||
390 |
xch->next = xscr; |
|
391 |
xch->i1 = i1; |
|
392 |
xch->i2 = i2; |
|
393 |
xch->chg1 = chg1; |
|
394 |
xch->chg2 = chg2; |
|
395 |
xch->ignore = 0; |
|
396 |
||
397 |
return xch; |
|
398 |
} |
|
399 |
||
400 |
||
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
401 |
static int recs_match(xrecord_t *rec1, xrecord_t *rec2) |
36671 | 402 |
{ |
403 |
return (rec1->ha == rec2->ha && |
|
404 |
xdl_recmatch(rec1->ptr, rec1->size, |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
405 |
rec2->ptr, rec2->size)); |
36671 | 406 |
} |
407 |
||
408 |
/* |
|
409 |
* If a line is indented more than this, get_indent() just returns this value. |
|
410 |
* This avoids having to do absurd amounts of work for data that are not |
|
411 |
* human-readable text, and also ensures that the output of get_indent fits within |
|
412 |
* an int. |
|
413 |
*/ |
|
414 |
#define MAX_INDENT 200 |
|
415 |
||
416 |
/* |
|
417 |
* Return the amount of indentation of the specified line, treating TAB as 8 |
|
418 |
* columns. Return -1 if line is empty or contains only whitespace. Clamp the |
|
419 |
* output value at MAX_INDENT. |
|
420 |
*/ |
|
421 |
static int get_indent(xrecord_t *rec) |
|
422 |
{ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
423 |
int64_t i; |
36671 | 424 |
int ret = 0; |
425 |
||
426 |
for (i = 0; i < rec->size; i++) { |
|
427 |
char c = rec->ptr[i]; |
|
428 |
||
429 |
if (!XDL_ISSPACE(c)) |
|
430 |
return ret; |
|
431 |
else if (c == ' ') |
|
432 |
ret += 1; |
|
433 |
else if (c == '\t') |
|
434 |
ret += 8 - ret % 8; |
|
435 |
/* ignore other whitespace characters */ |
|
436 |
||
437 |
if (ret >= MAX_INDENT) |
|
438 |
return MAX_INDENT; |
|
439 |
} |
|
440 |
||
441 |
/* The line contains only whitespace. */ |
|
442 |
return -1; |
|
443 |
} |
|
444 |
||
445 |
/* |
|
446 |
* If more than this number of consecutive blank rows are found, just return this |
|
447 |
* value. This avoids requiring O(N^2) work for pathological cases, and also |
|
448 |
* ensures that the output of score_split fits in an int. |
|
449 |
*/ |
|
450 |
#define MAX_BLANKS 20 |
|
451 |
||
452 |
/* Characteristics measured about a hypothetical split position. */ |
|
453 |
struct split_measurement { |
|
454 |
/* |
|
455 |
* Is the split at the end of the file (aside from any blank lines)? |
|
456 |
*/ |
|
457 |
int end_of_file; |
|
458 |
||
459 |
/* |
|
460 |
* How much is the line immediately following the split indented (or -1 if |
|
461 |
* the line is blank): |
|
462 |
*/ |
|
463 |
int indent; |
|
464 |
||
465 |
/* |
|
466 |
* How many consecutive lines above the split are blank? |
|
467 |
*/ |
|
468 |
int pre_blank; |
|
469 |
||
470 |
/* |
|
471 |
* How much is the nearest non-blank line above the split indented (or -1 |
|
472 |
* if there is no such line)? |
|
473 |
*/ |
|
474 |
int pre_indent; |
|
475 |
||
476 |
/* |
|
477 |
* How many lines after the line following the split are blank? |
|
478 |
*/ |
|
479 |
int post_blank; |
|
480 |
||
481 |
/* |
|
482 |
* How much is the nearest non-blank line after the line following the |
|
483 |
* split indented (or -1 if there is no such line)? |
|
484 |
*/ |
|
485 |
int post_indent; |
|
486 |
}; |
|
487 |
||
488 |
struct split_score { |
|
489 |
/* The effective indent of this split (smaller is preferred). */ |
|
490 |
int effective_indent; |
|
491 |
||
492 |
/* Penalty for this split (smaller is preferred). */ |
|
493 |
int penalty; |
|
494 |
}; |
|
495 |
||
496 |
/* |
|
497 |
* Fill m with information about a hypothetical split of xdf above line split. |
|
498 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
499 |
static void measure_split(const xdfile_t *xdf, int64_t split, |
36671 | 500 |
struct split_measurement *m) |
501 |
{ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
502 |
int64_t i; |
36671 | 503 |
|
504 |
if (split >= xdf->nrec) { |
|
505 |
m->end_of_file = 1; |
|
506 |
m->indent = -1; |
|
507 |
} else { |
|
508 |
m->end_of_file = 0; |
|
509 |
m->indent = get_indent(xdf->recs[split]); |
|
510 |
} |
|
511 |
||
512 |
m->pre_blank = 0; |
|
513 |
m->pre_indent = -1; |
|
514 |
for (i = split - 1; i >= 0; i--) { |
|
515 |
m->pre_indent = get_indent(xdf->recs[i]); |
|
516 |
if (m->pre_indent != -1) |
|
517 |
break; |
|
518 |
m->pre_blank += 1; |
|
519 |
if (m->pre_blank == MAX_BLANKS) { |
|
520 |
m->pre_indent = 0; |
|
521 |
break; |
|
522 |
} |
|
523 |
} |
|
524 |
||
525 |
m->post_blank = 0; |
|
526 |
m->post_indent = -1; |
|
527 |
for (i = split + 1; i < xdf->nrec; i++) { |
|
528 |
m->post_indent = get_indent(xdf->recs[i]); |
|
529 |
if (m->post_indent != -1) |
|
530 |
break; |
|
531 |
m->post_blank += 1; |
|
532 |
if (m->post_blank == MAX_BLANKS) { |
|
533 |
m->post_indent = 0; |
|
534 |
break; |
|
535 |
} |
|
536 |
} |
|
537 |
} |
|
538 |
||
539 |
/* |
|
540 |
* The empirically-determined weight factors used by score_split() below. |
|
541 |
* Larger values means that the position is a less favorable place to split. |
|
542 |
* |
|
543 |
* Note that scores are only ever compared against each other, so multiplying |
|
544 |
* all of these weight/penalty values by the same factor wouldn't change the |
|
545 |
* heuristic's behavior. Still, we need to set that arbitrary scale *somehow*. |
|
546 |
* In practice, these numbers are chosen to be large enough that they can be |
|
547 |
* adjusted relative to each other with sufficient precision despite using |
|
548 |
* integer math. |
|
549 |
*/ |
|
550 |
||
551 |
/* Penalty if there are no non-blank lines before the split */ |
|
552 |
#define START_OF_FILE_PENALTY 1 |
|
553 |
||
554 |
/* Penalty if there are no non-blank lines after the split */ |
|
555 |
#define END_OF_FILE_PENALTY 21 |
|
556 |
||
557 |
/* Multiplier for the number of blank lines around the split */ |
|
558 |
#define TOTAL_BLANK_WEIGHT (-30) |
|
559 |
||
560 |
/* Multiplier for the number of blank lines after the split */ |
|
561 |
#define POST_BLANK_WEIGHT 6 |
|
562 |
||
563 |
/* |
|
564 |
* Penalties applied if the line is indented more than its predecessor |
|
565 |
*/ |
|
566 |
#define RELATIVE_INDENT_PENALTY (-4) |
|
567 |
#define RELATIVE_INDENT_WITH_BLANK_PENALTY 10 |
|
568 |
||
569 |
/* |
|
570 |
* Penalties applied if the line is indented less than both its predecessor and |
|
571 |
* its successor |
|
572 |
*/ |
|
573 |
#define RELATIVE_OUTDENT_PENALTY 24 |
|
574 |
#define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17 |
|
575 |
||
576 |
/* |
|
577 |
* Penalties applied if the line is indented less than its predecessor but not |
|
578 |
* less than its successor |
|
579 |
*/ |
|
580 |
#define RELATIVE_DEDENT_PENALTY 23 |
|
581 |
#define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17 |
|
582 |
||
583 |
/* |
|
584 |
* We only consider whether the sum of the effective indents for splits are |
|
585 |
* less than (-1), equal to (0), or greater than (+1) each other. The resulting |
|
586 |
* value is multiplied by the following weight and combined with the penalty to |
|
587 |
* determine the better of two scores. |
|
588 |
*/ |
|
589 |
#define INDENT_WEIGHT 60 |
|
590 |
||
591 |
/* |
|
592 |
* Compute a badness score for the hypothetical split whose measurements are |
|
593 |
* stored in m. The weight factors were determined empirically using the tools and |
|
594 |
* corpus described in |
|
595 |
* |
|
596 |
* https://github.com/mhagger/diff-slider-tools |
|
597 |
* |
|
598 |
* Also see that project if you want to improve the weights based on, for example, |
|
599 |
* a larger or more diverse corpus. |
|
600 |
*/ |
|
601 |
static void score_add_split(const struct split_measurement *m, struct split_score *s) |
|
602 |
{ |
|
603 |
/* |
|
604 |
* A place to accumulate penalty factors (positive makes this index more |
|
605 |
* favored): |
|
606 |
*/ |
|
607 |
int post_blank, total_blank, indent, any_blanks; |
|
608 |
||
609 |
if (m->pre_indent == -1 && m->pre_blank == 0) |
|
610 |
s->penalty += START_OF_FILE_PENALTY; |
|
611 |
||
612 |
if (m->end_of_file) |
|
613 |
s->penalty += END_OF_FILE_PENALTY; |
|
614 |
||
615 |
/* |
|
616 |
* Set post_blank to the number of blank lines following the split, |
|
617 |
* including the line immediately after the split: |
|
618 |
*/ |
|
619 |
post_blank = (m->indent == -1) ? 1 + m->post_blank : 0; |
|
620 |
total_blank = m->pre_blank + post_blank; |
|
621 |
||
622 |
/* Penalties based on nearby blank lines: */ |
|
623 |
s->penalty += TOTAL_BLANK_WEIGHT * total_blank; |
|
624 |
s->penalty += POST_BLANK_WEIGHT * post_blank; |
|
625 |
||
626 |
if (m->indent != -1) |
|
627 |
indent = m->indent; |
|
628 |
else |
|
629 |
indent = m->post_indent; |
|
630 |
||
631 |
any_blanks = (total_blank != 0); |
|
632 |
||
633 |
/* Note that the effective indent is -1 at the end of the file: */ |
|
634 |
s->effective_indent += indent; |
|
635 |
||
636 |
if (indent == -1) { |
|
637 |
/* No additional adjustments needed. */ |
|
638 |
} else if (m->pre_indent == -1) { |
|
639 |
/* No additional adjustments needed. */ |
|
640 |
} else if (indent > m->pre_indent) { |
|
641 |
/* |
|
642 |
* The line is indented more than its predecessor. |
|
643 |
*/ |
|
644 |
s->penalty += any_blanks ? |
|
645 |
RELATIVE_INDENT_WITH_BLANK_PENALTY : |
|
646 |
RELATIVE_INDENT_PENALTY; |
|
647 |
} else if (indent == m->pre_indent) { |
|
648 |
/* |
|
649 |
* The line has the same indentation level as its predecessor. |
|
650 |
* No additional adjustments needed. |
|
651 |
*/ |
|
652 |
} else { |
|
653 |
/* |
|
654 |
* The line is indented less than its predecessor. It could be |
|
655 |
* the block terminator of the previous block, but it could |
|
656 |
* also be the start of a new block (e.g., an "else" block, or |
|
657 |
* maybe the previous block didn't have a block terminator). |
|
658 |
* Try to distinguish those cases based on what comes next: |
|
659 |
*/ |
|
660 |
if (m->post_indent != -1 && m->post_indent > indent) { |
|
661 |
/* |
|
662 |
* The following line is indented more. So it is likely |
|
663 |
* that this line is the start of a block. |
|
664 |
*/ |
|
665 |
s->penalty += any_blanks ? |
|
666 |
RELATIVE_OUTDENT_WITH_BLANK_PENALTY : |
|
667 |
RELATIVE_OUTDENT_PENALTY; |
|
668 |
} else { |
|
669 |
/* |
|
670 |
* That was probably the end of a block. |
|
671 |
*/ |
|
672 |
s->penalty += any_blanks ? |
|
673 |
RELATIVE_DEDENT_WITH_BLANK_PENALTY : |
|
674 |
RELATIVE_DEDENT_PENALTY; |
|
675 |
} |
|
676 |
} |
|
677 |
} |
|
678 |
||
679 |
static int score_cmp(struct split_score *s1, struct split_score *s2) |
|
680 |
{ |
|
681 |
/* -1 if s1.effective_indent < s2->effective_indent, etc. */ |
|
682 |
int cmp_indents = ((s1->effective_indent > s2->effective_indent) - |
|
683 |
(s1->effective_indent < s2->effective_indent)); |
|
684 |
||
685 |
return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty); |
|
686 |
} |
|
687 |
||
688 |
/* |
|
689 |
* Represent a group of changed lines in an xdfile_t (i.e., a contiguous group |
|
690 |
* of lines that was inserted or deleted from the corresponding version of the |
|
691 |
* file). We consider there to be such a group at the beginning of the file, at |
|
692 |
* the end of the file, and between any two unchanged lines, though most such |
|
693 |
* groups will usually be empty. |
|
694 |
* |
|
695 |
* If the first line in a group is equal to the line following the group, then |
|
696 |
* the group can be slid down. Similarly, if the last line in a group is equal |
|
697 |
* to the line preceding the group, then the group can be slid up. See |
|
698 |
* group_slide_down() and group_slide_up(). |
|
699 |
* |
|
700 |
* Note that loops that are testing for changed lines in xdf->rchg do not need |
|
701 |
* index bounding since the array is prepared with a zero at position -1 and N. |
|
702 |
*/ |
|
703 |
struct xdlgroup { |
|
704 |
/* |
|
705 |
* The index of the first changed line in the group, or the index of |
|
706 |
* the unchanged line above which the (empty) group is located. |
|
707 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
708 |
int64_t start; |
36671 | 709 |
|
710 |
/* |
|
711 |
* The index of the first unchanged line after the group. For an empty |
|
712 |
* group, end is equal to start. |
|
713 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
714 |
int64_t end; |
36671 | 715 |
}; |
716 |
||
717 |
/* |
|
718 |
* Initialize g to point at the first group in xdf. |
|
719 |
*/ |
|
720 |
static void group_init(xdfile_t *xdf, struct xdlgroup *g) |
|
721 |
{ |
|
722 |
g->start = g->end = 0; |
|
723 |
while (xdf->rchg[g->end]) |
|
724 |
g->end++; |
|
725 |
} |
|
726 |
||
727 |
/* |
|
728 |
* Move g to describe the next (possibly empty) group in xdf and return 0. If g |
|
729 |
* is already at the end of the file, do nothing and return -1. |
|
730 |
*/ |
|
731 |
static inline int group_next(xdfile_t *xdf, struct xdlgroup *g) |
|
732 |
{ |
|
733 |
if (g->end == xdf->nrec) |
|
734 |
return -1; |
|
735 |
||
736 |
g->start = g->end + 1; |
|
737 |
for (g->end = g->start; xdf->rchg[g->end]; g->end++) |
|
738 |
; |
|
739 |
||
740 |
return 0; |
|
741 |
} |
|
742 |
||
743 |
/* |
|
744 |
* Move g to describe the previous (possibly empty) group in xdf and return 0. |
|
745 |
* If g is already at the beginning of the file, do nothing and return -1. |
|
746 |
*/ |
|
747 |
static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g) |
|
748 |
{ |
|
749 |
if (g->start == 0) |
|
750 |
return -1; |
|
751 |
||
752 |
g->end = g->start - 1; |
|
753 |
for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--) |
|
754 |
; |
|
755 |
||
756 |
return 0; |
|
757 |
} |
|
758 |
||
759 |
/* |
|
760 |
* If g can be slid toward the end of the file, do so, and if it bumps into a |
|
761 |
* following group, expand this group to include it. Return 0 on success or -1 |
|
762 |
* if g cannot be slid down. |
|
763 |
*/ |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
764 |
static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g) |
36671 | 765 |
{ |
766 |
if (g->end < xdf->nrec && |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
767 |
recs_match(xdf->recs[g->start], xdf->recs[g->end])) { |
36671 | 768 |
xdf->rchg[g->start++] = 0; |
769 |
xdf->rchg[g->end++] = 1; |
|
770 |
||
771 |
while (xdf->rchg[g->end]) |
|
772 |
g->end++; |
|
773 |
||
774 |
return 0; |
|
775 |
} else { |
|
776 |
return -1; |
|
777 |
} |
|
778 |
} |
|
779 |
||
780 |
/* |
|
781 |
* If g can be slid toward the beginning of the file, do so, and if it bumps |
|
782 |
* into a previous group, expand this group to include it. Return 0 on success |
|
783 |
* or -1 if g cannot be slid up. |
|
784 |
*/ |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
785 |
static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g) |
36671 | 786 |
{ |
787 |
if (g->start > 0 && |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
788 |
recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1])) { |
36671 | 789 |
xdf->rchg[--g->start] = 1; |
790 |
xdf->rchg[--g->end] = 0; |
|
791 |
||
792 |
while (xdf->rchg[g->start - 1]) |
|
793 |
g->start--; |
|
794 |
||
795 |
return 0; |
|
796 |
} else { |
|
797 |
return -1; |
|
798 |
} |
|
799 |
} |
|
800 |
||
801 |
static void xdl_bug(const char *msg) |
|
802 |
{ |
|
803 |
fprintf(stderr, "BUG: %s\n", msg); |
|
804 |
exit(1); |
|
805 |
} |
|
806 |
||
807 |
/* |
|
36674
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
808 |
* For indentation heuristic, skip searching for better slide position after |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
809 |
* checking MAX_BORING lines without finding an improvement. This defends the |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
810 |
* indentation heuristic logic against pathological cases. The value is not |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
811 |
* picked scientifically but should be good enough. |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
812 |
*/ |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
813 |
#define MAX_BORING 100 |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
814 |
|
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
815 |
/* |
36671 | 816 |
* Move back and forward change groups for a consistent and pretty diff output. |
817 |
* This also helps in finding joinable change groups and reducing the diff |
|
818 |
* size. |
|
819 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
820 |
int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, int64_t flags) { |
36671 | 821 |
struct xdlgroup g, go; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
822 |
int64_t earliest_end, end_matching_other; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
823 |
int64_t groupsize; |
36671 | 824 |
|
825 |
group_init(xdf, &g); |
|
826 |
group_init(xdfo, &go); |
|
827 |
||
828 |
while (1) { |
|
829 |
/* If the group is empty in the to-be-compacted file, skip it: */ |
|
830 |
if (g.end == g.start) |
|
831 |
goto next; |
|
832 |
||
833 |
/* |
|
834 |
* Now shift the change up and then down as far as possible in |
|
835 |
* each direction. If it bumps into any other changes, merge them. |
|
836 |
*/ |
|
837 |
do { |
|
838 |
groupsize = g.end - g.start; |
|
839 |
||
840 |
/* |
|
841 |
* Keep track of the last "end" index that causes this |
|
842 |
* group to align with a group of changed lines in the |
|
843 |
* other file. -1 indicates that we haven't found such |
|
844 |
* a match yet: |
|
845 |
*/ |
|
846 |
end_matching_other = -1; |
|
847 |
||
848 |
/* Shift the group backward as much as possible: */ |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
849 |
while (!group_slide_up(xdf, &g)) |
36671 | 850 |
if (group_previous(xdfo, &go)) |
851 |
xdl_bug("group sync broken sliding up"); |
|
852 |
||
853 |
/* |
|
854 |
* This is this highest that this group can be shifted. |
|
855 |
* Record its end index: |
|
856 |
*/ |
|
857 |
earliest_end = g.end; |
|
858 |
||
859 |
if (go.end > go.start) |
|
860 |
end_matching_other = g.end; |
|
861 |
||
862 |
/* Now shift the group forward as far as possible: */ |
|
863 |
while (1) { |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
864 |
if (group_slide_down(xdf, &g)) |
36671 | 865 |
break; |
866 |
if (group_next(xdfo, &go)) |
|
867 |
xdl_bug("group sync broken sliding down"); |
|
868 |
||
869 |
if (go.end > go.start) |
|
870 |
end_matching_other = g.end; |
|
871 |
} |
|
872 |
} while (groupsize != g.end - g.start); |
|
873 |
||
874 |
/* |
|
875 |
* If the group can be shifted, then we can possibly use this |
|
876 |
* freedom to produce a more intuitive diff. |
|
877 |
* |
|
878 |
* The group is currently shifted as far down as possible, so the |
|
879 |
* heuristics below only have to handle upwards shifts. |
|
880 |
*/ |
|
881 |
||
882 |
if (g.end == earliest_end) { |
|
883 |
/* no shifting was possible */ |
|
884 |
} else if (end_matching_other != -1) { |
|
885 |
/* |
|
886 |
* Move the possibly merged group of changes back to line |
|
887 |
* up with the last group of changes from the other file |
|
888 |
* that it can align with. |
|
889 |
*/ |
|
890 |
while (go.end == go.start) { |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
891 |
if (group_slide_up(xdf, &g)) |
36671 | 892 |
xdl_bug("match disappeared"); |
893 |
if (group_previous(xdfo, &go)) |
|
894 |
xdl_bug("group sync broken sliding to match"); |
|
895 |
} |
|
896 |
} else if (flags & XDF_INDENT_HEURISTIC) { |
|
897 |
/* |
|
898 |
* Indent heuristic: a group of pure add/delete lines |
|
899 |
* implies two splits, one between the end of the "before" |
|
900 |
* context and the start of the group, and another between |
|
901 |
* the end of the group and the beginning of the "after" |
|
902 |
* context. Some splits are aesthetically better and some |
|
903 |
* are worse. We compute a badness "score" for each split, |
|
904 |
* and add the scores for the two splits to define a |
|
905 |
* "score" for each position that the group can be shifted |
|
906 |
* to. Then we pick the shift with the lowest score. |
|
907 |
*/ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
908 |
int64_t shift, best_shift = -1; |
36671 | 909 |
struct split_score best_score; |
910 |
||
36674
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
911 |
/* |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
912 |
* This is O(N * MAX_BLANKS) (N = shift-able lines). |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
913 |
* Even with MAX_BLANKS bounded to a small value, a |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
914 |
* large N could still make this loop take several |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
915 |
* times longer than the main diff algorithm. The |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
916 |
* "boring" value is to help cut down N to something |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
917 |
* like (MAX_BORING + groupsize). |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
918 |
* |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
919 |
* Scan from bottom to top. So we can exit the loop |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
920 |
* without compromising the assumption "for a same best |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
921 |
* score, pick the bottommost shift". |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
922 |
*/ |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
923 |
int boring = 0; |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
924 |
for (shift = g.end; shift >= earliest_end; shift--) { |
36671 | 925 |
struct split_measurement m; |
926 |
struct split_score score = {0, 0}; |
|
36674
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
927 |
int cmp; |
36671 | 928 |
|
929 |
measure_split(xdf, shift, &m); |
|
930 |
score_add_split(&m, &score); |
|
931 |
measure_split(xdf, shift - groupsize, &m); |
|
932 |
score_add_split(&m, &score); |
|
36674
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
933 |
|
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
934 |
if (best_shift == -1) { |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
935 |
cmp = -1; |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
936 |
} else { |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
937 |
cmp = score_cmp(&score, &best_score); |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
938 |
} |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
939 |
if (cmp < 0) { |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
940 |
boring = 0; |
36671 | 941 |
best_score.effective_indent = score.effective_indent; |
942 |
best_score.penalty = score.penalty; |
|
943 |
best_shift = shift; |
|
36674
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
944 |
} else { |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
945 |
boring += 1; |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
946 |
if (boring >= MAX_BORING) |
c420792217c8
xdiff: reduce indent heuristic overhead
Jun Wu <quark@fb.com>
parents:
36673
diff
changeset
|
947 |
break; |
36671 | 948 |
} |
949 |
} |
|
950 |
||
951 |
while (g.end > best_shift) { |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
952 |
if (group_slide_up(xdf, &g)) |
36671 | 953 |
xdl_bug("best shift unreached"); |
954 |
if (group_previous(xdfo, &go)) |
|
955 |
xdl_bug("group sync broken sliding to blank line"); |
|
956 |
} |
|
957 |
} |
|
958 |
||
959 |
next: |
|
960 |
/* Move past the just-processed group: */ |
|
961 |
if (group_next(xdf, &g)) |
|
962 |
break; |
|
963 |
if (group_next(xdfo, &go)) |
|
964 |
xdl_bug("group sync broken moving to next group"); |
|
965 |
} |
|
966 |
||
967 |
if (!group_next(xdfo, &go)) |
|
968 |
xdl_bug("group sync broken at end of file"); |
|
969 |
||
970 |
return 0; |
|
971 |
} |
|
972 |
||
973 |
||
974 |
int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) { |
|
975 |
xdchange_t *cscr = NULL, *xch; |
|
976 |
char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
977 |
int64_t i1, i2, l1, l2; |
36671 | 978 |
|
979 |
/* |
|
980 |
* Trivial. Collects "groups" of changes and creates an edit script. |
|
981 |
*/ |
|
982 |
for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--) |
|
983 |
if (rchg1[i1 - 1] || rchg2[i2 - 1]) { |
|
984 |
for (l1 = i1; rchg1[i1 - 1]; i1--); |
|
985 |
for (l2 = i2; rchg2[i2 - 1]; i2--); |
|
986 |
||
987 |
if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) { |
|
988 |
xdl_free_script(cscr); |
|
989 |
return -1; |
|
990 |
} |
|
991 |
cscr = xch; |
|
992 |
} |
|
993 |
||
994 |
*xscr = cscr; |
|
995 |
||
996 |
return 0; |
|
997 |
} |
|
998 |
||
999 |
||
1000 |
void xdl_free_script(xdchange_t *xscr) { |
|
1001 |
xdchange_t *xch; |
|
1002 |
||
1003 |
while ((xch = xscr) != NULL) { |
|
1004 |
xscr = xscr->next; |
|
1005 |
xdl_free(xch); |
|
1006 |
} |
|
1007 |
} |
|
1008 |
||
36763 | 1009 |
|
1010 |
/* |
|
1011 |
* Starting at the passed change atom, find the latest change atom to be included |
|
1012 |
* inside the differential hunk according to the specified configuration. |
|
1013 |
* Also advance xscr if the first changes must be discarded. |
|
1014 |
*/ |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
1015 |
xdchange_t *xdl_get_hunk(xdchange_t **xscr) |
36763 | 1016 |
{ |
1017 |
xdchange_t *xch, *xchp, *lxch; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
1018 |
uint64_t ignored = 0; /* number of ignored blank lines */ |
36763 | 1019 |
|
1020 |
/* remove ignorable changes that are too far before other changes */ |
|
1021 |
for (xchp = *xscr; xchp && xchp->ignore; xchp = xchp->next) { |
|
1022 |
xch = xchp->next; |
|
1023 |
||
1024 |
if (xch == NULL || |
|
36826
e5b14f5b8b94
xdiff: resolve signed unsigned comparison warning
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
1025 |
xch->i1 - (xchp->i1 + xchp->chg1) >= 0) |
36763 | 1026 |
*xscr = xch; |
1027 |
} |
|
1028 |
||
1029 |
if (*xscr == NULL) |
|
1030 |
return NULL; |
|
1031 |
||
1032 |
lxch = *xscr; |
|
1033 |
||
1034 |
for (xchp = *xscr, xch = xchp->next; xch; xchp = xch, xch = xch->next) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
1035 |
int64_t distance = xch->i1 - (xchp->i1 + xchp->chg1); |
36826
e5b14f5b8b94
xdiff: resolve signed unsigned comparison warning
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
1036 |
if (distance > 0) |
36763 | 1037 |
break; |
1038 |
||
36826
e5b14f5b8b94
xdiff: resolve signed unsigned comparison warning
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
1039 |
if (distance < 0 && (!xch->ignore || lxch == xchp)) { |
36763 | 1040 |
lxch = xch; |
1041 |
ignored = 0; |
|
36826
e5b14f5b8b94
xdiff: resolve signed unsigned comparison warning
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
1042 |
} else if (distance < 0 && xch->ignore) { |
36763 | 1043 |
ignored += xch->chg2; |
1044 |
} else if (lxch != xchp && |
|
36826
e5b14f5b8b94
xdiff: resolve signed unsigned comparison warning
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
1045 |
xch->i1 + ignored - (lxch->i1 + lxch->chg1) > 0) { |
36763 | 1046 |
break; |
1047 |
} else if (!xch->ignore) { |
|
1048 |
lxch = xch; |
|
1049 |
ignored = 0; |
|
1050 |
} else { |
|
1051 |
ignored += xch->chg2; |
|
1052 |
} |
|
1053 |
} |
|
1054 |
||
1055 |
return lxch; |
|
1056 |
} |
|
1057 |
||
1058 |
||
36671 | 1059 |
static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb, |
1060 |
xdemitconf_t const *xecfg) |
|
1061 |
{ |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
1062 |
int64_t p = xe->nprefix, s = xe->nsuffix; |
36671 | 1063 |
xdchange_t *xch, *xche; |
36763 | 1064 |
|
1065 |
if (!xecfg->hunk_func) |
|
1066 |
return -1; |
|
1067 |
||
36673 | 1068 |
if ((xecfg->flags & XDL_EMIT_BDIFFHUNK) != 0) { |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
1069 |
int64_t i1 = 0, i2 = 0, n1 = xe->xdf1.nrec, n2 = xe->xdf2.nrec; |
36673 | 1070 |
for (xch = xscr; xch; xch = xche->next) { |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
1071 |
xche = xdl_get_hunk(&xch); |
36673 | 1072 |
if (!xch) |
1073 |
break; |
|
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1074 |
if (xch != xche) |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1075 |
xdl_bug("xch != xche"); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1076 |
xch->i1 += p; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1077 |
xch->i2 += p; |
36673 | 1078 |
if (xch->i1 > i1 || xch->i2 > i2) { |
1079 |
if (xecfg->hunk_func(i1, xch->i1, i2, xch->i2, ecb->priv) < 0) |
|
1080 |
return -1; |
|
1081 |
} |
|
1082 |
i1 = xche->i1 + xche->chg1; |
|
1083 |
i2 = xche->i2 + xche->chg2; |
|
1084 |
} |
|
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1085 |
if (xecfg->hunk_func(i1, n1 + p + s, i2, n2 + p + s, |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1086 |
ecb->priv) < 0) |
36671 | 1087 |
return -1; |
36673 | 1088 |
} else { |
1089 |
for (xch = xscr; xch; xch = xche->next) { |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
1090 |
xche = xdl_get_hunk(&xch); |
36673 | 1091 |
if (!xch) |
1092 |
break; |
|
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1093 |
if (xecfg->hunk_func(xch->i1 + p, |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1094 |
xche->i1 + xche->chg1 - xch->i1, |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1095 |
xch->i2 + p, |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36763
diff
changeset
|
1096 |
xche->i2 + xche->chg2 - xch->i2, |
36673 | 1097 |
ecb->priv) < 0) |
1098 |
return -1; |
|
1099 |
} |
|
36671 | 1100 |
} |
1101 |
return 0; |
|
1102 |
} |
|
1103 |
||
1104 |
int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, |
|
1105 |
xdemitconf_t const *xecfg, xdemitcb_t *ecb) { |
|
1106 |
xdchange_t *xscr; |
|
1107 |
xdfenv_t xe; |
|
1108 |
||
1109 |
if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { |
|
1110 |
||
1111 |
return -1; |
|
1112 |
} |
|
1113 |
if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || |
|
1114 |
xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || |
|
1115 |
xdl_build_script(&xe, &xscr) < 0) { |
|
1116 |
||
1117 |
xdl_free_env(&xe); |
|
1118 |
return -1; |
|
1119 |
} |
|
1120 |
||
36763 | 1121 |
if (xdl_call_hunk_func(&xe, xscr, ecb, xecfg) < 0) { |
36671 | 1122 |
xdl_free_script(xscr); |
36673 | 1123 |
xdl_free_env(&xe); |
1124 |
return -1; |
|
36671 | 1125 |
} |
36673 | 1126 |
xdl_free_script(xscr); |
36671 | 1127 |
xdl_free_env(&xe); |
1128 |
||
1129 |
return 0; |
|
1130 |
} |