Mercurial > hg
comparison mercurial/thirdparty/xdiff/xhistogram.c @ 36671:34e2ff1f9cd8
xdiff: vendor xdiff library from git
Vendor git's xdiff library from git commit
d7c6c2369d7c6c2369ac21141b7c6cceaebc6414ec3da14ad using GPL2+ license.
There is another recent user report that hg diff generates suboptimal
result. It seems the fix to issue4074 isn't good enough. I crafted some
other interesting cases, and hg diff barely has any advantage compared with
gnu diffutils or git diff.
| testcase | gnu diffutils | hg diff | git diff |
| | lines time | lines time | lines time |
| patience | 6 0.00 | 602 0.08 | 6 0.00 |
| random | 91772 0.90 | 109462 0.70 | 91772 0.24 |
| json | 2 0.03 | 1264814 1.81 | 2 0.29 |
"lines" means the size of the output, i.e. the count of "+/-" lines. "time"
means seconds needed to do the calculation. Both are the smaller the better.
"hg diff" counts Python startup overhead.
Git and GNU diffutils generate optimal results. For the "json" case, git can
have an optimization that does a scan for common prefix and suffix first,
and match them if the length is greater than half of the text. See
https://neil.fraser.name/news/2006/03/12/. That would make git the fastest
for all above cases.
About testcases:
patience:
Aiming for the weakness of the greedy "patience diff" algorithm. Using
git's patience diff option would also get suboptimal result. Generated using
the Python script:
```
open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n')))
open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n')))
```
random:
Generated using the script in `test-issue4074.t`. It practically makes the
algorithm suffer. Impressively, git wins in both performance and diff
quality.
json:
The recent user reported case. It's a single line movement near the end of a
very large (800K lines) JSON file.
Test Plan:
Code taken as-is.
Differential Revision: https://phab.mercurial-scm.org/D2572
# no-check-commit for vendored code
author | Jun Wu <quark@fb.com> |
---|---|
date | Sat, 03 Mar 2018 10:39:43 -0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
36670:44048f1bcee5 | 36671:34e2ff1f9cd8 |
---|---|
1 /* | |
2 * Copyright (C) 2010, Google Inc. | |
3 * and other copyright owners as documented in JGit's IP log. | |
4 * | |
5 * This program and the accompanying materials are made available | |
6 * under the terms of the Eclipse Distribution License v1.0 which | |
7 * accompanies this distribution, is reproduced below, and is | |
8 * available at http://www.eclipse.org/org/documents/edl-v10.php | |
9 * | |
10 * All rights reserved. | |
11 * | |
12 * Redistribution and use in source and binary forms, with or | |
13 * without modification, are permitted provided that the following | |
14 * conditions are met: | |
15 * | |
16 * - Redistributions of source code must retain the above copyright | |
17 * notice, this list of conditions and the following disclaimer. | |
18 * | |
19 * - Redistributions in binary form must reproduce the above | |
20 * copyright notice, this list of conditions and the following | |
21 * disclaimer in the documentation and/or other materials provided | |
22 * with the distribution. | |
23 * | |
24 * - Neither the name of the Eclipse Foundation, Inc. nor the | |
25 * names of its contributors may be used to endorse or promote | |
26 * products derived from this software without specific prior | |
27 * written permission. | |
28 * | |
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND | |
30 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
34 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
35 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
37 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
38 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
39 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
40 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
41 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
42 */ | |
43 | |
44 #include "xinclude.h" | |
45 #include "xtypes.h" | |
46 #include "xdiff.h" | |
47 | |
48 #define MAX_PTR UINT_MAX | |
49 #define MAX_CNT UINT_MAX | |
50 | |
51 #define LINE_END(n) (line##n + count##n - 1) | |
52 #define LINE_END_PTR(n) (*line##n + *count##n - 1) | |
53 | |
54 struct histindex { | |
55 struct record { | |
56 unsigned int ptr, cnt; | |
57 struct record *next; | |
58 } **records, /* an occurrence */ | |
59 **line_map; /* map of line to record chain */ | |
60 chastore_t rcha; | |
61 unsigned int *next_ptrs; | |
62 unsigned int table_bits, | |
63 records_size, | |
64 line_map_size; | |
65 | |
66 unsigned int max_chain_length, | |
67 key_shift, | |
68 ptr_shift; | |
69 | |
70 unsigned int cnt, | |
71 has_common; | |
72 | |
73 xdfenv_t *env; | |
74 xpparam_t const *xpp; | |
75 }; | |
76 | |
77 struct region { | |
78 unsigned int begin1, end1; | |
79 unsigned int begin2, end2; | |
80 }; | |
81 | |
82 #define LINE_MAP(i, a) (i->line_map[(a) - i->ptr_shift]) | |
83 | |
84 #define NEXT_PTR(index, ptr) \ | |
85 (index->next_ptrs[(ptr) - index->ptr_shift]) | |
86 | |
87 #define CNT(index, ptr) \ | |
88 ((LINE_MAP(index, ptr))->cnt) | |
89 | |
90 #define REC(env, s, l) \ | |
91 (env->xdf##s.recs[l - 1]) | |
92 | |
93 static int cmp_recs(xpparam_t const *xpp, | |
94 xrecord_t *r1, xrecord_t *r2) | |
95 { | |
96 return r1->ha == r2->ha && | |
97 xdl_recmatch(r1->ptr, r1->size, r2->ptr, r2->size, | |
98 xpp->flags); | |
99 } | |
100 | |
101 #define CMP_ENV(xpp, env, s1, l1, s2, l2) \ | |
102 (cmp_recs(xpp, REC(env, s1, l1), REC(env, s2, l2))) | |
103 | |
104 #define CMP(i, s1, l1, s2, l2) \ | |
105 (cmp_recs(i->xpp, REC(i->env, s1, l1), REC(i->env, s2, l2))) | |
106 | |
107 #define TABLE_HASH(index, side, line) \ | |
108 XDL_HASHLONG((REC(index->env, side, line))->ha, index->table_bits) | |
109 | |
110 static int scanA(struct histindex *index, int line1, int count1) | |
111 { | |
112 unsigned int ptr, tbl_idx; | |
113 unsigned int chain_len; | |
114 struct record **rec_chain, *rec; | |
115 | |
116 for (ptr = LINE_END(1); line1 <= ptr; ptr--) { | |
117 tbl_idx = TABLE_HASH(index, 1, ptr); | |
118 rec_chain = index->records + tbl_idx; | |
119 rec = *rec_chain; | |
120 | |
121 chain_len = 0; | |
122 while (rec) { | |
123 if (CMP(index, 1, rec->ptr, 1, ptr)) { | |
124 /* | |
125 * ptr is identical to another element. Insert | |
126 * it onto the front of the existing element | |
127 * chain. | |
128 */ | |
129 NEXT_PTR(index, ptr) = rec->ptr; | |
130 rec->ptr = ptr; | |
131 /* cap rec->cnt at MAX_CNT */ | |
132 rec->cnt = XDL_MIN(MAX_CNT, rec->cnt + 1); | |
133 LINE_MAP(index, ptr) = rec; | |
134 goto continue_scan; | |
135 } | |
136 | |
137 rec = rec->next; | |
138 chain_len++; | |
139 } | |
140 | |
141 if (chain_len == index->max_chain_length) | |
142 return -1; | |
143 | |
144 /* | |
145 * This is the first time we have ever seen this particular | |
146 * element in the sequence. Construct a new chain for it. | |
147 */ | |
148 if (!(rec = xdl_cha_alloc(&index->rcha))) | |
149 return -1; | |
150 rec->ptr = ptr; | |
151 rec->cnt = 1; | |
152 rec->next = *rec_chain; | |
153 *rec_chain = rec; | |
154 LINE_MAP(index, ptr) = rec; | |
155 | |
156 continue_scan: | |
157 ; /* no op */ | |
158 } | |
159 | |
160 return 0; | |
161 } | |
162 | |
163 static int try_lcs(struct histindex *index, struct region *lcs, int b_ptr, | |
164 int line1, int count1, int line2, int count2) | |
165 { | |
166 unsigned int b_next = b_ptr + 1; | |
167 struct record *rec = index->records[TABLE_HASH(index, 2, b_ptr)]; | |
168 unsigned int as, ae, bs, be, np, rc; | |
169 int should_break; | |
170 | |
171 for (; rec; rec = rec->next) { | |
172 if (rec->cnt > index->cnt) { | |
173 if (!index->has_common) | |
174 index->has_common = CMP(index, 1, rec->ptr, 2, b_ptr); | |
175 continue; | |
176 } | |
177 | |
178 as = rec->ptr; | |
179 if (!CMP(index, 1, as, 2, b_ptr)) | |
180 continue; | |
181 | |
182 index->has_common = 1; | |
183 for (;;) { | |
184 should_break = 0; | |
185 np = NEXT_PTR(index, as); | |
186 bs = b_ptr; | |
187 ae = as; | |
188 be = bs; | |
189 rc = rec->cnt; | |
190 | |
191 while (line1 < as && line2 < bs | |
192 && CMP(index, 1, as - 1, 2, bs - 1)) { | |
193 as--; | |
194 bs--; | |
195 if (1 < rc) | |
196 rc = XDL_MIN(rc, CNT(index, as)); | |
197 } | |
198 while (ae < LINE_END(1) && be < LINE_END(2) | |
199 && CMP(index, 1, ae + 1, 2, be + 1)) { | |
200 ae++; | |
201 be++; | |
202 if (1 < rc) | |
203 rc = XDL_MIN(rc, CNT(index, ae)); | |
204 } | |
205 | |
206 if (b_next <= be) | |
207 b_next = be + 1; | |
208 if (lcs->end1 - lcs->begin1 < ae - as || rc < index->cnt) { | |
209 lcs->begin1 = as; | |
210 lcs->begin2 = bs; | |
211 lcs->end1 = ae; | |
212 lcs->end2 = be; | |
213 index->cnt = rc; | |
214 } | |
215 | |
216 if (np == 0) | |
217 break; | |
218 | |
219 while (np <= ae) { | |
220 np = NEXT_PTR(index, np); | |
221 if (np == 0) { | |
222 should_break = 1; | |
223 break; | |
224 } | |
225 } | |
226 | |
227 if (should_break) | |
228 break; | |
229 | |
230 as = np; | |
231 } | |
232 } | |
233 return b_next; | |
234 } | |
235 | |
236 static int find_lcs(struct histindex *index, struct region *lcs, | |
237 int line1, int count1, int line2, int count2) { | |
238 int b_ptr; | |
239 | |
240 if (scanA(index, line1, count1)) | |
241 return -1; | |
242 | |
243 index->cnt = index->max_chain_length + 1; | |
244 | |
245 for (b_ptr = line2; b_ptr <= LINE_END(2); ) | |
246 b_ptr = try_lcs(index, lcs, b_ptr, line1, count1, line2, count2); | |
247 | |
248 return index->has_common && index->max_chain_length < index->cnt; | |
249 } | |
250 | |
251 static int fall_back_to_classic_diff(struct histindex *index, | |
252 int line1, int count1, int line2, int count2) | |
253 { | |
254 xpparam_t xpp; | |
255 xpp.flags = index->xpp->flags & ~XDF_DIFF_ALGORITHM_MASK; | |
256 | |
257 return xdl_fall_back_diff(index->env, &xpp, | |
258 line1, count1, line2, count2); | |
259 } | |
260 | |
261 static int histogram_diff(xpparam_t const *xpp, xdfenv_t *env, | |
262 int line1, int count1, int line2, int count2) | |
263 { | |
264 struct histindex index; | |
265 struct region lcs; | |
266 int sz; | |
267 int result = -1; | |
268 | |
269 if (count1 <= 0 && count2 <= 0) | |
270 return 0; | |
271 | |
272 if (LINE_END(1) >= MAX_PTR) | |
273 return -1; | |
274 | |
275 if (!count1) { | |
276 while(count2--) | |
277 env->xdf2.rchg[line2++ - 1] = 1; | |
278 return 0; | |
279 } else if (!count2) { | |
280 while(count1--) | |
281 env->xdf1.rchg[line1++ - 1] = 1; | |
282 return 0; | |
283 } | |
284 | |
285 memset(&index, 0, sizeof(index)); | |
286 | |
287 index.env = env; | |
288 index.xpp = xpp; | |
289 | |
290 index.records = NULL; | |
291 index.line_map = NULL; | |
292 /* in case of early xdl_cha_free() */ | |
293 index.rcha.head = NULL; | |
294 | |
295 index.table_bits = xdl_hashbits(count1); | |
296 sz = index.records_size = 1 << index.table_bits; | |
297 sz *= sizeof(struct record *); | |
298 if (!(index.records = (struct record **) xdl_malloc(sz))) | |
299 goto cleanup; | |
300 memset(index.records, 0, sz); | |
301 | |
302 sz = index.line_map_size = count1; | |
303 sz *= sizeof(struct record *); | |
304 if (!(index.line_map = (struct record **) xdl_malloc(sz))) | |
305 goto cleanup; | |
306 memset(index.line_map, 0, sz); | |
307 | |
308 sz = index.line_map_size; | |
309 sz *= sizeof(unsigned int); | |
310 if (!(index.next_ptrs = (unsigned int *) xdl_malloc(sz))) | |
311 goto cleanup; | |
312 memset(index.next_ptrs, 0, sz); | |
313 | |
314 /* lines / 4 + 1 comes from xprepare.c:xdl_prepare_ctx() */ | |
315 if (xdl_cha_init(&index.rcha, sizeof(struct record), count1 / 4 + 1) < 0) | |
316 goto cleanup; | |
317 | |
318 index.ptr_shift = line1; | |
319 index.max_chain_length = 64; | |
320 | |
321 memset(&lcs, 0, sizeof(lcs)); | |
322 if (find_lcs(&index, &lcs, line1, count1, line2, count2)) | |
323 result = fall_back_to_classic_diff(&index, line1, count1, line2, count2); | |
324 else { | |
325 if (lcs.begin1 == 0 && lcs.begin2 == 0) { | |
326 while (count1--) | |
327 env->xdf1.rchg[line1++ - 1] = 1; | |
328 while (count2--) | |
329 env->xdf2.rchg[line2++ - 1] = 1; | |
330 result = 0; | |
331 } else { | |
332 result = histogram_diff(xpp, env, | |
333 line1, lcs.begin1 - line1, | |
334 line2, lcs.begin2 - line2); | |
335 if (result) | |
336 goto cleanup; | |
337 result = histogram_diff(xpp, env, | |
338 lcs.end1 + 1, LINE_END(1) - lcs.end1, | |
339 lcs.end2 + 1, LINE_END(2) - lcs.end2); | |
340 if (result) | |
341 goto cleanup; | |
342 } | |
343 } | |
344 | |
345 cleanup: | |
346 xdl_free(index.records); | |
347 xdl_free(index.line_map); | |
348 xdl_free(index.next_ptrs); | |
349 xdl_cha_free(&index.rcha); | |
350 | |
351 return result; | |
352 } | |
353 | |
354 int xdl_do_histogram_diff(mmfile_t *file1, mmfile_t *file2, | |
355 xpparam_t const *xpp, xdfenv_t *env) | |
356 { | |
357 if (xdl_prepare_env(file1, file2, xpp, env) < 0) | |
358 return -1; | |
359 | |
360 return histogram_diff(xpp, env, | |
361 env->xdf1.dstart + 1, env->xdf1.dend - env->xdf1.dstart + 1, | |
362 env->xdf2.dstart + 1, env->xdf2.dend - env->xdf2.dstart + 1); | |
363 } |