Mercurial > hg
annotate mercurial/thirdparty/xdiff/xprepare.c @ 36939:45bfcd16f27e
forget: add --dry-run mode
author | Sushil khanchi <sushilkhanchi97@gmail.com> |
---|---|
date | Sat, 10 Mar 2018 12:33:19 +0530 |
parents | d40b9e29c114 |
children |
rev | line source |
---|---|
36671 | 1 /* |
2 * LibXDiff by Davide Libenzi ( File Differential Library ) | |
3 * Copyright (C) 2003 Davide Libenzi | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2.1 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, see | |
17 * <http://www.gnu.org/licenses/>. | |
18 * | |
19 * Davide Libenzi <davidel@xmailserver.org> | |
20 * | |
21 */ | |
22 | |
23 #include "xinclude.h" | |
24 | |
25 | |
26 #define XDL_KPDIS_RUN 4 | |
27 #define XDL_MAX_EQLIMIT 1024 | |
28 #define XDL_SIMSCAN_WINDOW 100 | |
29 #define XDL_GUESS_NLINES1 256 | |
30 | |
31 | |
32 typedef struct s_xdlclass { | |
33 struct s_xdlclass *next; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
34 uint64_t ha; |
36671 | 35 char const *line; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
36 int64_t size; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
37 int64_t idx; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
38 int64_t len1, len2; |
36671 | 39 } xdlclass_t; |
40 | |
41 typedef struct s_xdlclassifier { | |
42 unsigned int hbits; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
43 int64_t hsize; |
36671 | 44 xdlclass_t **rchash; |
45 chastore_t ncha; | |
46 xdlclass_t **rcrecs; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
47 int64_t alloc; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
48 int64_t count; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
49 int64_t flags; |
36671 | 50 } xdlclassifier_t; |
51 | |
52 | |
53 | |
54 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
55 static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags); |
36671 | 56 static void xdl_free_classifier(xdlclassifier_t *cf); |
57 static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, | |
58 unsigned int hbits, xrecord_t *rec); | |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
59 static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 60 xdlclassifier_t *cf, xdfile_t *xdf); |
61 static void xdl_free_ctx(xdfile_t *xdf); | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
62 static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e); |
36671 | 63 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); |
64 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2); | |
65 static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); | |
66 | |
67 | |
68 | |
69 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
70 static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags) { |
36671 | 71 cf->flags = flags; |
72 | |
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
73 cf->hbits = xdl_hashbits(size); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
74 cf->hsize = ((uint64_t)1) << cf->hbits; |
36671 | 75 |
76 if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) { | |
77 | |
78 return -1; | |
79 } | |
80 if (!(cf->rchash = (xdlclass_t **) xdl_malloc(cf->hsize * sizeof(xdlclass_t *)))) { | |
81 | |
82 xdl_cha_free(&cf->ncha); | |
83 return -1; | |
84 } | |
85 memset(cf->rchash, 0, cf->hsize * sizeof(xdlclass_t *)); | |
86 | |
87 cf->alloc = size; | |
88 if (!(cf->rcrecs = (xdlclass_t **) xdl_malloc(cf->alloc * sizeof(xdlclass_t *)))) { | |
89 | |
90 xdl_free(cf->rchash); | |
91 xdl_cha_free(&cf->ncha); | |
92 return -1; | |
93 } | |
94 | |
95 cf->count = 0; | |
96 | |
97 return 0; | |
98 } | |
99 | |
100 | |
101 static void xdl_free_classifier(xdlclassifier_t *cf) { | |
102 | |
103 xdl_free(cf->rcrecs); | |
104 xdl_free(cf->rchash); | |
105 xdl_cha_free(&cf->ncha); | |
106 } | |
107 | |
108 | |
109 static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, | |
110 unsigned int hbits, xrecord_t *rec) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
111 int64_t hi; |
36671 | 112 char const *line; |
113 xdlclass_t *rcrec; | |
114 xdlclass_t **rcrecs; | |
115 | |
116 line = rec->ptr; | |
117 hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); | |
118 for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) | |
119 if (rcrec->ha == rec->ha && | |
120 xdl_recmatch(rcrec->line, rcrec->size, | |
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
121 rec->ptr, rec->size)) |
36671 | 122 break; |
123 | |
124 if (!rcrec) { | |
125 if (!(rcrec = xdl_cha_alloc(&cf->ncha))) { | |
126 | |
127 return -1; | |
128 } | |
129 rcrec->idx = cf->count++; | |
130 if (cf->count > cf->alloc) { | |
131 cf->alloc *= 2; | |
132 if (!(rcrecs = (xdlclass_t **) xdl_realloc(cf->rcrecs, cf->alloc * sizeof(xdlclass_t *)))) { | |
133 | |
134 return -1; | |
135 } | |
136 cf->rcrecs = rcrecs; | |
137 } | |
138 cf->rcrecs[rcrec->idx] = rcrec; | |
139 rcrec->line = line; | |
140 rcrec->size = rec->size; | |
141 rcrec->ha = rec->ha; | |
142 rcrec->len1 = rcrec->len2 = 0; | |
143 rcrec->next = cf->rchash[hi]; | |
144 cf->rchash[hi] = rcrec; | |
145 } | |
146 | |
147 (pass == 1) ? rcrec->len1++ : rcrec->len2++; | |
148 | |
149 rec->ha = (unsigned long) rcrec->idx; | |
150 | |
151 hi = (long) XDL_HASHLONG(rec->ha, hbits); | |
152 rec->next = rhash[hi]; | |
153 rhash[hi] = rec; | |
154 | |
155 return 0; | |
156 } | |
157 | |
158 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
159 /* |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
160 * Trim common prefix from files. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
161 * |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
162 * Note: trimming could affect hunk shifting. But the performance benefit |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
163 * outweighs the shift change. A diff result with suboptimal shifting is still |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
164 * valid. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
165 */ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
166 static void xdl_trim_files(mmfile_t *mf1, mmfile_t *mf2, int64_t reserved, |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
167 xdfenv_t *xe, mmfile_t *out_mf1, mmfile_t *out_mf2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
168 mmfile_t msmall, mlarge; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
169 /* prefix lines, prefix bytes, suffix lines, suffix bytes */ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
170 int64_t plines = 0, pbytes = 0, slines = 0, sbytes = 0, i; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
171 /* prefix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
172 const char *pp1, *pp2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
173 /* suffix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
174 const char *ps1, *ps2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
175 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
176 /* reserved must >= 0 for the line boundary adjustment to work */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
177 if (reserved < 0) |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
178 reserved = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
179 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
180 if (mf1->size < mf2->size) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
181 memcpy(&msmall, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
182 memcpy(&mlarge, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
183 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
184 memcpy(&msmall, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
185 memcpy(&mlarge, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
186 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
187 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
188 pp1 = msmall.ptr, pp2 = mlarge.ptr; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
189 for (i = 0; i < msmall.size && *pp1 == *pp2; ++i) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
190 plines += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
191 pp1++, pp2++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
192 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
193 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
194 ps1 = msmall.ptr + msmall.size - 1, ps2 = mlarge.ptr + mlarge.size - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
195 while (ps1 > pp1 && *ps1 == *ps2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
196 slines += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
197 ps1--, ps2--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
198 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
199 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
200 /* Retract common prefix and suffix boundaries for reserved lines */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
201 if (plines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
202 plines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
203 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
204 i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
205 while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
206 pp1--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
207 i += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
208 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
209 /* The new mmfile starts at the next char just after '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
210 pbytes = pp1 - msmall.ptr + 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
211 plines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
212 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
213 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
214 if (slines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
215 slines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
216 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
217 /* Note: with compiler SIMD support (ex. -O3 -mavx2), this |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
218 * might perform better than memchr. */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
219 i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
220 while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
221 ps1++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
222 i += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
223 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
224 /* The new mmfile includes this '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
225 sbytes = msmall.ptr + msmall.size - ps1 - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
226 slines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
227 if (msmall.ptr[msmall.size - 1] == '\n') |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
228 slines -= 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
229 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
230 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
231 xe->nprefix = plines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
232 xe->nsuffix = slines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
233 out_mf1->ptr = mf1->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
234 out_mf1->size = mf1->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
235 out_mf2->ptr = mf2->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
236 out_mf2->size = mf2->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
237 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
238 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
239 |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
240 static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 241 xdlclassifier_t *cf, xdfile_t *xdf) { |
242 unsigned int hbits; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
243 int64_t nrec, hsize, bsize; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
244 uint64_t hav; |
36671 | 245 char const *blk, *cur, *top, *prev; |
246 xrecord_t *crec; | |
247 xrecord_t **recs, **rrecs; | |
248 xrecord_t **rhash; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
249 uint64_t *ha; |
36671 | 250 char *rchg; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
251 int64_t *rindex; |
36671 | 252 |
253 ha = NULL; | |
254 rindex = NULL; | |
255 rchg = NULL; | |
256 rhash = NULL; | |
257 recs = NULL; | |
258 | |
259 if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0) | |
260 goto abort; | |
261 if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *)))) | |
262 goto abort; | |
263 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
264 { |
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
265 hbits = xdl_hashbits(narec); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
266 hsize = ((uint64_t)1) << hbits; |
36671 | 267 if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *)))) |
268 goto abort; | |
269 memset(rhash, 0, hsize * sizeof(xrecord_t *)); | |
270 } | |
271 | |
272 nrec = 0; | |
273 if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) { | |
274 for (top = blk + bsize; cur < top; ) { | |
275 prev = cur; | |
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
276 hav = xdl_hash_record(&cur, top); |
36671 | 277 if (nrec >= narec) { |
278 narec *= 2; | |
279 if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) | |
280 goto abort; | |
281 recs = rrecs; | |
282 } | |
283 if (!(crec = xdl_cha_alloc(&xdf->rcha))) | |
284 goto abort; | |
285 crec->ptr = prev; | |
286 crec->size = (long) (cur - prev); | |
287 crec->ha = hav; | |
288 recs[nrec++] = crec; | |
289 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
290 if (xdl_classify_record(pass, cf, rhash, hbits, crec) < 0) |
36671 | 291 goto abort; |
292 } | |
293 } | |
294 | |
295 if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char)))) | |
296 goto abort; | |
297 memset(rchg, 0, (nrec + 2) * sizeof(char)); | |
298 | |
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
299 if (!(rindex = (int64_t *) xdl_malloc((nrec + 1) * sizeof(int64_t)))) |
36671 | 300 goto abort; |
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
301 if (!(ha = (uint64_t *) xdl_malloc((nrec + 1) * sizeof(uint64_t)))) |
36671 | 302 goto abort; |
303 | |
304 xdf->nrec = nrec; | |
305 xdf->recs = recs; | |
306 xdf->hbits = hbits; | |
307 xdf->rhash = rhash; | |
308 xdf->rchg = rchg + 1; | |
309 xdf->rindex = rindex; | |
310 xdf->nreff = 0; | |
311 xdf->ha = ha; | |
312 xdf->dstart = 0; | |
313 xdf->dend = nrec - 1; | |
314 | |
315 return 0; | |
316 | |
317 abort: | |
318 xdl_free(ha); | |
319 xdl_free(rindex); | |
320 xdl_free(rchg); | |
321 xdl_free(rhash); | |
322 xdl_free(recs); | |
323 xdl_cha_free(&xdf->rcha); | |
324 return -1; | |
325 } | |
326 | |
327 | |
328 static void xdl_free_ctx(xdfile_t *xdf) { | |
329 | |
330 xdl_free(xdf->rhash); | |
331 xdl_free(xdf->rindex); | |
332 xdl_free(xdf->rchg - 1); | |
333 xdl_free(xdf->ha); | |
334 xdl_free(xdf->recs); | |
335 xdl_cha_free(&xdf->rcha); | |
336 } | |
337 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
338 /* Reserved lines for trimming, to leave room for shifting */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
339 #define TRIM_RESERVED_LINES 100 |
36671 | 340 |
341 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, | |
342 xdfenv_t *xe) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
343 int64_t enl1, enl2, sample; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
344 mmfile_t tmf1, tmf2; |
36671 | 345 xdlclassifier_t cf; |
346 | |
347 memset(&cf, 0, sizeof(cf)); | |
348 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
349 sample = XDL_GUESS_NLINES1; |
36671 | 350 |
351 enl1 = xdl_guess_lines(mf1, sample) + 1; | |
352 enl2 = xdl_guess_lines(mf2, sample) + 1; | |
353 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
354 if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) |
36671 | 355 return -1; |
356 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
357 xdl_trim_files(mf1, mf2, TRIM_RESERVED_LINES, xe, &tmf1, &tmf2); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
358 |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
359 if (xdl_prepare_ctx(1, &tmf1, enl1, &cf, &xe->xdf1) < 0) { |
36671 | 360 |
361 xdl_free_classifier(&cf); | |
362 return -1; | |
363 } | |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
364 if (xdl_prepare_ctx(2, &tmf2, enl2, &cf, &xe->xdf2) < 0) { |
36671 | 365 |
366 xdl_free_ctx(&xe->xdf1); | |
367 xdl_free_classifier(&cf); | |
368 return -1; | |
369 } | |
370 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
371 if (xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { |
36671 | 372 xdl_free_ctx(&xe->xdf2); |
373 xdl_free_ctx(&xe->xdf1); | |
374 xdl_free_classifier(&cf); | |
375 return -1; | |
376 } | |
377 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
378 xdl_free_classifier(&cf); |
36671 | 379 |
380 return 0; | |
381 } | |
382 | |
383 | |
384 void xdl_free_env(xdfenv_t *xe) { | |
385 | |
386 xdl_free_ctx(&xe->xdf2); | |
387 xdl_free_ctx(&xe->xdf1); | |
388 } | |
389 | |
390 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
391 static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e) { |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
392 int64_t r, rdis0, rpdis0, rdis1, rpdis1; |
36671 | 393 |
394 /* | |
395 * Limits the window the is examined during the similar-lines | |
396 * scan. The loops below stops when dis[i - r] == 1 (line that | |
397 * has no match), but there are corner cases where the loop | |
398 * proceed all the way to the extremities by causing huge | |
399 * performance penalties in case of big files. | |
400 */ | |
401 if (i - s > XDL_SIMSCAN_WINDOW) | |
402 s = i - XDL_SIMSCAN_WINDOW; | |
403 if (e - i > XDL_SIMSCAN_WINDOW) | |
404 e = i + XDL_SIMSCAN_WINDOW; | |
405 | |
406 /* | |
407 * Scans the lines before 'i' to find a run of lines that either | |
408 * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1). | |
409 * Note that we always call this function with dis[i] > 1, so the | |
410 * current line (i) is already a multimatch line. | |
411 */ | |
412 for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { | |
413 if (!dis[i - r]) | |
414 rdis0++; | |
415 else if (dis[i - r] == 2) | |
416 rpdis0++; | |
417 else | |
418 break; | |
419 } | |
420 /* | |
421 * If the run before the line 'i' found only multimatch lines, we | |
422 * return 0 and hence we don't make the current line (i) discarded. | |
423 * We want to discard multimatch lines only when they appear in the | |
424 * middle of runs with nomatch lines (dis[j] == 0). | |
425 */ | |
426 if (rdis0 == 0) | |
427 return 0; | |
428 for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { | |
429 if (!dis[i + r]) | |
430 rdis1++; | |
431 else if (dis[i + r] == 2) | |
432 rpdis1++; | |
433 else | |
434 break; | |
435 } | |
436 /* | |
437 * If the run after the line 'i' found only multimatch lines, we | |
438 * return 0 and hence we don't make the current line (i) discarded. | |
439 */ | |
440 if (rdis1 == 0) | |
441 return 0; | |
442 rdis1 += rdis0; | |
443 rpdis1 += rpdis0; | |
444 | |
445 return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); | |
446 } | |
447 | |
448 | |
449 /* | |
450 * Try to reduce the problem complexity, discard records that have no | |
451 * matches on the other file. Also, lines that have multiple matches | |
452 * might be potentially discarded if they happear in a run of discardable. | |
453 */ | |
454 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
455 int64_t i, nm, nreff, mlim; |
36671 | 456 xrecord_t **recs; |
457 xdlclass_t *rcrec; | |
458 char *dis, *dis1, *dis2; | |
459 | |
460 if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) { | |
461 | |
462 return -1; | |
463 } | |
464 memset(dis, 0, xdf1->nrec + xdf2->nrec + 2); | |
465 dis1 = dis; | |
466 dis2 = dis1 + xdf1->nrec + 1; | |
467 | |
468 if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) | |
469 mlim = XDL_MAX_EQLIMIT; | |
470 for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { | |
471 rcrec = cf->rcrecs[(*recs)->ha]; | |
472 nm = rcrec ? rcrec->len2 : 0; | |
473 dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; | |
474 } | |
475 | |
476 if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) | |
477 mlim = XDL_MAX_EQLIMIT; | |
478 for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { | |
479 rcrec = cf->rcrecs[(*recs)->ha]; | |
480 nm = rcrec ? rcrec->len1 : 0; | |
481 dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; | |
482 } | |
483 | |
484 for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; | |
485 i <= xdf1->dend; i++, recs++) { | |
486 if (dis1[i] == 1 || | |
487 (dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) { | |
488 xdf1->rindex[nreff] = i; | |
489 xdf1->ha[nreff] = (*recs)->ha; | |
490 nreff++; | |
491 } else | |
492 xdf1->rchg[i] = 1; | |
493 } | |
494 xdf1->nreff = nreff; | |
495 | |
496 for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; | |
497 i <= xdf2->dend; i++, recs++) { | |
498 if (dis2[i] == 1 || | |
499 (dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) { | |
500 xdf2->rindex[nreff] = i; | |
501 xdf2->ha[nreff] = (*recs)->ha; | |
502 nreff++; | |
503 } else | |
504 xdf2->rchg[i] = 1; | |
505 } | |
506 xdf2->nreff = nreff; | |
507 | |
508 xdl_free(dis); | |
509 | |
510 return 0; | |
511 } | |
512 | |
513 | |
514 /* | |
515 * Early trim initial and terminal matching records. | |
516 */ | |
517 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
518 int64_t i, lim; |
36671 | 519 xrecord_t **recs1, **recs2; |
520 | |
521 recs1 = xdf1->recs; | |
522 recs2 = xdf2->recs; | |
523 for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; | |
524 i++, recs1++, recs2++) | |
525 if ((*recs1)->ha != (*recs2)->ha) | |
526 break; | |
527 | |
528 xdf1->dstart = xdf2->dstart = i; | |
529 | |
530 recs1 = xdf1->recs + xdf1->nrec - 1; | |
531 recs2 = xdf2->recs + xdf2->nrec - 1; | |
532 for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) | |
533 if ((*recs1)->ha != (*recs2)->ha) | |
534 break; | |
535 | |
536 xdf1->dend = xdf1->nrec - i - 1; | |
537 xdf2->dend = xdf2->nrec - i - 1; | |
538 | |
539 return 0; | |
540 } | |
541 | |
542 | |
543 static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { | |
544 | |
545 if (xdl_trim_ends(xdf1, xdf2) < 0 || | |
546 xdl_cleanup_records(cf, xdf1, xdf2) < 0) { | |
547 | |
548 return -1; | |
549 } | |
550 | |
551 return 0; | |
552 } |