author | Matt Harbison <matt_harbison@yahoo.com> |
Wed, 09 Jan 2019 16:02:05 -0500 | |
changeset 41180 | 69804c040a04 |
parent 36923 | d40b9e29c114 |
permissions | -rw-r--r-- |
36671 | 1 |
/* |
2 |
* LibXDiff by Davide Libenzi ( File Differential Library ) |
|
3 |
* Copyright (C) 2003 Davide Libenzi |
|
4 |
* |
|
5 |
* This library is free software; you can redistribute it and/or |
|
6 |
* modify it under the terms of the GNU Lesser General Public |
|
7 |
* License as published by the Free Software Foundation; either |
|
8 |
* version 2.1 of the License, or (at your option) any later version. |
|
9 |
* |
|
10 |
* This library is distributed in the hope that it will be useful, |
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
* Lesser General Public License for more details. |
|
14 |
* |
|
15 |
* You should have received a copy of the GNU Lesser General Public |
|
16 |
* License along with this library; if not, see |
|
17 |
* <http://www.gnu.org/licenses/>. |
|
18 |
* |
|
19 |
* Davide Libenzi <davidel@xmailserver.org> |
|
20 |
* |
|
21 |
*/ |
|
22 |
||
23 |
#include "xinclude.h" |
|
24 |
||
25 |
||
26 |
#define XDL_KPDIS_RUN 4 |
|
27 |
#define XDL_MAX_EQLIMIT 1024 |
|
28 |
#define XDL_SIMSCAN_WINDOW 100 |
|
29 |
#define XDL_GUESS_NLINES1 256 |
|
30 |
||
31 |
||
32 |
typedef struct s_xdlclass { |
|
33 |
struct s_xdlclass *next; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
34 |
uint64_t ha; |
36671 | 35 |
char const *line; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
36 |
int64_t size; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
37 |
int64_t idx; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
38 |
int64_t len1, len2; |
36671 | 39 |
} xdlclass_t; |
40 |
||
41 |
typedef struct s_xdlclassifier { |
|
42 |
unsigned int hbits; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
43 |
int64_t hsize; |
36671 | 44 |
xdlclass_t **rchash; |
45 |
chastore_t ncha; |
|
46 |
xdlclass_t **rcrecs; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
47 |
int64_t alloc; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
48 |
int64_t count; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
49 |
int64_t flags; |
36671 | 50 |
} xdlclassifier_t; |
51 |
||
52 |
||
53 |
||
54 |
||
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
55 |
static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags); |
36671 | 56 |
static void xdl_free_classifier(xdlclassifier_t *cf); |
57 |
static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, |
|
58 |
unsigned int hbits, xrecord_t *rec); |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
59 |
static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 60 |
xdlclassifier_t *cf, xdfile_t *xdf); |
61 |
static void xdl_free_ctx(xdfile_t *xdf); |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
62 |
static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e); |
36671 | 63 |
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); |
64 |
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2); |
|
65 |
static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); |
|
66 |
||
67 |
||
68 |
||
69 |
||
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
70 |
static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags) { |
36671 | 71 |
cf->flags = flags; |
72 |
||
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
73 |
cf->hbits = xdl_hashbits(size); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
74 |
cf->hsize = ((uint64_t)1) << cf->hbits; |
36671 | 75 |
|
76 |
if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) { |
|
77 |
||
78 |
return -1; |
|
79 |
} |
|
80 |
if (!(cf->rchash = (xdlclass_t **) xdl_malloc(cf->hsize * sizeof(xdlclass_t *)))) { |
|
81 |
||
82 |
xdl_cha_free(&cf->ncha); |
|
83 |
return -1; |
|
84 |
} |
|
85 |
memset(cf->rchash, 0, cf->hsize * sizeof(xdlclass_t *)); |
|
86 |
||
87 |
cf->alloc = size; |
|
88 |
if (!(cf->rcrecs = (xdlclass_t **) xdl_malloc(cf->alloc * sizeof(xdlclass_t *)))) { |
|
89 |
||
90 |
xdl_free(cf->rchash); |
|
91 |
xdl_cha_free(&cf->ncha); |
|
92 |
return -1; |
|
93 |
} |
|
94 |
||
95 |
cf->count = 0; |
|
96 |
||
97 |
return 0; |
|
98 |
} |
|
99 |
||
100 |
||
101 |
static void xdl_free_classifier(xdlclassifier_t *cf) { |
|
102 |
||
103 |
xdl_free(cf->rcrecs); |
|
104 |
xdl_free(cf->rchash); |
|
105 |
xdl_cha_free(&cf->ncha); |
|
106 |
} |
|
107 |
||
108 |
||
109 |
static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, |
|
110 |
unsigned int hbits, xrecord_t *rec) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
111 |
int64_t hi; |
36671 | 112 |
char const *line; |
113 |
xdlclass_t *rcrec; |
|
114 |
xdlclass_t **rcrecs; |
|
115 |
||
116 |
line = rec->ptr; |
|
117 |
hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); |
|
118 |
for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) |
|
119 |
if (rcrec->ha == rec->ha && |
|
120 |
xdl_recmatch(rcrec->line, rcrec->size, |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
121 |
rec->ptr, rec->size)) |
36671 | 122 |
break; |
123 |
||
124 |
if (!rcrec) { |
|
125 |
if (!(rcrec = xdl_cha_alloc(&cf->ncha))) { |
|
126 |
||
127 |
return -1; |
|
128 |
} |
|
129 |
rcrec->idx = cf->count++; |
|
130 |
if (cf->count > cf->alloc) { |
|
131 |
cf->alloc *= 2; |
|
132 |
if (!(rcrecs = (xdlclass_t **) xdl_realloc(cf->rcrecs, cf->alloc * sizeof(xdlclass_t *)))) { |
|
133 |
||
134 |
return -1; |
|
135 |
} |
|
136 |
cf->rcrecs = rcrecs; |
|
137 |
} |
|
138 |
cf->rcrecs[rcrec->idx] = rcrec; |
|
139 |
rcrec->line = line; |
|
140 |
rcrec->size = rec->size; |
|
141 |
rcrec->ha = rec->ha; |
|
142 |
rcrec->len1 = rcrec->len2 = 0; |
|
143 |
rcrec->next = cf->rchash[hi]; |
|
144 |
cf->rchash[hi] = rcrec; |
|
145 |
} |
|
146 |
||
147 |
(pass == 1) ? rcrec->len1++ : rcrec->len2++; |
|
148 |
||
149 |
rec->ha = (unsigned long) rcrec->idx; |
|
150 |
||
151 |
hi = (long) XDL_HASHLONG(rec->ha, hbits); |
|
152 |
rec->next = rhash[hi]; |
|
153 |
rhash[hi] = rec; |
|
154 |
||
155 |
return 0; |
|
156 |
} |
|
157 |
||
158 |
||
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
159 |
/* |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
160 |
* Trim common prefix from files. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
161 |
* |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
162 |
* Note: trimming could affect hunk shifting. But the performance benefit |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
163 |
* outweighs the shift change. A diff result with suboptimal shifting is still |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
164 |
* valid. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
165 |
*/ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
166 |
static void xdl_trim_files(mmfile_t *mf1, mmfile_t *mf2, int64_t reserved, |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
167 |
xdfenv_t *xe, mmfile_t *out_mf1, mmfile_t *out_mf2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
168 |
mmfile_t msmall, mlarge; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
169 |
/* prefix lines, prefix bytes, suffix lines, suffix bytes */ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
170 |
int64_t plines = 0, pbytes = 0, slines = 0, sbytes = 0, i; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
171 |
/* prefix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
172 |
const char *pp1, *pp2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
173 |
/* suffix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
174 |
const char *ps1, *ps2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
175 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
176 |
/* reserved must >= 0 for the line boundary adjustment to work */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
177 |
if (reserved < 0) |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
178 |
reserved = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
179 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
180 |
if (mf1->size < mf2->size) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
181 |
memcpy(&msmall, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
182 |
memcpy(&mlarge, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
183 |
} else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
184 |
memcpy(&msmall, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
185 |
memcpy(&mlarge, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
186 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
187 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
188 |
pp1 = msmall.ptr, pp2 = mlarge.ptr; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
189 |
for (i = 0; i < msmall.size && *pp1 == *pp2; ++i) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
190 |
plines += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
191 |
pp1++, pp2++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
192 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
193 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
194 |
ps1 = msmall.ptr + msmall.size - 1, ps2 = mlarge.ptr + mlarge.size - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
195 |
while (ps1 > pp1 && *ps1 == *ps2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
196 |
slines += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
197 |
ps1--, ps2--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
198 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
199 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
200 |
/* Retract common prefix and suffix boundaries for reserved lines */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
201 |
if (plines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
202 |
plines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
203 |
} else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
204 |
i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
205 |
while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
206 |
pp1--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
207 |
i += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
208 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
209 |
/* The new mmfile starts at the next char just after '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
210 |
pbytes = pp1 - msmall.ptr + 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
211 |
plines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
212 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
213 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
214 |
if (slines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
215 |
slines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
216 |
} else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
217 |
/* Note: with compiler SIMD support (ex. -O3 -mavx2), this |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
218 |
* might perform better than memchr. */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
219 |
i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
220 |
while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
221 |
ps1++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
222 |
i += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
223 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
224 |
/* The new mmfile includes this '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
225 |
sbytes = msmall.ptr + msmall.size - ps1 - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
226 |
slines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
227 |
if (msmall.ptr[msmall.size - 1] == '\n') |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
228 |
slines -= 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
229 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
230 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
231 |
xe->nprefix = plines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
232 |
xe->nsuffix = slines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
233 |
out_mf1->ptr = mf1->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
234 |
out_mf1->size = mf1->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
235 |
out_mf2->ptr = mf2->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
236 |
out_mf2->size = mf2->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
237 |
} |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
238 |
|
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
239 |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
240 |
static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 241 |
xdlclassifier_t *cf, xdfile_t *xdf) { |
242 |
unsigned int hbits; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
243 |
int64_t nrec, hsize, bsize; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
244 |
uint64_t hav; |
36671 | 245 |
char const *blk, *cur, *top, *prev; |
246 |
xrecord_t *crec; |
|
247 |
xrecord_t **recs, **rrecs; |
|
248 |
xrecord_t **rhash; |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
249 |
uint64_t *ha; |
36671 | 250 |
char *rchg; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
251 |
int64_t *rindex; |
36671 | 252 |
|
253 |
ha = NULL; |
|
254 |
rindex = NULL; |
|
255 |
rchg = NULL; |
|
256 |
rhash = NULL; |
|
257 |
recs = NULL; |
|
258 |
||
259 |
if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0) |
|
260 |
goto abort; |
|
261 |
if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *)))) |
|
262 |
goto abort; |
|
263 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
264 |
{ |
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
265 |
hbits = xdl_hashbits(narec); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
266 |
hsize = ((uint64_t)1) << hbits; |
36671 | 267 |
if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *)))) |
268 |
goto abort; |
|
269 |
memset(rhash, 0, hsize * sizeof(xrecord_t *)); |
|
270 |
} |
|
271 |
||
272 |
nrec = 0; |
|
273 |
if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) { |
|
274 |
for (top = blk + bsize; cur < top; ) { |
|
275 |
prev = cur; |
|
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
276 |
hav = xdl_hash_record(&cur, top); |
36671 | 277 |
if (nrec >= narec) { |
278 |
narec *= 2; |
|
279 |
if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) |
|
280 |
goto abort; |
|
281 |
recs = rrecs; |
|
282 |
} |
|
283 |
if (!(crec = xdl_cha_alloc(&xdf->rcha))) |
|
284 |
goto abort; |
|
285 |
crec->ptr = prev; |
|
286 |
crec->size = (long) (cur - prev); |
|
287 |
crec->ha = hav; |
|
288 |
recs[nrec++] = crec; |
|
289 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
290 |
if (xdl_classify_record(pass, cf, rhash, hbits, crec) < 0) |
36671 | 291 |
goto abort; |
292 |
} |
|
293 |
} |
|
294 |
||
295 |
if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char)))) |
|
296 |
goto abort; |
|
297 |
memset(rchg, 0, (nrec + 2) * sizeof(char)); |
|
298 |
||
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
299 |
if (!(rindex = (int64_t *) xdl_malloc((nrec + 1) * sizeof(int64_t)))) |
36671 | 300 |
goto abort; |
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
301 |
if (!(ha = (uint64_t *) xdl_malloc((nrec + 1) * sizeof(uint64_t)))) |
36671 | 302 |
goto abort; |
303 |
||
304 |
xdf->nrec = nrec; |
|
305 |
xdf->recs = recs; |
|
306 |
xdf->hbits = hbits; |
|
307 |
xdf->rhash = rhash; |
|
308 |
xdf->rchg = rchg + 1; |
|
309 |
xdf->rindex = rindex; |
|
310 |
xdf->nreff = 0; |
|
311 |
xdf->ha = ha; |
|
312 |
xdf->dstart = 0; |
|
313 |
xdf->dend = nrec - 1; |
|
314 |
||
315 |
return 0; |
|
316 |
||
317 |
abort: |
|
318 |
xdl_free(ha); |
|
319 |
xdl_free(rindex); |
|
320 |
xdl_free(rchg); |
|
321 |
xdl_free(rhash); |
|
322 |
xdl_free(recs); |
|
323 |
xdl_cha_free(&xdf->rcha); |
|
324 |
return -1; |
|
325 |
} |
|
326 |
||
327 |
||
328 |
static void xdl_free_ctx(xdfile_t *xdf) { |
|
329 |
||
330 |
xdl_free(xdf->rhash); |
|
331 |
xdl_free(xdf->rindex); |
|
332 |
xdl_free(xdf->rchg - 1); |
|
333 |
xdl_free(xdf->ha); |
|
334 |
xdl_free(xdf->recs); |
|
335 |
xdl_cha_free(&xdf->rcha); |
|
336 |
} |
|
337 |
||
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
338 |
/* Reserved lines for trimming, to leave room for shifting */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
339 |
#define TRIM_RESERVED_LINES 100 |
36671 | 340 |
|
341 |
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, |
|
342 |
xdfenv_t *xe) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
343 |
int64_t enl1, enl2, sample; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
344 |
mmfile_t tmf1, tmf2; |
36671 | 345 |
xdlclassifier_t cf; |
346 |
||
347 |
memset(&cf, 0, sizeof(cf)); |
|
348 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
349 |
sample = XDL_GUESS_NLINES1; |
36671 | 350 |
|
351 |
enl1 = xdl_guess_lines(mf1, sample) + 1; |
|
352 |
enl2 = xdl_guess_lines(mf2, sample) + 1; |
|
353 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
354 |
if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) |
36671 | 355 |
return -1; |
356 |
||
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
357 |
xdl_trim_files(mf1, mf2, TRIM_RESERVED_LINES, xe, &tmf1, &tmf2); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
358 |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
359 |
if (xdl_prepare_ctx(1, &tmf1, enl1, &cf, &xe->xdf1) < 0) { |
36671 | 360 |
|
361 |
xdl_free_classifier(&cf); |
|
362 |
return -1; |
|
363 |
} |
|
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
364 |
if (xdl_prepare_ctx(2, &tmf2, enl2, &cf, &xe->xdf2) < 0) { |
36671 | 365 |
|
366 |
xdl_free_ctx(&xe->xdf1); |
|
367 |
xdl_free_classifier(&cf); |
|
368 |
return -1; |
|
369 |
} |
|
370 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
371 |
if (xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { |
36671 | 372 |
xdl_free_ctx(&xe->xdf2); |
373 |
xdl_free_ctx(&xe->xdf1); |
|
374 |
xdl_free_classifier(&cf); |
|
375 |
return -1; |
|
376 |
} |
|
377 |
||
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
378 |
xdl_free_classifier(&cf); |
36671 | 379 |
|
380 |
return 0; |
|
381 |
} |
|
382 |
||
383 |
||
384 |
void xdl_free_env(xdfenv_t *xe) { |
|
385 |
||
386 |
xdl_free_ctx(&xe->xdf2); |
|
387 |
xdl_free_ctx(&xe->xdf1); |
|
388 |
} |
|
389 |
||
390 |
||
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
391 |
static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e) { |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
392 |
int64_t r, rdis0, rpdis0, rdis1, rpdis1; |
36671 | 393 |
|
394 |
/* |
|
395 |
* Limits the window the is examined during the similar-lines |
|
396 |
* scan. The loops below stops when dis[i - r] == 1 (line that |
|
397 |
* has no match), but there are corner cases where the loop |
|
398 |
* proceed all the way to the extremities by causing huge |
|
399 |
* performance penalties in case of big files. |
|
400 |
*/ |
|
401 |
if (i - s > XDL_SIMSCAN_WINDOW) |
|
402 |
s = i - XDL_SIMSCAN_WINDOW; |
|
403 |
if (e - i > XDL_SIMSCAN_WINDOW) |
|
404 |
e = i + XDL_SIMSCAN_WINDOW; |
|
405 |
||
406 |
/* |
|
407 |
* Scans the lines before 'i' to find a run of lines that either |
|
408 |
* have no match (dis[j] == 0) or have multiple matches (dis[j] > 1). |
|
409 |
* Note that we always call this function with dis[i] > 1, so the |
|
410 |
* current line (i) is already a multimatch line. |
|
411 |
*/ |
|
412 |
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { |
|
413 |
if (!dis[i - r]) |
|
414 |
rdis0++; |
|
415 |
else if (dis[i - r] == 2) |
|
416 |
rpdis0++; |
|
417 |
else |
|
418 |
break; |
|
419 |
} |
|
420 |
/* |
|
421 |
* If the run before the line 'i' found only multimatch lines, we |
|
422 |
* return 0 and hence we don't make the current line (i) discarded. |
|
423 |
* We want to discard multimatch lines only when they appear in the |
|
424 |
* middle of runs with nomatch lines (dis[j] == 0). |
|
425 |
*/ |
|
426 |
if (rdis0 == 0) |
|
427 |
return 0; |
|
428 |
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { |
|
429 |
if (!dis[i + r]) |
|
430 |
rdis1++; |
|
431 |
else if (dis[i + r] == 2) |
|
432 |
rpdis1++; |
|
433 |
else |
|
434 |
break; |
|
435 |
} |
|
436 |
/* |
|
437 |
* If the run after the line 'i' found only multimatch lines, we |
|
438 |
* return 0 and hence we don't make the current line (i) discarded. |
|
439 |
*/ |
|
440 |
if (rdis1 == 0) |
|
441 |
return 0; |
|
442 |
rdis1 += rdis0; |
|
443 |
rpdis1 += rpdis0; |
|
444 |
||
445 |
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); |
|
446 |
} |
|
447 |
||
448 |
||
449 |
/* |
|
450 |
* Try to reduce the problem complexity, discard records that have no |
|
451 |
* matches on the other file. Also, lines that have multiple matches |
|
452 |
* might be potentially discarded if they happear in a run of discardable. |
|
453 |
*/ |
|
454 |
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
455 |
int64_t i, nm, nreff, mlim; |
36671 | 456 |
xrecord_t **recs; |
457 |
xdlclass_t *rcrec; |
|
458 |
char *dis, *dis1, *dis2; |
|
459 |
||
460 |
if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) { |
|
461 |
||
462 |
return -1; |
|
463 |
} |
|
464 |
memset(dis, 0, xdf1->nrec + xdf2->nrec + 2); |
|
465 |
dis1 = dis; |
|
466 |
dis2 = dis1 + xdf1->nrec + 1; |
|
467 |
||
468 |
if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) |
|
469 |
mlim = XDL_MAX_EQLIMIT; |
|
470 |
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { |
|
471 |
rcrec = cf->rcrecs[(*recs)->ha]; |
|
472 |
nm = rcrec ? rcrec->len2 : 0; |
|
473 |
dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; |
|
474 |
} |
|
475 |
||
476 |
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) |
|
477 |
mlim = XDL_MAX_EQLIMIT; |
|
478 |
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { |
|
479 |
rcrec = cf->rcrecs[(*recs)->ha]; |
|
480 |
nm = rcrec ? rcrec->len1 : 0; |
|
481 |
dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; |
|
482 |
} |
|
483 |
||
484 |
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; |
|
485 |
i <= xdf1->dend; i++, recs++) { |
|
486 |
if (dis1[i] == 1 || |
|
487 |
(dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) { |
|
488 |
xdf1->rindex[nreff] = i; |
|
489 |
xdf1->ha[nreff] = (*recs)->ha; |
|
490 |
nreff++; |
|
491 |
} else |
|
492 |
xdf1->rchg[i] = 1; |
|
493 |
} |
|
494 |
xdf1->nreff = nreff; |
|
495 |
||
496 |
for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; |
|
497 |
i <= xdf2->dend; i++, recs++) { |
|
498 |
if (dis2[i] == 1 || |
|
499 |
(dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) { |
|
500 |
xdf2->rindex[nreff] = i; |
|
501 |
xdf2->ha[nreff] = (*recs)->ha; |
|
502 |
nreff++; |
|
503 |
} else |
|
504 |
xdf2->rchg[i] = 1; |
|
505 |
} |
|
506 |
xdf2->nreff = nreff; |
|
507 |
||
508 |
xdl_free(dis); |
|
509 |
||
510 |
return 0; |
|
511 |
} |
|
512 |
||
513 |
||
514 |
/* |
|
515 |
* Early trim initial and terminal matching records. |
|
516 |
*/ |
|
517 |
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { |
|
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
518 |
int64_t i, lim; |
36671 | 519 |
xrecord_t **recs1, **recs2; |
520 |
||
521 |
recs1 = xdf1->recs; |
|
522 |
recs2 = xdf2->recs; |
|
523 |
for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; |
|
524 |
i++, recs1++, recs2++) |
|
525 |
if ((*recs1)->ha != (*recs2)->ha) |
|
526 |
break; |
|
527 |
||
528 |
xdf1->dstart = xdf2->dstart = i; |
|
529 |
||
530 |
recs1 = xdf1->recs + xdf1->nrec - 1; |
|
531 |
recs2 = xdf2->recs + xdf2->nrec - 1; |
|
532 |
for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) |
|
533 |
if ((*recs1)->ha != (*recs2)->ha) |
|
534 |
break; |
|
535 |
||
536 |
xdf1->dend = xdf1->nrec - i - 1; |
|
537 |
xdf2->dend = xdf2->nrec - i - 1; |
|
538 |
||
539 |
return 0; |
|
540 |
} |
|
541 |
||
542 |
||
543 |
static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { |
|
544 |
||
545 |
if (xdl_trim_ends(xdf1, xdf2) < 0 || |
|
546 |
xdl_cleanup_records(cf, xdf1, xdf2) < 0) { |
|
547 |
||
548 |
return -1; |
|
549 |
} |
|
550 |
||
551 |
return 0; |
|
552 |
} |