Mercurial > hg
annotate mercurial/thirdparty/xdiff/xprepare.c @ 47890:3853e6ee160d
dirstatemap: replace `removefile` by an explicit `entry.set_untracked()`
All the other caller goes through `reset_state`, so we can safely have an
explicit method on `DirstateItem` object.
This means that all the logic to preserve the previous state (from p2, merged,
etc) is now properly encapsulated within the DirstateItem. This pave the way to
using different storage for these information.
Differential Revision: https://phab.mercurial-scm.org/D11315
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Fri, 20 Aug 2021 11:27:01 +0200 |
parents | d40b9e29c114 |
children |
rev | line source |
---|---|
36671 | 1 /* |
2 * LibXDiff by Davide Libenzi ( File Differential Library ) | |
3 * Copyright (C) 2003 Davide Libenzi | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2.1 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, see | |
17 * <http://www.gnu.org/licenses/>. | |
18 * | |
19 * Davide Libenzi <davidel@xmailserver.org> | |
20 * | |
21 */ | |
22 | |
23 #include "xinclude.h" | |
24 | |
25 | |
26 #define XDL_KPDIS_RUN 4 | |
27 #define XDL_MAX_EQLIMIT 1024 | |
28 #define XDL_SIMSCAN_WINDOW 100 | |
29 #define XDL_GUESS_NLINES1 256 | |
30 | |
31 | |
32 typedef struct s_xdlclass { | |
33 struct s_xdlclass *next; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
34 uint64_t ha; |
36671 | 35 char const *line; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
36 int64_t size; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
37 int64_t idx; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
38 int64_t len1, len2; |
36671 | 39 } xdlclass_t; |
40 | |
41 typedef struct s_xdlclassifier { | |
42 unsigned int hbits; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
43 int64_t hsize; |
36671 | 44 xdlclass_t **rchash; |
45 chastore_t ncha; | |
46 xdlclass_t **rcrecs; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
47 int64_t alloc; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
48 int64_t count; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
49 int64_t flags; |
36671 | 50 } xdlclassifier_t; |
51 | |
52 | |
53 | |
54 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
55 static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags); |
36671 | 56 static void xdl_free_classifier(xdlclassifier_t *cf); |
57 static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, | |
58 unsigned int hbits, xrecord_t *rec); | |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
59 static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 60 xdlclassifier_t *cf, xdfile_t *xdf); |
61 static void xdl_free_ctx(xdfile_t *xdf); | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
62 static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e); |
36671 | 63 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); |
64 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2); | |
65 static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2); | |
66 | |
67 | |
68 | |
69 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
70 static int xdl_init_classifier(xdlclassifier_t *cf, int64_t size, int64_t flags) { |
36671 | 71 cf->flags = flags; |
72 | |
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
73 cf->hbits = xdl_hashbits(size); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
74 cf->hsize = ((uint64_t)1) << cf->hbits; |
36671 | 75 |
76 if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) { | |
77 | |
78 return -1; | |
79 } | |
80 if (!(cf->rchash = (xdlclass_t **) xdl_malloc(cf->hsize * sizeof(xdlclass_t *)))) { | |
81 | |
82 xdl_cha_free(&cf->ncha); | |
83 return -1; | |
84 } | |
85 memset(cf->rchash, 0, cf->hsize * sizeof(xdlclass_t *)); | |
86 | |
87 cf->alloc = size; | |
88 if (!(cf->rcrecs = (xdlclass_t **) xdl_malloc(cf->alloc * sizeof(xdlclass_t *)))) { | |
89 | |
90 xdl_free(cf->rchash); | |
91 xdl_cha_free(&cf->ncha); | |
92 return -1; | |
93 } | |
94 | |
95 cf->count = 0; | |
96 | |
97 return 0; | |
98 } | |
99 | |
100 | |
101 static void xdl_free_classifier(xdlclassifier_t *cf) { | |
102 | |
103 xdl_free(cf->rcrecs); | |
104 xdl_free(cf->rchash); | |
105 xdl_cha_free(&cf->ncha); | |
106 } | |
107 | |
108 | |
109 static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash, | |
110 unsigned int hbits, xrecord_t *rec) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
111 int64_t hi; |
36671 | 112 char const *line; |
113 xdlclass_t *rcrec; | |
114 xdlclass_t **rcrecs; | |
115 | |
116 line = rec->ptr; | |
117 hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); | |
118 for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) | |
119 if (rcrec->ha == rec->ha && | |
120 xdl_recmatch(rcrec->line, rcrec->size, | |
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
121 rec->ptr, rec->size)) |
36671 | 122 break; |
123 | |
124 if (!rcrec) { | |
125 if (!(rcrec = xdl_cha_alloc(&cf->ncha))) { | |
126 | |
127 return -1; | |
128 } | |
129 rcrec->idx = cf->count++; | |
130 if (cf->count > cf->alloc) { | |
131 cf->alloc *= 2; | |
132 if (!(rcrecs = (xdlclass_t **) xdl_realloc(cf->rcrecs, cf->alloc * sizeof(xdlclass_t *)))) { | |
133 | |
134 return -1; | |
135 } | |
136 cf->rcrecs = rcrecs; | |
137 } | |
138 cf->rcrecs[rcrec->idx] = rcrec; | |
139 rcrec->line = line; | |
140 rcrec->size = rec->size; | |
141 rcrec->ha = rec->ha; | |
142 rcrec->len1 = rcrec->len2 = 0; | |
143 rcrec->next = cf->rchash[hi]; | |
144 cf->rchash[hi] = rcrec; | |
145 } | |
146 | |
147 (pass == 1) ? rcrec->len1++ : rcrec->len2++; | |
148 | |
149 rec->ha = (unsigned long) rcrec->idx; | |
150 | |
151 hi = (long) XDL_HASHLONG(rec->ha, hbits); | |
152 rec->next = rhash[hi]; | |
153 rhash[hi] = rec; | |
154 | |
155 return 0; | |
156 } | |
157 | |
158 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
159 /* |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
160 * Trim common prefix from files. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
161 * |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
162 * Note: trimming could affect hunk shifting. But the performance benefit |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
163 * outweighs the shift change. A diff result with suboptimal shifting is still |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
164 * valid. |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
165 */ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
166 static void xdl_trim_files(mmfile_t *mf1, mmfile_t *mf2, int64_t reserved, |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
167 xdfenv_t *xe, mmfile_t *out_mf1, mmfile_t *out_mf2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
168 mmfile_t msmall, mlarge; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
169 /* prefix lines, prefix bytes, suffix lines, suffix bytes */ |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
170 int64_t plines = 0, pbytes = 0, slines = 0, sbytes = 0, i; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
171 /* prefix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
172 const char *pp1, *pp2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
173 /* suffix char pointer for msmall and mlarge */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
174 const char *ps1, *ps2; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
175 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
176 /* reserved must >= 0 for the line boundary adjustment to work */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
177 if (reserved < 0) |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
178 reserved = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
179 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
180 if (mf1->size < mf2->size) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
181 memcpy(&msmall, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
182 memcpy(&mlarge, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
183 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
184 memcpy(&msmall, mf2, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
185 memcpy(&mlarge, mf1, sizeof(mmfile_t)); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
186 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
187 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
188 pp1 = msmall.ptr, pp2 = mlarge.ptr; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
189 for (i = 0; i < msmall.size && *pp1 == *pp2; ++i) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
190 plines += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
191 pp1++, pp2++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
192 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
193 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
194 ps1 = msmall.ptr + msmall.size - 1, ps2 = mlarge.ptr + mlarge.size - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
195 while (ps1 > pp1 && *ps1 == *ps2) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
196 slines += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
197 ps1--, ps2--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
198 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
199 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
200 /* Retract common prefix and suffix boundaries for reserved lines */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
201 if (plines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
202 plines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
203 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
204 i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
205 while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
206 pp1--; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
207 i += (*pp1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
208 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
209 /* The new mmfile starts at the next char just after '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
210 pbytes = pp1 - msmall.ptr + 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
211 plines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
212 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
213 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
214 if (slines <= reserved + 1) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
215 slines = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
216 } else { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
217 /* Note: with compiler SIMD support (ex. -O3 -mavx2), this |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
218 * might perform better than memchr. */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
219 i = 0; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
220 while (i <= reserved) { |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
221 ps1++; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
222 i += (*ps1 == '\n'); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
223 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
224 /* The new mmfile includes this '\n' */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
225 sbytes = msmall.ptr + msmall.size - ps1 - 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
226 slines -= reserved; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
227 if (msmall.ptr[msmall.size - 1] == '\n') |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
228 slines -= 1; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
229 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
230 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
231 xe->nprefix = plines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
232 xe->nsuffix = slines; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
233 out_mf1->ptr = mf1->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
234 out_mf1->size = mf1->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
235 out_mf2->ptr = mf2->ptr + pbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
236 out_mf2->size = mf2->size - pbytes - sbytes; |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
237 } |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
238 |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
239 |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
240 static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, int64_t narec, |
36671 | 241 xdlclassifier_t *cf, xdfile_t *xdf) { |
242 unsigned int hbits; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
243 int64_t nrec, hsize, bsize; |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
244 uint64_t hav; |
36671 | 245 char const *blk, *cur, *top, *prev; |
246 xrecord_t *crec; | |
247 xrecord_t **recs, **rrecs; | |
248 xrecord_t **rhash; | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
249 uint64_t *ha; |
36671 | 250 char *rchg; |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
251 int64_t *rindex; |
36671 | 252 |
253 ha = NULL; | |
254 rindex = NULL; | |
255 rchg = NULL; | |
256 rhash = NULL; | |
257 recs = NULL; | |
258 | |
259 if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0) | |
260 goto abort; | |
261 if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *)))) | |
262 goto abort; | |
263 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
264 { |
36825
f1ef0e53e628
xdiff: use int64 for hash table size
Jun Wu <quark@fb.com>
parents:
36824
diff
changeset
|
265 hbits = xdl_hashbits(narec); |
36915
651c80720eed
xdiff: silence a 32-bit shift warning on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36825
diff
changeset
|
266 hsize = ((uint64_t)1) << hbits; |
36671 | 267 if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *)))) |
268 goto abort; | |
269 memset(rhash, 0, hsize * sizeof(xrecord_t *)); | |
270 } | |
271 | |
272 nrec = 0; | |
273 if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) { | |
274 for (top = blk + bsize; cur < top; ) { | |
275 prev = cur; | |
36823
49fe6249937a
xdiff: remove unused flags parameter
Jun Wu <quark@fb.com>
parents:
36822
diff
changeset
|
276 hav = xdl_hash_record(&cur, top); |
36671 | 277 if (nrec >= narec) { |
278 narec *= 2; | |
279 if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) | |
280 goto abort; | |
281 recs = rrecs; | |
282 } | |
283 if (!(crec = xdl_cha_alloc(&xdf->rcha))) | |
284 goto abort; | |
285 crec->ptr = prev; | |
286 crec->size = (long) (cur - prev); | |
287 crec->ha = hav; | |
288 recs[nrec++] = crec; | |
289 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
290 if (xdl_classify_record(pass, cf, rhash, hbits, crec) < 0) |
36671 | 291 goto abort; |
292 } | |
293 } | |
294 | |
295 if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char)))) | |
296 goto abort; | |
297 memset(rchg, 0, (nrec + 2) * sizeof(char)); | |
298 | |
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
299 if (!(rindex = (int64_t *) xdl_malloc((nrec + 1) * sizeof(int64_t)))) |
36671 | 300 goto abort; |
36923
d40b9e29c114
xdiff: fix a hard crash on Windows
Matt Harbison <matt_harbison@yahoo.com>
parents:
36915
diff
changeset
|
301 if (!(ha = (uint64_t *) xdl_malloc((nrec + 1) * sizeof(uint64_t)))) |
36671 | 302 goto abort; |
303 | |
304 xdf->nrec = nrec; | |
305 xdf->recs = recs; | |
306 xdf->hbits = hbits; | |
307 xdf->rhash = rhash; | |
308 xdf->rchg = rchg + 1; | |
309 xdf->rindex = rindex; | |
310 xdf->nreff = 0; | |
311 xdf->ha = ha; | |
312 xdf->dstart = 0; | |
313 xdf->dend = nrec - 1; | |
314 | |
315 return 0; | |
316 | |
317 abort: | |
318 xdl_free(ha); | |
319 xdl_free(rindex); | |
320 xdl_free(rchg); | |
321 xdl_free(rhash); | |
322 xdl_free(recs); | |
323 xdl_cha_free(&xdf->rcha); | |
324 return -1; | |
325 } | |
326 | |
327 | |
328 static void xdl_free_ctx(xdfile_t *xdf) { | |
329 | |
330 xdl_free(xdf->rhash); | |
331 xdl_free(xdf->rindex); | |
332 xdl_free(xdf->rchg - 1); | |
333 xdl_free(xdf->ha); | |
334 xdl_free(xdf->recs); | |
335 xdl_cha_free(&xdf->rcha); | |
336 } | |
337 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
338 /* Reserved lines for trimming, to leave room for shifting */ |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
339 #define TRIM_RESERVED_LINES 100 |
36671 | 340 |
341 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, | |
342 xdfenv_t *xe) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
343 int64_t enl1, enl2, sample; |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
344 mmfile_t tmf1, tmf2; |
36671 | 345 xdlclassifier_t cf; |
346 | |
347 memset(&cf, 0, sizeof(cf)); | |
348 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
349 sample = XDL_GUESS_NLINES1; |
36671 | 350 |
351 enl1 = xdl_guess_lines(mf1, sample) + 1; | |
352 enl2 = xdl_guess_lines(mf2, sample) + 1; | |
353 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
354 if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) |
36671 | 355 return -1; |
356 | |
36820
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
357 xdl_trim_files(mf1, mf2, TRIM_RESERVED_LINES, xe, &tmf1, &tmf2); |
f33a87cf60cc
xdiff: add a preprocessing step that trims files
Jun Wu <quark@fb.com>
parents:
36672
diff
changeset
|
358 |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
359 if (xdl_prepare_ctx(1, &tmf1, enl1, &cf, &xe->xdf1) < 0) { |
36671 | 360 |
361 xdl_free_classifier(&cf); | |
362 return -1; | |
363 } | |
36824
f0d9811dda8e
xdiff: remove unused xpp and xecfg parameters
Jun Wu <quark@fb.com>
parents:
36823
diff
changeset
|
364 if (xdl_prepare_ctx(2, &tmf2, enl2, &cf, &xe->xdf2) < 0) { |
36671 | 365 |
366 xdl_free_ctx(&xe->xdf1); | |
367 xdl_free_classifier(&cf); | |
368 return -1; | |
369 } | |
370 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
371 if (xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) { |
36671 | 372 xdl_free_ctx(&xe->xdf2); |
373 xdl_free_ctx(&xe->xdf1); | |
374 xdl_free_classifier(&cf); | |
375 return -1; | |
376 } | |
377 | |
36672
9e7b14caf67f
xdiff: remove patience and histogram diff algorithms
Jun Wu <quark@fb.com>
parents:
36671
diff
changeset
|
378 xdl_free_classifier(&cf); |
36671 | 379 |
380 return 0; | |
381 } | |
382 | |
383 | |
384 void xdl_free_env(xdfenv_t *xe) { | |
385 | |
386 xdl_free_ctx(&xe->xdf2); | |
387 xdl_free_ctx(&xe->xdf1); | |
388 } | |
389 | |
390 | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
391 static int xdl_clean_mmatch(char const *dis, int64_t i, int64_t s, int64_t e) { |
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
392 int64_t r, rdis0, rpdis0, rdis1, rpdis1; |
36671 | 393 |
394 /* | |
395 * Limits the window the is examined during the similar-lines | |
396 * scan. The loops below stops when dis[i - r] == 1 (line that | |
397 * has no match), but there are corner cases where the loop | |
398 * proceed all the way to the extremities by causing huge | |
399 * performance penalties in case of big files. | |
400 */ | |
401 if (i - s > XDL_SIMSCAN_WINDOW) | |
402 s = i - XDL_SIMSCAN_WINDOW; | |
403 if (e - i > XDL_SIMSCAN_WINDOW) | |
404 e = i + XDL_SIMSCAN_WINDOW; | |
405 | |
406 /* | |
407 * Scans the lines before 'i' to find a run of lines that either | |
408 * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1). | |
409 * Note that we always call this function with dis[i] > 1, so the | |
410 * current line (i) is already a multimatch line. | |
411 */ | |
412 for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { | |
413 if (!dis[i - r]) | |
414 rdis0++; | |
415 else if (dis[i - r] == 2) | |
416 rpdis0++; | |
417 else | |
418 break; | |
419 } | |
420 /* | |
421 * If the run before the line 'i' found only multimatch lines, we | |
422 * return 0 and hence we don't make the current line (i) discarded. | |
423 * We want to discard multimatch lines only when they appear in the | |
424 * middle of runs with nomatch lines (dis[j] == 0). | |
425 */ | |
426 if (rdis0 == 0) | |
427 return 0; | |
428 for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { | |
429 if (!dis[i + r]) | |
430 rdis1++; | |
431 else if (dis[i + r] == 2) | |
432 rpdis1++; | |
433 else | |
434 break; | |
435 } | |
436 /* | |
437 * If the run after the line 'i' found only multimatch lines, we | |
438 * return 0 and hence we don't make the current line (i) discarded. | |
439 */ | |
440 if (rdis1 == 0) | |
441 return 0; | |
442 rdis1 += rdis0; | |
443 rpdis1 += rpdis0; | |
444 | |
445 return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1); | |
446 } | |
447 | |
448 | |
449 /* | |
450 * Try to reduce the problem complexity, discard records that have no | |
451 * matches on the other file. Also, lines that have multiple matches | |
452 * might be potentially discarded if they happear in a run of discardable. | |
453 */ | |
454 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
455 int64_t i, nm, nreff, mlim; |
36671 | 456 xrecord_t **recs; |
457 xdlclass_t *rcrec; | |
458 char *dis, *dis1, *dis2; | |
459 | |
460 if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) { | |
461 | |
462 return -1; | |
463 } | |
464 memset(dis, 0, xdf1->nrec + xdf2->nrec + 2); | |
465 dis1 = dis; | |
466 dis2 = dis1 + xdf1->nrec + 1; | |
467 | |
468 if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) | |
469 mlim = XDL_MAX_EQLIMIT; | |
470 for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { | |
471 rcrec = cf->rcrecs[(*recs)->ha]; | |
472 nm = rcrec ? rcrec->len2 : 0; | |
473 dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; | |
474 } | |
475 | |
476 if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) | |
477 mlim = XDL_MAX_EQLIMIT; | |
478 for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { | |
479 rcrec = cf->rcrecs[(*recs)->ha]; | |
480 nm = rcrec ? rcrec->len1 : 0; | |
481 dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1; | |
482 } | |
483 | |
484 for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; | |
485 i <= xdf1->dend; i++, recs++) { | |
486 if (dis1[i] == 1 || | |
487 (dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) { | |
488 xdf1->rindex[nreff] = i; | |
489 xdf1->ha[nreff] = (*recs)->ha; | |
490 nreff++; | |
491 } else | |
492 xdf1->rchg[i] = 1; | |
493 } | |
494 xdf1->nreff = nreff; | |
495 | |
496 for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; | |
497 i <= xdf2->dend; i++, recs++) { | |
498 if (dis2[i] == 1 || | |
499 (dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) { | |
500 xdf2->rindex[nreff] = i; | |
501 xdf2->ha[nreff] = (*recs)->ha; | |
502 nreff++; | |
503 } else | |
504 xdf2->rchg[i] = 1; | |
505 } | |
506 xdf2->nreff = nreff; | |
507 | |
508 xdl_free(dis); | |
509 | |
510 return 0; | |
511 } | |
512 | |
513 | |
514 /* | |
515 * Early trim initial and terminal matching records. | |
516 */ | |
517 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { | |
36822
882657a9f768
xdiff: replace {unsigned ,}long with {u,}int64_t
Jun Wu <quark@fb.com>
parents:
36820
diff
changeset
|
518 int64_t i, lim; |
36671 | 519 xrecord_t **recs1, **recs2; |
520 | |
521 recs1 = xdf1->recs; | |
522 recs2 = xdf2->recs; | |
523 for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; | |
524 i++, recs1++, recs2++) | |
525 if ((*recs1)->ha != (*recs2)->ha) | |
526 break; | |
527 | |
528 xdf1->dstart = xdf2->dstart = i; | |
529 | |
530 recs1 = xdf1->recs + xdf1->nrec - 1; | |
531 recs2 = xdf2->recs + xdf2->nrec - 1; | |
532 for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) | |
533 if ((*recs1)->ha != (*recs2)->ha) | |
534 break; | |
535 | |
536 xdf1->dend = xdf1->nrec - i - 1; | |
537 xdf2->dend = xdf2->nrec - i - 1; | |
538 | |
539 return 0; | |
540 } | |
541 | |
542 | |
543 static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { | |
544 | |
545 if (xdl_trim_ends(xdf1, xdf2) < 0 || | |
546 xdl_cleanup_records(cf, xdf1, xdf2) < 0) { | |
547 | |
548 return -1; | |
549 } | |
550 | |
551 return 0; | |
552 } |