Mercurial > hg
comparison contrib/python-zstandard/zstd/dictBuilder/zdict.c @ 30434:2e484bdea8c4
zstd: vendor zstd 1.1.1
zstd is a new compression format and it is awesome, yielding
higher compression ratios and significantly faster compression
and decompression operations compared to zlib (our current
compression engine of choice) across the board.
We want zstd to be a 1st class citizen in Mercurial and to eventually
be the preferred compression format for various operations.
This patch starts the formal process of supporting zstd by vendoring
a copy of zstd. Why do we need to vendor zstd? Good question.
First, zstd is relatively new and not widely available yet. If we
didn't vendor zstd or distribute it with Mercurial, most users likely
wouldn't have zstd installed or even available to install. What good
is a feature if you can't use it? Vendoring and distributing the zstd
sources gives us the highest liklihood that zstd will be available to
Mercurial installs.
Second, the Python bindings to zstd (which will be vendored in a
separate changeset) make use of zstd APIs that are only available
via static linking. One reason they are only available via static
linking is that they are unstable and could change at any time.
While it might be possible for the Python bindings to attempt to
talk to different versions of the zstd C library, the safest thing to
do is link against a specific, known-working version of zstd. This
is why the Python zstd bindings themselves vendor zstd and why we
must as well. This also explains why the added files are in a
"python-zstandard" directory.
The added files are from the 1.1.1 release of zstd (Git commit
4c0b44f8ced84c4c8edfa07b564d31e4fa3e8885 from
https://github.com/facebook/zstd) and are added without modifications.
Not all files from the zstd "distribution" have been added. Notably
missing are files to support interacting with "legacy," pre-1.0
versions of zstd. The decision of which files to include is made by
the upstream python-zstandard project (which I'm the author of). The
files in this commit are a snapshot of the files from the 0.5.0
release of that project, Git commit
e637c1b214d5f869cf8116c550dcae23ec13b677 from
https://github.com/indygreg/python-zstandard.
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Thu, 10 Nov 2016 21:45:29 -0800 |
parents | |
children | b54a2984cdd4 |
comparison
equal
deleted
inserted
replaced
30433:96f2f50d923f | 30434:2e484bdea8c4 |
---|---|
1 /** | |
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. | |
3 * All rights reserved. | |
4 * | |
5 * This source code is licensed under the BSD-style license found in the | |
6 * LICENSE file in the root directory of this source tree. An additional grant | |
7 * of patent rights can be found in the PATENTS file in the same directory. | |
8 */ | |
9 | |
10 | |
11 /*-************************************** | |
12 * Tuning parameters | |
13 ****************************************/ | |
14 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) | |
15 #define ZDICT_MIN_SAMPLES_SIZE 512 | |
16 | |
17 | |
18 /*-************************************** | |
19 * Compiler Options | |
20 ****************************************/ | |
21 /* Unix Large Files support (>4GB) */ | |
22 #define _FILE_OFFSET_BITS 64 | |
23 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ | |
24 # define _LARGEFILE_SOURCE | |
25 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ | |
26 # define _LARGEFILE64_SOURCE | |
27 #endif | |
28 | |
29 | |
30 /*-************************************* | |
31 * Dependencies | |
32 ***************************************/ | |
33 #include <stdlib.h> /* malloc, free */ | |
34 #include <string.h> /* memset */ | |
35 #include <stdio.h> /* fprintf, fopen, ftello64 */ | |
36 #include <time.h> /* clock */ | |
37 | |
38 #include "mem.h" /* read */ | |
39 #include "error_private.h" | |
40 #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ | |
41 #define HUF_STATIC_LINKING_ONLY | |
42 #include "huf.h" | |
43 #include "zstd_internal.h" /* includes zstd.h */ | |
44 #include "xxhash.h" | |
45 #include "divsufsort.h" | |
46 #ifndef ZDICT_STATIC_LINKING_ONLY | |
47 # define ZDICT_STATIC_LINKING_ONLY | |
48 #endif | |
49 #include "zdict.h" | |
50 | |
51 | |
52 /*-************************************* | |
53 * Constants | |
54 ***************************************/ | |
55 #define KB *(1 <<10) | |
56 #define MB *(1 <<20) | |
57 #define GB *(1U<<30) | |
58 | |
59 #define DICTLISTSIZE_DEFAULT 10000 | |
60 | |
61 #define NOISELENGTH 32 | |
62 | |
63 #define MINRATIO 4 | |
64 static const int g_compressionLevel_default = 5; | |
65 static const U32 g_selectivity_default = 9; | |
66 static const size_t g_provision_entropySize = 200; | |
67 static const size_t g_min_fast_dictContent = 192; | |
68 | |
69 | |
70 /*-************************************* | |
71 * Console display | |
72 ***************************************/ | |
73 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } | |
74 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ | |
75 | |
76 static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } | |
77 | |
78 static void ZDICT_printHex(const void* ptr, size_t length) | |
79 { | |
80 const BYTE* const b = (const BYTE*)ptr; | |
81 size_t u; | |
82 for (u=0; u<length; u++) { | |
83 BYTE c = b[u]; | |
84 if (c<32 || c>126) c = '.'; /* non-printable char */ | |
85 DISPLAY("%c", c); | |
86 } | |
87 } | |
88 | |
89 | |
90 /*-******************************************************** | |
91 * Helper functions | |
92 **********************************************************/ | |
93 unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } | |
94 | |
95 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } | |
96 | |
97 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) | |
98 { | |
99 if (dictSize < 8) return 0; | |
100 if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0; | |
101 return MEM_readLE32((const char*)dictBuffer + 4); | |
102 } | |
103 | |
104 | |
105 /*-******************************************************** | |
106 * Dictionary training functions | |
107 **********************************************************/ | |
108 static unsigned ZDICT_NbCommonBytes (register size_t val) | |
109 { | |
110 if (MEM_isLittleEndian()) { | |
111 if (MEM_64bits()) { | |
112 # if defined(_MSC_VER) && defined(_WIN64) | |
113 unsigned long r = 0; | |
114 _BitScanForward64( &r, (U64)val ); | |
115 return (unsigned)(r>>3); | |
116 # elif defined(__GNUC__) && (__GNUC__ >= 3) | |
117 return (__builtin_ctzll((U64)val) >> 3); | |
118 # else | |
119 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; | |
120 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; | |
121 # endif | |
122 } else { /* 32 bits */ | |
123 # if defined(_MSC_VER) | |
124 unsigned long r=0; | |
125 _BitScanForward( &r, (U32)val ); | |
126 return (unsigned)(r>>3); | |
127 # elif defined(__GNUC__) && (__GNUC__ >= 3) | |
128 return (__builtin_ctz((U32)val) >> 3); | |
129 # else | |
130 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; | |
131 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; | |
132 # endif | |
133 } | |
134 } else { /* Big Endian CPU */ | |
135 if (MEM_64bits()) { | |
136 # if defined(_MSC_VER) && defined(_WIN64) | |
137 unsigned long r = 0; | |
138 _BitScanReverse64( &r, val ); | |
139 return (unsigned)(r>>3); | |
140 # elif defined(__GNUC__) && (__GNUC__ >= 3) | |
141 return (__builtin_clzll(val) >> 3); | |
142 # else | |
143 unsigned r; | |
144 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ | |
145 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } | |
146 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } | |
147 r += (!val); | |
148 return r; | |
149 # endif | |
150 } else { /* 32 bits */ | |
151 # if defined(_MSC_VER) | |
152 unsigned long r = 0; | |
153 _BitScanReverse( &r, (unsigned long)val ); | |
154 return (unsigned)(r>>3); | |
155 # elif defined(__GNUC__) && (__GNUC__ >= 3) | |
156 return (__builtin_clz((U32)val) >> 3); | |
157 # else | |
158 unsigned r; | |
159 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } | |
160 r += (!val); | |
161 return r; | |
162 # endif | |
163 } } | |
164 } | |
165 | |
166 | |
167 /*! ZDICT_count() : | |
168 Count the nb of common bytes between 2 pointers. | |
169 Note : this function presumes end of buffer followed by noisy guard band. | |
170 */ | |
171 static size_t ZDICT_count(const void* pIn, const void* pMatch) | |
172 { | |
173 const char* const pStart = (const char*)pIn; | |
174 for (;;) { | |
175 size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); | |
176 if (!diff) { | |
177 pIn = (const char*)pIn+sizeof(size_t); | |
178 pMatch = (const char*)pMatch+sizeof(size_t); | |
179 continue; | |
180 } | |
181 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); | |
182 return (size_t)((const char*)pIn - pStart); | |
183 } | |
184 } | |
185 | |
186 | |
187 typedef struct { | |
188 U32 pos; | |
189 U32 length; | |
190 U32 savings; | |
191 } dictItem; | |
192 | |
193 static void ZDICT_initDictItem(dictItem* d) | |
194 { | |
195 d->pos = 1; | |
196 d->length = 0; | |
197 d->savings = (U32)(-1); | |
198 } | |
199 | |
200 | |
201 #define LLIMIT 64 /* heuristic determined experimentally */ | |
202 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ | |
203 static dictItem ZDICT_analyzePos( | |
204 BYTE* doneMarks, | |
205 const int* suffix, U32 start, | |
206 const void* buffer, U32 minRatio, U32 notificationLevel) | |
207 { | |
208 U32 lengthList[LLIMIT] = {0}; | |
209 U32 cumulLength[LLIMIT] = {0}; | |
210 U32 savings[LLIMIT] = {0}; | |
211 const BYTE* b = (const BYTE*)buffer; | |
212 size_t length; | |
213 size_t maxLength = LLIMIT; | |
214 size_t pos = suffix[start]; | |
215 U32 end = start; | |
216 dictItem solution; | |
217 | |
218 /* init */ | |
219 memset(&solution, 0, sizeof(solution)); | |
220 doneMarks[pos] = 1; | |
221 | |
222 /* trivial repetition cases */ | |
223 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) | |
224 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) | |
225 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { | |
226 /* skip and mark segment */ | |
227 U16 u16 = MEM_read16(b+pos+4); | |
228 U32 u, e = 6; | |
229 while (MEM_read16(b+pos+e) == u16) e+=2 ; | |
230 if (b[pos+e] == b[pos+e-1]) e++; | |
231 for (u=1; u<e; u++) | |
232 doneMarks[pos+u] = 1; | |
233 return solution; | |
234 } | |
235 | |
236 /* look forward */ | |
237 do { | |
238 end++; | |
239 length = ZDICT_count(b + pos, b + suffix[end]); | |
240 } while (length >=MINMATCHLENGTH); | |
241 | |
242 /* look backward */ | |
243 do { | |
244 length = ZDICT_count(b + pos, b + *(suffix+start-1)); | |
245 if (length >=MINMATCHLENGTH) start--; | |
246 } while(length >= MINMATCHLENGTH); | |
247 | |
248 /* exit if not found a minimum nb of repetitions */ | |
249 if (end-start < minRatio) { | |
250 U32 idx; | |
251 for(idx=start; idx<end; idx++) | |
252 doneMarks[suffix[idx]] = 1; | |
253 return solution; | |
254 } | |
255 | |
256 { int i; | |
257 U32 searchLength; | |
258 U32 refinedStart = start; | |
259 U32 refinedEnd = end; | |
260 | |
261 DISPLAYLEVEL(4, "\n"); | |
262 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos); | |
263 DISPLAYLEVEL(4, "\n"); | |
264 | |
265 for (searchLength = MINMATCHLENGTH ; ; searchLength++) { | |
266 BYTE currentChar = 0; | |
267 U32 currentCount = 0; | |
268 U32 currentID = refinedStart; | |
269 U32 id; | |
270 U32 selectedCount = 0; | |
271 U32 selectedID = currentID; | |
272 for (id =refinedStart; id < refinedEnd; id++) { | |
273 if (b[ suffix[id] + searchLength] != currentChar) { | |
274 if (currentCount > selectedCount) { | |
275 selectedCount = currentCount; | |
276 selectedID = currentID; | |
277 } | |
278 currentID = id; | |
279 currentChar = b[ suffix[id] + searchLength]; | |
280 currentCount = 0; | |
281 } | |
282 currentCount ++; | |
283 } | |
284 if (currentCount > selectedCount) { /* for last */ | |
285 selectedCount = currentCount; | |
286 selectedID = currentID; | |
287 } | |
288 | |
289 if (selectedCount < minRatio) | |
290 break; | |
291 refinedStart = selectedID; | |
292 refinedEnd = refinedStart + selectedCount; | |
293 } | |
294 | |
295 /* evaluate gain based on new ref */ | |
296 start = refinedStart; | |
297 pos = suffix[refinedStart]; | |
298 end = start; | |
299 memset(lengthList, 0, sizeof(lengthList)); | |
300 | |
301 /* look forward */ | |
302 do { | |
303 end++; | |
304 length = ZDICT_count(b + pos, b + suffix[end]); | |
305 if (length >= LLIMIT) length = LLIMIT-1; | |
306 lengthList[length]++; | |
307 } while (length >=MINMATCHLENGTH); | |
308 | |
309 /* look backward */ | |
310 length = MINMATCHLENGTH; | |
311 while ((length >= MINMATCHLENGTH) & (start > 0)) { | |
312 length = ZDICT_count(b + pos, b + suffix[start - 1]); | |
313 if (length >= LLIMIT) length = LLIMIT - 1; | |
314 lengthList[length]++; | |
315 if (length >= MINMATCHLENGTH) start--; | |
316 } | |
317 | |
318 /* largest useful length */ | |
319 memset(cumulLength, 0, sizeof(cumulLength)); | |
320 cumulLength[maxLength-1] = lengthList[maxLength-1]; | |
321 for (i=(int)(maxLength-2); i>=0; i--) | |
322 cumulLength[i] = cumulLength[i+1] + lengthList[i]; | |
323 | |
324 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; | |
325 maxLength = i; | |
326 | |
327 /* reduce maxLength in case of final into repetitive data */ | |
328 { U32 l = (U32)maxLength; | |
329 BYTE const c = b[pos + maxLength-1]; | |
330 while (b[pos+l-2]==c) l--; | |
331 maxLength = l; | |
332 } | |
333 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ | |
334 | |
335 /* calculate savings */ | |
336 savings[5] = 0; | |
337 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) | |
338 savings[i] = savings[i-1] + (lengthList[i] * (i-3)); | |
339 | |
340 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", | |
341 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); | |
342 | |
343 solution.pos = (U32)pos; | |
344 solution.length = (U32)maxLength; | |
345 solution.savings = savings[maxLength]; | |
346 | |
347 /* mark positions done */ | |
348 { U32 id; | |
349 for (id=start; id<end; id++) { | |
350 U32 p, pEnd; | |
351 U32 const testedPos = suffix[id]; | |
352 if (testedPos == pos) | |
353 length = solution.length; | |
354 else { | |
355 length = ZDICT_count(b+pos, b+testedPos); | |
356 if (length > solution.length) length = solution.length; | |
357 } | |
358 pEnd = (U32)(testedPos + length); | |
359 for (p=testedPos; p<pEnd; p++) | |
360 doneMarks[p] = 1; | |
361 } } } | |
362 | |
363 return solution; | |
364 } | |
365 | |
366 | |
367 /*! ZDICT_checkMerge | |
368 check if dictItem can be merged, do it if possible | |
369 @return : id of destination elt, 0 if not merged | |
370 */ | |
371 static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip) | |
372 { | |
373 const U32 tableSize = table->pos; | |
374 const U32 eltEnd = elt.pos + elt.length; | |
375 | |
376 /* tail overlap */ | |
377 U32 u; for (u=1; u<tableSize; u++) { | |
378 if (u==eltNbToSkip) continue; | |
379 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ | |
380 /* append */ | |
381 U32 addedLength = table[u].pos - elt.pos; | |
382 table[u].length += addedLength; | |
383 table[u].pos = elt.pos; | |
384 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ | |
385 table[u].savings += elt.length / 8; /* rough approx bonus */ | |
386 elt = table[u]; | |
387 /* sort : improve rank */ | |
388 while ((u>1) && (table[u-1].savings < elt.savings)) | |
389 table[u] = table[u-1], u--; | |
390 table[u] = elt; | |
391 return u; | |
392 } } | |
393 | |
394 /* front overlap */ | |
395 for (u=1; u<tableSize; u++) { | |
396 if (u==eltNbToSkip) continue; | |
397 if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ | |
398 /* append */ | |
399 int addedLength = (int)eltEnd - (table[u].pos + table[u].length); | |
400 table[u].savings += elt.length / 8; /* rough approx bonus */ | |
401 if (addedLength > 0) { /* otherwise, elt fully included into existing */ | |
402 table[u].length += addedLength; | |
403 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ | |
404 } | |
405 /* sort : improve rank */ | |
406 elt = table[u]; | |
407 while ((u>1) && (table[u-1].savings < elt.savings)) | |
408 table[u] = table[u-1], u--; | |
409 table[u] = elt; | |
410 return u; | |
411 } } | |
412 | |
413 return 0; | |
414 } | |
415 | |
416 | |
417 static void ZDICT_removeDictItem(dictItem* table, U32 id) | |
418 { | |
419 /* convention : first element is nb of elts */ | |
420 U32 const max = table->pos; | |
421 U32 u; | |
422 if (!id) return; /* protection, should never happen */ | |
423 for (u=id; u<max-1; u++) | |
424 table[u] = table[u+1]; | |
425 table->pos--; | |
426 } | |
427 | |
428 | |
429 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt) | |
430 { | |
431 /* merge if possible */ | |
432 U32 mergeId = ZDICT_checkMerge(table, elt, 0); | |
433 if (mergeId) { | |
434 U32 newMerge = 1; | |
435 while (newMerge) { | |
436 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId); | |
437 if (newMerge) ZDICT_removeDictItem(table, mergeId); | |
438 mergeId = newMerge; | |
439 } | |
440 return; | |
441 } | |
442 | |
443 /* insert */ | |
444 { U32 current; | |
445 U32 nextElt = table->pos; | |
446 if (nextElt >= maxSize) nextElt = maxSize-1; | |
447 current = nextElt-1; | |
448 while (table[current].savings < elt.savings) { | |
449 table[current+1] = table[current]; | |
450 current--; | |
451 } | |
452 table[current+1] = elt; | |
453 table->pos = nextElt+1; | |
454 } | |
455 } | |
456 | |
457 | |
458 static U32 ZDICT_dictSize(const dictItem* dictList) | |
459 { | |
460 U32 u, dictSize = 0; | |
461 for (u=1; u<dictList[0].pos; u++) | |
462 dictSize += dictList[u].length; | |
463 return dictSize; | |
464 } | |
465 | |
466 | |
467 static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize, | |
468 const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ | |
469 const size_t* fileSizes, unsigned nbFiles, | |
470 U32 minRatio, U32 notificationLevel) | |
471 { | |
472 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0)); | |
473 int* const suffix = suffix0+1; | |
474 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix)); | |
475 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */ | |
476 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); | |
477 size_t result = 0; | |
478 clock_t displayClock = 0; | |
479 clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10; | |
480 | |
481 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \ | |
482 if (ZDICT_clockSpan(displayClock) > refreshRate) \ | |
483 { displayClock = clock(); DISPLAY(__VA_ARGS__); \ | |
484 if (notificationLevel>=4) fflush(stdout); } } | |
485 | |
486 /* init */ | |
487 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ | |
488 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { | |
489 result = ERROR(memory_allocation); | |
490 goto _cleanup; | |
491 } | |
492 if (minRatio < MINRATIO) minRatio = MINRATIO; | |
493 memset(doneMarks, 0, bufferSize+16); | |
494 | |
495 /* limit sample set size (divsufsort limitation)*/ | |
496 if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20)); | |
497 while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; | |
498 | |
499 /* sort */ | |
500 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); | |
501 { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); | |
502 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } | |
503 } | |
504 suffix[bufferSize] = (int)bufferSize; /* leads into noise */ | |
505 suffix0[0] = (int)bufferSize; /* leads into noise */ | |
506 /* build reverse suffix sort */ | |
507 { size_t pos; | |
508 for (pos=0; pos < bufferSize; pos++) | |
509 reverseSuffix[suffix[pos]] = (U32)pos; | |
510 /* note filePos tracks borders between samples. | |
511 It's not used at this stage, but planned to become useful in a later update */ | |
512 filePos[0] = 0; | |
513 for (pos=1; pos<nbFiles; pos++) | |
514 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]); | |
515 } | |
516 | |
517 DISPLAYLEVEL(2, "finding patterns ... \n"); | |
518 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio); | |
519 | |
520 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { | |
521 dictItem solution; | |
522 if (doneMarks[cursor]) { cursor++; continue; } | |
523 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); | |
524 if (solution.length==0) { cursor++; continue; } | |
525 ZDICT_insertDictItem(dictList, dictListSize, solution); | |
526 cursor += solution.length; | |
527 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); | |
528 } } | |
529 | |
530 _cleanup: | |
531 free(suffix0); | |
532 free(reverseSuffix); | |
533 free(doneMarks); | |
534 free(filePos); | |
535 return result; | |
536 } | |
537 | |
538 | |
539 static void ZDICT_fillNoise(void* buffer, size_t length) | |
540 { | |
541 unsigned const prime1 = 2654435761U; | |
542 unsigned const prime2 = 2246822519U; | |
543 unsigned acc = prime1; | |
544 size_t p=0;; | |
545 for (p=0; p<length; p++) { | |
546 acc *= prime2; | |
547 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); | |
548 } | |
549 } | |
550 | |
551 | |
552 typedef struct | |
553 { | |
554 ZSTD_CCtx* ref; | |
555 ZSTD_CCtx* zc; | |
556 void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */ | |
557 } EStats_ress_t; | |
558 | |
559 #define MAXREPOFFSET 1024 | |
560 | |
561 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, | |
562 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, | |
563 const void* src, size_t srcSize, U32 notificationLevel) | |
564 { | |
565 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog); | |
566 size_t cSize; | |
567 | |
568 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ | |
569 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); | |
570 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } | |
571 } | |
572 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize); | |
573 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; } | |
574 | |
575 if (cSize) { /* if == 0; block is not compressible */ | |
576 const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); | |
577 | |
578 /* literals stats */ | |
579 { const BYTE* bytePtr; | |
580 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) | |
581 countLit[*bytePtr]++; | |
582 } | |
583 | |
584 /* seqStats */ | |
585 { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); | |
586 ZSTD_seqToCodes(seqStorePtr); | |
587 | |
588 { const BYTE* codePtr = seqStorePtr->ofCode; | |
589 U32 u; | |
590 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++; | |
591 } | |
592 | |
593 { const BYTE* codePtr = seqStorePtr->mlCode; | |
594 U32 u; | |
595 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++; | |
596 } | |
597 | |
598 { const BYTE* codePtr = seqStorePtr->llCode; | |
599 U32 u; | |
600 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++; | |
601 } | |
602 | |
603 if (nbSeq >= 2) { /* rep offsets */ | |
604 const seqDef* const seq = seqStorePtr->sequencesStart; | |
605 U32 offset1 = seq[0].offset - 3; | |
606 U32 offset2 = seq[1].offset - 3; | |
607 if (offset1 >= MAXREPOFFSET) offset1 = 0; | |
608 if (offset2 >= MAXREPOFFSET) offset2 = 0; | |
609 repOffsets[offset1] += 3; | |
610 repOffsets[offset2] += 1; | |
611 } } } | |
612 } | |
613 | |
614 /* | |
615 static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles) | |
616 { | |
617 unsigned u; | |
618 size_t max=0; | |
619 for (u=0; u<nbFiles; u++) | |
620 if (max < fileSizes[u]) max = fileSizes[u]; | |
621 return max; | |
622 } | |
623 */ | |
624 | |
625 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) | |
626 { | |
627 size_t total=0; | |
628 unsigned u; | |
629 for (u=0; u<nbFiles; u++) total += fileSizes[u]; | |
630 return total; | |
631 } | |
632 | |
633 typedef struct { U32 offset; U32 count; } offsetCount_t; | |
634 | |
635 static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count) | |
636 { | |
637 U32 u; | |
638 table[ZSTD_REP_NUM].offset = val; | |
639 table[ZSTD_REP_NUM].count = count; | |
640 for (u=ZSTD_REP_NUM; u>0; u--) { | |
641 offsetCount_t tmp; | |
642 if (table[u-1].count >= table[u].count) break; | |
643 tmp = table[u-1]; | |
644 table[u-1] = table[u]; | |
645 table[u] = tmp; | |
646 } | |
647 } | |
648 | |
649 | |
650 #define OFFCODE_MAX 30 /* only applicable to first block */ | |
651 static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, | |
652 unsigned compressionLevel, | |
653 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, | |
654 const void* dictBuffer, size_t dictBufferSize, | |
655 unsigned notificationLevel) | |
656 { | |
657 U32 countLit[256]; | |
658 HUF_CREATE_STATIC_CTABLE(hufTable, 255); | |
659 U32 offcodeCount[OFFCODE_MAX+1]; | |
660 short offcodeNCount[OFFCODE_MAX+1]; | |
661 U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB)); | |
662 U32 matchLengthCount[MaxML+1]; | |
663 short matchLengthNCount[MaxML+1]; | |
664 U32 litLengthCount[MaxLL+1]; | |
665 short litLengthNCount[MaxLL+1]; | |
666 U32 repOffset[MAXREPOFFSET]; | |
667 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; | |
668 EStats_ress_t esr; | |
669 ZSTD_parameters params; | |
670 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; | |
671 size_t pos = 0, errorCode; | |
672 size_t eSize = 0; | |
673 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); | |
674 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); | |
675 BYTE* dstPtr = (BYTE*)dstBuffer; | |
676 | |
677 /* init */ | |
678 esr.ref = ZSTD_createCCtx(); | |
679 esr.zc = ZSTD_createCCtx(); | |
680 esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX); | |
681 if (!esr.ref || !esr.zc || !esr.workPlace) { | |
682 eSize = ERROR(memory_allocation); | |
683 DISPLAYLEVEL(1, "Not enough memory \n"); | |
684 goto _cleanup; | |
685 } | |
686 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */ | |
687 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ | |
688 for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1; | |
689 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; | |
690 for (u=0; u<=MaxLL; u++) litLengthCount[u]=1; | |
691 memset(repOffset, 0, sizeof(repOffset)); | |
692 repOffset[1] = repOffset[4] = repOffset[8] = 1; | |
693 memset(bestRepOffset, 0, sizeof(bestRepOffset)); | |
694 if (compressionLevel==0) compressionLevel=g_compressionLevel_default; | |
695 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); | |
696 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); | |
697 if (ZSTD_isError(beginResult)) { | |
698 eSize = ERROR(GENERIC); | |
699 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n"); | |
700 goto _cleanup; | |
701 } } | |
702 | |
703 /* collect stats on all files */ | |
704 for (u=0; u<nbFiles; u++) { | |
705 ZDICT_countEStats(esr, params, | |
706 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, | |
707 (const char*)srcBuffer + pos, fileSizes[u], | |
708 notificationLevel); | |
709 pos += fileSizes[u]; | |
710 } | |
711 | |
712 /* analyze */ | |
713 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); | |
714 if (HUF_isError(errorCode)) { | |
715 eSize = ERROR(GENERIC); | |
716 DISPLAYLEVEL(1, "HUF_buildCTable error \n"); | |
717 goto _cleanup; | |
718 } | |
719 huffLog = (U32)errorCode; | |
720 | |
721 /* looking for most common first offsets */ | |
722 { U32 offset; | |
723 for (offset=1; offset<MAXREPOFFSET; offset++) | |
724 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); | |
725 } | |
726 /* note : the result of this phase should be used to better appreciate the impact on statistics */ | |
727 | |
728 total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; | |
729 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax); | |
730 if (FSE_isError(errorCode)) { | |
731 eSize = ERROR(GENERIC); | |
732 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); | |
733 goto _cleanup; | |
734 } | |
735 Offlog = (U32)errorCode; | |
736 | |
737 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; | |
738 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); | |
739 if (FSE_isError(errorCode)) { | |
740 eSize = ERROR(GENERIC); | |
741 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); | |
742 goto _cleanup; | |
743 } | |
744 mlLog = (U32)errorCode; | |
745 | |
746 total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; | |
747 errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL); | |
748 if (FSE_isError(errorCode)) { | |
749 eSize = ERROR(GENERIC); | |
750 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); | |
751 goto _cleanup; | |
752 } | |
753 llLog = (U32)errorCode; | |
754 | |
755 /* write result to buffer */ | |
756 { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); | |
757 if (HUF_isError(hhSize)) { | |
758 eSize = ERROR(GENERIC); | |
759 DISPLAYLEVEL(1, "HUF_writeCTable error \n"); | |
760 goto _cleanup; | |
761 } | |
762 dstPtr += hhSize; | |
763 maxDstSize -= hhSize; | |
764 eSize += hhSize; | |
765 } | |
766 | |
767 { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); | |
768 if (FSE_isError(ohSize)) { | |
769 eSize = ERROR(GENERIC); | |
770 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n"); | |
771 goto _cleanup; | |
772 } | |
773 dstPtr += ohSize; | |
774 maxDstSize -= ohSize; | |
775 eSize += ohSize; | |
776 } | |
777 | |
778 { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); | |
779 if (FSE_isError(mhSize)) { | |
780 eSize = ERROR(GENERIC); | |
781 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n"); | |
782 goto _cleanup; | |
783 } | |
784 dstPtr += mhSize; | |
785 maxDstSize -= mhSize; | |
786 eSize += mhSize; | |
787 } | |
788 | |
789 { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); | |
790 if (FSE_isError(lhSize)) { | |
791 eSize = ERROR(GENERIC); | |
792 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n"); | |
793 goto _cleanup; | |
794 } | |
795 dstPtr += lhSize; | |
796 maxDstSize -= lhSize; | |
797 eSize += lhSize; | |
798 } | |
799 | |
800 if (maxDstSize<12) { | |
801 eSize = ERROR(GENERIC); | |
802 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n"); | |
803 goto _cleanup; | |
804 } | |
805 # if 0 | |
806 MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset); | |
807 MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset); | |
808 MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); | |
809 #else | |
810 /* at this stage, we don't use the result of "most common first offset", | |
811 as the impact of statistics is not properly evaluated */ | |
812 MEM_writeLE32(dstPtr+0, repStartValue[0]); | |
813 MEM_writeLE32(dstPtr+4, repStartValue[1]); | |
814 MEM_writeLE32(dstPtr+8, repStartValue[2]); | |
815 #endif | |
816 //dstPtr += 12; | |
817 eSize += 12; | |
818 | |
819 _cleanup: | |
820 ZSTD_freeCCtx(esr.ref); | |
821 ZSTD_freeCCtx(esr.zc); | |
822 free(esr.workPlace); | |
823 | |
824 return eSize; | |
825 } | |
826 | |
827 | |
828 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, | |
829 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, | |
830 ZDICT_params_t params) | |
831 { | |
832 size_t hSize; | |
833 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; | |
834 U32 const notificationLevel = params.notificationLevel; | |
835 | |
836 /* dictionary header */ | |
837 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); | |
838 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); | |
839 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; | |
840 U32 const dictID = params.dictID ? params.dictID : compliantID; | |
841 MEM_writeLE32((char*)dictBuffer+4, dictID); | |
842 } | |
843 hSize = 8; | |
844 | |
845 /* entropy tables */ | |
846 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ | |
847 DISPLAYLEVEL(2, "statistics ... \n"); | |
848 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, | |
849 compressionLevel, | |
850 samplesBuffer, samplesSizes, nbSamples, | |
851 (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, | |
852 notificationLevel); | |
853 if (ZDICT_isError(eSize)) return eSize; | |
854 hSize += eSize; | |
855 } | |
856 | |
857 | |
858 if (hSize + dictContentSize < dictBufferCapacity) | |
859 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); | |
860 return MIN(dictBufferCapacity, hSize+dictContentSize); | |
861 } | |
862 | |
863 | |
864 /*! ZDICT_trainFromBuffer_unsafe() : | |
865 * Warning : `samplesBuffer` must be followed by noisy guard band. | |
866 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() | |
867 */ | |
868 size_t ZDICT_trainFromBuffer_unsafe( | |
869 void* dictBuffer, size_t maxDictSize, | |
870 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, | |
871 ZDICT_params_t params) | |
872 { | |
873 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); | |
874 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); | |
875 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; | |
876 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; | |
877 size_t const targetDictSize = maxDictSize; | |
878 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); | |
879 size_t dictSize = 0; | |
880 U32 const notificationLevel = params.notificationLevel; | |
881 | |
882 /* checks */ | |
883 if (!dictList) return ERROR(memory_allocation); | |
884 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } | |
885 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */ | |
886 | |
887 /* init */ | |
888 ZDICT_initDictItem(dictList); | |
889 | |
890 /* build dictionary */ | |
891 ZDICT_trainBuffer(dictList, dictListSize, | |
892 samplesBuffer, samplesBuffSize, | |
893 samplesSizes, nbSamples, | |
894 minRep, notificationLevel); | |
895 | |
896 /* display best matches */ | |
897 if (params.notificationLevel>= 3) { | |
898 U32 const nb = MIN(25, dictList[0].pos); | |
899 U32 const dictContentSize = ZDICT_dictSize(dictList); | |
900 U32 u; | |
901 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); | |
902 DISPLAYLEVEL(3, "list %u best segments \n", nb); | |
903 for (u=1; u<=nb; u++) { | |
904 U32 pos = dictList[u].pos; | |
905 U32 length = dictList[u].length; | |
906 U32 printedLength = MIN(40, length); | |
907 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", | |
908 u, length, pos, dictList[u].savings); | |
909 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); | |
910 DISPLAYLEVEL(3, "| \n"); | |
911 } } | |
912 | |
913 | |
914 /* create dictionary */ | |
915 { U32 dictContentSize = ZDICT_dictSize(dictList); | |
916 if (dictContentSize < targetDictSize/3) { | |
917 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); | |
918 if (minRep > MINRATIO) { | |
919 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); | |
920 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); | |
921 } | |
922 if (samplesBuffSize < 10 * targetDictSize) | |
923 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); | |
924 } | |
925 | |
926 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { | |
927 U32 proposedSelectivity = selectivity-1; | |
928 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } | |
929 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize); | |
930 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); | |
931 DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n"); | |
932 } | |
933 | |
934 /* limit dictionary size */ | |
935 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ | |
936 U32 currentSize = 0; | |
937 U32 n; for (n=1; n<max; n++) { | |
938 currentSize += dictList[n].length; | |
939 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; } | |
940 } | |
941 dictList->pos = n; | |
942 dictContentSize = currentSize; | |
943 } | |
944 | |
945 /* build dict content */ | |
946 { U32 u; | |
947 BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; | |
948 for (u=1; u<dictList->pos; u++) { | |
949 U32 l = dictList[u].length; | |
950 ptr -= l; | |
951 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ | |
952 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); | |
953 } } | |
954 | |
955 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, | |
956 samplesBuffer, samplesSizes, nbSamples, | |
957 params); | |
958 } | |
959 | |
960 /* clean up */ | |
961 free(dictList); | |
962 return dictSize; | |
963 } | |
964 | |
965 | |
966 /* issue : samplesBuffer need to be followed by a noisy guard band. | |
967 * work around : duplicate the buffer, and add the noise */ | |
968 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, | |
969 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, | |
970 ZDICT_params_t params) | |
971 { | |
972 size_t result; | |
973 void* newBuff; | |
974 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); | |
975 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ | |
976 | |
977 newBuff = malloc(sBuffSize + NOISELENGTH); | |
978 if (!newBuff) return ERROR(memory_allocation); | |
979 | |
980 memcpy(newBuff, samplesBuffer, sBuffSize); | |
981 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ | |
982 | |
983 result = ZDICT_trainFromBuffer_unsafe( | |
984 dictBuffer, dictBufferCapacity, | |
985 newBuff, samplesSizes, nbSamples, | |
986 params); | |
987 free(newBuff); | |
988 return result; | |
989 } | |
990 | |
991 | |
992 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, | |
993 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) | |
994 { | |
995 ZDICT_params_t params; | |
996 memset(¶ms, 0, sizeof(params)); | |
997 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, | |
998 samplesBuffer, samplesSizes, nbSamples, | |
999 params); | |
1000 } | |
1001 | |
1002 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, | |
1003 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) | |
1004 { | |
1005 ZDICT_params_t params; | |
1006 memset(¶ms, 0, sizeof(params)); | |
1007 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, | |
1008 samplesBuffer, samplesSizes, nbSamples, | |
1009 params); | |
1010 } |