comparison contrib/python-zstandard/zstd/dictBuilder/zdict.c @ 30434:2e484bdea8c4

zstd: vendor zstd 1.1.1 zstd is a new compression format and it is awesome, yielding higher compression ratios and significantly faster compression and decompression operations compared to zlib (our current compression engine of choice) across the board. We want zstd to be a 1st class citizen in Mercurial and to eventually be the preferred compression format for various operations. This patch starts the formal process of supporting zstd by vendoring a copy of zstd. Why do we need to vendor zstd? Good question. First, zstd is relatively new and not widely available yet. If we didn't vendor zstd or distribute it with Mercurial, most users likely wouldn't have zstd installed or even available to install. What good is a feature if you can't use it? Vendoring and distributing the zstd sources gives us the highest liklihood that zstd will be available to Mercurial installs. Second, the Python bindings to zstd (which will be vendored in a separate changeset) make use of zstd APIs that are only available via static linking. One reason they are only available via static linking is that they are unstable and could change at any time. While it might be possible for the Python bindings to attempt to talk to different versions of the zstd C library, the safest thing to do is link against a specific, known-working version of zstd. This is why the Python zstd bindings themselves vendor zstd and why we must as well. This also explains why the added files are in a "python-zstandard" directory. The added files are from the 1.1.1 release of zstd (Git commit 4c0b44f8ced84c4c8edfa07b564d31e4fa3e8885 from https://github.com/facebook/zstd) and are added without modifications. Not all files from the zstd "distribution" have been added. Notably missing are files to support interacting with "legacy," pre-1.0 versions of zstd. The decision of which files to include is made by the upstream python-zstandard project (which I'm the author of). The files in this commit are a snapshot of the files from the 0.5.0 release of that project, Git commit e637c1b214d5f869cf8116c550dcae23ec13b677 from https://github.com/indygreg/python-zstandard.
author Gregory Szorc <gregory.szorc@gmail.com>
date Thu, 10 Nov 2016 21:45:29 -0800
parents
children b54a2984cdd4
comparison
equal deleted inserted replaced
30433:96f2f50d923f 30434:2e484bdea8c4
1 /**
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
8 */
9
10
11 /*-**************************************
12 * Tuning parameters
13 ****************************************/
14 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15 #define ZDICT_MIN_SAMPLES_SIZE 512
16
17
18 /*-**************************************
19 * Compiler Options
20 ****************************************/
21 /* Unix Large Files support (>4GB) */
22 #define _FILE_OFFSET_BITS 64
23 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
24 # define _LARGEFILE_SOURCE
25 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
26 # define _LARGEFILE64_SOURCE
27 #endif
28
29
30 /*-*************************************
31 * Dependencies
32 ***************************************/
33 #include <stdlib.h> /* malloc, free */
34 #include <string.h> /* memset */
35 #include <stdio.h> /* fprintf, fopen, ftello64 */
36 #include <time.h> /* clock */
37
38 #include "mem.h" /* read */
39 #include "error_private.h"
40 #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41 #define HUF_STATIC_LINKING_ONLY
42 #include "huf.h"
43 #include "zstd_internal.h" /* includes zstd.h */
44 #include "xxhash.h"
45 #include "divsufsort.h"
46 #ifndef ZDICT_STATIC_LINKING_ONLY
47 # define ZDICT_STATIC_LINKING_ONLY
48 #endif
49 #include "zdict.h"
50
51
52 /*-*************************************
53 * Constants
54 ***************************************/
55 #define KB *(1 <<10)
56 #define MB *(1 <<20)
57 #define GB *(1U<<30)
58
59 #define DICTLISTSIZE_DEFAULT 10000
60
61 #define NOISELENGTH 32
62
63 #define MINRATIO 4
64 static const int g_compressionLevel_default = 5;
65 static const U32 g_selectivity_default = 9;
66 static const size_t g_provision_entropySize = 200;
67 static const size_t g_min_fast_dictContent = 192;
68
69
70 /*-*************************************
71 * Console display
72 ***************************************/
73 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
74 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
75
76 static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
77
78 static void ZDICT_printHex(const void* ptr, size_t length)
79 {
80 const BYTE* const b = (const BYTE*)ptr;
81 size_t u;
82 for (u=0; u<length; u++) {
83 BYTE c = b[u];
84 if (c<32 || c>126) c = '.'; /* non-printable char */
85 DISPLAY("%c", c);
86 }
87 }
88
89
90 /*-********************************************************
91 * Helper functions
92 **********************************************************/
93 unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
94
95 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
96
97 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
98 {
99 if (dictSize < 8) return 0;
100 if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
101 return MEM_readLE32((const char*)dictBuffer + 4);
102 }
103
104
105 /*-********************************************************
106 * Dictionary training functions
107 **********************************************************/
108 static unsigned ZDICT_NbCommonBytes (register size_t val)
109 {
110 if (MEM_isLittleEndian()) {
111 if (MEM_64bits()) {
112 # if defined(_MSC_VER) && defined(_WIN64)
113 unsigned long r = 0;
114 _BitScanForward64( &r, (U64)val );
115 return (unsigned)(r>>3);
116 # elif defined(__GNUC__) && (__GNUC__ >= 3)
117 return (__builtin_ctzll((U64)val) >> 3);
118 # else
119 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
120 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
121 # endif
122 } else { /* 32 bits */
123 # if defined(_MSC_VER)
124 unsigned long r=0;
125 _BitScanForward( &r, (U32)val );
126 return (unsigned)(r>>3);
127 # elif defined(__GNUC__) && (__GNUC__ >= 3)
128 return (__builtin_ctz((U32)val) >> 3);
129 # else
130 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
131 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
132 # endif
133 }
134 } else { /* Big Endian CPU */
135 if (MEM_64bits()) {
136 # if defined(_MSC_VER) && defined(_WIN64)
137 unsigned long r = 0;
138 _BitScanReverse64( &r, val );
139 return (unsigned)(r>>3);
140 # elif defined(__GNUC__) && (__GNUC__ >= 3)
141 return (__builtin_clzll(val) >> 3);
142 # else
143 unsigned r;
144 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
145 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
146 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
147 r += (!val);
148 return r;
149 # endif
150 } else { /* 32 bits */
151 # if defined(_MSC_VER)
152 unsigned long r = 0;
153 _BitScanReverse( &r, (unsigned long)val );
154 return (unsigned)(r>>3);
155 # elif defined(__GNUC__) && (__GNUC__ >= 3)
156 return (__builtin_clz((U32)val) >> 3);
157 # else
158 unsigned r;
159 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
160 r += (!val);
161 return r;
162 # endif
163 } }
164 }
165
166
167 /*! ZDICT_count() :
168 Count the nb of common bytes between 2 pointers.
169 Note : this function presumes end of buffer followed by noisy guard band.
170 */
171 static size_t ZDICT_count(const void* pIn, const void* pMatch)
172 {
173 const char* const pStart = (const char*)pIn;
174 for (;;) {
175 size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
176 if (!diff) {
177 pIn = (const char*)pIn+sizeof(size_t);
178 pMatch = (const char*)pMatch+sizeof(size_t);
179 continue;
180 }
181 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
182 return (size_t)((const char*)pIn - pStart);
183 }
184 }
185
186
187 typedef struct {
188 U32 pos;
189 U32 length;
190 U32 savings;
191 } dictItem;
192
193 static void ZDICT_initDictItem(dictItem* d)
194 {
195 d->pos = 1;
196 d->length = 0;
197 d->savings = (U32)(-1);
198 }
199
200
201 #define LLIMIT 64 /* heuristic determined experimentally */
202 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
203 static dictItem ZDICT_analyzePos(
204 BYTE* doneMarks,
205 const int* suffix, U32 start,
206 const void* buffer, U32 minRatio, U32 notificationLevel)
207 {
208 U32 lengthList[LLIMIT] = {0};
209 U32 cumulLength[LLIMIT] = {0};
210 U32 savings[LLIMIT] = {0};
211 const BYTE* b = (const BYTE*)buffer;
212 size_t length;
213 size_t maxLength = LLIMIT;
214 size_t pos = suffix[start];
215 U32 end = start;
216 dictItem solution;
217
218 /* init */
219 memset(&solution, 0, sizeof(solution));
220 doneMarks[pos] = 1;
221
222 /* trivial repetition cases */
223 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
224 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
225 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
226 /* skip and mark segment */
227 U16 u16 = MEM_read16(b+pos+4);
228 U32 u, e = 6;
229 while (MEM_read16(b+pos+e) == u16) e+=2 ;
230 if (b[pos+e] == b[pos+e-1]) e++;
231 for (u=1; u<e; u++)
232 doneMarks[pos+u] = 1;
233 return solution;
234 }
235
236 /* look forward */
237 do {
238 end++;
239 length = ZDICT_count(b + pos, b + suffix[end]);
240 } while (length >=MINMATCHLENGTH);
241
242 /* look backward */
243 do {
244 length = ZDICT_count(b + pos, b + *(suffix+start-1));
245 if (length >=MINMATCHLENGTH) start--;
246 } while(length >= MINMATCHLENGTH);
247
248 /* exit if not found a minimum nb of repetitions */
249 if (end-start < minRatio) {
250 U32 idx;
251 for(idx=start; idx<end; idx++)
252 doneMarks[suffix[idx]] = 1;
253 return solution;
254 }
255
256 { int i;
257 U32 searchLength;
258 U32 refinedStart = start;
259 U32 refinedEnd = end;
260
261 DISPLAYLEVEL(4, "\n");
262 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
263 DISPLAYLEVEL(4, "\n");
264
265 for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
266 BYTE currentChar = 0;
267 U32 currentCount = 0;
268 U32 currentID = refinedStart;
269 U32 id;
270 U32 selectedCount = 0;
271 U32 selectedID = currentID;
272 for (id =refinedStart; id < refinedEnd; id++) {
273 if (b[ suffix[id] + searchLength] != currentChar) {
274 if (currentCount > selectedCount) {
275 selectedCount = currentCount;
276 selectedID = currentID;
277 }
278 currentID = id;
279 currentChar = b[ suffix[id] + searchLength];
280 currentCount = 0;
281 }
282 currentCount ++;
283 }
284 if (currentCount > selectedCount) { /* for last */
285 selectedCount = currentCount;
286 selectedID = currentID;
287 }
288
289 if (selectedCount < minRatio)
290 break;
291 refinedStart = selectedID;
292 refinedEnd = refinedStart + selectedCount;
293 }
294
295 /* evaluate gain based on new ref */
296 start = refinedStart;
297 pos = suffix[refinedStart];
298 end = start;
299 memset(lengthList, 0, sizeof(lengthList));
300
301 /* look forward */
302 do {
303 end++;
304 length = ZDICT_count(b + pos, b + suffix[end]);
305 if (length >= LLIMIT) length = LLIMIT-1;
306 lengthList[length]++;
307 } while (length >=MINMATCHLENGTH);
308
309 /* look backward */
310 length = MINMATCHLENGTH;
311 while ((length >= MINMATCHLENGTH) & (start > 0)) {
312 length = ZDICT_count(b + pos, b + suffix[start - 1]);
313 if (length >= LLIMIT) length = LLIMIT - 1;
314 lengthList[length]++;
315 if (length >= MINMATCHLENGTH) start--;
316 }
317
318 /* largest useful length */
319 memset(cumulLength, 0, sizeof(cumulLength));
320 cumulLength[maxLength-1] = lengthList[maxLength-1];
321 for (i=(int)(maxLength-2); i>=0; i--)
322 cumulLength[i] = cumulLength[i+1] + lengthList[i];
323
324 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
325 maxLength = i;
326
327 /* reduce maxLength in case of final into repetitive data */
328 { U32 l = (U32)maxLength;
329 BYTE const c = b[pos + maxLength-1];
330 while (b[pos+l-2]==c) l--;
331 maxLength = l;
332 }
333 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
334
335 /* calculate savings */
336 savings[5] = 0;
337 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
338 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
339
340 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
342
343 solution.pos = (U32)pos;
344 solution.length = (U32)maxLength;
345 solution.savings = savings[maxLength];
346
347 /* mark positions done */
348 { U32 id;
349 for (id=start; id<end; id++) {
350 U32 p, pEnd;
351 U32 const testedPos = suffix[id];
352 if (testedPos == pos)
353 length = solution.length;
354 else {
355 length = ZDICT_count(b+pos, b+testedPos);
356 if (length > solution.length) length = solution.length;
357 }
358 pEnd = (U32)(testedPos + length);
359 for (p=testedPos; p<pEnd; p++)
360 doneMarks[p] = 1;
361 } } }
362
363 return solution;
364 }
365
366
367 /*! ZDICT_checkMerge
368 check if dictItem can be merged, do it if possible
369 @return : id of destination elt, 0 if not merged
370 */
371 static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
372 {
373 const U32 tableSize = table->pos;
374 const U32 eltEnd = elt.pos + elt.length;
375
376 /* tail overlap */
377 U32 u; for (u=1; u<tableSize; u++) {
378 if (u==eltNbToSkip) continue;
379 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
380 /* append */
381 U32 addedLength = table[u].pos - elt.pos;
382 table[u].length += addedLength;
383 table[u].pos = elt.pos;
384 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
385 table[u].savings += elt.length / 8; /* rough approx bonus */
386 elt = table[u];
387 /* sort : improve rank */
388 while ((u>1) && (table[u-1].savings < elt.savings))
389 table[u] = table[u-1], u--;
390 table[u] = elt;
391 return u;
392 } }
393
394 /* front overlap */
395 for (u=1; u<tableSize; u++) {
396 if (u==eltNbToSkip) continue;
397 if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
398 /* append */
399 int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
400 table[u].savings += elt.length / 8; /* rough approx bonus */
401 if (addedLength > 0) { /* otherwise, elt fully included into existing */
402 table[u].length += addedLength;
403 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
404 }
405 /* sort : improve rank */
406 elt = table[u];
407 while ((u>1) && (table[u-1].savings < elt.savings))
408 table[u] = table[u-1], u--;
409 table[u] = elt;
410 return u;
411 } }
412
413 return 0;
414 }
415
416
417 static void ZDICT_removeDictItem(dictItem* table, U32 id)
418 {
419 /* convention : first element is nb of elts */
420 U32 const max = table->pos;
421 U32 u;
422 if (!id) return; /* protection, should never happen */
423 for (u=id; u<max-1; u++)
424 table[u] = table[u+1];
425 table->pos--;
426 }
427
428
429 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
430 {
431 /* merge if possible */
432 U32 mergeId = ZDICT_checkMerge(table, elt, 0);
433 if (mergeId) {
434 U32 newMerge = 1;
435 while (newMerge) {
436 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
437 if (newMerge) ZDICT_removeDictItem(table, mergeId);
438 mergeId = newMerge;
439 }
440 return;
441 }
442
443 /* insert */
444 { U32 current;
445 U32 nextElt = table->pos;
446 if (nextElt >= maxSize) nextElt = maxSize-1;
447 current = nextElt-1;
448 while (table[current].savings < elt.savings) {
449 table[current+1] = table[current];
450 current--;
451 }
452 table[current+1] = elt;
453 table->pos = nextElt+1;
454 }
455 }
456
457
458 static U32 ZDICT_dictSize(const dictItem* dictList)
459 {
460 U32 u, dictSize = 0;
461 for (u=1; u<dictList[0].pos; u++)
462 dictSize += dictList[u].length;
463 return dictSize;
464 }
465
466
467 static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
468 const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
469 const size_t* fileSizes, unsigned nbFiles,
470 U32 minRatio, U32 notificationLevel)
471 {
472 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
473 int* const suffix = suffix0+1;
474 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
475 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
476 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
477 size_t result = 0;
478 clock_t displayClock = 0;
479 clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
480
481 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
482 if (ZDICT_clockSpan(displayClock) > refreshRate) \
483 { displayClock = clock(); DISPLAY(__VA_ARGS__); \
484 if (notificationLevel>=4) fflush(stdout); } }
485
486 /* init */
487 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
488 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
489 result = ERROR(memory_allocation);
490 goto _cleanup;
491 }
492 if (minRatio < MINRATIO) minRatio = MINRATIO;
493 memset(doneMarks, 0, bufferSize+16);
494
495 /* limit sample set size (divsufsort limitation)*/
496 if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
497 while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
498
499 /* sort */
500 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
501 { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
502 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
503 }
504 suffix[bufferSize] = (int)bufferSize; /* leads into noise */
505 suffix0[0] = (int)bufferSize; /* leads into noise */
506 /* build reverse suffix sort */
507 { size_t pos;
508 for (pos=0; pos < bufferSize; pos++)
509 reverseSuffix[suffix[pos]] = (U32)pos;
510 /* note filePos tracks borders between samples.
511 It's not used at this stage, but planned to become useful in a later update */
512 filePos[0] = 0;
513 for (pos=1; pos<nbFiles; pos++)
514 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
515 }
516
517 DISPLAYLEVEL(2, "finding patterns ... \n");
518 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
519
520 { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
521 dictItem solution;
522 if (doneMarks[cursor]) { cursor++; continue; }
523 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
524 if (solution.length==0) { cursor++; continue; }
525 ZDICT_insertDictItem(dictList, dictListSize, solution);
526 cursor += solution.length;
527 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
528 } }
529
530 _cleanup:
531 free(suffix0);
532 free(reverseSuffix);
533 free(doneMarks);
534 free(filePos);
535 return result;
536 }
537
538
539 static void ZDICT_fillNoise(void* buffer, size_t length)
540 {
541 unsigned const prime1 = 2654435761U;
542 unsigned const prime2 = 2246822519U;
543 unsigned acc = prime1;
544 size_t p=0;;
545 for (p=0; p<length; p++) {
546 acc *= prime2;
547 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
548 }
549 }
550
551
552 typedef struct
553 {
554 ZSTD_CCtx* ref;
555 ZSTD_CCtx* zc;
556 void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
557 } EStats_ress_t;
558
559 #define MAXREPOFFSET 1024
560
561 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
562 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
563 const void* src, size_t srcSize, U32 notificationLevel)
564 {
565 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
566 size_t cSize;
567
568 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
569 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
570 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
571 }
572 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
573 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
574
575 if (cSize) { /* if == 0; block is not compressible */
576 const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
577
578 /* literals stats */
579 { const BYTE* bytePtr;
580 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
581 countLit[*bytePtr]++;
582 }
583
584 /* seqStats */
585 { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
586 ZSTD_seqToCodes(seqStorePtr);
587
588 { const BYTE* codePtr = seqStorePtr->ofCode;
589 U32 u;
590 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
591 }
592
593 { const BYTE* codePtr = seqStorePtr->mlCode;
594 U32 u;
595 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
596 }
597
598 { const BYTE* codePtr = seqStorePtr->llCode;
599 U32 u;
600 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
601 }
602
603 if (nbSeq >= 2) { /* rep offsets */
604 const seqDef* const seq = seqStorePtr->sequencesStart;
605 U32 offset1 = seq[0].offset - 3;
606 U32 offset2 = seq[1].offset - 3;
607 if (offset1 >= MAXREPOFFSET) offset1 = 0;
608 if (offset2 >= MAXREPOFFSET) offset2 = 0;
609 repOffsets[offset1] += 3;
610 repOffsets[offset2] += 1;
611 } } }
612 }
613
614 /*
615 static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
616 {
617 unsigned u;
618 size_t max=0;
619 for (u=0; u<nbFiles; u++)
620 if (max < fileSizes[u]) max = fileSizes[u];
621 return max;
622 }
623 */
624
625 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
626 {
627 size_t total=0;
628 unsigned u;
629 for (u=0; u<nbFiles; u++) total += fileSizes[u];
630 return total;
631 }
632
633 typedef struct { U32 offset; U32 count; } offsetCount_t;
634
635 static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
636 {
637 U32 u;
638 table[ZSTD_REP_NUM].offset = val;
639 table[ZSTD_REP_NUM].count = count;
640 for (u=ZSTD_REP_NUM; u>0; u--) {
641 offsetCount_t tmp;
642 if (table[u-1].count >= table[u].count) break;
643 tmp = table[u-1];
644 table[u-1] = table[u];
645 table[u] = tmp;
646 }
647 }
648
649
650 #define OFFCODE_MAX 30 /* only applicable to first block */
651 static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
652 unsigned compressionLevel,
653 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
654 const void* dictBuffer, size_t dictBufferSize,
655 unsigned notificationLevel)
656 {
657 U32 countLit[256];
658 HUF_CREATE_STATIC_CTABLE(hufTable, 255);
659 U32 offcodeCount[OFFCODE_MAX+1];
660 short offcodeNCount[OFFCODE_MAX+1];
661 U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
662 U32 matchLengthCount[MaxML+1];
663 short matchLengthNCount[MaxML+1];
664 U32 litLengthCount[MaxLL+1];
665 short litLengthNCount[MaxLL+1];
666 U32 repOffset[MAXREPOFFSET];
667 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
668 EStats_ress_t esr;
669 ZSTD_parameters params;
670 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
671 size_t pos = 0, errorCode;
672 size_t eSize = 0;
673 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
674 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
675 BYTE* dstPtr = (BYTE*)dstBuffer;
676
677 /* init */
678 esr.ref = ZSTD_createCCtx();
679 esr.zc = ZSTD_createCCtx();
680 esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
681 if (!esr.ref || !esr.zc || !esr.workPlace) {
682 eSize = ERROR(memory_allocation);
683 DISPLAYLEVEL(1, "Not enough memory \n");
684 goto _cleanup;
685 }
686 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
687 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
688 for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
689 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
690 for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
691 memset(repOffset, 0, sizeof(repOffset));
692 repOffset[1] = repOffset[4] = repOffset[8] = 1;
693 memset(bestRepOffset, 0, sizeof(bestRepOffset));
694 if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
695 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
696 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
697 if (ZSTD_isError(beginResult)) {
698 eSize = ERROR(GENERIC);
699 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
700 goto _cleanup;
701 } }
702
703 /* collect stats on all files */
704 for (u=0; u<nbFiles; u++) {
705 ZDICT_countEStats(esr, params,
706 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
707 (const char*)srcBuffer + pos, fileSizes[u],
708 notificationLevel);
709 pos += fileSizes[u];
710 }
711
712 /* analyze */
713 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
714 if (HUF_isError(errorCode)) {
715 eSize = ERROR(GENERIC);
716 DISPLAYLEVEL(1, "HUF_buildCTable error \n");
717 goto _cleanup;
718 }
719 huffLog = (U32)errorCode;
720
721 /* looking for most common first offsets */
722 { U32 offset;
723 for (offset=1; offset<MAXREPOFFSET; offset++)
724 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
725 }
726 /* note : the result of this phase should be used to better appreciate the impact on statistics */
727
728 total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
729 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
730 if (FSE_isError(errorCode)) {
731 eSize = ERROR(GENERIC);
732 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
733 goto _cleanup;
734 }
735 Offlog = (U32)errorCode;
736
737 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
738 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
739 if (FSE_isError(errorCode)) {
740 eSize = ERROR(GENERIC);
741 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
742 goto _cleanup;
743 }
744 mlLog = (U32)errorCode;
745
746 total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
747 errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
748 if (FSE_isError(errorCode)) {
749 eSize = ERROR(GENERIC);
750 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
751 goto _cleanup;
752 }
753 llLog = (U32)errorCode;
754
755 /* write result to buffer */
756 { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
757 if (HUF_isError(hhSize)) {
758 eSize = ERROR(GENERIC);
759 DISPLAYLEVEL(1, "HUF_writeCTable error \n");
760 goto _cleanup;
761 }
762 dstPtr += hhSize;
763 maxDstSize -= hhSize;
764 eSize += hhSize;
765 }
766
767 { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
768 if (FSE_isError(ohSize)) {
769 eSize = ERROR(GENERIC);
770 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
771 goto _cleanup;
772 }
773 dstPtr += ohSize;
774 maxDstSize -= ohSize;
775 eSize += ohSize;
776 }
777
778 { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
779 if (FSE_isError(mhSize)) {
780 eSize = ERROR(GENERIC);
781 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
782 goto _cleanup;
783 }
784 dstPtr += mhSize;
785 maxDstSize -= mhSize;
786 eSize += mhSize;
787 }
788
789 { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
790 if (FSE_isError(lhSize)) {
791 eSize = ERROR(GENERIC);
792 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
793 goto _cleanup;
794 }
795 dstPtr += lhSize;
796 maxDstSize -= lhSize;
797 eSize += lhSize;
798 }
799
800 if (maxDstSize<12) {
801 eSize = ERROR(GENERIC);
802 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
803 goto _cleanup;
804 }
805 # if 0
806 MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
807 MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
808 MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
809 #else
810 /* at this stage, we don't use the result of "most common first offset",
811 as the impact of statistics is not properly evaluated */
812 MEM_writeLE32(dstPtr+0, repStartValue[0]);
813 MEM_writeLE32(dstPtr+4, repStartValue[1]);
814 MEM_writeLE32(dstPtr+8, repStartValue[2]);
815 #endif
816 //dstPtr += 12;
817 eSize += 12;
818
819 _cleanup:
820 ZSTD_freeCCtx(esr.ref);
821 ZSTD_freeCCtx(esr.zc);
822 free(esr.workPlace);
823
824 return eSize;
825 }
826
827
828 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
829 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
830 ZDICT_params_t params)
831 {
832 size_t hSize;
833 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
834 U32 const notificationLevel = params.notificationLevel;
835
836 /* dictionary header */
837 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
838 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
839 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
840 U32 const dictID = params.dictID ? params.dictID : compliantID;
841 MEM_writeLE32((char*)dictBuffer+4, dictID);
842 }
843 hSize = 8;
844
845 /* entropy tables */
846 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
847 DISPLAYLEVEL(2, "statistics ... \n");
848 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
849 compressionLevel,
850 samplesBuffer, samplesSizes, nbSamples,
851 (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
852 notificationLevel);
853 if (ZDICT_isError(eSize)) return eSize;
854 hSize += eSize;
855 }
856
857
858 if (hSize + dictContentSize < dictBufferCapacity)
859 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
860 return MIN(dictBufferCapacity, hSize+dictContentSize);
861 }
862
863
864 /*! ZDICT_trainFromBuffer_unsafe() :
865 * Warning : `samplesBuffer` must be followed by noisy guard band.
866 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
867 */
868 size_t ZDICT_trainFromBuffer_unsafe(
869 void* dictBuffer, size_t maxDictSize,
870 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
871 ZDICT_params_t params)
872 {
873 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
874 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
875 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
876 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
877 size_t const targetDictSize = maxDictSize;
878 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
879 size_t dictSize = 0;
880 U32 const notificationLevel = params.notificationLevel;
881
882 /* checks */
883 if (!dictList) return ERROR(memory_allocation);
884 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
885 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
886
887 /* init */
888 ZDICT_initDictItem(dictList);
889
890 /* build dictionary */
891 ZDICT_trainBuffer(dictList, dictListSize,
892 samplesBuffer, samplesBuffSize,
893 samplesSizes, nbSamples,
894 minRep, notificationLevel);
895
896 /* display best matches */
897 if (params.notificationLevel>= 3) {
898 U32 const nb = MIN(25, dictList[0].pos);
899 U32 const dictContentSize = ZDICT_dictSize(dictList);
900 U32 u;
901 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
902 DISPLAYLEVEL(3, "list %u best segments \n", nb);
903 for (u=1; u<=nb; u++) {
904 U32 pos = dictList[u].pos;
905 U32 length = dictList[u].length;
906 U32 printedLength = MIN(40, length);
907 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
908 u, length, pos, dictList[u].savings);
909 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
910 DISPLAYLEVEL(3, "| \n");
911 } }
912
913
914 /* create dictionary */
915 { U32 dictContentSize = ZDICT_dictSize(dictList);
916 if (dictContentSize < targetDictSize/3) {
917 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
918 if (minRep > MINRATIO) {
919 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
920 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
921 }
922 if (samplesBuffSize < 10 * targetDictSize)
923 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
924 }
925
926 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
927 U32 proposedSelectivity = selectivity-1;
928 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
929 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
930 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
931 DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
932 }
933
934 /* limit dictionary size */
935 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
936 U32 currentSize = 0;
937 U32 n; for (n=1; n<max; n++) {
938 currentSize += dictList[n].length;
939 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
940 }
941 dictList->pos = n;
942 dictContentSize = currentSize;
943 }
944
945 /* build dict content */
946 { U32 u;
947 BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
948 for (u=1; u<dictList->pos; u++) {
949 U32 l = dictList[u].length;
950 ptr -= l;
951 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */
952 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
953 } }
954
955 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
956 samplesBuffer, samplesSizes, nbSamples,
957 params);
958 }
959
960 /* clean up */
961 free(dictList);
962 return dictSize;
963 }
964
965
966 /* issue : samplesBuffer need to be followed by a noisy guard band.
967 * work around : duplicate the buffer, and add the noise */
968 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
969 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
970 ZDICT_params_t params)
971 {
972 size_t result;
973 void* newBuff;
974 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
975 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
976
977 newBuff = malloc(sBuffSize + NOISELENGTH);
978 if (!newBuff) return ERROR(memory_allocation);
979
980 memcpy(newBuff, samplesBuffer, sBuffSize);
981 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
982
983 result = ZDICT_trainFromBuffer_unsafe(
984 dictBuffer, dictBufferCapacity,
985 newBuff, samplesSizes, nbSamples,
986 params);
987 free(newBuff);
988 return result;
989 }
990
991
992 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
993 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
994 {
995 ZDICT_params_t params;
996 memset(&params, 0, sizeof(params));
997 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
998 samplesBuffer, samplesSizes, nbSamples,
999 params);
1000 }
1001
1002 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1003 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1004 {
1005 ZDICT_params_t params;
1006 memset(&params, 0, sizeof(params));
1007 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1008 samplesBuffer, samplesSizes, nbSamples,
1009 params);
1010 }