|
1 /** |
|
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. |
|
3 * All rights reserved. |
|
4 * |
|
5 * This source code is licensed under the BSD-style license found in the |
|
6 * LICENSE file in the root directory of this source tree. An additional grant |
|
7 * of patent rights can be found in the PATENTS file in the same directory. |
|
8 */ |
|
9 |
|
10 /*-************************************* |
|
11 * Dependencies |
|
12 ***************************************/ |
|
13 #include <stdio.h> /* fprintf */ |
|
14 #include <stdlib.h> /* malloc, free, qsort */ |
|
15 #include <string.h> /* memset */ |
|
16 #include <time.h> /* clock */ |
|
17 |
|
18 #include "mem.h" /* read */ |
|
19 #include "pool.h" |
|
20 #include "threading.h" |
|
21 #include "zstd_internal.h" /* includes zstd.h */ |
|
22 #ifndef ZDICT_STATIC_LINKING_ONLY |
|
23 #define ZDICT_STATIC_LINKING_ONLY |
|
24 #endif |
|
25 #include "zdict.h" |
|
26 |
|
27 /*-************************************* |
|
28 * Constants |
|
29 ***************************************/ |
|
30 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) |
|
31 |
|
32 /*-************************************* |
|
33 * Console display |
|
34 ***************************************/ |
|
35 static int g_displayLevel = 2; |
|
36 #define DISPLAY(...) \ |
|
37 { \ |
|
38 fprintf(stderr, __VA_ARGS__); \ |
|
39 fflush(stderr); \ |
|
40 } |
|
41 #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ |
|
42 if (displayLevel >= l) { \ |
|
43 DISPLAY(__VA_ARGS__); \ |
|
44 } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ |
|
45 #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) |
|
46 |
|
47 #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ |
|
48 if (displayLevel >= l) { \ |
|
49 if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ |
|
50 g_time = clock(); \ |
|
51 DISPLAY(__VA_ARGS__); \ |
|
52 if (displayLevel >= 4) \ |
|
53 fflush(stdout); \ |
|
54 } \ |
|
55 } |
|
56 #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) |
|
57 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; |
|
58 static clock_t g_time = 0; |
|
59 |
|
60 /*-************************************* |
|
61 * Hash table |
|
62 *************************************** |
|
63 * A small specialized hash map for storing activeDmers. |
|
64 * The map does not resize, so if it becomes full it will loop forever. |
|
65 * Thus, the map must be large enough to store every value. |
|
66 * The map implements linear probing and keeps its load less than 0.5. |
|
67 */ |
|
68 |
|
69 #define MAP_EMPTY_VALUE ((U32)-1) |
|
70 typedef struct COVER_map_pair_t_s { |
|
71 U32 key; |
|
72 U32 value; |
|
73 } COVER_map_pair_t; |
|
74 |
|
75 typedef struct COVER_map_s { |
|
76 COVER_map_pair_t *data; |
|
77 U32 sizeLog; |
|
78 U32 size; |
|
79 U32 sizeMask; |
|
80 } COVER_map_t; |
|
81 |
|
82 /** |
|
83 * Clear the map. |
|
84 */ |
|
85 static void COVER_map_clear(COVER_map_t *map) { |
|
86 memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t)); |
|
87 } |
|
88 |
|
89 /** |
|
90 * Initializes a map of the given size. |
|
91 * Returns 1 on success and 0 on failure. |
|
92 * The map must be destroyed with COVER_map_destroy(). |
|
93 * The map is only guaranteed to be large enough to hold size elements. |
|
94 */ |
|
95 static int COVER_map_init(COVER_map_t *map, U32 size) { |
|
96 map->sizeLog = ZSTD_highbit32(size) + 2; |
|
97 map->size = (U32)1 << map->sizeLog; |
|
98 map->sizeMask = map->size - 1; |
|
99 map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t)); |
|
100 if (!map->data) { |
|
101 map->sizeLog = 0; |
|
102 map->size = 0; |
|
103 return 0; |
|
104 } |
|
105 COVER_map_clear(map); |
|
106 return 1; |
|
107 } |
|
108 |
|
109 /** |
|
110 * Internal hash function |
|
111 */ |
|
112 static const U32 prime4bytes = 2654435761U; |
|
113 static U32 COVER_map_hash(COVER_map_t *map, U32 key) { |
|
114 return (key * prime4bytes) >> (32 - map->sizeLog); |
|
115 } |
|
116 |
|
117 /** |
|
118 * Helper function that returns the index that a key should be placed into. |
|
119 */ |
|
120 static U32 COVER_map_index(COVER_map_t *map, U32 key) { |
|
121 const U32 hash = COVER_map_hash(map, key); |
|
122 U32 i; |
|
123 for (i = hash;; i = (i + 1) & map->sizeMask) { |
|
124 COVER_map_pair_t *pos = &map->data[i]; |
|
125 if (pos->value == MAP_EMPTY_VALUE) { |
|
126 return i; |
|
127 } |
|
128 if (pos->key == key) { |
|
129 return i; |
|
130 } |
|
131 } |
|
132 } |
|
133 |
|
134 /** |
|
135 * Returns the pointer to the value for key. |
|
136 * If key is not in the map, it is inserted and the value is set to 0. |
|
137 * The map must not be full. |
|
138 */ |
|
139 static U32 *COVER_map_at(COVER_map_t *map, U32 key) { |
|
140 COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)]; |
|
141 if (pos->value == MAP_EMPTY_VALUE) { |
|
142 pos->key = key; |
|
143 pos->value = 0; |
|
144 } |
|
145 return &pos->value; |
|
146 } |
|
147 |
|
148 /** |
|
149 * Deletes key from the map if present. |
|
150 */ |
|
151 static void COVER_map_remove(COVER_map_t *map, U32 key) { |
|
152 U32 i = COVER_map_index(map, key); |
|
153 COVER_map_pair_t *del = &map->data[i]; |
|
154 U32 shift = 1; |
|
155 if (del->value == MAP_EMPTY_VALUE) { |
|
156 return; |
|
157 } |
|
158 for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) { |
|
159 COVER_map_pair_t *const pos = &map->data[i]; |
|
160 /* If the position is empty we are done */ |
|
161 if (pos->value == MAP_EMPTY_VALUE) { |
|
162 del->value = MAP_EMPTY_VALUE; |
|
163 return; |
|
164 } |
|
165 /* If pos can be moved to del do so */ |
|
166 if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) { |
|
167 del->key = pos->key; |
|
168 del->value = pos->value; |
|
169 del = pos; |
|
170 shift = 1; |
|
171 } else { |
|
172 ++shift; |
|
173 } |
|
174 } |
|
175 } |
|
176 |
|
177 /** |
|
178 * Destroyes a map that is inited with COVER_map_init(). |
|
179 */ |
|
180 static void COVER_map_destroy(COVER_map_t *map) { |
|
181 if (map->data) { |
|
182 free(map->data); |
|
183 } |
|
184 map->data = NULL; |
|
185 map->size = 0; |
|
186 } |
|
187 |
|
188 /*-************************************* |
|
189 * Context |
|
190 ***************************************/ |
|
191 |
|
192 typedef struct { |
|
193 const BYTE *samples; |
|
194 size_t *offsets; |
|
195 const size_t *samplesSizes; |
|
196 size_t nbSamples; |
|
197 U32 *suffix; |
|
198 size_t suffixSize; |
|
199 U32 *freqs; |
|
200 U32 *dmerAt; |
|
201 unsigned d; |
|
202 } COVER_ctx_t; |
|
203 |
|
204 /* We need a global context for qsort... */ |
|
205 static COVER_ctx_t *g_ctx = NULL; |
|
206 |
|
207 /*-************************************* |
|
208 * Helper functions |
|
209 ***************************************/ |
|
210 |
|
211 /** |
|
212 * Returns the sum of the sample sizes. |
|
213 */ |
|
214 static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { |
|
215 size_t sum = 0; |
|
216 size_t i; |
|
217 for (i = 0; i < nbSamples; ++i) { |
|
218 sum += samplesSizes[i]; |
|
219 } |
|
220 return sum; |
|
221 } |
|
222 |
|
223 /** |
|
224 * Returns -1 if the dmer at lp is less than the dmer at rp. |
|
225 * Return 0 if the dmers at lp and rp are equal. |
|
226 * Returns 1 if the dmer at lp is greater than the dmer at rp. |
|
227 */ |
|
228 static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) { |
|
229 const U32 lhs = *(const U32 *)lp; |
|
230 const U32 rhs = *(const U32 *)rp; |
|
231 return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); |
|
232 } |
|
233 |
|
234 /** |
|
235 * Same as COVER_cmp() except ties are broken by pointer value |
|
236 * NOTE: g_ctx must be set to call this function. A global is required because |
|
237 * qsort doesn't take an opaque pointer. |
|
238 */ |
|
239 static int COVER_strict_cmp(const void *lp, const void *rp) { |
|
240 int result = COVER_cmp(g_ctx, lp, rp); |
|
241 if (result == 0) { |
|
242 result = lp < rp ? -1 : 1; |
|
243 } |
|
244 return result; |
|
245 } |
|
246 |
|
247 /** |
|
248 * Returns the first pointer in [first, last) whose element does not compare |
|
249 * less than value. If no such element exists it returns last. |
|
250 */ |
|
251 static const size_t *COVER_lower_bound(const size_t *first, const size_t *last, |
|
252 size_t value) { |
|
253 size_t count = last - first; |
|
254 while (count != 0) { |
|
255 size_t step = count / 2; |
|
256 const size_t *ptr = first; |
|
257 ptr += step; |
|
258 if (*ptr < value) { |
|
259 first = ++ptr; |
|
260 count -= step + 1; |
|
261 } else { |
|
262 count = step; |
|
263 } |
|
264 } |
|
265 return first; |
|
266 } |
|
267 |
|
268 /** |
|
269 * Generic groupBy function. |
|
270 * Groups an array sorted by cmp into groups with equivalent values. |
|
271 * Calls grp for each group. |
|
272 */ |
|
273 static void |
|
274 COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx, |
|
275 int (*cmp)(COVER_ctx_t *, const void *, const void *), |
|
276 void (*grp)(COVER_ctx_t *, const void *, const void *)) { |
|
277 const BYTE *ptr = (const BYTE *)data; |
|
278 size_t num = 0; |
|
279 while (num < count) { |
|
280 const BYTE *grpEnd = ptr + size; |
|
281 ++num; |
|
282 while (num < count && cmp(ctx, ptr, grpEnd) == 0) { |
|
283 grpEnd += size; |
|
284 ++num; |
|
285 } |
|
286 grp(ctx, ptr, grpEnd); |
|
287 ptr = grpEnd; |
|
288 } |
|
289 } |
|
290 |
|
291 /*-************************************* |
|
292 * Cover functions |
|
293 ***************************************/ |
|
294 |
|
295 /** |
|
296 * Called on each group of positions with the same dmer. |
|
297 * Counts the frequency of each dmer and saves it in the suffix array. |
|
298 * Fills `ctx->dmerAt`. |
|
299 */ |
|
300 static void COVER_group(COVER_ctx_t *ctx, const void *group, |
|
301 const void *groupEnd) { |
|
302 /* The group consists of all the positions with the same first d bytes. */ |
|
303 const U32 *grpPtr = (const U32 *)group; |
|
304 const U32 *grpEnd = (const U32 *)groupEnd; |
|
305 /* The dmerId is how we will reference this dmer. |
|
306 * This allows us to map the whole dmer space to a much smaller space, the |
|
307 * size of the suffix array. |
|
308 */ |
|
309 const U32 dmerId = (U32)(grpPtr - ctx->suffix); |
|
310 /* Count the number of samples this dmer shows up in */ |
|
311 U32 freq = 0; |
|
312 /* Details */ |
|
313 const size_t *curOffsetPtr = ctx->offsets; |
|
314 const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples; |
|
315 /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a |
|
316 * different sample than the last. |
|
317 */ |
|
318 size_t curSampleEnd = ctx->offsets[0]; |
|
319 for (; grpPtr != grpEnd; ++grpPtr) { |
|
320 /* Save the dmerId for this position so we can get back to it. */ |
|
321 ctx->dmerAt[*grpPtr] = dmerId; |
|
322 /* Dictionaries only help for the first reference to the dmer. |
|
323 * After that zstd can reference the match from the previous reference. |
|
324 * So only count each dmer once for each sample it is in. |
|
325 */ |
|
326 if (*grpPtr < curSampleEnd) { |
|
327 continue; |
|
328 } |
|
329 freq += 1; |
|
330 /* Binary search to find the end of the sample *grpPtr is in. |
|
331 * In the common case that grpPtr + 1 == grpEnd we can skip the binary |
|
332 * search because the loop is over. |
|
333 */ |
|
334 if (grpPtr + 1 != grpEnd) { |
|
335 const size_t *sampleEndPtr = |
|
336 COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr); |
|
337 curSampleEnd = *sampleEndPtr; |
|
338 curOffsetPtr = sampleEndPtr + 1; |
|
339 } |
|
340 } |
|
341 /* At this point we are never going to look at this segment of the suffix |
|
342 * array again. We take advantage of this fact to save memory. |
|
343 * We store the frequency of the dmer in the first position of the group, |
|
344 * which is dmerId. |
|
345 */ |
|
346 ctx->suffix[dmerId] = freq; |
|
347 } |
|
348 |
|
349 /** |
|
350 * A segment is a range in the source as well as the score of the segment. |
|
351 */ |
|
352 typedef struct { |
|
353 U32 begin; |
|
354 U32 end; |
|
355 double score; |
|
356 } COVER_segment_t; |
|
357 |
|
358 /** |
|
359 * Selects the best segment in an epoch. |
|
360 * Segments of are scored according to the function: |
|
361 * |
|
362 * Let F(d) be the frequency of dmer d. |
|
363 * Let S_i be the dmer at position i of segment S which has length k. |
|
364 * |
|
365 * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) |
|
366 * |
|
367 * Once the dmer d is in the dictionay we set F(d) = 0. |
|
368 */ |
|
369 static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, |
|
370 COVER_map_t *activeDmers, U32 begin, |
|
371 U32 end, COVER_params_t parameters) { |
|
372 /* Constants */ |
|
373 const U32 k = parameters.k; |
|
374 const U32 d = parameters.d; |
|
375 const U32 dmersInK = k - d + 1; |
|
376 /* Try each segment (activeSegment) and save the best (bestSegment) */ |
|
377 COVER_segment_t bestSegment = {0, 0, 0}; |
|
378 COVER_segment_t activeSegment; |
|
379 /* Reset the activeDmers in the segment */ |
|
380 COVER_map_clear(activeDmers); |
|
381 /* The activeSegment starts at the beginning of the epoch. */ |
|
382 activeSegment.begin = begin; |
|
383 activeSegment.end = begin; |
|
384 activeSegment.score = 0; |
|
385 /* Slide the activeSegment through the whole epoch. |
|
386 * Save the best segment in bestSegment. |
|
387 */ |
|
388 while (activeSegment.end < end) { |
|
389 /* The dmerId for the dmer at the next position */ |
|
390 U32 newDmer = ctx->dmerAt[activeSegment.end]; |
|
391 /* The entry in activeDmers for this dmerId */ |
|
392 U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer); |
|
393 /* If the dmer isn't already present in the segment add its score. */ |
|
394 if (*newDmerOcc == 0) { |
|
395 /* The paper suggest using the L-0.5 norm, but experiments show that it |
|
396 * doesn't help. |
|
397 */ |
|
398 activeSegment.score += freqs[newDmer]; |
|
399 } |
|
400 /* Add the dmer to the segment */ |
|
401 activeSegment.end += 1; |
|
402 *newDmerOcc += 1; |
|
403 |
|
404 /* If the window is now too large, drop the first position */ |
|
405 if (activeSegment.end - activeSegment.begin == dmersInK + 1) { |
|
406 U32 delDmer = ctx->dmerAt[activeSegment.begin]; |
|
407 U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer); |
|
408 activeSegment.begin += 1; |
|
409 *delDmerOcc -= 1; |
|
410 /* If this is the last occurence of the dmer, subtract its score */ |
|
411 if (*delDmerOcc == 0) { |
|
412 COVER_map_remove(activeDmers, delDmer); |
|
413 activeSegment.score -= freqs[delDmer]; |
|
414 } |
|
415 } |
|
416 |
|
417 /* If this segment is the best so far save it */ |
|
418 if (activeSegment.score > bestSegment.score) { |
|
419 bestSegment = activeSegment; |
|
420 } |
|
421 } |
|
422 { |
|
423 /* Trim off the zero frequency head and tail from the segment. */ |
|
424 U32 newBegin = bestSegment.end; |
|
425 U32 newEnd = bestSegment.begin; |
|
426 U32 pos; |
|
427 for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { |
|
428 U32 freq = freqs[ctx->dmerAt[pos]]; |
|
429 if (freq != 0) { |
|
430 newBegin = MIN(newBegin, pos); |
|
431 newEnd = pos + 1; |
|
432 } |
|
433 } |
|
434 bestSegment.begin = newBegin; |
|
435 bestSegment.end = newEnd; |
|
436 } |
|
437 { |
|
438 /* Zero out the frequency of each dmer covered by the chosen segment. */ |
|
439 U32 pos; |
|
440 for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { |
|
441 freqs[ctx->dmerAt[pos]] = 0; |
|
442 } |
|
443 } |
|
444 return bestSegment; |
|
445 } |
|
446 |
|
447 /** |
|
448 * Check the validity of the parameters. |
|
449 * Returns non-zero if the parameters are valid and 0 otherwise. |
|
450 */ |
|
451 static int COVER_checkParameters(COVER_params_t parameters) { |
|
452 /* k and d are required parameters */ |
|
453 if (parameters.d == 0 || parameters.k == 0) { |
|
454 return 0; |
|
455 } |
|
456 /* d <= k */ |
|
457 if (parameters.d > parameters.k) { |
|
458 return 0; |
|
459 } |
|
460 return 1; |
|
461 } |
|
462 |
|
463 /** |
|
464 * Clean up a context initialized with `COVER_ctx_init()`. |
|
465 */ |
|
466 static void COVER_ctx_destroy(COVER_ctx_t *ctx) { |
|
467 if (!ctx) { |
|
468 return; |
|
469 } |
|
470 if (ctx->suffix) { |
|
471 free(ctx->suffix); |
|
472 ctx->suffix = NULL; |
|
473 } |
|
474 if (ctx->freqs) { |
|
475 free(ctx->freqs); |
|
476 ctx->freqs = NULL; |
|
477 } |
|
478 if (ctx->dmerAt) { |
|
479 free(ctx->dmerAt); |
|
480 ctx->dmerAt = NULL; |
|
481 } |
|
482 if (ctx->offsets) { |
|
483 free(ctx->offsets); |
|
484 ctx->offsets = NULL; |
|
485 } |
|
486 } |
|
487 |
|
488 /** |
|
489 * Prepare a context for dictionary building. |
|
490 * The context is only dependent on the parameter `d` and can used multiple |
|
491 * times. |
|
492 * Returns 1 on success or zero on error. |
|
493 * The context must be destroyed with `COVER_ctx_destroy()`. |
|
494 */ |
|
495 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, |
|
496 const size_t *samplesSizes, unsigned nbSamples, |
|
497 unsigned d) { |
|
498 const BYTE *const samples = (const BYTE *)samplesBuffer; |
|
499 const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); |
|
500 /* Checks */ |
|
501 if (totalSamplesSize < d || |
|
502 totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { |
|
503 DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", |
|
504 (COVER_MAX_SAMPLES_SIZE >> 20)); |
|
505 return 0; |
|
506 } |
|
507 /* Zero the context */ |
|
508 memset(ctx, 0, sizeof(*ctx)); |
|
509 DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, |
|
510 (U32)totalSamplesSize); |
|
511 ctx->samples = samples; |
|
512 ctx->samplesSizes = samplesSizes; |
|
513 ctx->nbSamples = nbSamples; |
|
514 /* Partial suffix array */ |
|
515 ctx->suffixSize = totalSamplesSize - d + 1; |
|
516 ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); |
|
517 /* Maps index to the dmerID */ |
|
518 ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); |
|
519 /* The offsets of each file */ |
|
520 ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); |
|
521 if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) { |
|
522 DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n"); |
|
523 COVER_ctx_destroy(ctx); |
|
524 return 0; |
|
525 } |
|
526 ctx->freqs = NULL; |
|
527 ctx->d = d; |
|
528 |
|
529 /* Fill offsets from the samlesSizes */ |
|
530 { |
|
531 U32 i; |
|
532 ctx->offsets[0] = 0; |
|
533 for (i = 1; i <= nbSamples; ++i) { |
|
534 ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; |
|
535 } |
|
536 } |
|
537 DISPLAYLEVEL(2, "Constructing partial suffix array\n"); |
|
538 { |
|
539 /* suffix is a partial suffix array. |
|
540 * It only sorts suffixes by their first parameters.d bytes. |
|
541 * The sort is stable, so each dmer group is sorted by position in input. |
|
542 */ |
|
543 U32 i; |
|
544 for (i = 0; i < ctx->suffixSize; ++i) { |
|
545 ctx->suffix[i] = i; |
|
546 } |
|
547 /* qsort doesn't take an opaque pointer, so pass as a global */ |
|
548 g_ctx = ctx; |
|
549 qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp); |
|
550 } |
|
551 DISPLAYLEVEL(2, "Computing frequencies\n"); |
|
552 /* For each dmer group (group of positions with the same first d bytes): |
|
553 * 1. For each position we set dmerAt[position] = dmerID. The dmerID is |
|
554 * (groupBeginPtr - suffix). This allows us to go from position to |
|
555 * dmerID so we can look up values in freq. |
|
556 * 2. We calculate how many samples the dmer occurs in and save it in |
|
557 * freqs[dmerId]. |
|
558 */ |
|
559 COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp, |
|
560 &COVER_group); |
|
561 ctx->freqs = ctx->suffix; |
|
562 ctx->suffix = NULL; |
|
563 return 1; |
|
564 } |
|
565 |
|
566 /** |
|
567 * Given the prepared context build the dictionary. |
|
568 */ |
|
569 static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, |
|
570 COVER_map_t *activeDmers, void *dictBuffer, |
|
571 size_t dictBufferCapacity, |
|
572 COVER_params_t parameters) { |
|
573 BYTE *const dict = (BYTE *)dictBuffer; |
|
574 size_t tail = dictBufferCapacity; |
|
575 /* Divide the data up into epochs of equal size. |
|
576 * We will select at least one segment from each epoch. |
|
577 */ |
|
578 const U32 epochs = (U32)(dictBufferCapacity / parameters.k); |
|
579 const U32 epochSize = (U32)(ctx->suffixSize / epochs); |
|
580 size_t epoch; |
|
581 DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, |
|
582 epochSize); |
|
583 /* Loop through the epochs until there are no more segments or the dictionary |
|
584 * is full. |
|
585 */ |
|
586 for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { |
|
587 const U32 epochBegin = (U32)(epoch * epochSize); |
|
588 const U32 epochEnd = epochBegin + epochSize; |
|
589 size_t segmentSize; |
|
590 /* Select a segment */ |
|
591 COVER_segment_t segment = COVER_selectSegment( |
|
592 ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); |
|
593 /* Trim the segment if necessary and if it is empty then we are done */ |
|
594 segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); |
|
595 if (segmentSize == 0) { |
|
596 break; |
|
597 } |
|
598 /* We fill the dictionary from the back to allow the best segments to be |
|
599 * referenced with the smallest offsets. |
|
600 */ |
|
601 tail -= segmentSize; |
|
602 memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); |
|
603 DISPLAYUPDATE( |
|
604 2, "\r%u%% ", |
|
605 (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); |
|
606 } |
|
607 DISPLAYLEVEL(2, "\r%79s\r", ""); |
|
608 return tail; |
|
609 } |
|
610 |
|
611 /** |
|
612 * Translate from COVER_params_t to ZDICT_params_t required for finalizing the |
|
613 * dictionary. |
|
614 */ |
|
615 static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) { |
|
616 ZDICT_params_t zdictParams; |
|
617 memset(&zdictParams, 0, sizeof(zdictParams)); |
|
618 zdictParams.notificationLevel = 1; |
|
619 zdictParams.dictID = parameters.dictID; |
|
620 zdictParams.compressionLevel = parameters.compressionLevel; |
|
621 return zdictParams; |
|
622 } |
|
623 |
|
624 /** |
|
625 * Constructs a dictionary using a heuristic based on the following paper: |
|
626 * |
|
627 * Liao, Petri, Moffat, Wirth |
|
628 * Effective Construction of Relative Lempel-Ziv Dictionaries |
|
629 * Published in WWW 2016. |
|
630 */ |
|
631 ZDICTLIB_API size_t COVER_trainFromBuffer( |
|
632 void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, |
|
633 const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) { |
|
634 BYTE *const dict = (BYTE *)dictBuffer; |
|
635 COVER_ctx_t ctx; |
|
636 COVER_map_t activeDmers; |
|
637 /* Checks */ |
|
638 if (!COVER_checkParameters(parameters)) { |
|
639 DISPLAYLEVEL(1, "Cover parameters incorrect\n"); |
|
640 return ERROR(GENERIC); |
|
641 } |
|
642 if (nbSamples == 0) { |
|
643 DISPLAYLEVEL(1, "Cover must have at least one input file\n"); |
|
644 return ERROR(GENERIC); |
|
645 } |
|
646 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { |
|
647 DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", |
|
648 ZDICT_DICTSIZE_MIN); |
|
649 return ERROR(dstSize_tooSmall); |
|
650 } |
|
651 /* Initialize global data */ |
|
652 g_displayLevel = parameters.notificationLevel; |
|
653 /* Initialize context and activeDmers */ |
|
654 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, |
|
655 parameters.d)) { |
|
656 return ERROR(GENERIC); |
|
657 } |
|
658 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { |
|
659 DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); |
|
660 COVER_ctx_destroy(&ctx); |
|
661 return ERROR(GENERIC); |
|
662 } |
|
663 |
|
664 DISPLAYLEVEL(2, "Building dictionary\n"); |
|
665 { |
|
666 const size_t tail = |
|
667 COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, |
|
668 dictBufferCapacity, parameters); |
|
669 ZDICT_params_t zdictParams = COVER_translateParams(parameters); |
|
670 const size_t dictionarySize = ZDICT_finalizeDictionary( |
|
671 dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, |
|
672 samplesBuffer, samplesSizes, nbSamples, zdictParams); |
|
673 if (!ZSTD_isError(dictionarySize)) { |
|
674 DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", |
|
675 (U32)dictionarySize); |
|
676 } |
|
677 COVER_ctx_destroy(&ctx); |
|
678 COVER_map_destroy(&activeDmers); |
|
679 return dictionarySize; |
|
680 } |
|
681 } |
|
682 |
|
683 /** |
|
684 * COVER_best_t is used for two purposes: |
|
685 * 1. Synchronizing threads. |
|
686 * 2. Saving the best parameters and dictionary. |
|
687 * |
|
688 * All of the methods except COVER_best_init() are thread safe if zstd is |
|
689 * compiled with multithreaded support. |
|
690 */ |
|
691 typedef struct COVER_best_s { |
|
692 pthread_mutex_t mutex; |
|
693 pthread_cond_t cond; |
|
694 size_t liveJobs; |
|
695 void *dict; |
|
696 size_t dictSize; |
|
697 COVER_params_t parameters; |
|
698 size_t compressedSize; |
|
699 } COVER_best_t; |
|
700 |
|
701 /** |
|
702 * Initialize the `COVER_best_t`. |
|
703 */ |
|
704 static void COVER_best_init(COVER_best_t *best) { |
|
705 if (!best) { |
|
706 return; |
|
707 } |
|
708 pthread_mutex_init(&best->mutex, NULL); |
|
709 pthread_cond_init(&best->cond, NULL); |
|
710 best->liveJobs = 0; |
|
711 best->dict = NULL; |
|
712 best->dictSize = 0; |
|
713 best->compressedSize = (size_t)-1; |
|
714 memset(&best->parameters, 0, sizeof(best->parameters)); |
|
715 } |
|
716 |
|
717 /** |
|
718 * Wait until liveJobs == 0. |
|
719 */ |
|
720 static void COVER_best_wait(COVER_best_t *best) { |
|
721 if (!best) { |
|
722 return; |
|
723 } |
|
724 pthread_mutex_lock(&best->mutex); |
|
725 while (best->liveJobs != 0) { |
|
726 pthread_cond_wait(&best->cond, &best->mutex); |
|
727 } |
|
728 pthread_mutex_unlock(&best->mutex); |
|
729 } |
|
730 |
|
731 /** |
|
732 * Call COVER_best_wait() and then destroy the COVER_best_t. |
|
733 */ |
|
734 static void COVER_best_destroy(COVER_best_t *best) { |
|
735 if (!best) { |
|
736 return; |
|
737 } |
|
738 COVER_best_wait(best); |
|
739 if (best->dict) { |
|
740 free(best->dict); |
|
741 } |
|
742 pthread_mutex_destroy(&best->mutex); |
|
743 pthread_cond_destroy(&best->cond); |
|
744 } |
|
745 |
|
746 /** |
|
747 * Called when a thread is about to be launched. |
|
748 * Increments liveJobs. |
|
749 */ |
|
750 static void COVER_best_start(COVER_best_t *best) { |
|
751 if (!best) { |
|
752 return; |
|
753 } |
|
754 pthread_mutex_lock(&best->mutex); |
|
755 ++best->liveJobs; |
|
756 pthread_mutex_unlock(&best->mutex); |
|
757 } |
|
758 |
|
759 /** |
|
760 * Called when a thread finishes executing, both on error or success. |
|
761 * Decrements liveJobs and signals any waiting threads if liveJobs == 0. |
|
762 * If this dictionary is the best so far save it and its parameters. |
|
763 */ |
|
764 static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, |
|
765 COVER_params_t parameters, void *dict, |
|
766 size_t dictSize) { |
|
767 if (!best) { |
|
768 return; |
|
769 } |
|
770 { |
|
771 size_t liveJobs; |
|
772 pthread_mutex_lock(&best->mutex); |
|
773 --best->liveJobs; |
|
774 liveJobs = best->liveJobs; |
|
775 /* If the new dictionary is better */ |
|
776 if (compressedSize < best->compressedSize) { |
|
777 /* Allocate space if necessary */ |
|
778 if (!best->dict || best->dictSize < dictSize) { |
|
779 if (best->dict) { |
|
780 free(best->dict); |
|
781 } |
|
782 best->dict = malloc(dictSize); |
|
783 if (!best->dict) { |
|
784 best->compressedSize = ERROR(GENERIC); |
|
785 best->dictSize = 0; |
|
786 return; |
|
787 } |
|
788 } |
|
789 /* Save the dictionary, parameters, and size */ |
|
790 memcpy(best->dict, dict, dictSize); |
|
791 best->dictSize = dictSize; |
|
792 best->parameters = parameters; |
|
793 best->compressedSize = compressedSize; |
|
794 } |
|
795 pthread_mutex_unlock(&best->mutex); |
|
796 if (liveJobs == 0) { |
|
797 pthread_cond_broadcast(&best->cond); |
|
798 } |
|
799 } |
|
800 } |
|
801 |
|
802 /** |
|
803 * Parameters for COVER_tryParameters(). |
|
804 */ |
|
805 typedef struct COVER_tryParameters_data_s { |
|
806 const COVER_ctx_t *ctx; |
|
807 COVER_best_t *best; |
|
808 size_t dictBufferCapacity; |
|
809 COVER_params_t parameters; |
|
810 } COVER_tryParameters_data_t; |
|
811 |
|
812 /** |
|
813 * Tries a set of parameters and upates the COVER_best_t with the results. |
|
814 * This function is thread safe if zstd is compiled with multithreaded support. |
|
815 * It takes its parameters as an *OWNING* opaque pointer to support threading. |
|
816 */ |
|
817 static void COVER_tryParameters(void *opaque) { |
|
818 /* Save parameters as local variables */ |
|
819 COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; |
|
820 const COVER_ctx_t *const ctx = data->ctx; |
|
821 const COVER_params_t parameters = data->parameters; |
|
822 size_t dictBufferCapacity = data->dictBufferCapacity; |
|
823 size_t totalCompressedSize = ERROR(GENERIC); |
|
824 /* Allocate space for hash table, dict, and freqs */ |
|
825 COVER_map_t activeDmers; |
|
826 BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); |
|
827 U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); |
|
828 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { |
|
829 DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); |
|
830 goto _cleanup; |
|
831 } |
|
832 if (!dict || !freqs) { |
|
833 DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); |
|
834 goto _cleanup; |
|
835 } |
|
836 /* Copy the frequencies because we need to modify them */ |
|
837 memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32)); |
|
838 /* Build the dictionary */ |
|
839 { |
|
840 const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, |
|
841 dictBufferCapacity, parameters); |
|
842 const ZDICT_params_t zdictParams = COVER_translateParams(parameters); |
|
843 dictBufferCapacity = ZDICT_finalizeDictionary( |
|
844 dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, |
|
845 ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); |
|
846 if (ZDICT_isError(dictBufferCapacity)) { |
|
847 DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); |
|
848 goto _cleanup; |
|
849 } |
|
850 } |
|
851 /* Check total compressed size */ |
|
852 { |
|
853 /* Pointers */ |
|
854 ZSTD_CCtx *cctx; |
|
855 ZSTD_CDict *cdict; |
|
856 void *dst; |
|
857 /* Local variables */ |
|
858 size_t dstCapacity; |
|
859 size_t i; |
|
860 /* Allocate dst with enough space to compress the maximum sized sample */ |
|
861 { |
|
862 size_t maxSampleSize = 0; |
|
863 for (i = 0; i < ctx->nbSamples; ++i) { |
|
864 maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); |
|
865 } |
|
866 dstCapacity = ZSTD_compressBound(maxSampleSize); |
|
867 dst = malloc(dstCapacity); |
|
868 } |
|
869 /* Create the cctx and cdict */ |
|
870 cctx = ZSTD_createCCtx(); |
|
871 cdict = |
|
872 ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel); |
|
873 if (!dst || !cctx || !cdict) { |
|
874 goto _compressCleanup; |
|
875 } |
|
876 /* Compress each sample and sum their sizes (or error) */ |
|
877 totalCompressedSize = 0; |
|
878 for (i = 0; i < ctx->nbSamples; ++i) { |
|
879 const size_t size = ZSTD_compress_usingCDict( |
|
880 cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], |
|
881 ctx->samplesSizes[i], cdict); |
|
882 if (ZSTD_isError(size)) { |
|
883 totalCompressedSize = ERROR(GENERIC); |
|
884 goto _compressCleanup; |
|
885 } |
|
886 totalCompressedSize += size; |
|
887 } |
|
888 _compressCleanup: |
|
889 ZSTD_freeCCtx(cctx); |
|
890 ZSTD_freeCDict(cdict); |
|
891 if (dst) { |
|
892 free(dst); |
|
893 } |
|
894 } |
|
895 |
|
896 _cleanup: |
|
897 COVER_best_finish(data->best, totalCompressedSize, parameters, dict, |
|
898 dictBufferCapacity); |
|
899 free(data); |
|
900 COVER_map_destroy(&activeDmers); |
|
901 if (dict) { |
|
902 free(dict); |
|
903 } |
|
904 if (freqs) { |
|
905 free(freqs); |
|
906 } |
|
907 } |
|
908 |
|
909 ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, |
|
910 size_t dictBufferCapacity, |
|
911 const void *samplesBuffer, |
|
912 const size_t *samplesSizes, |
|
913 unsigned nbSamples, |
|
914 COVER_params_t *parameters) { |
|
915 /* constants */ |
|
916 const unsigned nbThreads = parameters->nbThreads; |
|
917 const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; |
|
918 const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d; |
|
919 const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k; |
|
920 const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k; |
|
921 const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps; |
|
922 const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); |
|
923 const unsigned kIterations = |
|
924 (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); |
|
925 /* Local variables */ |
|
926 const int displayLevel = parameters->notificationLevel; |
|
927 unsigned iteration = 1; |
|
928 unsigned d; |
|
929 unsigned k; |
|
930 COVER_best_t best; |
|
931 POOL_ctx *pool = NULL; |
|
932 /* Checks */ |
|
933 if (kMinK < kMaxD || kMaxK < kMinK) { |
|
934 LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); |
|
935 return ERROR(GENERIC); |
|
936 } |
|
937 if (nbSamples == 0) { |
|
938 DISPLAYLEVEL(1, "Cover must have at least one input file\n"); |
|
939 return ERROR(GENERIC); |
|
940 } |
|
941 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { |
|
942 DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", |
|
943 ZDICT_DICTSIZE_MIN); |
|
944 return ERROR(dstSize_tooSmall); |
|
945 } |
|
946 if (nbThreads > 1) { |
|
947 pool = POOL_create(nbThreads, 1); |
|
948 if (!pool) { |
|
949 return ERROR(memory_allocation); |
|
950 } |
|
951 } |
|
952 /* Initialization */ |
|
953 COVER_best_init(&best); |
|
954 /* Turn down global display level to clean up display at level 2 and below */ |
|
955 g_displayLevel = parameters->notificationLevel - 1; |
|
956 /* Loop through d first because each new value needs a new context */ |
|
957 LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", |
|
958 kIterations); |
|
959 for (d = kMinD; d <= kMaxD; d += 2) { |
|
960 /* Initialize the context for this value of d */ |
|
961 COVER_ctx_t ctx; |
|
962 LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); |
|
963 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { |
|
964 LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); |
|
965 COVER_best_destroy(&best); |
|
966 return ERROR(GENERIC); |
|
967 } |
|
968 /* Loop through k reusing the same context */ |
|
969 for (k = kMinK; k <= kMaxK; k += kStepSize) { |
|
970 /* Prepare the arguments */ |
|
971 COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc( |
|
972 sizeof(COVER_tryParameters_data_t)); |
|
973 LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k); |
|
974 if (!data) { |
|
975 LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n"); |
|
976 COVER_best_destroy(&best); |
|
977 COVER_ctx_destroy(&ctx); |
|
978 return ERROR(GENERIC); |
|
979 } |
|
980 data->ctx = &ctx; |
|
981 data->best = &best; |
|
982 data->dictBufferCapacity = dictBufferCapacity; |
|
983 data->parameters = *parameters; |
|
984 data->parameters.k = k; |
|
985 data->parameters.d = d; |
|
986 data->parameters.steps = kSteps; |
|
987 /* Check the parameters */ |
|
988 if (!COVER_checkParameters(data->parameters)) { |
|
989 DISPLAYLEVEL(1, "Cover parameters incorrect\n"); |
|
990 continue; |
|
991 } |
|
992 /* Call the function and pass ownership of data to it */ |
|
993 COVER_best_start(&best); |
|
994 if (pool) { |
|
995 POOL_add(pool, &COVER_tryParameters, data); |
|
996 } else { |
|
997 COVER_tryParameters(data); |
|
998 } |
|
999 /* Print status */ |
|
1000 LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ", |
|
1001 (U32)((iteration * 100) / kIterations)); |
|
1002 ++iteration; |
|
1003 } |
|
1004 COVER_best_wait(&best); |
|
1005 COVER_ctx_destroy(&ctx); |
|
1006 } |
|
1007 LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", ""); |
|
1008 /* Fill the output buffer and parameters with output of the best parameters */ |
|
1009 { |
|
1010 const size_t dictSize = best.dictSize; |
|
1011 if (ZSTD_isError(best.compressedSize)) { |
|
1012 COVER_best_destroy(&best); |
|
1013 return best.compressedSize; |
|
1014 } |
|
1015 *parameters = best.parameters; |
|
1016 memcpy(dictBuffer, best.dict, dictSize); |
|
1017 COVER_best_destroy(&best); |
|
1018 POOL_free(pool); |
|
1019 return dictSize; |
|
1020 } |
|
1021 } |