contrib/python-zstandard/zstd/dictBuilder/cover.c
changeset 30895 c32454d69b85
child 37495 b1fb341d8a61
equal deleted inserted replaced
30894:5b60464efbde 30895:c32454d69b85
       
     1 /**
       
     2  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
       
     3  * All rights reserved.
       
     4  *
       
     5  * This source code is licensed under the BSD-style license found in the
       
     6  * LICENSE file in the root directory of this source tree. An additional grant
       
     7  * of patent rights can be found in the PATENTS file in the same directory.
       
     8  */
       
     9 
       
    10 /*-*************************************
       
    11 *  Dependencies
       
    12 ***************************************/
       
    13 #include <stdio.h>  /* fprintf */
       
    14 #include <stdlib.h> /* malloc, free, qsort */
       
    15 #include <string.h> /* memset */
       
    16 #include <time.h>   /* clock */
       
    17 
       
    18 #include "mem.h" /* read */
       
    19 #include "pool.h"
       
    20 #include "threading.h"
       
    21 #include "zstd_internal.h" /* includes zstd.h */
       
    22 #ifndef ZDICT_STATIC_LINKING_ONLY
       
    23 #define ZDICT_STATIC_LINKING_ONLY
       
    24 #endif
       
    25 #include "zdict.h"
       
    26 
       
    27 /*-*************************************
       
    28 *  Constants
       
    29 ***************************************/
       
    30 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
       
    31 
       
    32 /*-*************************************
       
    33 *  Console display
       
    34 ***************************************/
       
    35 static int g_displayLevel = 2;
       
    36 #define DISPLAY(...)                                                           \
       
    37   {                                                                            \
       
    38     fprintf(stderr, __VA_ARGS__);                                              \
       
    39     fflush(stderr);                                                            \
       
    40   }
       
    41 #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
       
    42   if (displayLevel >= l) {                                                     \
       
    43     DISPLAY(__VA_ARGS__);                                                      \
       
    44   } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
       
    45 #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
       
    46 
       
    47 #define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
       
    48   if (displayLevel >= l) {                                                     \
       
    49     if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
       
    50       g_time = clock();                                                        \
       
    51       DISPLAY(__VA_ARGS__);                                                    \
       
    52       if (displayLevel >= 4)                                                   \
       
    53         fflush(stdout);                                                        \
       
    54     }                                                                          \
       
    55   }
       
    56 #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
       
    57 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
       
    58 static clock_t g_time = 0;
       
    59 
       
    60 /*-*************************************
       
    61 * Hash table
       
    62 ***************************************
       
    63 * A small specialized hash map for storing activeDmers.
       
    64 * The map does not resize, so if it becomes full it will loop forever.
       
    65 * Thus, the map must be large enough to store every value.
       
    66 * The map implements linear probing and keeps its load less than 0.5.
       
    67 */
       
    68 
       
    69 #define MAP_EMPTY_VALUE ((U32)-1)
       
    70 typedef struct COVER_map_pair_t_s {
       
    71   U32 key;
       
    72   U32 value;
       
    73 } COVER_map_pair_t;
       
    74 
       
    75 typedef struct COVER_map_s {
       
    76   COVER_map_pair_t *data;
       
    77   U32 sizeLog;
       
    78   U32 size;
       
    79   U32 sizeMask;
       
    80 } COVER_map_t;
       
    81 
       
    82 /**
       
    83  * Clear the map.
       
    84  */
       
    85 static void COVER_map_clear(COVER_map_t *map) {
       
    86   memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
       
    87 }
       
    88 
       
    89 /**
       
    90  * Initializes a map of the given size.
       
    91  * Returns 1 on success and 0 on failure.
       
    92  * The map must be destroyed with COVER_map_destroy().
       
    93  * The map is only guaranteed to be large enough to hold size elements.
       
    94  */
       
    95 static int COVER_map_init(COVER_map_t *map, U32 size) {
       
    96   map->sizeLog = ZSTD_highbit32(size) + 2;
       
    97   map->size = (U32)1 << map->sizeLog;
       
    98   map->sizeMask = map->size - 1;
       
    99   map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
       
   100   if (!map->data) {
       
   101     map->sizeLog = 0;
       
   102     map->size = 0;
       
   103     return 0;
       
   104   }
       
   105   COVER_map_clear(map);
       
   106   return 1;
       
   107 }
       
   108 
       
   109 /**
       
   110  * Internal hash function
       
   111  */
       
   112 static const U32 prime4bytes = 2654435761U;
       
   113 static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
       
   114   return (key * prime4bytes) >> (32 - map->sizeLog);
       
   115 }
       
   116 
       
   117 /**
       
   118  * Helper function that returns the index that a key should be placed into.
       
   119  */
       
   120 static U32 COVER_map_index(COVER_map_t *map, U32 key) {
       
   121   const U32 hash = COVER_map_hash(map, key);
       
   122   U32 i;
       
   123   for (i = hash;; i = (i + 1) & map->sizeMask) {
       
   124     COVER_map_pair_t *pos = &map->data[i];
       
   125     if (pos->value == MAP_EMPTY_VALUE) {
       
   126       return i;
       
   127     }
       
   128     if (pos->key == key) {
       
   129       return i;
       
   130     }
       
   131   }
       
   132 }
       
   133 
       
   134 /**
       
   135  * Returns the pointer to the value for key.
       
   136  * If key is not in the map, it is inserted and the value is set to 0.
       
   137  * The map must not be full.
       
   138  */
       
   139 static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
       
   140   COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
       
   141   if (pos->value == MAP_EMPTY_VALUE) {
       
   142     pos->key = key;
       
   143     pos->value = 0;
       
   144   }
       
   145   return &pos->value;
       
   146 }
       
   147 
       
   148 /**
       
   149  * Deletes key from the map if present.
       
   150  */
       
   151 static void COVER_map_remove(COVER_map_t *map, U32 key) {
       
   152   U32 i = COVER_map_index(map, key);
       
   153   COVER_map_pair_t *del = &map->data[i];
       
   154   U32 shift = 1;
       
   155   if (del->value == MAP_EMPTY_VALUE) {
       
   156     return;
       
   157   }
       
   158   for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
       
   159     COVER_map_pair_t *const pos = &map->data[i];
       
   160     /* If the position is empty we are done */
       
   161     if (pos->value == MAP_EMPTY_VALUE) {
       
   162       del->value = MAP_EMPTY_VALUE;
       
   163       return;
       
   164     }
       
   165     /* If pos can be moved to del do so */
       
   166     if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
       
   167       del->key = pos->key;
       
   168       del->value = pos->value;
       
   169       del = pos;
       
   170       shift = 1;
       
   171     } else {
       
   172       ++shift;
       
   173     }
       
   174   }
       
   175 }
       
   176 
       
   177 /**
       
   178  * Destroyes a map that is inited with COVER_map_init().
       
   179  */
       
   180 static void COVER_map_destroy(COVER_map_t *map) {
       
   181   if (map->data) {
       
   182     free(map->data);
       
   183   }
       
   184   map->data = NULL;
       
   185   map->size = 0;
       
   186 }
       
   187 
       
   188 /*-*************************************
       
   189 * Context
       
   190 ***************************************/
       
   191 
       
   192 typedef struct {
       
   193   const BYTE *samples;
       
   194   size_t *offsets;
       
   195   const size_t *samplesSizes;
       
   196   size_t nbSamples;
       
   197   U32 *suffix;
       
   198   size_t suffixSize;
       
   199   U32 *freqs;
       
   200   U32 *dmerAt;
       
   201   unsigned d;
       
   202 } COVER_ctx_t;
       
   203 
       
   204 /* We need a global context for qsort... */
       
   205 static COVER_ctx_t *g_ctx = NULL;
       
   206 
       
   207 /*-*************************************
       
   208 *  Helper functions
       
   209 ***************************************/
       
   210 
       
   211 /**
       
   212  * Returns the sum of the sample sizes.
       
   213  */
       
   214 static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
       
   215   size_t sum = 0;
       
   216   size_t i;
       
   217   for (i = 0; i < nbSamples; ++i) {
       
   218     sum += samplesSizes[i];
       
   219   }
       
   220   return sum;
       
   221 }
       
   222 
       
   223 /**
       
   224  * Returns -1 if the dmer at lp is less than the dmer at rp.
       
   225  * Return 0 if the dmers at lp and rp are equal.
       
   226  * Returns 1 if the dmer at lp is greater than the dmer at rp.
       
   227  */
       
   228 static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
       
   229   const U32 lhs = *(const U32 *)lp;
       
   230   const U32 rhs = *(const U32 *)rp;
       
   231   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
       
   232 }
       
   233 
       
   234 /**
       
   235  * Same as COVER_cmp() except ties are broken by pointer value
       
   236  * NOTE: g_ctx must be set to call this function.  A global is required because
       
   237  * qsort doesn't take an opaque pointer.
       
   238  */
       
   239 static int COVER_strict_cmp(const void *lp, const void *rp) {
       
   240   int result = COVER_cmp(g_ctx, lp, rp);
       
   241   if (result == 0) {
       
   242     result = lp < rp ? -1 : 1;
       
   243   }
       
   244   return result;
       
   245 }
       
   246 
       
   247 /**
       
   248  * Returns the first pointer in [first, last) whose element does not compare
       
   249  * less than value.  If no such element exists it returns last.
       
   250  */
       
   251 static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
       
   252                                        size_t value) {
       
   253   size_t count = last - first;
       
   254   while (count != 0) {
       
   255     size_t step = count / 2;
       
   256     const size_t *ptr = first;
       
   257     ptr += step;
       
   258     if (*ptr < value) {
       
   259       first = ++ptr;
       
   260       count -= step + 1;
       
   261     } else {
       
   262       count = step;
       
   263     }
       
   264   }
       
   265   return first;
       
   266 }
       
   267 
       
   268 /**
       
   269  * Generic groupBy function.
       
   270  * Groups an array sorted by cmp into groups with equivalent values.
       
   271  * Calls grp for each group.
       
   272  */
       
   273 static void
       
   274 COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
       
   275               int (*cmp)(COVER_ctx_t *, const void *, const void *),
       
   276               void (*grp)(COVER_ctx_t *, const void *, const void *)) {
       
   277   const BYTE *ptr = (const BYTE *)data;
       
   278   size_t num = 0;
       
   279   while (num < count) {
       
   280     const BYTE *grpEnd = ptr + size;
       
   281     ++num;
       
   282     while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
       
   283       grpEnd += size;
       
   284       ++num;
       
   285     }
       
   286     grp(ctx, ptr, grpEnd);
       
   287     ptr = grpEnd;
       
   288   }
       
   289 }
       
   290 
       
   291 /*-*************************************
       
   292 *  Cover functions
       
   293 ***************************************/
       
   294 
       
   295 /**
       
   296  * Called on each group of positions with the same dmer.
       
   297  * Counts the frequency of each dmer and saves it in the suffix array.
       
   298  * Fills `ctx->dmerAt`.
       
   299  */
       
   300 static void COVER_group(COVER_ctx_t *ctx, const void *group,
       
   301                         const void *groupEnd) {
       
   302   /* The group consists of all the positions with the same first d bytes. */
       
   303   const U32 *grpPtr = (const U32 *)group;
       
   304   const U32 *grpEnd = (const U32 *)groupEnd;
       
   305   /* The dmerId is how we will reference this dmer.
       
   306    * This allows us to map the whole dmer space to a much smaller space, the
       
   307    * size of the suffix array.
       
   308    */
       
   309   const U32 dmerId = (U32)(grpPtr - ctx->suffix);
       
   310   /* Count the number of samples this dmer shows up in */
       
   311   U32 freq = 0;
       
   312   /* Details */
       
   313   const size_t *curOffsetPtr = ctx->offsets;
       
   314   const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
       
   315   /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
       
   316    * different sample than the last.
       
   317    */
       
   318   size_t curSampleEnd = ctx->offsets[0];
       
   319   for (; grpPtr != grpEnd; ++grpPtr) {
       
   320     /* Save the dmerId for this position so we can get back to it. */
       
   321     ctx->dmerAt[*grpPtr] = dmerId;
       
   322     /* Dictionaries only help for the first reference to the dmer.
       
   323      * After that zstd can reference the match from the previous reference.
       
   324      * So only count each dmer once for each sample it is in.
       
   325      */
       
   326     if (*grpPtr < curSampleEnd) {
       
   327       continue;
       
   328     }
       
   329     freq += 1;
       
   330     /* Binary search to find the end of the sample *grpPtr is in.
       
   331      * In the common case that grpPtr + 1 == grpEnd we can skip the binary
       
   332      * search because the loop is over.
       
   333      */
       
   334     if (grpPtr + 1 != grpEnd) {
       
   335       const size_t *sampleEndPtr =
       
   336           COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
       
   337       curSampleEnd = *sampleEndPtr;
       
   338       curOffsetPtr = sampleEndPtr + 1;
       
   339     }
       
   340   }
       
   341   /* At this point we are never going to look at this segment of the suffix
       
   342    * array again.  We take advantage of this fact to save memory.
       
   343    * We store the frequency of the dmer in the first position of the group,
       
   344    * which is dmerId.
       
   345    */
       
   346   ctx->suffix[dmerId] = freq;
       
   347 }
       
   348 
       
   349 /**
       
   350  * A segment is a range in the source as well as the score of the segment.
       
   351  */
       
   352 typedef struct {
       
   353   U32 begin;
       
   354   U32 end;
       
   355   double score;
       
   356 } COVER_segment_t;
       
   357 
       
   358 /**
       
   359  * Selects the best segment in an epoch.
       
   360  * Segments of are scored according to the function:
       
   361  *
       
   362  * Let F(d) be the frequency of dmer d.
       
   363  * Let S_i be the dmer at position i of segment S which has length k.
       
   364  *
       
   365  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
       
   366  *
       
   367  * Once the dmer d is in the dictionay we set F(d) = 0.
       
   368  */
       
   369 static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
       
   370                                            COVER_map_t *activeDmers, U32 begin,
       
   371                                            U32 end, COVER_params_t parameters) {
       
   372   /* Constants */
       
   373   const U32 k = parameters.k;
       
   374   const U32 d = parameters.d;
       
   375   const U32 dmersInK = k - d + 1;
       
   376   /* Try each segment (activeSegment) and save the best (bestSegment) */
       
   377   COVER_segment_t bestSegment = {0, 0, 0};
       
   378   COVER_segment_t activeSegment;
       
   379   /* Reset the activeDmers in the segment */
       
   380   COVER_map_clear(activeDmers);
       
   381   /* The activeSegment starts at the beginning of the epoch. */
       
   382   activeSegment.begin = begin;
       
   383   activeSegment.end = begin;
       
   384   activeSegment.score = 0;
       
   385   /* Slide the activeSegment through the whole epoch.
       
   386    * Save the best segment in bestSegment.
       
   387    */
       
   388   while (activeSegment.end < end) {
       
   389     /* The dmerId for the dmer at the next position */
       
   390     U32 newDmer = ctx->dmerAt[activeSegment.end];
       
   391     /* The entry in activeDmers for this dmerId */
       
   392     U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
       
   393     /* If the dmer isn't already present in the segment add its score. */
       
   394     if (*newDmerOcc == 0) {
       
   395       /* The paper suggest using the L-0.5 norm, but experiments show that it
       
   396        * doesn't help.
       
   397        */
       
   398       activeSegment.score += freqs[newDmer];
       
   399     }
       
   400     /* Add the dmer to the segment */
       
   401     activeSegment.end += 1;
       
   402     *newDmerOcc += 1;
       
   403 
       
   404     /* If the window is now too large, drop the first position */
       
   405     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
       
   406       U32 delDmer = ctx->dmerAt[activeSegment.begin];
       
   407       U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
       
   408       activeSegment.begin += 1;
       
   409       *delDmerOcc -= 1;
       
   410       /* If this is the last occurence of the dmer, subtract its score */
       
   411       if (*delDmerOcc == 0) {
       
   412         COVER_map_remove(activeDmers, delDmer);
       
   413         activeSegment.score -= freqs[delDmer];
       
   414       }
       
   415     }
       
   416 
       
   417     /* If this segment is the best so far save it */
       
   418     if (activeSegment.score > bestSegment.score) {
       
   419       bestSegment = activeSegment;
       
   420     }
       
   421   }
       
   422   {
       
   423     /* Trim off the zero frequency head and tail from the segment. */
       
   424     U32 newBegin = bestSegment.end;
       
   425     U32 newEnd = bestSegment.begin;
       
   426     U32 pos;
       
   427     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
       
   428       U32 freq = freqs[ctx->dmerAt[pos]];
       
   429       if (freq != 0) {
       
   430         newBegin = MIN(newBegin, pos);
       
   431         newEnd = pos + 1;
       
   432       }
       
   433     }
       
   434     bestSegment.begin = newBegin;
       
   435     bestSegment.end = newEnd;
       
   436   }
       
   437   {
       
   438     /* Zero out the frequency of each dmer covered by the chosen segment. */
       
   439     U32 pos;
       
   440     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
       
   441       freqs[ctx->dmerAt[pos]] = 0;
       
   442     }
       
   443   }
       
   444   return bestSegment;
       
   445 }
       
   446 
       
   447 /**
       
   448  * Check the validity of the parameters.
       
   449  * Returns non-zero if the parameters are valid and 0 otherwise.
       
   450  */
       
   451 static int COVER_checkParameters(COVER_params_t parameters) {
       
   452   /* k and d are required parameters */
       
   453   if (parameters.d == 0 || parameters.k == 0) {
       
   454     return 0;
       
   455   }
       
   456   /* d <= k */
       
   457   if (parameters.d > parameters.k) {
       
   458     return 0;
       
   459   }
       
   460   return 1;
       
   461 }
       
   462 
       
   463 /**
       
   464  * Clean up a context initialized with `COVER_ctx_init()`.
       
   465  */
       
   466 static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
       
   467   if (!ctx) {
       
   468     return;
       
   469   }
       
   470   if (ctx->suffix) {
       
   471     free(ctx->suffix);
       
   472     ctx->suffix = NULL;
       
   473   }
       
   474   if (ctx->freqs) {
       
   475     free(ctx->freqs);
       
   476     ctx->freqs = NULL;
       
   477   }
       
   478   if (ctx->dmerAt) {
       
   479     free(ctx->dmerAt);
       
   480     ctx->dmerAt = NULL;
       
   481   }
       
   482   if (ctx->offsets) {
       
   483     free(ctx->offsets);
       
   484     ctx->offsets = NULL;
       
   485   }
       
   486 }
       
   487 
       
   488 /**
       
   489  * Prepare a context for dictionary building.
       
   490  * The context is only dependent on the parameter `d` and can used multiple
       
   491  * times.
       
   492  * Returns 1 on success or zero on error.
       
   493  * The context must be destroyed with `COVER_ctx_destroy()`.
       
   494  */
       
   495 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
       
   496                           const size_t *samplesSizes, unsigned nbSamples,
       
   497                           unsigned d) {
       
   498   const BYTE *const samples = (const BYTE *)samplesBuffer;
       
   499   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
       
   500   /* Checks */
       
   501   if (totalSamplesSize < d ||
       
   502       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
       
   503     DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
       
   504                  (COVER_MAX_SAMPLES_SIZE >> 20));
       
   505     return 0;
       
   506   }
       
   507   /* Zero the context */
       
   508   memset(ctx, 0, sizeof(*ctx));
       
   509   DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
       
   510                (U32)totalSamplesSize);
       
   511   ctx->samples = samples;
       
   512   ctx->samplesSizes = samplesSizes;
       
   513   ctx->nbSamples = nbSamples;
       
   514   /* Partial suffix array */
       
   515   ctx->suffixSize = totalSamplesSize - d + 1;
       
   516   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
       
   517   /* Maps index to the dmerID */
       
   518   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
       
   519   /* The offsets of each file */
       
   520   ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
       
   521   if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
       
   522     DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
       
   523     COVER_ctx_destroy(ctx);
       
   524     return 0;
       
   525   }
       
   526   ctx->freqs = NULL;
       
   527   ctx->d = d;
       
   528 
       
   529   /* Fill offsets from the samlesSizes */
       
   530   {
       
   531     U32 i;
       
   532     ctx->offsets[0] = 0;
       
   533     for (i = 1; i <= nbSamples; ++i) {
       
   534       ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
       
   535     }
       
   536   }
       
   537   DISPLAYLEVEL(2, "Constructing partial suffix array\n");
       
   538   {
       
   539     /* suffix is a partial suffix array.
       
   540      * It only sorts suffixes by their first parameters.d bytes.
       
   541      * The sort is stable, so each dmer group is sorted by position in input.
       
   542      */
       
   543     U32 i;
       
   544     for (i = 0; i < ctx->suffixSize; ++i) {
       
   545       ctx->suffix[i] = i;
       
   546     }
       
   547     /* qsort doesn't take an opaque pointer, so pass as a global */
       
   548     g_ctx = ctx;
       
   549     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp);
       
   550   }
       
   551   DISPLAYLEVEL(2, "Computing frequencies\n");
       
   552   /* For each dmer group (group of positions with the same first d bytes):
       
   553    * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
       
   554    *    (groupBeginPtr - suffix).  This allows us to go from position to
       
   555    *    dmerID so we can look up values in freq.
       
   556    * 2. We calculate how many samples the dmer occurs in and save it in
       
   557    *    freqs[dmerId].
       
   558    */
       
   559   COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp,
       
   560                 &COVER_group);
       
   561   ctx->freqs = ctx->suffix;
       
   562   ctx->suffix = NULL;
       
   563   return 1;
       
   564 }
       
   565 
       
   566 /**
       
   567  * Given the prepared context build the dictionary.
       
   568  */
       
   569 static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
       
   570                                     COVER_map_t *activeDmers, void *dictBuffer,
       
   571                                     size_t dictBufferCapacity,
       
   572                                     COVER_params_t parameters) {
       
   573   BYTE *const dict = (BYTE *)dictBuffer;
       
   574   size_t tail = dictBufferCapacity;
       
   575   /* Divide the data up into epochs of equal size.
       
   576    * We will select at least one segment from each epoch.
       
   577    */
       
   578   const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
       
   579   const U32 epochSize = (U32)(ctx->suffixSize / epochs);
       
   580   size_t epoch;
       
   581   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
       
   582                epochSize);
       
   583   /* Loop through the epochs until there are no more segments or the dictionary
       
   584    * is full.
       
   585    */
       
   586   for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
       
   587     const U32 epochBegin = (U32)(epoch * epochSize);
       
   588     const U32 epochEnd = epochBegin + epochSize;
       
   589     size_t segmentSize;
       
   590     /* Select a segment */
       
   591     COVER_segment_t segment = COVER_selectSegment(
       
   592         ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
       
   593     /* Trim the segment if necessary and if it is empty then we are done */
       
   594     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
       
   595     if (segmentSize == 0) {
       
   596       break;
       
   597     }
       
   598     /* We fill the dictionary from the back to allow the best segments to be
       
   599      * referenced with the smallest offsets.
       
   600      */
       
   601     tail -= segmentSize;
       
   602     memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
       
   603     DISPLAYUPDATE(
       
   604         2, "\r%u%%       ",
       
   605         (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
       
   606   }
       
   607   DISPLAYLEVEL(2, "\r%79s\r", "");
       
   608   return tail;
       
   609 }
       
   610 
       
   611 /**
       
   612  * Translate from COVER_params_t to ZDICT_params_t required for finalizing the
       
   613  * dictionary.
       
   614  */
       
   615 static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
       
   616   ZDICT_params_t zdictParams;
       
   617   memset(&zdictParams, 0, sizeof(zdictParams));
       
   618   zdictParams.notificationLevel = 1;
       
   619   zdictParams.dictID = parameters.dictID;
       
   620   zdictParams.compressionLevel = parameters.compressionLevel;
       
   621   return zdictParams;
       
   622 }
       
   623 
       
   624 /**
       
   625  * Constructs a dictionary using a heuristic based on the following paper:
       
   626  *
       
   627  * Liao, Petri, Moffat, Wirth
       
   628  * Effective Construction of Relative Lempel-Ziv Dictionaries
       
   629  * Published in WWW 2016.
       
   630  */
       
   631 ZDICTLIB_API size_t COVER_trainFromBuffer(
       
   632     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
       
   633     const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {
       
   634   BYTE *const dict = (BYTE *)dictBuffer;
       
   635   COVER_ctx_t ctx;
       
   636   COVER_map_t activeDmers;
       
   637   /* Checks */
       
   638   if (!COVER_checkParameters(parameters)) {
       
   639     DISPLAYLEVEL(1, "Cover parameters incorrect\n");
       
   640     return ERROR(GENERIC);
       
   641   }
       
   642   if (nbSamples == 0) {
       
   643     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
       
   644     return ERROR(GENERIC);
       
   645   }
       
   646   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
       
   647     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
       
   648                  ZDICT_DICTSIZE_MIN);
       
   649     return ERROR(dstSize_tooSmall);
       
   650   }
       
   651   /* Initialize global data */
       
   652   g_displayLevel = parameters.notificationLevel;
       
   653   /* Initialize context and activeDmers */
       
   654   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
       
   655                       parameters.d)) {
       
   656     return ERROR(GENERIC);
       
   657   }
       
   658   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
       
   659     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
       
   660     COVER_ctx_destroy(&ctx);
       
   661     return ERROR(GENERIC);
       
   662   }
       
   663 
       
   664   DISPLAYLEVEL(2, "Building dictionary\n");
       
   665   {
       
   666     const size_t tail =
       
   667         COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
       
   668                               dictBufferCapacity, parameters);
       
   669     ZDICT_params_t zdictParams = COVER_translateParams(parameters);
       
   670     const size_t dictionarySize = ZDICT_finalizeDictionary(
       
   671         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
       
   672         samplesBuffer, samplesSizes, nbSamples, zdictParams);
       
   673     if (!ZSTD_isError(dictionarySize)) {
       
   674       DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
       
   675                    (U32)dictionarySize);
       
   676     }
       
   677     COVER_ctx_destroy(&ctx);
       
   678     COVER_map_destroy(&activeDmers);
       
   679     return dictionarySize;
       
   680   }
       
   681 }
       
   682 
       
   683 /**
       
   684  * COVER_best_t is used for two purposes:
       
   685  * 1. Synchronizing threads.
       
   686  * 2. Saving the best parameters and dictionary.
       
   687  *
       
   688  * All of the methods except COVER_best_init() are thread safe if zstd is
       
   689  * compiled with multithreaded support.
       
   690  */
       
   691 typedef struct COVER_best_s {
       
   692   pthread_mutex_t mutex;
       
   693   pthread_cond_t cond;
       
   694   size_t liveJobs;
       
   695   void *dict;
       
   696   size_t dictSize;
       
   697   COVER_params_t parameters;
       
   698   size_t compressedSize;
       
   699 } COVER_best_t;
       
   700 
       
   701 /**
       
   702  * Initialize the `COVER_best_t`.
       
   703  */
       
   704 static void COVER_best_init(COVER_best_t *best) {
       
   705   if (!best) {
       
   706     return;
       
   707   }
       
   708   pthread_mutex_init(&best->mutex, NULL);
       
   709   pthread_cond_init(&best->cond, NULL);
       
   710   best->liveJobs = 0;
       
   711   best->dict = NULL;
       
   712   best->dictSize = 0;
       
   713   best->compressedSize = (size_t)-1;
       
   714   memset(&best->parameters, 0, sizeof(best->parameters));
       
   715 }
       
   716 
       
   717 /**
       
   718  * Wait until liveJobs == 0.
       
   719  */
       
   720 static void COVER_best_wait(COVER_best_t *best) {
       
   721   if (!best) {
       
   722     return;
       
   723   }
       
   724   pthread_mutex_lock(&best->mutex);
       
   725   while (best->liveJobs != 0) {
       
   726     pthread_cond_wait(&best->cond, &best->mutex);
       
   727   }
       
   728   pthread_mutex_unlock(&best->mutex);
       
   729 }
       
   730 
       
   731 /**
       
   732  * Call COVER_best_wait() and then destroy the COVER_best_t.
       
   733  */
       
   734 static void COVER_best_destroy(COVER_best_t *best) {
       
   735   if (!best) {
       
   736     return;
       
   737   }
       
   738   COVER_best_wait(best);
       
   739   if (best->dict) {
       
   740     free(best->dict);
       
   741   }
       
   742   pthread_mutex_destroy(&best->mutex);
       
   743   pthread_cond_destroy(&best->cond);
       
   744 }
       
   745 
       
   746 /**
       
   747  * Called when a thread is about to be launched.
       
   748  * Increments liveJobs.
       
   749  */
       
   750 static void COVER_best_start(COVER_best_t *best) {
       
   751   if (!best) {
       
   752     return;
       
   753   }
       
   754   pthread_mutex_lock(&best->mutex);
       
   755   ++best->liveJobs;
       
   756   pthread_mutex_unlock(&best->mutex);
       
   757 }
       
   758 
       
   759 /**
       
   760  * Called when a thread finishes executing, both on error or success.
       
   761  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
       
   762  * If this dictionary is the best so far save it and its parameters.
       
   763  */
       
   764 static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
       
   765                               COVER_params_t parameters, void *dict,
       
   766                               size_t dictSize) {
       
   767   if (!best) {
       
   768     return;
       
   769   }
       
   770   {
       
   771     size_t liveJobs;
       
   772     pthread_mutex_lock(&best->mutex);
       
   773     --best->liveJobs;
       
   774     liveJobs = best->liveJobs;
       
   775     /* If the new dictionary is better */
       
   776     if (compressedSize < best->compressedSize) {
       
   777       /* Allocate space if necessary */
       
   778       if (!best->dict || best->dictSize < dictSize) {
       
   779         if (best->dict) {
       
   780           free(best->dict);
       
   781         }
       
   782         best->dict = malloc(dictSize);
       
   783         if (!best->dict) {
       
   784           best->compressedSize = ERROR(GENERIC);
       
   785           best->dictSize = 0;
       
   786           return;
       
   787         }
       
   788       }
       
   789       /* Save the dictionary, parameters, and size */
       
   790       memcpy(best->dict, dict, dictSize);
       
   791       best->dictSize = dictSize;
       
   792       best->parameters = parameters;
       
   793       best->compressedSize = compressedSize;
       
   794     }
       
   795     pthread_mutex_unlock(&best->mutex);
       
   796     if (liveJobs == 0) {
       
   797       pthread_cond_broadcast(&best->cond);
       
   798     }
       
   799   }
       
   800 }
       
   801 
       
   802 /**
       
   803  * Parameters for COVER_tryParameters().
       
   804  */
       
   805 typedef struct COVER_tryParameters_data_s {
       
   806   const COVER_ctx_t *ctx;
       
   807   COVER_best_t *best;
       
   808   size_t dictBufferCapacity;
       
   809   COVER_params_t parameters;
       
   810 } COVER_tryParameters_data_t;
       
   811 
       
   812 /**
       
   813  * Tries a set of parameters and upates the COVER_best_t with the results.
       
   814  * This function is thread safe if zstd is compiled with multithreaded support.
       
   815  * It takes its parameters as an *OWNING* opaque pointer to support threading.
       
   816  */
       
   817 static void COVER_tryParameters(void *opaque) {
       
   818   /* Save parameters as local variables */
       
   819   COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
       
   820   const COVER_ctx_t *const ctx = data->ctx;
       
   821   const COVER_params_t parameters = data->parameters;
       
   822   size_t dictBufferCapacity = data->dictBufferCapacity;
       
   823   size_t totalCompressedSize = ERROR(GENERIC);
       
   824   /* Allocate space for hash table, dict, and freqs */
       
   825   COVER_map_t activeDmers;
       
   826   BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
       
   827   U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
       
   828   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
       
   829     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
       
   830     goto _cleanup;
       
   831   }
       
   832   if (!dict || !freqs) {
       
   833     DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
       
   834     goto _cleanup;
       
   835   }
       
   836   /* Copy the frequencies because we need to modify them */
       
   837   memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
       
   838   /* Build the dictionary */
       
   839   {
       
   840     const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
       
   841                                               dictBufferCapacity, parameters);
       
   842     const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
       
   843     dictBufferCapacity = ZDICT_finalizeDictionary(
       
   844         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
       
   845         ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams);
       
   846     if (ZDICT_isError(dictBufferCapacity)) {
       
   847       DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
       
   848       goto _cleanup;
       
   849     }
       
   850   }
       
   851   /* Check total compressed size */
       
   852   {
       
   853     /* Pointers */
       
   854     ZSTD_CCtx *cctx;
       
   855     ZSTD_CDict *cdict;
       
   856     void *dst;
       
   857     /* Local variables */
       
   858     size_t dstCapacity;
       
   859     size_t i;
       
   860     /* Allocate dst with enough space to compress the maximum sized sample */
       
   861     {
       
   862       size_t maxSampleSize = 0;
       
   863       for (i = 0; i < ctx->nbSamples; ++i) {
       
   864         maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
       
   865       }
       
   866       dstCapacity = ZSTD_compressBound(maxSampleSize);
       
   867       dst = malloc(dstCapacity);
       
   868     }
       
   869     /* Create the cctx and cdict */
       
   870     cctx = ZSTD_createCCtx();
       
   871     cdict =
       
   872         ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel);
       
   873     if (!dst || !cctx || !cdict) {
       
   874       goto _compressCleanup;
       
   875     }
       
   876     /* Compress each sample and sum their sizes (or error) */
       
   877     totalCompressedSize = 0;
       
   878     for (i = 0; i < ctx->nbSamples; ++i) {
       
   879       const size_t size = ZSTD_compress_usingCDict(
       
   880           cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
       
   881           ctx->samplesSizes[i], cdict);
       
   882       if (ZSTD_isError(size)) {
       
   883         totalCompressedSize = ERROR(GENERIC);
       
   884         goto _compressCleanup;
       
   885       }
       
   886       totalCompressedSize += size;
       
   887     }
       
   888   _compressCleanup:
       
   889     ZSTD_freeCCtx(cctx);
       
   890     ZSTD_freeCDict(cdict);
       
   891     if (dst) {
       
   892       free(dst);
       
   893     }
       
   894   }
       
   895 
       
   896 _cleanup:
       
   897   COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
       
   898                     dictBufferCapacity);
       
   899   free(data);
       
   900   COVER_map_destroy(&activeDmers);
       
   901   if (dict) {
       
   902     free(dict);
       
   903   }
       
   904   if (freqs) {
       
   905     free(freqs);
       
   906   }
       
   907 }
       
   908 
       
   909 ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
       
   910                                                   size_t dictBufferCapacity,
       
   911                                                   const void *samplesBuffer,
       
   912                                                   const size_t *samplesSizes,
       
   913                                                   unsigned nbSamples,
       
   914                                                   COVER_params_t *parameters) {
       
   915   /* constants */
       
   916   const unsigned nbThreads = parameters->nbThreads;
       
   917   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
       
   918   const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d;
       
   919   const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k;
       
   920   const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k;
       
   921   const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps;
       
   922   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
       
   923   const unsigned kIterations =
       
   924       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
       
   925   /* Local variables */
       
   926   const int displayLevel = parameters->notificationLevel;
       
   927   unsigned iteration = 1;
       
   928   unsigned d;
       
   929   unsigned k;
       
   930   COVER_best_t best;
       
   931   POOL_ctx *pool = NULL;
       
   932   /* Checks */
       
   933   if (kMinK < kMaxD || kMaxK < kMinK) {
       
   934     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
       
   935     return ERROR(GENERIC);
       
   936   }
       
   937   if (nbSamples == 0) {
       
   938     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
       
   939     return ERROR(GENERIC);
       
   940   }
       
   941   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
       
   942     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
       
   943                  ZDICT_DICTSIZE_MIN);
       
   944     return ERROR(dstSize_tooSmall);
       
   945   }
       
   946   if (nbThreads > 1) {
       
   947     pool = POOL_create(nbThreads, 1);
       
   948     if (!pool) {
       
   949       return ERROR(memory_allocation);
       
   950     }
       
   951   }
       
   952   /* Initialization */
       
   953   COVER_best_init(&best);
       
   954   /* Turn down global display level to clean up display at level 2 and below */
       
   955   g_displayLevel = parameters->notificationLevel - 1;
       
   956   /* Loop through d first because each new value needs a new context */
       
   957   LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
       
   958                     kIterations);
       
   959   for (d = kMinD; d <= kMaxD; d += 2) {
       
   960     /* Initialize the context for this value of d */
       
   961     COVER_ctx_t ctx;
       
   962     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
       
   963     if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
       
   964       LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
       
   965       COVER_best_destroy(&best);
       
   966       return ERROR(GENERIC);
       
   967     }
       
   968     /* Loop through k reusing the same context */
       
   969     for (k = kMinK; k <= kMaxK; k += kStepSize) {
       
   970       /* Prepare the arguments */
       
   971       COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
       
   972           sizeof(COVER_tryParameters_data_t));
       
   973       LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
       
   974       if (!data) {
       
   975         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
       
   976         COVER_best_destroy(&best);
       
   977         COVER_ctx_destroy(&ctx);
       
   978         return ERROR(GENERIC);
       
   979       }
       
   980       data->ctx = &ctx;
       
   981       data->best = &best;
       
   982       data->dictBufferCapacity = dictBufferCapacity;
       
   983       data->parameters = *parameters;
       
   984       data->parameters.k = k;
       
   985       data->parameters.d = d;
       
   986       data->parameters.steps = kSteps;
       
   987       /* Check the parameters */
       
   988       if (!COVER_checkParameters(data->parameters)) {
       
   989         DISPLAYLEVEL(1, "Cover parameters incorrect\n");
       
   990         continue;
       
   991       }
       
   992       /* Call the function and pass ownership of data to it */
       
   993       COVER_best_start(&best);
       
   994       if (pool) {
       
   995         POOL_add(pool, &COVER_tryParameters, data);
       
   996       } else {
       
   997         COVER_tryParameters(data);
       
   998       }
       
   999       /* Print status */
       
  1000       LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
       
  1001                          (U32)((iteration * 100) / kIterations));
       
  1002       ++iteration;
       
  1003     }
       
  1004     COVER_best_wait(&best);
       
  1005     COVER_ctx_destroy(&ctx);
       
  1006   }
       
  1007   LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
       
  1008   /* Fill the output buffer and parameters with output of the best parameters */
       
  1009   {
       
  1010     const size_t dictSize = best.dictSize;
       
  1011     if (ZSTD_isError(best.compressedSize)) {
       
  1012       COVER_best_destroy(&best);
       
  1013       return best.compressedSize;
       
  1014     }
       
  1015     *parameters = best.parameters;
       
  1016     memcpy(dictBuffer, best.dict, dictSize);
       
  1017     COVER_best_destroy(&best);
       
  1018     POOL_free(pool);
       
  1019     return dictSize;
       
  1020   }
       
  1021 }