view contrib/python-zstandard/zstd/dictBuilder/cover.h @ 52148:1dbbb957bbe6 stable

run-tests: add a --hg-wheel options to test a pre-built wheel This will be useful to test the wheel we intend to publish. A future changeset will integrate this in the CI.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Sun, 27 Oct 2024 08:54:48 +0100
parents 69de49c4e39c
children
line wrap: on
line source

#include <stdio.h>  /* fprintf */
#include <stdlib.h> /* malloc, free, qsort */
#include <string.h> /* memset */
#include <time.h>   /* clock */
#include "mem.h" /* read */
#include "pool.h"
#include "threading.h"
#include "zstd_internal.h" /* includes zstd.h */
#ifndef ZDICT_STATIC_LINKING_ONLY
#define ZDICT_STATIC_LINKING_ONLY
#endif
#include "zdict.h"

/**
 * COVER_best_t is used for two purposes:
 * 1. Synchronizing threads.
 * 2. Saving the best parameters and dictionary.
 *
 * All of the methods except COVER_best_init() are thread safe if zstd is
 * compiled with multithreaded support.
 */
typedef struct COVER_best_s {
  ZSTD_pthread_mutex_t mutex;
  ZSTD_pthread_cond_t cond;
  size_t liveJobs;
  void *dict;
  size_t dictSize;
  ZDICT_cover_params_t parameters;
  size_t compressedSize;
} COVER_best_t;

/**
 * A segment is a range in the source as well as the score of the segment.
 */
typedef struct {
  U32 begin;
  U32 end;
  U32 score;
} COVER_segment_t;

/**
 *Number of epochs and size of each epoch.
 */
typedef struct {
  U32 num;
  U32 size;
} COVER_epoch_info_t;

/**
 * Struct used for the dictionary selection function.
 */
typedef struct COVER_dictSelection {
  BYTE* dictContent;
  size_t dictSize;
  size_t totalCompressedSize;
} COVER_dictSelection_t;

/**
 * Computes the number of epochs and the size of each epoch.
 * We will make sure that each epoch gets at least 10 * k bytes.
 *
 * The COVER algorithms divide the data up into epochs of equal size and
 * select one segment from each epoch.
 *
 * @param maxDictSize The maximum allowed dictionary size.
 * @param nbDmers     The number of dmers we are training on.
 * @param k           The parameter k (segment size).
 * @param passes      The target number of passes over the dmer corpus.
 *                    More passes means a better dictionary.
 */
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
                                       U32 k, U32 passes);

/**
 * Warns the user when their corpus is too small.
 */
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);

/**
 *  Checks total compressed size of a dictionary
 */
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
                                      const size_t *samplesSizes, const BYTE *samples,
                                      size_t *offsets,
                                      size_t nbTrainSamples, size_t nbSamples,
                                      BYTE *const dict, size_t dictBufferCapacity);

/**
 * Returns the sum of the sample sizes.
 */
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;

/**
 * Initialize the `COVER_best_t`.
 */
void COVER_best_init(COVER_best_t *best);

/**
 * Wait until liveJobs == 0.
 */
void COVER_best_wait(COVER_best_t *best);

/**
 * Call COVER_best_wait() and then destroy the COVER_best_t.
 */
void COVER_best_destroy(COVER_best_t *best);

/**
 * Called when a thread is about to be launched.
 * Increments liveJobs.
 */
void COVER_best_start(COVER_best_t *best);

/**
 * Called when a thread finishes executing, both on error or success.
 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
 * If this dictionary is the best so far save it and its parameters.
 */
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
                       COVER_dictSelection_t selection);
/**
 * Error function for COVER_selectDict function. Checks if the return
 * value is an error.
 */
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);

 /**
  * Error function for COVER_selectDict function. Returns a struct where
  * return.totalCompressedSize is a ZSTD error.
  */
COVER_dictSelection_t COVER_dictSelectionError(size_t error);

/**
 * Always call after selectDict is called to free up used memory from
 * newly created dictionary.
 */
void COVER_dictSelectionFree(COVER_dictSelection_t selection);

/**
 * Called to finalize the dictionary and select one based on whether or not
 * the shrink-dict flag was enabled. If enabled the dictionary used is the
 * smallest dictionary within a specified regression of the compressed size
 * from the largest dictionary.
 */
 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
                       size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
                       size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);