comparison contrib/python-zstandard/zstd/dictBuilder/cover.c @ 40121:73fef626dae3

zstandard: vendor python-zstandard 0.10.1 This was just released. The upstream source distribution from PyPI was extracted. Unwanted files were removed. The clang-format ignore list was updated to reflect the new source of files. setup.py was updated to pass a new argument to python-zstandard's function for returning an Extension instance. Upstream had to change to use relative paths because Python 3.7's packaging doesn't seem to like absolute paths when defining sources, includes, etc. The default relative path calculation is relative to setup_zstd.py which is different from the directory of Mercurial's setup.py. The project contains a vendored copy of zstandard 1.3.6. The old version was 1.3.4. The API should be backwards compatible and nothing in core should need adjusted. However, there is a new "chunker" API that we may find useful in places where we want to emit compressed chunks of a fixed size. There are a pair of bug fixes in 0.10.0 with regards to compressobj() and decompressobj() when block flushing is used. I actually found these bugs when introducing these APIs in Mercurial! But existing Mercurial code is not affected because we don't perform block flushing. # no-check-commit because 3rd party code has different style guidelines Differential Revision: https://phab.mercurial-scm.org/D4911
author Gregory Szorc <gregory.szorc@gmail.com>
date Mon, 08 Oct 2018 16:27:40 -0700
parents b1fb341d8a61
children 675775c33ab6
comparison
equal deleted inserted replaced
40120:89742f1fa6cb 40121:73fef626dae3
27 #include <time.h> /* clock */ 27 #include <time.h> /* clock */
28 28
29 #include "mem.h" /* read */ 29 #include "mem.h" /* read */
30 #include "pool.h" 30 #include "pool.h"
31 #include "threading.h" 31 #include "threading.h"
32 #include "cover.h"
32 #include "zstd_internal.h" /* includes zstd.h */ 33 #include "zstd_internal.h" /* includes zstd.h */
33 #ifndef ZDICT_STATIC_LINKING_ONLY 34 #ifndef ZDICT_STATIC_LINKING_ONLY
34 #define ZDICT_STATIC_LINKING_ONLY 35 #define ZDICT_STATIC_LINKING_ONLY
35 #endif 36 #endif
36 #include "zdict.h" 37 #include "zdict.h"
37 38
38 /*-************************************* 39 /*-*************************************
39 * Constants 40 * Constants
40 ***************************************/ 41 ***************************************/
41 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) 42 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
43 #define DEFAULT_SPLITPOINT 1.0
42 44
43 /*-************************************* 45 /*-*************************************
44 * Console display 46 * Console display
45 ***************************************/ 47 ***************************************/
46 static int g_displayLevel = 2; 48 static int g_displayLevel = 2;
182 } 184 }
183 } 185 }
184 } 186 }
185 187
186 /** 188 /**
187 * Destroyes a map that is inited with COVER_map_init(). 189 * Destroys a map that is inited with COVER_map_init().
188 */ 190 */
189 static void COVER_map_destroy(COVER_map_t *map) { 191 static void COVER_map_destroy(COVER_map_t *map) {
190 if (map->data) { 192 if (map->data) {
191 free(map->data); 193 free(map->data);
192 } 194 }
201 typedef struct { 203 typedef struct {
202 const BYTE *samples; 204 const BYTE *samples;
203 size_t *offsets; 205 size_t *offsets;
204 const size_t *samplesSizes; 206 const size_t *samplesSizes;
205 size_t nbSamples; 207 size_t nbSamples;
208 size_t nbTrainSamples;
209 size_t nbTestSamples;
206 U32 *suffix; 210 U32 *suffix;
207 size_t suffixSize; 211 size_t suffixSize;
208 U32 *freqs; 212 U32 *freqs;
209 U32 *dmerAt; 213 U32 *dmerAt;
210 unsigned d; 214 unsigned d;
218 ***************************************/ 222 ***************************************/
219 223
220 /** 224 /**
221 * Returns the sum of the sample sizes. 225 * Returns the sum of the sample sizes.
222 */ 226 */
223 static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { 227 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
224 size_t sum = 0; 228 size_t sum = 0;
225 size_t i; 229 unsigned i;
226 for (i = 0; i < nbSamples; ++i) { 230 for (i = 0; i < nbSamples; ++i) {
227 sum += samplesSizes[i]; 231 sum += samplesSizes[i];
228 } 232 }
229 return sum; 233 return sum;
230 } 234 }
375 * which is dmerId. 379 * which is dmerId.
376 */ 380 */
377 ctx->suffix[dmerId] = freq; 381 ctx->suffix[dmerId] = freq;
378 } 382 }
379 383
380 /**
381 * A segment is a range in the source as well as the score of the segment.
382 */
383 typedef struct {
384 U32 begin;
385 U32 end;
386 U32 score;
387 } COVER_segment_t;
388 384
389 /** 385 /**
390 * Selects the best segment in an epoch. 386 * Selects the best segment in an epoch.
391 * Segments of are scored according to the function: 387 * Segments of are scored according to the function:
392 * 388 *
492 } 488 }
493 /* d <= k */ 489 /* d <= k */
494 if (parameters.d > parameters.k) { 490 if (parameters.d > parameters.k) {
495 return 0; 491 return 0;
496 } 492 }
493 /* 0 < splitPoint <= 1 */
494 if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
495 return 0;
496 }
497 return 1; 497 return 1;
498 } 498 }
499 499
500 /** 500 /**
501 * Clean up a context initialized with `COVER_ctx_init()`. 501 * Clean up a context initialized with `COVER_ctx_init()`.
529 * Returns 1 on success or zero on error. 529 * Returns 1 on success or zero on error.
530 * The context must be destroyed with `COVER_ctx_destroy()`. 530 * The context must be destroyed with `COVER_ctx_destroy()`.
531 */ 531 */
532 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, 532 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
533 const size_t *samplesSizes, unsigned nbSamples, 533 const size_t *samplesSizes, unsigned nbSamples,
534 unsigned d) { 534 unsigned d, double splitPoint) {
535 const BYTE *const samples = (const BYTE *)samplesBuffer; 535 const BYTE *const samples = (const BYTE *)samplesBuffer;
536 const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); 536 const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
537 /* Split samples into testing and training sets */
538 const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
539 const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
540 const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
541 const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
537 /* Checks */ 542 /* Checks */
538 if (totalSamplesSize < MAX(d, sizeof(U64)) || 543 if (totalSamplesSize < MAX(d, sizeof(U64)) ||
539 totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { 544 totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
540 DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", 545 DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
541 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); 546 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
542 return 0; 547 return 0;
543 } 548 }
549 /* Check if there are at least 5 training samples */
550 if (nbTrainSamples < 5) {
551 DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
552 return 0;
553 }
554 /* Check if there's testing sample */
555 if (nbTestSamples < 1) {
556 DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
557 return 0;
558 }
544 /* Zero the context */ 559 /* Zero the context */
545 memset(ctx, 0, sizeof(*ctx)); 560 memset(ctx, 0, sizeof(*ctx));
546 DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, 561 DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
547 (U32)totalSamplesSize); 562 (U32)trainingSamplesSize);
563 DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
564 (U32)testSamplesSize);
548 ctx->samples = samples; 565 ctx->samples = samples;
549 ctx->samplesSizes = samplesSizes; 566 ctx->samplesSizes = samplesSizes;
550 ctx->nbSamples = nbSamples; 567 ctx->nbSamples = nbSamples;
568 ctx->nbTrainSamples = nbTrainSamples;
569 ctx->nbTestSamples = nbTestSamples;
551 /* Partial suffix array */ 570 /* Partial suffix array */
552 ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1; 571 ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
553 ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); 572 ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
554 /* Maps index to the dmerID */ 573 /* Maps index to the dmerID */
555 ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); 574 ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
556 /* The offsets of each file */ 575 /* The offsets of each file */
557 ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); 576 ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
561 return 0; 580 return 0;
562 } 581 }
563 ctx->freqs = NULL; 582 ctx->freqs = NULL;
564 ctx->d = d; 583 ctx->d = d;
565 584
566 /* Fill offsets from the samlesSizes */ 585 /* Fill offsets from the samplesSizes */
567 { 586 {
568 U32 i; 587 U32 i;
569 ctx->offsets[0] = 0; 588 ctx->offsets[0] = 0;
570 for (i = 1; i <= nbSamples; ++i) { 589 for (i = 1; i <= nbSamples; ++i) {
571 ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; 590 ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
579 */ 598 */
580 U32 i; 599 U32 i;
581 for (i = 0; i < ctx->suffixSize; ++i) { 600 for (i = 0; i < ctx->suffixSize; ++i) {
582 ctx->suffix[i] = i; 601 ctx->suffix[i] = i;
583 } 602 }
584 /* qsort doesn't take an opaque pointer, so pass as a global */ 603 /* qsort doesn't take an opaque pointer, so pass as a global.
604 * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
605 */
585 g_ctx = ctx; 606 g_ctx = ctx;
607 #if defined(__OpenBSD__)
608 mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
609 (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
610 #else
586 qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), 611 qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
587 (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); 612 (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
613 #endif
588 } 614 }
589 DISPLAYLEVEL(2, "Computing frequencies\n"); 615 DISPLAYLEVEL(2, "Computing frequencies\n");
590 /* For each dmer group (group of positions with the same first d bytes): 616 /* For each dmer group (group of positions with the same first d bytes):
591 * 1. For each position we set dmerAt[position] = dmerID. The dmerID is 617 * 1. For each position we set dmerAt[position] = dmerID. The dmerID is
592 * (groupBeginPtr - suffix). This allows us to go from position to 618 * (groupBeginPtr - suffix). This allows us to go from position to
611 BYTE *const dict = (BYTE *)dictBuffer; 637 BYTE *const dict = (BYTE *)dictBuffer;
612 size_t tail = dictBufferCapacity; 638 size_t tail = dictBufferCapacity;
613 /* Divide the data up into epochs of equal size. 639 /* Divide the data up into epochs of equal size.
614 * We will select at least one segment from each epoch. 640 * We will select at least one segment from each epoch.
615 */ 641 */
616 const U32 epochs = (U32)(dictBufferCapacity / parameters.k); 642 const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
617 const U32 epochSize = (U32)(ctx->suffixSize / epochs); 643 const U32 epochSize = (U32)(ctx->suffixSize / epochs);
618 size_t epoch; 644 size_t epoch;
619 DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, 645 DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
620 epochSize); 646 epochSize);
621 /* Loop through the epochs until there are no more segments or the dictionary 647 /* Loop through the epochs until there are no more segments or the dictionary
656 ZDICT_cover_params_t parameters) 682 ZDICT_cover_params_t parameters)
657 { 683 {
658 BYTE* const dict = (BYTE*)dictBuffer; 684 BYTE* const dict = (BYTE*)dictBuffer;
659 COVER_ctx_t ctx; 685 COVER_ctx_t ctx;
660 COVER_map_t activeDmers; 686 COVER_map_t activeDmers;
661 687 parameters.splitPoint = 1.0;
662 /* Initialize global data */ 688 /* Initialize global data */
663 g_displayLevel = parameters.zParams.notificationLevel; 689 g_displayLevel = parameters.zParams.notificationLevel;
664 /* Checks */ 690 /* Checks */
665 if (!COVER_checkParameters(parameters, dictBufferCapacity)) { 691 if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
666 DISPLAYLEVEL(1, "Cover parameters incorrect\n"); 692 DISPLAYLEVEL(1, "Cover parameters incorrect\n");
675 ZDICT_DICTSIZE_MIN); 701 ZDICT_DICTSIZE_MIN);
676 return ERROR(dstSize_tooSmall); 702 return ERROR(dstSize_tooSmall);
677 } 703 }
678 /* Initialize context and activeDmers */ 704 /* Initialize context and activeDmers */
679 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, 705 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
680 parameters.d)) { 706 parameters.d, parameters.splitPoint)) {
681 return ERROR(GENERIC); 707 return ERROR(GENERIC);
682 } 708 }
683 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { 709 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
684 DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); 710 DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
685 COVER_ctx_destroy(&ctx); 711 COVER_ctx_destroy(&ctx);
702 COVER_map_destroy(&activeDmers); 728 COVER_map_destroy(&activeDmers);
703 return dictionarySize; 729 return dictionarySize;
704 } 730 }
705 } 731 }
706 732
707 /** 733
708 * COVER_best_t is used for two purposes: 734
709 * 1. Synchronizing threads. 735 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
710 * 2. Saving the best parameters and dictionary. 736 const size_t *samplesSizes, const BYTE *samples,
711 * 737 size_t *offsets,
712 * All of the methods except COVER_best_init() are thread safe if zstd is 738 size_t nbTrainSamples, size_t nbSamples,
713 * compiled with multithreaded support. 739 BYTE *const dict, size_t dictBufferCapacity) {
714 */ 740 size_t totalCompressedSize = ERROR(GENERIC);
715 typedef struct COVER_best_s { 741 /* Pointers */
716 ZSTD_pthread_mutex_t mutex; 742 ZSTD_CCtx *cctx;
717 ZSTD_pthread_cond_t cond; 743 ZSTD_CDict *cdict;
718 size_t liveJobs; 744 void *dst;
719 void *dict; 745 /* Local variables */
720 size_t dictSize; 746 size_t dstCapacity;
721 ZDICT_cover_params_t parameters; 747 size_t i;
722 size_t compressedSize; 748 /* Allocate dst with enough space to compress the maximum sized sample */
723 } COVER_best_t; 749 {
750 size_t maxSampleSize = 0;
751 i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
752 for (; i < nbSamples; ++i) {
753 maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
754 }
755 dstCapacity = ZSTD_compressBound(maxSampleSize);
756 dst = malloc(dstCapacity);
757 }
758 /* Create the cctx and cdict */
759 cctx = ZSTD_createCCtx();
760 cdict = ZSTD_createCDict(dict, dictBufferCapacity,
761 parameters.zParams.compressionLevel);
762 if (!dst || !cctx || !cdict) {
763 goto _compressCleanup;
764 }
765 /* Compress each sample and sum their sizes (or error) */
766 totalCompressedSize = dictBufferCapacity;
767 i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
768 for (; i < nbSamples; ++i) {
769 const size_t size = ZSTD_compress_usingCDict(
770 cctx, dst, dstCapacity, samples + offsets[i],
771 samplesSizes[i], cdict);
772 if (ZSTD_isError(size)) {
773 totalCompressedSize = ERROR(GENERIC);
774 goto _compressCleanup;
775 }
776 totalCompressedSize += size;
777 }
778 _compressCleanup:
779 ZSTD_freeCCtx(cctx);
780 ZSTD_freeCDict(cdict);
781 if (dst) {
782 free(dst);
783 }
784 return totalCompressedSize;
785 }
786
724 787
725 /** 788 /**
726 * Initialize the `COVER_best_t`. 789 * Initialize the `COVER_best_t`.
727 */ 790 */
728 static void COVER_best_init(COVER_best_t *best) { 791 void COVER_best_init(COVER_best_t *best) {
729 if (best==NULL) return; /* compatible with init on NULL */ 792 if (best==NULL) return; /* compatible with init on NULL */
730 (void)ZSTD_pthread_mutex_init(&best->mutex, NULL); 793 (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
731 (void)ZSTD_pthread_cond_init(&best->cond, NULL); 794 (void)ZSTD_pthread_cond_init(&best->cond, NULL);
732 best->liveJobs = 0; 795 best->liveJobs = 0;
733 best->dict = NULL; 796 best->dict = NULL;
737 } 800 }
738 801
739 /** 802 /**
740 * Wait until liveJobs == 0. 803 * Wait until liveJobs == 0.
741 */ 804 */
742 static void COVER_best_wait(COVER_best_t *best) { 805 void COVER_best_wait(COVER_best_t *best) {
743 if (!best) { 806 if (!best) {
744 return; 807 return;
745 } 808 }
746 ZSTD_pthread_mutex_lock(&best->mutex); 809 ZSTD_pthread_mutex_lock(&best->mutex);
747 while (best->liveJobs != 0) { 810 while (best->liveJobs != 0) {
751 } 814 }
752 815
753 /** 816 /**
754 * Call COVER_best_wait() and then destroy the COVER_best_t. 817 * Call COVER_best_wait() and then destroy the COVER_best_t.
755 */ 818 */
756 static void COVER_best_destroy(COVER_best_t *best) { 819 void COVER_best_destroy(COVER_best_t *best) {
757 if (!best) { 820 if (!best) {
758 return; 821 return;
759 } 822 }
760 COVER_best_wait(best); 823 COVER_best_wait(best);
761 if (best->dict) { 824 if (best->dict) {
767 830
768 /** 831 /**
769 * Called when a thread is about to be launched. 832 * Called when a thread is about to be launched.
770 * Increments liveJobs. 833 * Increments liveJobs.
771 */ 834 */
772 static void COVER_best_start(COVER_best_t *best) { 835 void COVER_best_start(COVER_best_t *best) {
773 if (!best) { 836 if (!best) {
774 return; 837 return;
775 } 838 }
776 ZSTD_pthread_mutex_lock(&best->mutex); 839 ZSTD_pthread_mutex_lock(&best->mutex);
777 ++best->liveJobs; 840 ++best->liveJobs;
781 /** 844 /**
782 * Called when a thread finishes executing, both on error or success. 845 * Called when a thread finishes executing, both on error or success.
783 * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 846 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
784 * If this dictionary is the best so far save it and its parameters. 847 * If this dictionary is the best so far save it and its parameters.
785 */ 848 */
786 static void COVER_best_finish(COVER_best_t *best, size_t compressedSize, 849 void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
787 ZDICT_cover_params_t parameters, void *dict, 850 ZDICT_cover_params_t parameters, void *dict,
788 size_t dictSize) { 851 size_t dictSize) {
789 if (!best) { 852 if (!best) {
790 return; 853 return;
791 } 854 }
812 memcpy(best->dict, dict, dictSize); 875 memcpy(best->dict, dict, dictSize);
813 best->dictSize = dictSize; 876 best->dictSize = dictSize;
814 best->parameters = parameters; 877 best->parameters = parameters;
815 best->compressedSize = compressedSize; 878 best->compressedSize = compressedSize;
816 } 879 }
817 ZSTD_pthread_mutex_unlock(&best->mutex);
818 if (liveJobs == 0) { 880 if (liveJobs == 0) {
819 ZSTD_pthread_cond_broadcast(&best->cond); 881 ZSTD_pthread_cond_broadcast(&best->cond);
820 } 882 }
883 ZSTD_pthread_mutex_unlock(&best->mutex);
821 } 884 }
822 } 885 }
823 886
824 /** 887 /**
825 * Parameters for COVER_tryParameters(). 888 * Parameters for COVER_tryParameters().
830 size_t dictBufferCapacity; 893 size_t dictBufferCapacity;
831 ZDICT_cover_params_t parameters; 894 ZDICT_cover_params_t parameters;
832 } COVER_tryParameters_data_t; 895 } COVER_tryParameters_data_t;
833 896
834 /** 897 /**
835 * Tries a set of parameters and upates the COVER_best_t with the results. 898 * Tries a set of parameters and updates the COVER_best_t with the results.
836 * This function is thread safe if zstd is compiled with multithreaded support. 899 * This function is thread safe if zstd is compiled with multithreaded support.
837 * It takes its parameters as an *OWNING* opaque pointer to support threading. 900 * It takes its parameters as an *OWNING* opaque pointer to support threading.
838 */ 901 */
839 static void COVER_tryParameters(void *opaque) { 902 static void COVER_tryParameters(void *opaque) {
840 /* Save parameters as local variables */ 903 /* Save parameters as local variables */
861 { 924 {
862 const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, 925 const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
863 dictBufferCapacity, parameters); 926 dictBufferCapacity, parameters);
864 dictBufferCapacity = ZDICT_finalizeDictionary( 927 dictBufferCapacity = ZDICT_finalizeDictionary(
865 dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, 928 dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
866 ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, 929 ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
867 parameters.zParams); 930 parameters.zParams);
868 if (ZDICT_isError(dictBufferCapacity)) { 931 if (ZDICT_isError(dictBufferCapacity)) {
869 DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); 932 DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
870 goto _cleanup; 933 goto _cleanup;
871 } 934 }
872 } 935 }
873 /* Check total compressed size */ 936 /* Check total compressed size */
874 { 937 totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
875 /* Pointers */ 938 ctx->samples, ctx->offsets,
876 ZSTD_CCtx *cctx; 939 ctx->nbTrainSamples, ctx->nbSamples,
877 ZSTD_CDict *cdict; 940 dict, dictBufferCapacity);
878 void *dst;
879 /* Local variables */
880 size_t dstCapacity;
881 size_t i;
882 /* Allocate dst with enough space to compress the maximum sized sample */
883 {
884 size_t maxSampleSize = 0;
885 for (i = 0; i < ctx->nbSamples; ++i) {
886 maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
887 }
888 dstCapacity = ZSTD_compressBound(maxSampleSize);
889 dst = malloc(dstCapacity);
890 }
891 /* Create the cctx and cdict */
892 cctx = ZSTD_createCCtx();
893 cdict = ZSTD_createCDict(dict, dictBufferCapacity,
894 parameters.zParams.compressionLevel);
895 if (!dst || !cctx || !cdict) {
896 goto _compressCleanup;
897 }
898 /* Compress each sample and sum their sizes (or error) */
899 totalCompressedSize = dictBufferCapacity;
900 for (i = 0; i < ctx->nbSamples; ++i) {
901 const size_t size = ZSTD_compress_usingCDict(
902 cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
903 ctx->samplesSizes[i], cdict);
904 if (ZSTD_isError(size)) {
905 totalCompressedSize = ERROR(GENERIC);
906 goto _compressCleanup;
907 }
908 totalCompressedSize += size;
909 }
910 _compressCleanup:
911 ZSTD_freeCCtx(cctx);
912 ZSTD_freeCDict(cdict);
913 if (dst) {
914 free(dst);
915 }
916 }
917 941
918 _cleanup: 942 _cleanup:
919 COVER_best_finish(data->best, totalCompressedSize, parameters, dict, 943 COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
920 dictBufferCapacity); 944 dictBufferCapacity);
921 free(data); 945 free(data);
932 void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, 956 void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
933 const size_t *samplesSizes, unsigned nbSamples, 957 const size_t *samplesSizes, unsigned nbSamples,
934 ZDICT_cover_params_t *parameters) { 958 ZDICT_cover_params_t *parameters) {
935 /* constants */ 959 /* constants */
936 const unsigned nbThreads = parameters->nbThreads; 960 const unsigned nbThreads = parameters->nbThreads;
961 const double splitPoint =
962 parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
937 const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; 963 const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
938 const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; 964 const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
939 const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; 965 const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
940 const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; 966 const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
941 const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; 967 const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
949 unsigned k; 975 unsigned k;
950 COVER_best_t best; 976 COVER_best_t best;
951 POOL_ctx *pool = NULL; 977 POOL_ctx *pool = NULL;
952 978
953 /* Checks */ 979 /* Checks */
980 if (splitPoint <= 0 || splitPoint > 1) {
981 LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
982 return ERROR(GENERIC);
983 }
954 if (kMinK < kMaxD || kMaxK < kMinK) { 984 if (kMinK < kMaxD || kMaxK < kMinK) {
955 LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); 985 LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
956 return ERROR(GENERIC); 986 return ERROR(GENERIC);
957 } 987 }
958 if (nbSamples == 0) { 988 if (nbSamples == 0) {
979 kIterations); 1009 kIterations);
980 for (d = kMinD; d <= kMaxD; d += 2) { 1010 for (d = kMinD; d <= kMaxD; d += 2) {
981 /* Initialize the context for this value of d */ 1011 /* Initialize the context for this value of d */
982 COVER_ctx_t ctx; 1012 COVER_ctx_t ctx;
983 LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); 1013 LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
984 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { 1014 if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
985 LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); 1015 LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
986 COVER_best_destroy(&best); 1016 COVER_best_destroy(&best);
987 POOL_free(pool); 1017 POOL_free(pool);
988 return ERROR(GENERIC); 1018 return ERROR(GENERIC);
989 } 1019 }
1004 data->best = &best; 1034 data->best = &best;
1005 data->dictBufferCapacity = dictBufferCapacity; 1035 data->dictBufferCapacity = dictBufferCapacity;
1006 data->parameters = *parameters; 1036 data->parameters = *parameters;
1007 data->parameters.k = k; 1037 data->parameters.k = k;
1008 data->parameters.d = d; 1038 data->parameters.d = d;
1039 data->parameters.splitPoint = splitPoint;
1009 data->parameters.steps = kSteps; 1040 data->parameters.steps = kSteps;
1010 data->parameters.zParams.notificationLevel = g_displayLevel; 1041 data->parameters.zParams.notificationLevel = g_displayLevel;
1011 /* Check the parameters */ 1042 /* Check the parameters */
1012 if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { 1043 if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
1013 DISPLAYLEVEL(1, "Cover parameters incorrect\n"); 1044 DISPLAYLEVEL(1, "Cover parameters incorrect\n");