comparison contrib/python-zstandard/zstd/dictBuilder/zdict.c @ 40121:73fef626dae3

zstandard: vendor python-zstandard 0.10.1 This was just released. The upstream source distribution from PyPI was extracted. Unwanted files were removed. The clang-format ignore list was updated to reflect the new source of files. setup.py was updated to pass a new argument to python-zstandard's function for returning an Extension instance. Upstream had to change to use relative paths because Python 3.7's packaging doesn't seem to like absolute paths when defining sources, includes, etc. The default relative path calculation is relative to setup_zstd.py which is different from the directory of Mercurial's setup.py. The project contains a vendored copy of zstandard 1.3.6. The old version was 1.3.4. The API should be backwards compatible and nothing in core should need adjusted. However, there is a new "chunker" API that we may find useful in places where we want to emit compressed chunks of a fixed size. There are a pair of bug fixes in 0.10.0 with regards to compressobj() and decompressobj() when block flushing is used. I actually found these bugs when introducing these APIs in Mercurial! But existing Mercurial code is not affected because we don't perform block flushing. # no-check-commit because 3rd party code has different style guidelines Differential Revision: https://phab.mercurial-scm.org/D4911
author Gregory Szorc <gregory.szorc@gmail.com>
date Mon, 08 Oct 2018 16:27:40 -0700
parents b1fb341d8a61
children 675775c33ab6
comparison
equal deleted inserted replaced
40120:89742f1fa6cb 40121:73fef626dae3
291 break; 291 break;
292 refinedStart = selectedID; 292 refinedStart = selectedID;
293 refinedEnd = refinedStart + selectedCount; 293 refinedEnd = refinedStart + selectedCount;
294 } 294 }
295 295
296 /* evaluate gain based on new ref */ 296 /* evaluate gain based on new dict */
297 start = refinedStart; 297 start = refinedStart;
298 pos = suffix[refinedStart]; 298 pos = suffix[refinedStart];
299 end = start; 299 end = start;
300 memset(lengthList, 0, sizeof(lengthList)); 300 memset(lengthList, 0, sizeof(lengthList));
301 301
339 /* calculate savings */ 339 /* calculate savings */
340 savings[5] = 0; 340 savings[5] = 0;
341 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) 341 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
342 savings[i] = savings[i-1] + (lengthList[i] * (i-3)); 342 savings[i] = savings[i-1] + (lengthList[i] * (i-3));
343 343
344 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", 344 DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); 345 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
346 346
347 solution.pos = (U32)pos; 347 solution.pos = (U32)pos;
348 solution.length = (U32)maxLength; 348 solution.length = (U32)maxLength;
349 solution.savings = savings[maxLength]; 349 solution.savings = savings[maxLength];
579 } 579 }
580 580
581 581
582 typedef struct 582 typedef struct
583 { 583 {
584 ZSTD_CCtx* ref; /* contains reference to dictionary */ 584 ZSTD_CDict* dict; /* dictionary */
585 ZSTD_CCtx* zc; /* working context */ 585 ZSTD_CCtx* zc; /* working context */
586 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ 586 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
587 } EStats_ress_t; 587 } EStats_ress_t;
588 588
589 #define MAXREPOFFSET 1024 589 #define MAXREPOFFSET 1024
595 { 595 {
596 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); 596 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
597 size_t cSize; 597 size_t cSize;
598 598
599 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ 599 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
600 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); 600 { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
601 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } 601 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
602
602 } 603 }
603 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); 604 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
604 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } 605 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
605 606
606 if (cSize) { /* if == 0; block is not compressible */ 607 if (cSize) { /* if == 0; block is not compressible */
695 short matchLengthNCount[MaxML+1]; 696 short matchLengthNCount[MaxML+1];
696 U32 litLengthCount[MaxLL+1]; 697 U32 litLengthCount[MaxLL+1];
697 short litLengthNCount[MaxLL+1]; 698 short litLengthNCount[MaxLL+1];
698 U32 repOffset[MAXREPOFFSET]; 699 U32 repOffset[MAXREPOFFSET];
699 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; 700 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
700 EStats_ress_t esr; 701 EStats_ress_t esr = { NULL, NULL, NULL };
701 ZSTD_parameters params; 702 ZSTD_parameters params;
702 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; 703 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
703 size_t pos = 0, errorCode; 704 size_t pos = 0, errorCode;
704 size_t eSize = 0; 705 size_t eSize = 0;
705 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); 706 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
706 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); 707 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
707 BYTE* dstPtr = (BYTE*)dstBuffer; 708 BYTE* dstPtr = (BYTE*)dstBuffer;
708 709
709 /* init */ 710 /* init */
710 DEBUGLOG(4, "ZDICT_analyzeEntropy"); 711 DEBUGLOG(4, "ZDICT_analyzeEntropy");
711 esr.ref = ZSTD_createCCtx();
712 esr.zc = ZSTD_createCCtx();
713 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
714 if (!esr.ref || !esr.zc || !esr.workPlace) {
715 eSize = ERROR(memory_allocation);
716 DISPLAYLEVEL(1, "Not enough memory \n");
717 goto _cleanup;
718 }
719 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ 712 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
720 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ 713 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
721 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; 714 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
722 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; 715 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
723 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; 716 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
724 memset(repOffset, 0, sizeof(repOffset)); 717 memset(repOffset, 0, sizeof(repOffset));
725 repOffset[1] = repOffset[4] = repOffset[8] = 1; 718 repOffset[1] = repOffset[4] = repOffset[8] = 1;
726 memset(bestRepOffset, 0, sizeof(bestRepOffset)); 719 memset(bestRepOffset, 0, sizeof(bestRepOffset));
727 if (compressionLevel<=0) compressionLevel = g_compressionLevel_default; 720 if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
728 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); 721 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
729 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); 722
730 if (ZSTD_isError(beginResult)) { 723 esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
731 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult)); 724 esr.zc = ZSTD_createCCtx();
732 eSize = ERROR(GENERIC); 725 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
733 goto _cleanup; 726 if (!esr.dict || !esr.zc || !esr.workPlace) {
734 } } 727 eSize = ERROR(memory_allocation);
728 DISPLAYLEVEL(1, "Not enough memory \n");
729 goto _cleanup;
730 }
735 731
736 /* collect stats on all samples */ 732 /* collect stats on all samples */
737 for (u=0; u<nbFiles; u++) { 733 for (u=0; u<nbFiles; u++) {
738 ZDICT_countEStats(esr, params, 734 ZDICT_countEStats(esr, params,
739 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, 735 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
854 MEM_writeLE32(dstPtr+8, repStartValue[2]); 850 MEM_writeLE32(dstPtr+8, repStartValue[2]);
855 #endif 851 #endif
856 eSize += 12; 852 eSize += 12;
857 853
858 _cleanup: 854 _cleanup:
859 ZSTD_freeCCtx(esr.ref); 855 ZSTD_freeCDict(esr.dict);
860 ZSTD_freeCCtx(esr.zc); 856 ZSTD_freeCCtx(esr.zc);
861 free(esr.workPlace); 857 free(esr.workPlace);
862 858
863 return eSize; 859 return eSize;
864 } 860 }
865 861
866 862
867 863
868 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, 864 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
869 const void* customDictContent, size_t dictContentSize, 865 const void* customDictContent, size_t dictContentSize,
870 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 866 const void* samplesBuffer, const size_t* samplesSizes,
871 ZDICT_params_t params) 867 unsigned nbSamples, ZDICT_params_t params)
872 { 868 {
873 size_t hSize; 869 size_t hSize;
874 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ 870 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
875 BYTE header[HBUFFSIZE]; 871 BYTE header[HBUFFSIZE];
876 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; 872 int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
877 U32 const notificationLevel = params.notificationLevel; 873 U32 const notificationLevel = params.notificationLevel;
878 874
879 /* check conditions */ 875 /* check conditions */
880 DEBUGLOG(4, "ZDICT_finalizeDictionary"); 876 DEBUGLOG(4, "ZDICT_finalizeDictionary");
881 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); 877 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
912 return dictSize; 908 return dictSize;
913 } 909 }
914 } 910 }
915 911
916 912
917 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 913 static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
918 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 914 void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
919 ZDICT_params_t params) 915 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
920 { 916 ZDICT_params_t params)
921 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; 917 {
918 int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
922 U32 const notificationLevel = params.notificationLevel; 919 U32 const notificationLevel = params.notificationLevel;
923 size_t hSize = 8; 920 size_t hSize = 8;
924 921
925 /* calculate entropy tables */ 922 /* calculate entropy tables */
926 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 923 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
945 if (hSize + dictContentSize < dictBufferCapacity) 942 if (hSize + dictContentSize < dictBufferCapacity)
946 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); 943 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
947 return MIN(dictBufferCapacity, hSize+dictContentSize); 944 return MIN(dictBufferCapacity, hSize+dictContentSize);
948 } 945 }
949 946
950 947 /* Hidden declaration for dbio.c */
948 size_t ZDICT_trainFromBuffer_unsafe_legacy(
949 void* dictBuffer, size_t maxDictSize,
950 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951 ZDICT_legacy_params_t params);
951 /*! ZDICT_trainFromBuffer_unsafe_legacy() : 952 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
952 * Warning : `samplesBuffer` must be followed by noisy guard band. 953 * Warning : `samplesBuffer` must be followed by noisy guard band.
953 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() 954 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
954 */ 955 */
955 size_t ZDICT_trainFromBuffer_unsafe_legacy( 956 size_t ZDICT_trainFromBuffer_unsafe_legacy(
989 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); 990 DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
990 for (u=1; u<nb; u++) { 991 for (u=1; u<nb; u++) {
991 U32 const pos = dictList[u].pos; 992 U32 const pos = dictList[u].pos;
992 U32 const length = dictList[u].length; 993 U32 const length = dictList[u].length;
993 U32 const printedLength = MIN(40, length); 994 U32 const printedLength = MIN(40, length);
994 if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) 995 if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
996 free(dictList);
995 return ERROR(GENERIC); /* should never happen */ 997 return ERROR(GENERIC); /* should never happen */
998 }
996 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", 999 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
997 u, length, pos, dictList[u].savings); 1000 u, length, pos, dictList[u].savings);
998 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); 1001 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
999 DISPLAYLEVEL(3, "| \n"); 1002 DISPLAYLEVEL(3, "| \n");
1000 } } 1003 } }
1080 1083
1081 1084
1082 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, 1085 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1083 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1086 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1084 { 1087 {
1085 ZDICT_cover_params_t params; 1088 ZDICT_fastCover_params_t params;
1086 DEBUGLOG(3, "ZDICT_trainFromBuffer"); 1089 DEBUGLOG(3, "ZDICT_trainFromBuffer");
1087 memset(&params, 0, sizeof(params)); 1090 memset(&params, 0, sizeof(params));
1088 params.d = 8; 1091 params.d = 8;
1089 params.steps = 4; 1092 params.steps = 4;
1090 /* Default to level 6 since no compression level information is available */ 1093 /* Default to level 6 since no compression level information is available */
1091 params.zParams.compressionLevel = 6; 1094 params.zParams.compressionLevel = 3;
1092 #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1) 1095 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1093 params.zParams.notificationLevel = ZSTD_DEBUG; 1096 params.zParams.notificationLevel = DEBUGLEVEL;
1094 #endif 1097 #endif
1095 return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, 1098 return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1096 samplesBuffer, samplesSizes, nbSamples, 1099 samplesBuffer, samplesSizes, nbSamples,
1097 &params); 1100 &params);
1098 } 1101 }
1099 1102
1100 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 1103 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,