From 330261ca5cea4c6066dd2c42931a5a9dbfee7cc1 Mon Sep 17 00:00:00 2001 From: RaiKoHoff Date: Thu, 27 Feb 2020 18:17:42 +0100 Subject: [PATCH] + upd: use Zufuliu's EOL Detection method (with compiler intrinsic optimizations) --- src/Edit.c | 221 +++++++++++++++++++++++++++++++++++++++++++ src/Notepad3.vcxproj | 3 +- src/TypeDefs.h | 17 ++++ 3 files changed, 240 insertions(+), 1 deletion(-) diff --git a/src/Edit.c b/src/Edit.c index 38836a5d3..7aa980961 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -24,6 +24,11 @@ #include #include +#include +#ifdef __AVX__ +#include +#endif + #include "Styles.h" #include "Dialogs.h" #include "resource.h" @@ -826,6 +831,220 @@ bool EditCopyAppend(HWND hwnd, bool bAppend) // EditDetectEOLMode() - moved here to handle Unicode files correctly // by zufuliu (https://github.com/zufuliu/notepad2) // +//============================================================================= + +// https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64 +// use __popcnt() or _mm_popcnt_u32() require testing __cpuid(): +/* +* int cpuInfo[4]; +* __cpuid(cpuInfo, 0x00000001); +* const BOOL cpuPOPCNT = cpuInfo[2] & (1 << 23); +*/ +// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel +// Bit Twiddling Hacks copyright 1997-2005 Sean Eron Anderson +#if defined(_MSC_VER) +static __forceinline unsigned int bth_popcount(unsigned int v) { + v = v - ((v >> 1) & 0x55555555U); + v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U); + return (((v + (v >> 4)) & 0x0F0F0F0FU) * 0x01010101U) >> 24; +} +#endif + +#if !defined(_MSC_VER) +#define __popcnt __builtin_popcount +#elif NP2_USE_AVX2 +#define __popcnt _mm_popcnt_u32 +#else +//#define np2_popcount __popcnt +#define __popcnt bth_popcount +#endif + +// ---------------------------------------------------------------------------- + +void EditDetectEOLMode(LPCSTR lpData, size_t cbData, EditFileIOStatus* const status) +{ + if (!lpData || (cbData == 0)) { return; } + + /* '\r' and '\n' is not reused (e.g. as trailing byte in DBCS) by any known encoding, + it's safe to check whole data byte by byte.*/ + + DocLn lineCountCRLF = 0; + DocLn lineCountCR = 0; + DocLn lineCountLF = 0; + + // tools/GenerateTable.py + static const uint8_t eol_table[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, // 00 - 0F + }; + + const uint8_t* ptr = (const uint8_t*)lpData; + // No NULL-terminated requirement for *ptr == '\n' + const uint8_t* const end = ptr + cbData - 1; + +#if NP2_USE_AVX2 +#define LAST_CR_MASK (1U << (sizeof(__m256i) - 1)) + const __m256i vectCR = _mm256_set1_epi8('\r'); + const __m256i vectLF = _mm256_set1_epi8('\n'); + while (ptr + sizeof(__m256i) <= end) { + // unaligned loading: line starts at random position. + const __m256i chunk = _mm256_loadu_si256((__m256i*)ptr); + uint32_t maskCR = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vectCR)); + uint32_t maskLF = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vectLF)); + + ptr += sizeof(__m256i); + if (maskCR) { + if (maskCR & LAST_CR_MASK) { + maskCR &= LAST_CR_MASK - 1; + if (*ptr == '\n') { + // CR+LF across boundary + ++ptr; + ++lineCountCRLF; + } + else { + // clear highest bit (last CR) to avoid using following code: + // maskCR = (maskCR_LF ^ maskLF) | (maskCR & LAST_CR_MASK); + ++lineCountCR; + } + } + + // maskCR and maskLF never have some bit set. after shifting maskCR by 1 bit, + // the bits both set in maskCR and maskLF represents CR+LF; + // the bits only set in maskCR or maskLF represents individual CR or LF. + const uint32_t maskCRLF = (maskCR << 1) & maskLF; // CR+LF + const uint32_t maskCR_LF = (maskCR << 1) ^ maskLF;// CR alone or LF alone + maskLF = maskCR_LF & maskLF; // LF alone + maskCR = maskCR_LF ^ maskLF; // CR alone (with one position offset) + if (maskCRLF) { + lineCountCRLF += __popcnt(maskCRLF); + } + if (maskCR) { + lineCountCR += __popcnt(maskCR); + } + } + if (maskLF) { + lineCountLF += __popcnt(maskLF); + } + } + +#undef LAST_CR_MASK + // end NP2_USE_AVX2 +#elif NP2_USE_SSE2 +#define LAST_CR_MASK (1U << (2*sizeof(__m128i) - 1)) + const __m128i vectCR = _mm_set1_epi8('\r'); + const __m128i vectLF = _mm_set1_epi8('\n'); + while (ptr + 2 * sizeof(__m128i) <= end) { + // unaligned loading: line starts at random position. + __m128i chunk = _mm_loadu_si128((__m128i*)ptr); + uint32_t maskCR = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vectCR)); + uint32_t maskLF = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vectLF)); + chunk = _mm_loadu_si128((__m128i*)(ptr + sizeof(__m128i))); + maskCR |= ((uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vectCR))) << sizeof(__m128i); + maskLF |= ((uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vectLF))) << sizeof(__m128i); + + ptr += 2 * sizeof(__m128i); + if (maskCR) { + if (maskCR & LAST_CR_MASK) { + maskCR &= LAST_CR_MASK - 1; + if (*ptr == '\n') { + // CR+LF across boundary + ++ptr; + ++lineCountCRLF; + } + else { + // clear highest bit (last CR) to avoid using following code: + // maskCR = (maskCR_LF ^ maskLF) | (maskCR & LAST_CR_MASK); + ++lineCountCR; + } + } + + // maskCR and maskLF never have some bit set. after shifting maskCR by 1 bit, + // the bits both set in maskCR and maskLF represents CR+LF; + // the bits only set in maskCR or maskLF represents individual CR or LF. + const uint32_t maskCRLF = (maskCR << 1) & maskLF; // CR+LF + const uint32_t maskCR_LF = (maskCR << 1) ^ maskLF;// CR alone or LF alone + maskLF = maskCR_LF & maskLF; // LF alone + maskCR = maskCR_LF ^ maskLF; // CR alone (with one position offset) + if (maskCRLF) { + lineCountCRLF += __popcnt(maskCRLF); + } + if (maskCR) { + lineCountCR += __popcnt(maskCR); + } + } + if (maskLF) { + lineCountLF += __popcnt(maskLF); + } + } + +#undef LAST_CR_MASK + // end NP2_USE_SSE2 +#endif + + do { + // skip to line end + uint8_t ch; + uint8_t type = 0; + while (ptr < end && ((ch = *ptr++) > '\r' || (type = eol_table[ch]) == 0)) { + // nop + } + switch (type) { + case 1: //'\n' + ++lineCountLF; + break; + case 2: //'\r' + if (*ptr == '\n') { + ++ptr; + ++lineCountCRLF; + } + else { + ++lineCountCR; + } + break; + } + } while (ptr < end); + + if (ptr == end) { + switch (*ptr) { + case '\n': + ++lineCountLF; + break; + case '\r': + ++lineCountCR; + break; + } + } + + // values must kept in same order as SC_EOL_CRLF(0), SC_EOL_CR(1), SC_EOL_LF(2) + DocLn const linesMax = max_ln(max_ln(lineCountCRLF, lineCountCR), lineCountLF); + DocLn linesCount[3] = { 0, 0, 0 }; + linesCount[SC_EOL_CRLF] = lineCountCRLF; + linesCount[SC_EOL_CR] = lineCountCR; + linesCount[SC_EOL_LF] = lineCountLF; + + int iEOLMode = status->iEOLMode; + if (linesMax != linesCount[iEOLMode]) + { + if (linesMax == linesCount[SC_EOL_CRLF]) { + iEOLMode = SC_EOL_CRLF; + } + else if (linesMax == linesCount[SC_EOL_CR]) { + iEOLMode = SC_EOL_CR; + } + else { + iEOLMode = SC_EOL_LF; + } + } + status->iEOLMode = iEOLMode; + + status->bInconsistentEOLs = 1 < ((!!lineCountCRLF) + (!!lineCountCR) + (!!lineCountLF)); + status->eolCount[SC_EOL_CRLF] = lineCountCRLF; + status->eolCount[SC_EOL_CR] = lineCountCR; + status->eolCount[SC_EOL_LF] = lineCountLF; +} + + +#if 0 + void EditDetectEOLMode(LPCSTR lpData, size_t cbData, EditFileIOStatus* const status) { int iEOLMode = Settings.DefaultEOLMode; @@ -889,6 +1108,8 @@ void EditDetectEOLMode(LPCSTR lpData, size_t cbData, EditFileIOStatus* const sta status->eolCount[SC_EOL_LF] = linesCount[SC_EOL_LF]; } +#endif + //============================================================================= // diff --git a/src/Notepad3.vcxproj b/src/Notepad3.vcxproj index 8abe66851..e28cd2e2b 100644 --- a/src/Notepad3.vcxproj +++ b/src/Notepad3.vcxproj @@ -240,7 +240,7 @@ Level4 None true - StreamingSIMDExtensions2 + AdvancedVectorExtensions2 Fast stdcpp17 Speed @@ -318,6 +318,7 @@ AnySuitable true Sync + AdvancedVectorExtensions2 comctl32.lib;imm32.lib;shlwapi.lib;uxtheme.lib;muiload.lib;dwrite.lib;scintilla.lib;%(AdditionalDependencies) diff --git a/src/TypeDefs.h b/src/TypeDefs.h index 6912121f6..4b2e62371 100644 --- a/src/TypeDefs.h +++ b/src/TypeDefs.h @@ -47,6 +47,23 @@ #include "Sci_Position.h" +// Compiler Optimizations +#ifndef NP2_USE_SSE2 + // SSE2 enabled by default +#define NP2_USE_SSE2 1 +#endif + +#if defined(_WIN64) && !defined(NP2_USE_AVX2) +#if defined(__AVX2__) +#define NP2_USE_AVX2 1 +#else +#define NP2_USE_AVX2 0 +#endif +#else +#define NP2_USE_AVX2 0 +#endif // NP2_USE_AVX2 + + // // TODO(rkotten): // SCI_CREATEDOCUMENT (SC_DOCUMENTOPTION_TEXT_LARGE)