diff --git a/Versions/build.txt b/Versions/build.txt index 8c88d58d5..44085c505 100644 --- a/Versions/build.txt +++ b/Versions/build.txt @@ -1 +1 @@ -2399 +2401 diff --git a/res/Notepad3.exe.manifest.conf b/res/Notepad3.exe.manifest.conf index cb40e1bfb..0877fa6d9 100644 --- a/res/Notepad3.exe.manifest.conf +++ b/res/Notepad3.exe.manifest.conf @@ -3,7 +3,7 @@ Notepad3 BETA diff --git a/src/Edit.c b/src/Edit.c index f62fb70cd..981a4211d 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -1119,9 +1119,10 @@ bool EditLoadFile( bool bBOM = false; bool bReverse = false; - bool const bIsUnicodeValid = IsValidUnicode(lpData, cbData, &bBOM, &bReverse); bool const bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable) && !bIsForced && !bSkipUTFDetection && !bIsUTF8Sig); + cpi_enc_t const encUnicode = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &bBOM, &bReverse); + if (cbData == 0) { FileVars_Init(NULL, 0, &Globals.fvCurFile); status->iEOLMode = Settings.DefaultEOLMode; @@ -1130,20 +1131,22 @@ bool EditLoadFile( SciCall_SetEOLMode(Settings.DefaultEOLMode); FreeMem(lpData); } - else if (bIsUnicodeForced || (!bIsForced && bIsUnicodeAnalyzed && bIsUnicodeValid)) + else if (bIsUnicodeForced || (!bIsForced && (bIsUnicodeAnalyzed || !Encoding_IsNONE(encUnicode)))) { // === UNICODE === + if (Encoding_IsNONE(encUnicode)) + { + bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); + bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); - bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); - bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); - - if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) { - bBOM = bBOM_LE; - bReverse = false; - } - else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) { - bBOM = bBOM_BE; - bReverse = true; + if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) { + bBOM = bBOM_LE; + bReverse = false; + } + else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) { + bBOM = bBOM_BE; + bReverse = true; + } } if (bReverse) diff --git a/src/Encoding.c b/src/Encoding.c index bd22d83d1..6857ae035 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -664,51 +664,6 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt) // ============================================================================ -bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse) -{ - size_t const enoughData = 2048LL; - size_t const cb = (len < enoughData) ? len : enoughData; - - if (!pBuffer || cb < 2) { return false; } - - // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE. - // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE. - // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags. - // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags. - // - int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK; - - int iTest = iAllTests; - /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok - - if (iTest == iAllTests) { - iTest = 0; // iTest doesn't seem to have been modified ... - } - - bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE); - bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE); - - bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK); - bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK); - bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK); - - //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES); - - if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) - { - if (lpbBOM) { - *lpbBOM = (bHasBOM || bHasRBOM); - } - if (lpbReverse) { - *lpbReverse = (bHasRBOM || bIsReverse); - } - return true; - } - return false; -} -// ============================================================================ - - bool IsValidUTF7(const char* pTest, size_t nLength) { if (!pTest) { return false; } diff --git a/src/Encoding.h b/src/Encoding.h index 6104939d1..608dc41ab 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -121,7 +121,7 @@ inline bool IsUTF8Signature(const char* p) { bool IsValidUTF7(const char* pTest, size_t nLength); bool IsValidUTF8(const char* pTest, size_t nLength); -bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse); + ////////////////////////////////////////////////////// // Google's CED "Compact Encoding Detection" @@ -145,6 +145,7 @@ inline bool IsDBCSCodePage(UINT cp) { } cpi_enc_t Encoding_AnalyzeText(const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint); +cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse); const char* Encoding_GetTitleInfoA(); const WCHAR* Encoding_GetTitleInfoW(); diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 170221a6b..deb2fae0f 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -552,7 +552,12 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi { bool bBOM = false; bool bReverse = false; - if (IsValidUnicode(text, len, &bBOM, &bReverse)) { + cpi_enc_t const cpi = GetUnicodeEncoding(text, len, &bBOM, &bReverse); + if (!Encoding_IsNONE(cpiEncoding)) + { + cpiEncoding = cpi; + } + else { cpiEncoding = bBOM ? (bReverse ? CPI_UNICODEBE : CPI_UNICODE) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE); } } @@ -830,6 +835,57 @@ extern "C" cpi_enc_t Encoding_AnalyzeText // ============================================================================ +cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse) +{ + size_t const enoughData = 2048LL; + size_t const cb = (len < enoughData) ? len : enoughData; + + if (!pBuffer || cb < 2) { return false; } + + // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE. + // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE. + // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags. + // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags. + // + int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK; + + int iTest = iAllTests; + /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok + + if (iTest == iAllTests) { + iTest = 0; // iTest doesn't seem to have been modified ... + } + + bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE); + bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE); + + bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK); + bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK); + bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK); + + //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES); + + cpi_enc_t iEncoding = CPI_NONE; + + if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) + { + if (lpbBOM) { + *lpbBOM = (bHasBOM || bHasRBOM); + } + if (lpbReverse) { + *lpbReverse = (bHasRBOM || bIsReverse); + } + if (bHasBOM || bHasRBOM) { + iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM; + } + else if (bIsUnicode || bIsReverse) { + iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE; + } + } + return iEncoding; +} +// ============================================================================ + //============================================================================= // diff --git a/src/VersionEx.h b/src/VersionEx.h index 53bab1d55..5032bb75d 100644 --- a/src/VersionEx.h +++ b/src/VersionEx.h @@ -7,8 +7,8 @@ #define SAPPNAME "Notepad3" #define VERSION_MAJOR 5 #define VERSION_MINOR 19 -#define VERSION_REV 719 -#define VERSION_BUILD 2399 +#define VERSION_REV 721 +#define VERSION_BUILD 2401 #define SCINTILLA_VER 420 #define ONIGURUMA_REGEX_VER 6.9.3 #define VERSION_PATCH BETA