diff --git a/src/Edit.c b/src/Edit.c index c9ea107a8..5f10baecf 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -1339,15 +1339,10 @@ bool EditLoadFile( // === UNICODE ( UTF-16LE / UTF-16BE ) === // -------------------------------------------------------------------------- - bool const bPureASCIINoBOM = encDetection.bPureASCII && !encDetection.bHasBOM; - bool bIsUnicodeDetected = !IS_ENC_ENFORCED() && Encoding_IsUNICODE(encDetection.unicodeAnalysis) && !bPureASCIINoBOM; - - if (Encoding_IsUNICODE(encDetection.Encoding) || bIsUnicodeDetected) { - - // ---------------------------------------------------------------------- + if (Encoding_IsUNICODE(encDetection.Encoding)) + { status->iEncoding = encDetection.bHasBOM ? (encDetection.bIsReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) : (encDetection.bIsReverse ? CPI_UNICODEBE : CPI_UNICODE); - // ---------------------------------------------------------------------- if (encDetection.bIsReverse) { SwabEx(lpData, lpData, cbData); @@ -1389,7 +1384,7 @@ bool EditLoadFile( EditDetectEOLMode(lpData, cbData, status); } } - else if (!IS_ENC_ENFORCED() && encDetection.bPureASCII) { + else if (!IS_ENC_ENFORCED() && (encDetection.bPureASCII7Bit && !encDetection.bHasUnicodeNullBytes)) { // load ASCII(7-bit) as ANSI/UTF-8 EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory); status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT); diff --git a/src/Encoding.c b/src/Encoding.c index 6a1293c1d..a88c78370 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -701,18 +701,24 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch) bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt) { int iTest = IS_TEXT_UNICODE_SIGNATURE; - /*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest); - //~return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0)); - return ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0); // don't rely on result ok + bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest); + return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0)); } // ---------------------------------------------------------------------------- bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt) { int iTest = IS_TEXT_UNICODE_REVERSE_SIGNATURE; - /*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest); - //~return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0)); - return ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0); // don't rely on result ok + bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest); + return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0)); +} +// ---------------------------------------------------------------------------- + +bool HasUnicodeNullBytes(const char* pBuf, size_t cnt) +{ + int iTest = IS_TEXT_UNICODE_NULL_BYTES; + bool const ok = IsTextUnicode(pBuf, (int)cnt, &iTest); + return (ok && ((iTest & IS_TEXT_UNICODE_NULL_BYTES) != 0)); } // ---------------------------------------------------------------------------- @@ -723,14 +729,14 @@ bool Has_UTF16_BOM(const char* pBuf, size_t cnt) // ============================================================================ -bool IsValidUTF7(const char* pTest, size_t nLength) +bool IsPureAscii7Bit(const char* pTest, size_t nLength) { if (!pTest) { return false; } char const *pt = pTest; for (size_t i = 0; i < nLength; ++i) { - if ((*pt & 0x80) || !*pt) { + if (*pt & 0x80) { return false; } ++pt; diff --git a/src/Encoding.h b/src/Encoding.h index 65bbb147f..983690857 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -124,15 +124,16 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch); bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt); bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt); bool Has_UTF16_BOM(const char *pBuf, size_t cnt); +bool HasUnicodeNullBytes(const char* pBuf, size_t cnt); - inline bool IsUTF8Signature(const char* p) +inline bool IsUTF8Signature(const char* p) { return ((p[0] == '\xEF') && (p[1] == '\xBB') && (p[2] == '\xBF')); } #define UTF8StringStart(p) (IsUTF8Signature(p)) ? ((p)+3) : (p) -bool IsValidUTF7(const char* pTest, size_t nLength); bool IsValidUTF8(const char* pTest, size_t nLength); +bool IsPureAscii7Bit(const char* pTest, size_t nLength); ////////////////////////////////////////////////////// @@ -192,13 +193,14 @@ typedef struct _enc_det_t { bool bIsReverse; bool bIsUTF8Sig; bool bValidUTF8; - bool bPureASCII; + bool bHasUnicodeNullBytes; + bool bPureASCII7Bit; char encodingStrg[64]; } ENC_DET_T; -#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" } +#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, false, "" } ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData, diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 4858665bf..a23f0ff88 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -555,7 +555,7 @@ cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK; int iTest = iAllTests; - /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok + /*bool const ok =*/ IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok in case of multi-flags if (iTest == iAllTests) { return CPI_NONE; // iTest doesn't seem to have been modified ... @@ -1261,7 +1261,7 @@ extern "C" cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv) // GetFileEncoding() // extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData, - cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection) + cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection) { ENC_DET_T encDetRes = INIT_ENC_DET_T; @@ -1309,27 +1309,26 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint); // --------------------------------------------------------------------------- } - encDetRes.bPureASCII = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsValidUTF7(lpData, cbData); + encDetRes.bHasUnicodeNullBytes = HasUnicodeNullBytes(lpData, cbData); + encDetRes.bPureASCII7Bit = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsPureAscii7Bit(lpData, cbData); if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeHint; encDetRes.confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel); } - else if (encDetRes.bPureASCII) { + else if (encDetRes.bPureASCII7Bit && !encDetRes.bHasUnicodeNullBytes) { encDetRes.analyzedEncoding = (Settings.LoadASCIIasUTF8) ? CPI_UTF8 : CPI_ANSI_DEFAULT; } if (!bSkipUTFDetection) { encDetRes.unicodeAnalysis = GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse)); - if (Encoding_IsNONE(encDetRes.unicodeAnalysis) && Encoding_IsUNICODE(encDetRes.analyzedEncoding)) { encDetRes.unicodeAnalysis = encDetRes.analyzedEncoding; } - if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) - { - // check considten BOM + if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) { + // check consistent BOM if (encDetRes.bHasBOM && !bBOM_LE && !bBOM_BE) { encDetRes.unicodeAnalysis = CPI_NONE; } @@ -1347,7 +1346,8 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD if (Encoding_IsValid(encDetRes.analyzedEncoding)) { // no bIsReliable check (forced unreliable detection) encDetRes.forcedEncoding = encDetRes.analyzedEncoding; - } else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) { + } + else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) { encDetRes.forcedEncoding = encDetRes.unicodeAnalysis; } } @@ -1370,23 +1370,38 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD if (IS_ENC_ENFORCED()) { encDetRes.Encoding = encDetRes.forcedEncoding; - } else if (encDetRes.bIsUTF8Sig) { + } + else if (encDetRes.bIsUTF8Sig) { encDetRes.Encoding = CPI_UTF8SIGN; - } else if (bBOM_LE || bBOM_BE) { + } + else if (bBOM_LE || bBOM_BE) { encDetRes.Encoding = bBOM_LE ? CPI_UNICODEBOM : CPI_UNICODEBEBOM; encDetRes.bIsReverse = bBOM_BE; - } else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) { + } + else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && encDetRes.bHasUnicodeNullBytes) + { + encDetRes.Encoding = encDetRes.unicodeAnalysis; + } + else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) + { encDetRes.Encoding = encDetRes.analyzedEncoding; - } else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) { + } + else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) + { + encDetRes.Encoding = encDetRes.unicodeAnalysis; + } + else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) + { encDetRes.Encoding = Encoding_SrcWeak(CPI_GET); - } else if (Encoding_IsValid(iAnalyzeHint)) { + } + else if (Encoding_IsValid(iAnalyzeHint)) + { encDetRes.Encoding = iAnalyzeHint; } if (!Encoding_IsValid(encDetRes.Encoding)) { encDetRes.Encoding = CPI_PREFERRED_ENCODING; } - return encDetRes; } diff --git a/test/test_files/encoding/UTF-16/ASCII_NO_UTF16.c b/test/test_files/encoding/UTF-16/ASCII_NO_UTF16 (issue #4112).c similarity index 100% rename from test/test_files/encoding/UTF-16/ASCII_NO_UTF16.c rename to test/test_files/encoding/UTF-16/ASCII_NO_UTF16 (issue #4112).c