From 9d3f190236dedf6017757efd15aba450256a1872 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Thu, 26 Nov 2020 12:07:38 +0100 Subject: [PATCH] + rfc: refactoring encoding detection: pure ASCII vs. 7-bit --- src/Edit.c | 10 +++++----- src/Encoding.c | 5 +++-- src/Encoding.h | 5 ++--- src/EncodingDetection.cpp | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index ca0b87c15..c170c0ed3 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -471,7 +471,7 @@ bool EditIsRecodingNeeded(WCHAR* pszText, int cchLen) return false; } - DWORD dwFlags = Encoding_GetWCMB_Flags(codepage); + DWORD dwFlags = Encoding_GetWCMBFlagsByCodePage(codepage); if (dwFlags != 0) { dwFlags |= (WC_COMPOSITECHECK | WC_DEFAULTCHAR); } @@ -1166,6 +1166,7 @@ bool EditLoadFile( bSkipUTFDetection, bSkipANSICPDetection, bForceEncDetection); #define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetection.forcedEncoding)) +#define IS_ENC_PURE_ASCII() (encDetection.analyzedEncoding == CPI_ASCII_7BIT) // -------------------------------------------------------------------------- @@ -1238,7 +1239,6 @@ bool EditLoadFile( bool const bAnalysisUTF8 = Encoding_IsUTF8(encDetection.Encoding); bool const bRejectUTF8 = (IS_ENC_ENFORCED() && !bForcedUTF8) || !bValidUTF8 || (!encDetection.bIsUTF8Sig && bSkipUTFDetection); - bool const bIsCP_UTF7 = (Encoding_GetCodePage(encDetection.Encoding) == CP_UTF7); if (bForcedUTF8 || (!bRejectUTF8 && (encDetection.bIsUTF8Sig || bAnalysisUTF8))) { if (encDetection.bIsUTF8Sig) { @@ -1250,8 +1250,8 @@ bool EditLoadFile( status->iEncoding = CPI_UTF8; EditDetectEOLMode(lpData, cbData, status); } - } else if (!IS_ENC_ENFORCED() && (bIsCP_UTF7 && encDetection.bIs7BitOnly)) { - // load UTF-7/ASCII(7-bit) as ANSI/UTF-8 + } else if (!IS_ENC_ENFORCED() && IS_ENC_PURE_ASCII()) { + // load ASCII(7-bit) as ANSI/UTF-8 EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory); status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT); EditDetectEOLMode(lpData, cbData, status); @@ -1459,7 +1459,7 @@ bool EditSaveFile( lpDataWide, (SizeOfMem(lpDataWide) / sizeof(WCHAR))); // dry conversion run - DWORD const dwFlags = Encoding_GetWCMB_Flags(uCodePage); + DWORD const dwFlags = Encoding_GetWCMBFlagsByCodePage(uCodePage); size_t const cbSizeNeeded = (size_t)WideCharToMultiByteEx(uCodePage, dwFlags, lpDataWide, cbDataWide, NULL, 0, NULL, NULL); size_t const cbDataNew = max(cbSizeNeeded, cbDataWide); diff --git a/src/Encoding.c b/src/Encoding.c index 5982c8b92..8cb1801cf 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -53,7 +53,7 @@ const UINT uCodePageMBCS[] = { // ============================================================================ -DWORD Encoding_GetWCMB_Flags(const UINT codePage) +DWORD Encoding_GetWCMBFlagsByCodePage(const UINT codePage) { DWORD flags = WC_NO_BEST_FIT_CHARS; for (int k = 0; k < COUNTOF(uCodePageMBCS); k++) { @@ -721,7 +721,7 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt) } // ============================================================================ - +#if 0 bool IsValidUTF7(const char* pTest, size_t nLength) { if (!pTest) { @@ -737,6 +737,7 @@ bool IsValidUTF7(const char* pTest, size_t nLength) } return true; } +#endif // ============================================================================ diff --git a/src/Encoding.h b/src/Encoding.h index 37cfcf096..3b95536dc 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -69,7 +69,7 @@ typedef struct _np2encoding { } NP2ENCODING; -DWORD Encoding_GetWCMB_Flags(const UINT codePage); +DWORD Encoding_GetWCMBFlagsByCodePage(const UINT codePage); cpi_enc_t Encoding_Current(cpi_enc_t iEncoding); // getter/setter cpi_enc_t Encoding_Forced(cpi_enc_t iEncoding); // getter/setter @@ -186,7 +186,6 @@ typedef struct _enc_det_t { float confidence; // flags: bool bIsAnalysisReliable; - bool bIs7BitOnly; bool bHasBOM; bool bIsReverse; bool bIsUTF8Sig; @@ -196,7 +195,7 @@ typedef struct _enc_det_t { } ENC_DET_T; -#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" } +#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, "" } ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, const size_t cbData, diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 0fcf82a30..3bc35d452 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -626,7 +626,7 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi if (encStrg && (encStrg[0] != '\0')) { // preprocessing: special cases - if (_stricmp(encStrg, "ascii") == 0) { + if (StrCmpICA(encStrg, "ascii") == 0) { cpiEncoding = CPI_ASCII_7BIT; } else { cpiEncoding = Encoding_MatchA(encStrg); @@ -1270,7 +1270,6 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, encDetRes.bHasBOM = (bBOM_LE || bBOM_BE); encDetRes.bIsReverse = bBOM_BE; encDetRes.bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false); - encDetRes.bIs7BitOnly = IsValidUTF7(lpData, cbData); encDetRes.bValidUTF8 = IsValidUTF8(lpData, cbData); if (!IS_ENC_ENFORCED()) { @@ -1298,6 +1297,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint); // --------------------------------------------------------------------------- } + //~encDetRes.bIs7BitOnly = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsValidUTF7(lpData, cbData); if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeHint;