+ rfc: refactoring encoding detection: pure ASCII vs. 7-bit

This commit is contained in:
Rainer Kottenhoff 2020-11-26 12:07:38 +01:00
parent 51d93fa0f9
commit 9d3f190236
4 changed files with 12 additions and 12 deletions

View File

@ -471,7 +471,7 @@ bool EditIsRecodingNeeded(WCHAR* pszText, int cchLen)
return false;
}
DWORD dwFlags = Encoding_GetWCMB_Flags(codepage);
DWORD dwFlags = Encoding_GetWCMBFlagsByCodePage(codepage);
if (dwFlags != 0) {
dwFlags |= (WC_COMPOSITECHECK | WC_DEFAULTCHAR);
}
@ -1166,6 +1166,7 @@ bool EditLoadFile(
bSkipUTFDetection, bSkipANSICPDetection, bForceEncDetection);
#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetection.forcedEncoding))
#define IS_ENC_PURE_ASCII() (encDetection.analyzedEncoding == CPI_ASCII_7BIT)
// --------------------------------------------------------------------------
@ -1238,7 +1239,6 @@ bool EditLoadFile(
bool const bAnalysisUTF8 = Encoding_IsUTF8(encDetection.Encoding);
bool const bRejectUTF8 = (IS_ENC_ENFORCED() && !bForcedUTF8) || !bValidUTF8 || (!encDetection.bIsUTF8Sig && bSkipUTFDetection);
bool const bIsCP_UTF7 = (Encoding_GetCodePage(encDetection.Encoding) == CP_UTF7);
if (bForcedUTF8 || (!bRejectUTF8 && (encDetection.bIsUTF8Sig || bAnalysisUTF8))) {
if (encDetection.bIsUTF8Sig) {
@ -1250,8 +1250,8 @@ bool EditLoadFile(
status->iEncoding = CPI_UTF8;
EditDetectEOLMode(lpData, cbData, status);
}
} else if (!IS_ENC_ENFORCED() && (bIsCP_UTF7 && encDetection.bIs7BitOnly)) {
// load UTF-7/ASCII(7-bit) as ANSI/UTF-8
} else if (!IS_ENC_ENFORCED() && IS_ENC_PURE_ASCII()) {
// load ASCII(7-bit) as ANSI/UTF-8
EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory);
status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT);
EditDetectEOLMode(lpData, cbData, status);
@ -1459,7 +1459,7 @@ bool EditSaveFile(
lpDataWide, (SizeOfMem(lpDataWide) / sizeof(WCHAR)));
// dry conversion run
DWORD const dwFlags = Encoding_GetWCMB_Flags(uCodePage);
DWORD const dwFlags = Encoding_GetWCMBFlagsByCodePage(uCodePage);
size_t const cbSizeNeeded = (size_t)WideCharToMultiByteEx(uCodePage, dwFlags, lpDataWide, cbDataWide, NULL, 0, NULL, NULL);
size_t const cbDataNew = max(cbSizeNeeded, cbDataWide);

View File

@ -53,7 +53,7 @@ const UINT uCodePageMBCS[] = {
// ============================================================================
DWORD Encoding_GetWCMB_Flags(const UINT codePage)
DWORD Encoding_GetWCMBFlagsByCodePage(const UINT codePage)
{
DWORD flags = WC_NO_BEST_FIT_CHARS;
for (int k = 0; k < COUNTOF(uCodePageMBCS); k++) {
@ -721,7 +721,7 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
}
// ============================================================================
#if 0
bool IsValidUTF7(const char* pTest, size_t nLength)
{
if (!pTest) {
@ -737,6 +737,7 @@ bool IsValidUTF7(const char* pTest, size_t nLength)
}
return true;
}
#endif
// ============================================================================

View File

@ -69,7 +69,7 @@ typedef struct _np2encoding {
} NP2ENCODING;
DWORD Encoding_GetWCMB_Flags(const UINT codePage);
DWORD Encoding_GetWCMBFlagsByCodePage(const UINT codePage);
cpi_enc_t Encoding_Current(cpi_enc_t iEncoding); // getter/setter
cpi_enc_t Encoding_Forced(cpi_enc_t iEncoding); // getter/setter
@ -186,7 +186,6 @@ typedef struct _enc_det_t {
float confidence;
// flags:
bool bIsAnalysisReliable;
bool bIs7BitOnly;
bool bHasBOM;
bool bIsReverse;
bool bIsUTF8Sig;
@ -196,7 +195,7 @@ typedef struct _enc_det_t {
} ENC_DET_T;
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" }
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, "" }
ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, const size_t cbData,

View File

@ -626,7 +626,7 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi
if (encStrg && (encStrg[0] != '\0')) {
// preprocessing: special cases
if (_stricmp(encStrg, "ascii") == 0) {
if (StrCmpICA(encStrg, "ascii") == 0) {
cpiEncoding = CPI_ASCII_7BIT;
} else {
cpiEncoding = Encoding_MatchA(encStrg);
@ -1270,7 +1270,6 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
encDetRes.bHasBOM = (bBOM_LE || bBOM_BE);
encDetRes.bIsReverse = bBOM_BE;
encDetRes.bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false);
encDetRes.bIs7BitOnly = IsValidUTF7(lpData, cbData);
encDetRes.bValidUTF8 = IsValidUTF8(lpData, cbData);
if (!IS_ENC_ENFORCED()) {
@ -1298,6 +1297,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData,
Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint);
// ---------------------------------------------------------------------------
}
//~encDetRes.bIs7BitOnly = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsValidUTF7(lpData, cbData);
if (encDetRes.analyzedEncoding == CPI_NONE) {
encDetRes.analyzedEncoding = iAnalyzeHint;