From 77e7758b7b13a791c0f47c09b41559bab2c3804b Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Sun, 17 Nov 2019 12:05:56 +0100 Subject: [PATCH 1/2] + rfc: refactoring source code (encoding detection) --- src/Edit.c | 528 +++++--------------------------------- src/Edit.h | 19 -- src/Encoding.h | 44 +++- src/EncodingDetection.cpp | 525 +++++++++++++++++++++++++++++++++---- 4 files changed, 578 insertions(+), 538 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index a53b5a1c7..81636c419 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -1045,20 +1045,33 @@ bool EditLoadFile( Globals.dwLastError = GetLastError(); CloseHandle(hFile); + if (cbData == 0) { + FileVars_Init(NULL, 0, &Globals.fvCurFile); + status->iEOLMode = Settings.DefaultEOLMode; + status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; + EditSetNewText(hwnd, "", 0, bClearUndoHistory); + SciCall_SetEOLMode(Settings.DefaultEOLMode); + FreeMem(lpData); + Encoding_SrcCmdLn(CPI_NONE); + Encoding_SrcWeak(CPI_NONE); + return true; + } + bool bReadSuccess = ((readFlag & DECRYPT_FATAL_ERROR) || (readFlag & DECRYPT_FREAD_FAILED)) ? false : true; - // ((readFlag == DECRYPT_SUCCESS) || (readFlag & DECRYPT_NO_ENCRYPTION)) => true; + if ((readFlag & DECRYPT_CANCELED_NO_PASS) || (readFlag & DECRYPT_WRONG_PASS)) { bReadSuccess = (InfoBoxLng(MB_OKCANCEL, L"MsgNoOrWrongPassphrase", IDS_MUI_NOPASS) == IDOK); if (!bReadSuccess) { FreeMem(lpData); + Encoding_SrcCmdLn(CPI_NONE); + Encoding_SrcWeak(CPI_NONE); return true; } else { status->bEncryptedRaw = true; } } - if (!bReadSuccess) { FreeMem(lpData); Encoding_SrcCmdLn(CPI_NONE); @@ -1066,214 +1079,87 @@ bool EditLoadFile( return false; } - // -------------------------------------------------------------------------- - // Encoding Detection + + ENC_DET_T encDetection = Encoding_DetectEncoding(pszFile, lpData, cbData, bSkipUTFDetection, bSkipANSICPDetection, bForceEncDetection); + + #define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetection.forcedEncoding)) + // -------------------------------------------------------------------------- - // assume current code-page or default encoding (if forced) - cpi_enc_t const iAnalyzeFallback = Settings.UseDefaultForFileEncoding ? Settings.DefaultEncoding : CPI_ANSI_DEFAULT; - - // --- 1st check for force encodings --- - LPCWSTR lpszExt = PathFindExtension(pszFile); - bool const bNfoDizDetected = (lpszExt && !(StringCchCompareXI(lpszExt, L".nfo") && StringCchCompareXI(lpszExt, L".diz"))); - - cpi_enc_t iForcedEncoding = Globals.bForceReLoadAsUTF8 ? CPI_UTF8 : - ((Settings.LoadNFOasOEM && bNfoDizDetected) ? Globals.DOSEncoding : Encoding_SrcCmdLn(CPI_GET)); - - #define IS_ENC_ENFORCED() (!Encoding_IsNONE(iForcedEncoding)) - - // --- 2nd Use Encoding Analysis if applicable - - size_t const cbNbytes4Analysis = (cbData < 200000L) ? cbData : 200000L; - - float confidence = 0.0f; - cpi_enc_t iAnalyzedEncoding = iAnalyzeFallback; - - if (!IS_ENC_ENFORCED() || bForceEncDetection) - { - iAnalyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &confidence, iAnalyzeFallback); - - if (Flags.bDevDebugMode) { + if (Flags.bDevDebugMode) { #if 1 - SetAdditionalTitleInfo(Encoding_GetTitleInfoW()); + SetAdditionalTitleInfo(Encoding_GetTitleInfoW()); #else - DocPos const iPos = SciCall_PositionFromLine(SciCall_GetFirstVisibleLine()); - int const iXOff = SciCall_GetXOffset(); - SciCall_SetXOffset(0); - SciCall_CallTipShow(iPos, Encoding_GetTitleInfoA()); - SciCall_SetXOffset(iXOff); - Globals.CallTipType = CT_ENC_INFO; + DocPos const iPos = SciCall_PositionFromLine(SciCall_GetFirstVisibleLine()); + int const iXOff = SciCall_GetXOffset(); + SciCall_SetXOffset(0); + SciCall_CallTipShow(iPos, Encoding_GetTitleInfoA()); + SciCall_SetXOffset(iXOff); + Globals.CallTipType = CT_ENC_INFO; #endif + + if (IS_ENC_ENFORCED()) { + WCHAR wchBuf[128] = { L'\0' }; + StringCchPrintf(wchBuf, COUNTOF(wchBuf), L"ForcedEncoding='%s'", g_Encodings[encDetection.forcedEncoding].wchLabel); + SetAdditionalTitleInfo(wchBuf); } - if (bForceEncDetection && !Encoding_IsNONE(iAnalyzedEncoding)) { - iForcedEncoding = (iAnalyzedEncoding == CPI_ASCII_7BIT) ? CPI_ANSI_DEFAULT : iAnalyzedEncoding; // no bIsReliable check (forced unreliable detection) + + if (!Encoding_IsNONE(encDetection.fileVarEncoding) && FileVars_IsValidEncoding(&Globals.fvCurFile)) { + WCHAR wchBuf[128] = { L'\0' }; + StringCchPrintf(wchBuf, COUNTOF(wchBuf), L" - FilEncTag='%s'", + g_Encodings[FileVars_GetEncoding(&Globals.fvCurFile)].wchLabel); + AppendAdditionalTitleInfo(wchBuf); } } - - if (Flags.bDevDebugMode && IS_ENC_ENFORCED()) { - WCHAR wchBuf[128] = { L'\0' }; - StringCchPrintf(wchBuf, COUNTOF(wchBuf), L"ForcedEncoding='%s'", g_Encodings[iForcedEncoding].wchLabel); - SetAdditionalTitleInfo(wchBuf); - } - - // ------------------------------------------------------ - - if (!IS_ENC_ENFORCED()) - { - bool const bIsUnicode = Encoding_IsUTF8(iAnalyzedEncoding) || Encoding_IsUNICODE(iAnalyzedEncoding); - - if (iAnalyzedEncoding == CPI_NONE) - { - iAnalyzedEncoding = iAnalyzeFallback; - confidence = Settings2.AnalyzeReliableConfidenceLevel; - } - else if (iAnalyzedEncoding == CPI_ASCII_7BIT) { - iAnalyzedEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - confidence = 1.0; - } - else { - if ((bSkipUTFDetection && bIsUnicode) || (bSkipANSICPDetection && !bIsUnicode)) { - iAnalyzedEncoding = CPI_NONE; - confidence = 0.0; - } - } - } - else { - iAnalyzedEncoding = iForcedEncoding; - confidence = 1.0; - } - - bool const bIsReliable = (confidence >= Settings2.AnalyzeReliableConfidenceLevel); - + // -------------------------------------------------------------------------- - // --- 3rd Unicode Checks + bool const bIsUnicodeForced = Encoding_IsUNICODE(encDetection.forcedEncoding); + bool const bIsUnicodeDetected = !IS_ENC_ENFORCED() && (encDetection.bIsUnicodeAnalyzed || !Encoding_IsNONE(encDetection.unicodeEncoding)); - bool const bIsUnicodeForced = Encoding_IsUNICODE(iForcedEncoding); - - // choose best encoding guess - cpi_enc_t const iFileEncWeak = Encoding_SrcWeak(CPI_GET); - - // set Preferred Encoding - cpi_enc_t iPreferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - - if (IS_ENC_ENFORCED()) { - iPreferredEncoding = iForcedEncoding; - } - else if (!Encoding_IsNONE(iFileEncWeak)) { - iPreferredEncoding = iFileEncWeak; - } - else if (!Encoding_IsNONE(iAnalyzedEncoding) && (bIsReliable || !Settings.UseReliableCEDonly)) { - iPreferredEncoding = iAnalyzedEncoding; - } - else if (Encoding_IsNONE(iPreferredEncoding)) { - iPreferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - } - - // -------------------------------------------------------------------------- - - bool const bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false); - - bool bBOM = false; - bool bReverse = false; - bool const bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable) && !IS_ENC_ENFORCED() && !bSkipUTFDetection && !bIsUTF8Sig); - - cpi_enc_t const encUnicode = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &bBOM, &bReverse); - - if (cbData == 0) { - FileVars_Init(NULL, 0, &Globals.fvCurFile); - status->iEOLMode = Settings.DefaultEOLMode; - status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : iPreferredEncoding; - EditSetNewText(hwnd, "", 0, bClearUndoHistory); - SciCall_SetEOLMode(Settings.DefaultEOLMode); - FreeMem(lpData); - } - else if (bIsUnicodeForced || (!IS_ENC_ENFORCED() && (bIsUnicodeAnalyzed || !Encoding_IsNONE(encUnicode)))) + if (bIsUnicodeForced || bIsUnicodeDetected) { // === UNICODE === - if (Encoding_IsNONE(encUnicode)) - { - bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); - bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); - - if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) { - bBOM = bBOM_LE; - bReverse = false; - } - else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) { - bBOM = bBOM_BE; - bReverse = true; - } - } - - if (bReverse) + if (encDetection.bIsReverse) { SwabEx(lpData, lpData, cbData); - status->iEncoding = (bBOM ? CPI_UNICODEBEBOM : CPI_UNICODEBE); + status->iEncoding = (encDetection.bHasBOM ? CPI_UNICODEBEBOM : CPI_UNICODEBE); } else { - status->iEncoding = (bBOM ? CPI_UNICODEBOM : CPI_UNICODE); + status->iEncoding = (encDetection.bHasBOM ? CPI_UNICODEBOM : CPI_UNICODE); } char* lpDataUTF8 = AllocMem((cbData * 3) + 2, HEAP_ZERO_MEMORY); - ptrdiff_t convCnt = WideCharToMultiByteEx(Encoding_SciCP, 0, (bBOM) ? (LPWSTR)lpData + 1 : (LPWSTR)lpData, - (bBOM) ? (cbData / sizeof(WCHAR)) : (cbData / sizeof(WCHAR) + 1), lpDataUTF8, SizeOfMem(lpDataUTF8), NULL, NULL); + ptrdiff_t convCnt = WideCharToMultiByteEx(Encoding_SciCP, 0, (encDetection.bHasBOM ? (LPWSTR)lpData + 1 : (LPWSTR)lpData), + (encDetection.bHasBOM ? (cbData / sizeof(WCHAR)) : (cbData / sizeof(WCHAR) + 1)), lpDataUTF8, SizeOfMem(lpDataUTF8), NULL, NULL); if (convCnt == 0) { + convCnt = WideCharToMultiByteEx(CP_ACP, 0, (encDetection.bHasBOM ? (LPWSTR)lpData + 1 : (LPWSTR)lpData), + -1, lpDataUTF8, SizeOfMem(lpDataUTF8), NULL, NULL); status->bUnicodeErr = true; - convCnt = WideCharToMultiByteEx(CP_ACP, 0, (bBOM) ? (LPWSTR)lpData + 1 : (LPWSTR)lpData, - (-1), lpDataUTF8, SizeOfMem(lpDataUTF8), NULL, NULL); } - if (convCnt != 0) { - FreeMem(lpData); - FileVars_Init(lpDataUTF8, convCnt - 1, &Globals.fvCurFile); - EditSetNewText(hwnd, lpDataUTF8, convCnt - 1, bClearUndoHistory); - EditDetectEOLMode(lpDataUTF8, convCnt - 1, status); - FreeMem(lpDataUTF8); - } - else { - FreeMem(lpDataUTF8); - FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); - return false; - } + FreeMem(lpData); + FileVars_Init(lpDataUTF8, convCnt - 1, &Globals.fvCurFile); + EditSetNewText(hwnd, lpDataUTF8, convCnt - 1, bClearUndoHistory); + EditDetectEOLMode(lpDataUTF8, convCnt - 1, status); + FreeMem(lpDataUTF8); } else { // === ALL OTHERS === - // force file vars ? - FileVars_Init(lpData, cbData, &Globals.fvCurFile); - cpi_enc_t const iFileVarEncoding = (FileVars_IsValidEncoding(&Globals.fvCurFile) && !Settings.NoEncodingTags) ? - FileVars_GetEncoding(&Globals.fvCurFile) : CPI_NONE; - - if (!IS_ENC_ENFORCED() && !Encoding_IsNONE(iFileVarEncoding)) { - iForcedEncoding = (Globals.fvCurFile.mask & FV_ENCODING) ? iFileVarEncoding : iForcedEncoding; - iPreferredEncoding = IS_ENC_ENFORCED() ? iForcedEncoding : iPreferredEncoding; - } - - if (Flags.bDevDebugMode) { - if (!Encoding_IsNONE(iFileVarEncoding) && FileVars_IsValidEncoding(&Globals.fvCurFile)) { - WCHAR wchBuf[128] = { L'\0' }; - StringCchPrintf(wchBuf, COUNTOF(wchBuf), L" - FilEncTag='%s'", - g_Encodings[FileVars_GetEncoding(&Globals.fvCurFile)].wchLabel); - AppendAdditionalTitleInfo(wchBuf); - } - } - // === UTF-8 ? === bool const bValidUTF8 = IsValidUTF8(lpData, cbData); - bool const bForcedUTF8 = Encoding_IsUTF8(iForcedEncoding); - bool const bAnalysisUTF8 = Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable; - bool const bSoftHintUTF8 = Encoding_IsUTF8(iAnalyzedEncoding) && Encoding_IsUTF8(iPreferredEncoding); // non-reliable analysis = soft-hint + bool const bForcedUTF8 = Encoding_IsUTF8(encDetection.forcedEncoding); + bool const bAnalysisUTF8 = Encoding_IsUTF8(encDetection.analyzedEncoding) && encDetection.bIsAnalysisReliable; + bool const bSoftHintUTF8 = Encoding_IsUTF8(encDetection.analyzedEncoding) && Encoding_IsUTF8(encDetection.preferredEncoding); // non-reliable analysis = soft-hint - bool const bRejectUTF8 = IS_ENC_ENFORCED() || !bValidUTF8 || (!bIsUTF8Sig && bSkipUTFDetection); + bool const bRejectUTF8 = (IS_ENC_ENFORCED() && !bForcedUTF8) || !bValidUTF8 || (!encDetection.bIsUTF8Sig && bSkipUTFDetection); - if (bForcedUTF8 || (!bRejectUTF8 && (bIsUTF8Sig || bAnalysisUTF8 || bSoftHintUTF8))) // soft-hint = prefer UTF-8 + if (bForcedUTF8 || (!bRejectUTF8 && (encDetection.bIsUTF8Sig || bAnalysisUTF8 || bSoftHintUTF8))) // soft-hint = prefer UTF-8 { - if (bIsUTF8Sig) { + if (encDetection.bIsUTF8Sig) { EditSetNewText(hwnd, UTF8StringStart(lpData), cbData - 3, bClearUndoHistory); status->iEncoding = CPI_UTF8SIGN; EditDetectEOLMode(UTF8StringStart(lpData), cbData - 3, status); @@ -1285,10 +1171,10 @@ bool EditLoadFile( } FreeMem(lpData); } - else { // === ALL OTHER === + else { // === ALL OTHER NON UTF-8 === // ---------------------------------------------------------------------- - status->iEncoding = Encoding_IsValid(iPreferredEncoding) ? iPreferredEncoding : CPI_ANSI_DEFAULT; + status->iEncoding = Encoding_IsValid(encDetection.preferredEncoding) ? encDetection.preferredEncoding : CPI_ANSI_DEFAULT; // ---------------------------------------------------------------------- if (((Encoding_GetCodePage(status->iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(status->iEncoding)) || @@ -1337,7 +1223,7 @@ bool EditLoadFile( Encoding_SrcCmdLn(CPI_NONE); Encoding_SrcWeak(CPI_NONE); - SciCall_SetCharacterCategoryOptimization(Encoding_IsCJK(iAnalyzedEncoding) ? 0x10000 : 0x1000); + SciCall_SetCharacterCategoryOptimization(Encoding_IsCJK(encDetection.analyzedEncoding) ? 0x10000 : 0x1000); if (Flags.bDevDebugMode) { WCHAR wcBuf[128] = { L'\0' }; @@ -8452,292 +8338,6 @@ void EditSetBookmarkList(HWND hwnd, LPCWSTR pszBookMarks) } -//============================================================================= -// -// _SetFileVars() -// -static void _SetFileVars(char* buffer, size_t cch, LPFILEVARS lpfv) -{ - bool bDisableFileVar = false; - - if (!Flags.NoFileVariables) - { - int i; - if (FileVars_ParseInt(buffer, "enable-local-variables", &i) && (!i)) { - bDisableFileVar = true; - } - if (!bDisableFileVar) { - - if (FileVars_ParseInt(buffer, "tab-width", &i)) { - lpfv->iTabWidth = clampi(i, 1, 256); - lpfv->mask |= FV_TABWIDTH; - } - - if (FileVars_ParseInt(buffer, "c-basic-indent", &i)) { - lpfv->iIndentWidth = clampi(i, 0, 256); - lpfv->mask |= FV_INDENTWIDTH; - } - - if (FileVars_ParseInt(buffer, "indent-tabs-mode", &i)) { - lpfv->bTabsAsSpaces = (i) ? false : true; - lpfv->mask |= FV_TABSASSPACES; - } - - if (FileVars_ParseInt(buffer, "c-tab-always-indent", &i)) { - lpfv->bTabIndents = (i) ? true : false; - lpfv->mask |= FV_TABINDENTS; - } - - if (FileVars_ParseInt(buffer, "truncate-lines", &i)) { - lpfv->bWordWrap = (i) ? false : true; - lpfv->mask |= FV_WORDWRAP; - } - - if (FileVars_ParseInt(buffer, "fill-column", &i)) { - lpfv->iLongLinesLimit = clampi(i, 0, LONG_LINES_MARKER_LIMIT); - lpfv->mask |= FV_LONGLINESLIMIT; - } - } - } - - // Unicode Sig - bool const bHasSignature = IsUTF8Signature(buffer) || Has_UTF16_LE_BOM(buffer, cch) || Has_UTF16_BE_BOM(buffer, cch); - - if (!bHasSignature && !Settings.NoEncodingTags && !bDisableFileVar) { - - if (FileVars_ParseStr(buffer, "encoding", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) - lpfv->mask |= FV_ENCODING; - else if (FileVars_ParseStr(buffer, "charset", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) - lpfv->mask |= FV_ENCODING; - else if (FileVars_ParseStr(buffer, "coding", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) - lpfv->mask |= FV_ENCODING; - } - if (lpfv->mask & FV_ENCODING) { - lpfv->iEncoding = Encoding_MatchA(lpfv->tchEncoding); - } - - if (!Flags.NoFileVariables && !bDisableFileVar) { - if (FileVars_ParseStr(buffer, "mode", lpfv->tchMode, COUNTOF(lpfv->tchMode))) - lpfv->mask |= FV_MODE; - } -} - -//============================================================================= -// -// FileVars_Init() -// -bool FileVars_Init(char* lpData, size_t cbData, LPFILEVARS lpfv) -{ - ZeroMemory(lpfv, sizeof(FILEVARS)); - lpfv->bTabIndents = Settings.TabIndents; - lpfv->bTabsAsSpaces = Settings.TabsAsSpaces; - lpfv->bWordWrap = Settings.WordWrap; - lpfv->iTabWidth = Settings.TabWidth; - lpfv->iIndentWidth = Settings.IndentWidth; - lpfv->iLongLinesLimit = Settings.LongLinesLimit; - lpfv->iEncoding = Settings.DefaultEncoding; - - if ((Flags.NoFileVariables && Settings.NoEncodingTags) || !lpData || !cbData) { - return true; - } - - char tmpbuf[LARGE_BUFFER]; - size_t const cch = min_s(cbData + 1, COUNTOF(tmpbuf)); - - StringCchCopyNA(tmpbuf, COUNTOF(tmpbuf), lpData, cch); - _SetFileVars(tmpbuf, cch, lpfv); - - // if no file vars found, look at EOF - if ((lpfv->mask == 0) && (cbData > COUNTOF(tmpbuf))) { - StringCchCopyNA(tmpbuf, COUNTOF(tmpbuf), lpData + cbData - COUNTOF(tmpbuf) + 1, COUNTOF(tmpbuf)); - _SetFileVars(tmpbuf, cch, lpfv); - } - - return true; -} - - -//============================================================================= -// -// FileVars_Apply() -// -bool FileVars_Apply(LPFILEVARS lpfv) { - - int const _iTabWidth = (lpfv->mask & FV_TABWIDTH) ? lpfv->iTabWidth : Settings.TabWidth; - SciCall_SetTabWidth(_iTabWidth); - - int const _iIndentWidth = (lpfv->mask & FV_INDENTWIDTH) ? lpfv->iIndentWidth : ((lpfv->mask & FV_TABWIDTH) ? 0 : Settings.IndentWidth); - SciCall_SetIndent(_iIndentWidth); - - bool const _bTabsAsSpaces = (lpfv->mask & FV_TABSASSPACES) ? lpfv->bTabsAsSpaces : Settings.TabsAsSpaces; - SciCall_SetUseTabs(!_bTabsAsSpaces); - - bool const _bTabIndents = (lpfv->mask & FV_TABINDENTS) ? lpfv->bTabIndents : Settings.TabIndents; - SciCall_SetTabIndents(_bTabIndents); - SciCall_SetBackSpaceUnIndents(Settings.BackspaceUnindents); - - bool const _bWordWrap = (lpfv->mask & FV_WORDWRAP) ? lpfv->bWordWrap : Settings.WordWrap; - int const _iWrapMode = _bWordWrap ? ((Settings.WordWrapMode == 0) ? SC_WRAP_WHITESPACE : SC_WRAP_CHAR) : SC_WRAP_NONE; - SciCall_SetWrapMode(_iWrapMode); - - int const _iLongLinesLimit = (lpfv->mask & FV_LONGLINESLIMIT) ? lpfv->iLongLinesLimit : Settings.LongLinesLimit; - SciCall_SetEdgeColumn(_iLongLinesLimit); - Globals.iWrapCol = _iLongLinesLimit; - - return true; -} - - -//============================================================================= -// -// FileVars_ParseInt() -// -bool FileVars_ParseInt(char* pszData,char* pszName,int* piValue) { - - char *pvStart = StrStrIA(pszData, pszName); - while (pvStart) { - char chPrev = (pvStart > pszData) ? *(pvStart-1) : 0; - if (!IsCharAlphaNumericA(chPrev) && chPrev != '-' && chPrev != '_') { - pvStart += StringCchLenA(pszName,0); - while (*pvStart == ' ') { - pvStart++; - } - if (*pvStart == ':' || *pvStart == '=') { break; } - } - else { - pvStart += StringCchLenA(pszName, 0); - } - pvStart = StrStrIA(pvStart, pszName); // next - } - - if (pvStart) { - - while (*pvStart && StrChrIA(":=\"' \t", *pvStart)) { - pvStart++; - } - char tch[32] = { L'\0' }; - StringCchCopyNA(tch,COUNTOF(tch),pvStart,COUNTOF(tch)); - - char* pvEnd = tch; - while (*pvEnd && IsCharAlphaNumericA(*pvEnd)) { - pvEnd++; - } - *pvEnd = 0; - StrTrimA(tch," \t:=\"'"); - - int itok = sscanf_s(tch,"%i",piValue); - if (itok == 1) { - return true; - } - if (tch[0] == 't') { - *piValue = 1; - return true; - } - if (tch[0] == 'n' || tch[0] == 'f') { - *piValue = 0; - return true; - } - } - return false; -} - - -//============================================================================= -// -// FileVars_ParseStr() -// -bool FileVars_ParseStr(char* pszData,char* pszName,char* pszValue,int cchValue) { - - char *pvStart = StrStrIA(pszData, pszName); - while (pvStart) { - char chPrev = (pvStart > pszData) ? *(pvStart-1) : 0; - if (!IsCharAlphaNumericA(chPrev) && chPrev != '-' && chPrev != '_') { - pvStart += StringCchLenA(pszName,0); - while (*pvStart == ' ') { - pvStart++; - } - if (*pvStart == ':' || *pvStart == '=') { - break; - } - } - else { - pvStart += StringCchLenA(pszName, 0); - } - pvStart = StrStrIA(pvStart, pszName); // next - } - - if (pvStart) { - - bool bQuoted = false; - while (*pvStart && StrChrIA(":=\"' \t",*pvStart)) { - if (*pvStart == '\'' || *pvStart == '"') - bQuoted = true; - pvStart++; - } - - char tch[32] = { L'\0' }; - StringCchCopyNA(tch,COUNTOF(tch),pvStart,COUNTOF(tch)); - - char* pvEnd = tch; - while (*pvEnd && (IsCharAlphaNumericA(*pvEnd) || StrChrIA("+-/_", *pvEnd) || (bQuoted && *pvEnd == ' '))) { - pvEnd++; - } - *pvEnd = 0; - - StrTrimA(tch," \t:=\"'"); - - StringCchCopyNA(pszValue,cchValue,tch,COUNTOF(tch)); - - return true; - } - return false; -} - - -//============================================================================= -// -// FileVars_IsUTF8() -// -bool FileVars_IsUTF8(LPFILEVARS lpfv) { - if (lpfv->mask & FV_ENCODING) { - if (StringCchCompareNIA(lpfv->tchEncoding,COUNTOF(lpfv->tchEncoding),"utf-8",CSTRLEN("utf-8")) == 0 || - StringCchCompareNIA(lpfv->tchEncoding,COUNTOF(lpfv->tchEncoding),"utf8", CSTRLEN("utf8")) == 0) - return true; - } - return false; -} - - -//============================================================================= -// -// FileVars_IsValidEncoding() -// -bool FileVars_IsValidEncoding(LPFILEVARS lpfv) { - CPINFO cpi; - if (lpfv->mask & FV_ENCODING && Encoding_IsValidIdx(lpfv->iEncoding)) { - if ((Encoding_IsINTERNAL(lpfv->iEncoding)) || - (IsValidCodePage(Encoding_GetCodePage(lpfv->iEncoding)) && - GetCPInfo(Encoding_GetCodePage(lpfv->iEncoding),&cpi))) { - return true; - } - } - return false; -} - - -//============================================================================= -// -// FileVars_GetEncoding() -// -cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv) -{ - if (lpfv->mask & FV_ENCODING) { - return(lpfv->iEncoding); - } - return CPI_NONE; -} - - //============================================================================= // // EditBookmarkClick() diff --git a/src/Edit.h b/src/Edit.h index 0144bb0b9..881ff3a6f 100644 --- a/src/Edit.h +++ b/src/Edit.h @@ -121,25 +121,6 @@ void EditMarkAllOccurrences(HWND hwnd, bool bForceClear); void EditHideNotMarkedLineRange(HWND hwnd, bool bHideLines); void EditSelectionMultiSelectAll(); - -#define FV_TABWIDTH 1 -#define FV_INDENTWIDTH 2 -#define FV_TABSASSPACES 4 -#define FV_TABINDENTS 8 -#define FV_WORDWRAP 16 -#define FV_LONGLINESLIMIT 32 -#define FV_ENCODING 64 -#define FV_MODE 128 - -bool FileVars_Init(char* lpData, size_t cbData,LPFILEVARS lpfv); -bool FileVars_Apply(LPFILEVARS lpfv); -bool FileVars_ParseInt(char* pszData,char* pszName,int* piValue); -bool FileVars_ParseStr(char* pszData,char* pszName,char* pszValue,int cchValue); -bool FileVars_IsUTF8(LPFILEVARS lpfv); -bool FileVars_IsValidEncoding(LPFILEVARS lpfv); -cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv); - - // // Folding Functions // diff --git a/src/Encoding.h b/src/Encoding.h index 133d8c51e..4d4ea90d4 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -145,8 +145,48 @@ inline bool IsDBCSCodePage(UINT cp) { return ((cp == 932) || (cp == 936) || (cp == 949) || (cp == 950) || (cp == 951) || (cp == 1361)); } -cpi_enc_t Encoding_AnalyzeText(const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint); -cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse); +// ---------------------------------------------------------------------------- + +#define FV_TABWIDTH 1 +#define FV_INDENTWIDTH 2 +#define FV_TABSASSPACES 4 +#define FV_TABINDENTS 8 +#define FV_WORDWRAP 16 +#define FV_LONGLINESLIMIT 32 +#define FV_ENCODING 64 +#define FV_MODE 128 + +bool FileVars_Init(const char* lpData, size_t cbData, LPFILEVARS lpfv); +bool FileVars_Apply(LPFILEVARS lpfv); +bool FileVars_ParseInt(char* pszData, char* pszName, int* piValue); +bool FileVars_ParseStr(char* pszData, char* pszName, char* pszValue, int cchValue); +bool FileVars_IsUTF8(LPFILEVARS lpfv); +bool FileVars_IsValidEncoding(LPFILEVARS lpfv); +cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv); + +// ---------------------------------------------------------------------------- + +typedef struct _enc_det_t +{ + cpi_enc_t forcedEncoding; + cpi_enc_t analyzedEncoding; + cpi_enc_t preferredEncoding; + cpi_enc_t unicodeEncoding; + cpi_enc_t fileVarEncoding; + + bool bIsAnalysisReliable; + bool bIsUnicodeAnalyzed; + bool bHasBOM; + bool bIsReverse; + bool bIsUTF8Sig; + +} ENC_DET_T; + + +ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, const size_t cbData, + bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection); + +// ---------------------------------------------------------------------------- const char* Encoding_GetTitleInfoA(); const WCHAR* Encoding_GetTitleInfoW(); diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 78a820034..093f6fbf8 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -27,6 +27,7 @@ #define WIN32_LEAN_AND_MEAN 1 #define NOMINMAX 1 #include +#include #define STRSAFE_NO_CB_FUNCTIONS #define STRSAFE_NO_DEPRECATE // don't allow deprecated functions @@ -39,7 +40,9 @@ extern "C" { #include "TypeDefs.h" +#include "Helpers.h" #include "Encoding.h" +#include "SciCall.h" } // CED - Compact Encoding Detection (by Google) @@ -504,6 +507,56 @@ extern "C" void ChangeEncodingCodePage(const cpi_enc_t cpi, UINT newCP) //============================================================================= +cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse) +{ + cpi_enc_t iEncoding = CPI_NONE; + + size_t const enoughData = 2048LL; + size_t const cb = (len < enoughData) ? len : enoughData; + + if (!pBuffer || cb < 2) { return iEncoding; } + + // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE. + // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE. + // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags. + // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags. + // + int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK; + + int iTest = iAllTests; + /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok + + if (iTest == iAllTests) { + iTest = 0; // iTest doesn't seem to have been modified ... + } + + bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE); + bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE); + + bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK); + bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK); + bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK); + + //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES); + + if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) + { + if (lpbBOM) { + *lpbBOM = (bHasBOM || bHasRBOM); + } + if (lpbReverse) { + *lpbReverse = (bHasRBOM || bIsReverse); + } + if (bHasBOM || bHasRBOM) { + iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM; + } + else if (bIsUnicode || bIsReverse) { + iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE; + } + } + return iEncoding; +} +// ============================================================================ constexpr Encoding _MapCPI2CEDEncoding(const cpi_enc_t cpiEncoding) { @@ -682,7 +735,8 @@ inline float max_f(float x, float y) { return (x > y) ? x : y; } // -------------------------------------------------------------------------- -extern "C" cpi_enc_t Encoding_AnalyzeText +//extern "C" cpi_enc_t Encoding_AnalyzeText +cpi_enc_t Encoding_AnalyzeText ( const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint) @@ -836,58 +890,6 @@ extern "C" cpi_enc_t Encoding_AnalyzeText // ============================================================================ -cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse) -{ - cpi_enc_t iEncoding = CPI_NONE; - - size_t const enoughData = 2048LL; - size_t const cb = (len < enoughData) ? len : enoughData; - - if (!pBuffer || cb < 2) { return iEncoding; } - - // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE. - // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE. - // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags. - // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags. - // - int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK; - - int iTest = iAllTests; - /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok - - if (iTest == iAllTests) { - iTest = 0; // iTest doesn't seem to have been modified ... - } - - bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE); - bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE); - - bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK); - bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK); - bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK); - - //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES); - - if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) - { - if (lpbBOM) { - *lpbBOM = (bHasBOM || bHasRBOM); - } - if (lpbReverse) { - *lpbReverse = (bHasRBOM || bIsReverse); - } - if (bHasBOM || bHasRBOM) { - iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM; - } - else if (bIsUnicode || bIsReverse) { - iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE; - } - } - return iEncoding; -} -// ============================================================================ - - //============================================================================= // // _SetEncodingTitleInfo() @@ -936,3 +938,420 @@ static void _SetEncodingTitleInfo(const char* encodingUCD, cpi_enc_t encUCD, flo ::MultiByteToWideChar(CP_UTF7, 0, chEncodingInfo, -1, wchEncodingInfo, ARRAYSIZE(wchEncodingInfo)); } + + +//============================================================================= +// +// _SetFileVars() +// +static void _SetFileVars(char* buffer, size_t cch, LPFILEVARS lpfv) +{ + bool bDisableFileVar = false; + + if (!Flags.NoFileVariables) + { + int i; + if (FileVars_ParseInt(buffer, "enable-local-variables", &i) && (!i)) { + bDisableFileVar = true; + } + if (!bDisableFileVar) { + + if (FileVars_ParseInt(buffer, "tab-width", &i)) { + lpfv->iTabWidth = clampi(i, 1, 256); + lpfv->mask |= FV_TABWIDTH; + } + + if (FileVars_ParseInt(buffer, "c-basic-indent", &i)) { + lpfv->iIndentWidth = clampi(i, 0, 256); + lpfv->mask |= FV_INDENTWIDTH; + } + + if (FileVars_ParseInt(buffer, "indent-tabs-mode", &i)) { + lpfv->bTabsAsSpaces = (i) ? false : true; + lpfv->mask |= FV_TABSASSPACES; + } + + if (FileVars_ParseInt(buffer, "c-tab-always-indent", &i)) { + lpfv->bTabIndents = (i) ? true : false; + lpfv->mask |= FV_TABINDENTS; + } + + if (FileVars_ParseInt(buffer, "truncate-lines", &i)) { + lpfv->bWordWrap = (i) ? false : true; + lpfv->mask |= FV_WORDWRAP; + } + + if (FileVars_ParseInt(buffer, "fill-column", &i)) { + lpfv->iLongLinesLimit = clampi(i, 0, LONG_LINES_MARKER_LIMIT); + lpfv->mask |= FV_LONGLINESLIMIT; + } + } + } + + // Unicode Sig + bool const bHasSignature = IsUTF8Signature(buffer) || Has_UTF16_LE_BOM(buffer, cch) || Has_UTF16_BE_BOM(buffer, cch); + + if (!bHasSignature && !Settings.NoEncodingTags && !bDisableFileVar) { + + if (FileVars_ParseStr(buffer, "encoding", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) + lpfv->mask |= FV_ENCODING; + else if (FileVars_ParseStr(buffer, "charset", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) + lpfv->mask |= FV_ENCODING; + else if (FileVars_ParseStr(buffer, "coding", lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding))) + lpfv->mask |= FV_ENCODING; + } + if (lpfv->mask & FV_ENCODING) { + lpfv->iEncoding = Encoding_MatchA(lpfv->tchEncoding); + } + + if (!Flags.NoFileVariables && !bDisableFileVar) { + if (FileVars_ParseStr(buffer, "mode", lpfv->tchMode, COUNTOF(lpfv->tchMode))) + lpfv->mask |= FV_MODE; + } +} + +//============================================================================= +// +// FileVars_Init() +// +extern "C" bool FileVars_Init(const char* lpData, size_t cbData, LPFILEVARS lpfv) +{ + ZeroMemory(lpfv, sizeof(FILEVARS)); + lpfv->bTabIndents = Settings.TabIndents; + lpfv->bTabsAsSpaces = Settings.TabsAsSpaces; + lpfv->bWordWrap = Settings.WordWrap; + lpfv->iTabWidth = Settings.TabWidth; + lpfv->iIndentWidth = Settings.IndentWidth; + lpfv->iLongLinesLimit = Settings.LongLinesLimit; + lpfv->iEncoding = Settings.DefaultEncoding; + + if ((Flags.NoFileVariables && Settings.NoEncodingTags) || !lpData || !cbData) { + return true; + } + + char tmpbuf[LARGE_BUFFER]; + size_t const cch = min_s(cbData + 1, COUNTOF(tmpbuf)); + + StringCchCopyNA(tmpbuf, COUNTOF(tmpbuf), lpData, cch); + _SetFileVars(tmpbuf, cch, lpfv); + + // if no file vars found, look at EOF + if ((lpfv->mask == 0) && (cbData > COUNTOF(tmpbuf))) { + StringCchCopyNA(tmpbuf, COUNTOF(tmpbuf), lpData + cbData - COUNTOF(tmpbuf) + 1, COUNTOF(tmpbuf)); + _SetFileVars(tmpbuf, cch, lpfv); + } + + return true; +} + + +//============================================================================= +// +// FileVars_Apply() +// +extern "C" bool FileVars_Apply(LPFILEVARS lpfv) { + + int const _iTabWidth = (lpfv->mask & FV_TABWIDTH) ? lpfv->iTabWidth : Settings.TabWidth; + SciCall_SetTabWidth(_iTabWidth); + + int const _iIndentWidth = (lpfv->mask & FV_INDENTWIDTH) ? lpfv->iIndentWidth : ((lpfv->mask & FV_TABWIDTH) ? 0 : Settings.IndentWidth); + SciCall_SetIndent(_iIndentWidth); + + bool const _bTabsAsSpaces = (lpfv->mask & FV_TABSASSPACES) ? lpfv->bTabsAsSpaces : Settings.TabsAsSpaces; + SciCall_SetUseTabs(!_bTabsAsSpaces); + + bool const _bTabIndents = (lpfv->mask & FV_TABINDENTS) ? lpfv->bTabIndents : Settings.TabIndents; + SciCall_SetTabIndents(_bTabIndents); + SciCall_SetBackSpaceUnIndents(Settings.BackspaceUnindents); + + bool const _bWordWrap = (lpfv->mask & FV_WORDWRAP) ? lpfv->bWordWrap : Settings.WordWrap; + int const _iWrapMode = _bWordWrap ? ((Settings.WordWrapMode == 0) ? SC_WRAP_WHITESPACE : SC_WRAP_CHAR) : SC_WRAP_NONE; + SciCall_SetWrapMode(_iWrapMode); + + int const _iLongLinesLimit = (lpfv->mask & FV_LONGLINESLIMIT) ? lpfv->iLongLinesLimit : Settings.LongLinesLimit; + SciCall_SetEdgeColumn(_iLongLinesLimit); + Globals.iWrapCol = _iLongLinesLimit; + + return true; +} + + +//============================================================================= +// +// FileVars_ParseInt() +// +extern "C" bool FileVars_ParseInt(char* pszData, char* pszName, int* piValue) { + + char* pvStart = StrStrIA(pszData, pszName); + while (pvStart) { + char chPrev = (pvStart > pszData) ? *(pvStart - 1) : 0; + if (!IsCharAlphaNumericA(chPrev) && chPrev != '-' && chPrev != '_') { + pvStart += StringCchLenA(pszName, 0); + while (*pvStart == ' ') { + pvStart++; + } + if (*pvStart == ':' || *pvStart == '=') { break; } + } + else { + pvStart += StringCchLenA(pszName, 0); + } + pvStart = StrStrIA(pvStart, pszName); // next + } + + if (pvStart) { + + while (*pvStart && StrChrIA(":=\"' \t", *pvStart)) { + pvStart++; + } + char tch[32] = { L'\0' }; + StringCchCopyNA(tch, COUNTOF(tch), pvStart, COUNTOF(tch)); + + char* pvEnd = tch; + while (*pvEnd && IsCharAlphaNumericA(*pvEnd)) { + pvEnd++; + } + *pvEnd = 0; + StrTrimA(tch, " \t:=\"'"); + + int itok = sscanf_s(tch, "%i", piValue); + if (itok == 1) { + return true; + } + if (tch[0] == 't') { + *piValue = 1; + return true; + } + if (tch[0] == 'n' || tch[0] == 'f') { + *piValue = 0; + return true; + } + } + return false; +} + + +//============================================================================= +// +// FileVars_ParseStr() +// +extern "C" bool FileVars_ParseStr(char* pszData, char* pszName, char* pszValue, int cchValue) { + + char* pvStart = StrStrIA(pszData, pszName); + while (pvStart) { + char chPrev = (pvStart > pszData) ? *(pvStart - 1) : 0; + if (!IsCharAlphaNumericA(chPrev) && chPrev != '-' && chPrev != '_') { + pvStart += StringCchLenA(pszName, 0); + while (*pvStart == ' ') { + pvStart++; + } + if (*pvStart == ':' || *pvStart == '=') { + break; + } + } + else { + pvStart += StringCchLenA(pszName, 0); + } + pvStart = StrStrIA(pvStart, pszName); // next + } + + if (pvStart) { + + bool bQuoted = false; + while (*pvStart && StrChrIA(":=\"' \t", *pvStart)) { + if (*pvStart == '\'' || *pvStart == '"') + bQuoted = true; + pvStart++; + } + + char tch[32] = { L'\0' }; + StringCchCopyNA(tch, COUNTOF(tch), pvStart, COUNTOF(tch)); + + char* pvEnd = tch; + while (*pvEnd && (IsCharAlphaNumericA(*pvEnd) || StrChrIA("+-/_", *pvEnd) || (bQuoted && *pvEnd == ' '))) { + pvEnd++; + } + *pvEnd = 0; + + StrTrimA(tch, " \t:=\"'"); + + StringCchCopyNA(pszValue, cchValue, tch, COUNTOF(tch)); + + return true; + } + return false; +} + + +//============================================================================= +// +// FileVars_IsUTF8() +// +extern "C" bool FileVars_IsUTF8(LPFILEVARS lpfv) { + if (lpfv->mask & FV_ENCODING) { + if (StringCchCompareNIA(lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding), "utf-8", CSTRLEN("utf-8")) == 0 || + StringCchCompareNIA(lpfv->tchEncoding, COUNTOF(lpfv->tchEncoding), "utf8", CSTRLEN("utf8")) == 0) + return true; + } + return false; +} + + +//============================================================================= +// +// FileVars_IsValidEncoding() +// +extern "C" bool FileVars_IsValidEncoding(LPFILEVARS lpfv) { + CPINFO cpi; + if (lpfv->mask & FV_ENCODING && Encoding_IsValidIdx(lpfv->iEncoding)) { + if ((Encoding_IsINTERNAL(lpfv->iEncoding)) || + (IsValidCodePage(Encoding_GetCodePage(lpfv->iEncoding)) && + GetCPInfo(Encoding_GetCodePage(lpfv->iEncoding), &cpi))) { + return true; + } + } + return false; +} + + +//============================================================================= +// +// FileVars_GetEncoding() +// +extern "C" cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv) +{ + if (lpfv->mask & FV_ENCODING) { + return(lpfv->iEncoding); + } + return CPI_NONE; +} + +//============================================================================= +//============================================================================= + + +//============================================================================= +// +// GetFileEncoding() +// +extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, const size_t cbData, + bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection) +{ + // -------------------------------------------------------------------------- + // Encoding Detection + // -------------------------------------------------------------------------- + + ENC_DET_T encDetRes = { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, false, false, false, false, false }; + + // assume current code-page or default encoding (if forced) + cpi_enc_t const iAnalyzeFallback = Settings.UseDefaultForFileEncoding ? Settings.DefaultEncoding : CPI_ANSI_DEFAULT; + + // --- 1st check for force encodings --- + LPCWSTR lpszExt = PathFindExtension(pszFile); + bool const bNfoDizDetected = (lpszExt && !(StringCchCompareXI(lpszExt, L".nfo") && StringCchCompareXI(lpszExt, L".diz"))); + + encDetRes.forcedEncoding = Globals.bForceReLoadAsUTF8 ? CPI_UTF8 : + ((Settings.LoadNFOasOEM && bNfoDizDetected) ? Globals.DOSEncoding : Encoding_SrcCmdLn(CPI_GET)); + +#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetRes.forcedEncoding)) + + // --- 2nd Use Encoding Analysis if applicable + + size_t const cbNbytes4Analysis = (cbData < 200000L) ? cbData : 200000L; + + float confidence = 0.0f; + encDetRes.analyzedEncoding = iAnalyzeFallback; + + if (!IS_ENC_ENFORCED() || bForceEncDetection) + { + encDetRes.analyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &confidence, iAnalyzeFallback); + + if (bForceEncDetection && !Encoding_IsNONE(encDetRes.analyzedEncoding)) { + encDetRes.forcedEncoding = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) ? CPI_ANSI_DEFAULT : encDetRes.analyzedEncoding; // no bIsReliable check (forced unreliable detection) + } + } + + // ------------------------------------------------------ + + if (!IS_ENC_ENFORCED()) + { + bool const bIsUnicode = Encoding_IsUTF8(encDetRes.analyzedEncoding) || Encoding_IsUNICODE(encDetRes.analyzedEncoding); + + if (encDetRes.analyzedEncoding == CPI_NONE) + { + encDetRes.analyzedEncoding = iAnalyzeFallback; + confidence = Settings2.AnalyzeReliableConfidenceLevel; + } + else if (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) { + encDetRes.analyzedEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; + confidence = 1.0; + } + else { + if ((bSkipUTFDetection && bIsUnicode) || (bSkipANSICPDetection && !bIsUnicode)) { + encDetRes.analyzedEncoding = CPI_NONE; + confidence = 0.0; + } + } + } + + encDetRes.bIsAnalysisReliable = (confidence >= Settings2.AnalyzeReliableConfidenceLevel); + + // -------------------------------------------------------------------------- + + // --- 3rd Unicode Checks + + // choose best encoding guess + cpi_enc_t const iFileEncWeak = Encoding_SrcWeak(CPI_GET); + + // set Preferred Encoding + encDetRes.preferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; + + if (IS_ENC_ENFORCED()) { + encDetRes.preferredEncoding = encDetRes.forcedEncoding; + } + else if (!Encoding_IsNONE(iFileEncWeak)) { + encDetRes.preferredEncoding = iFileEncWeak; + } + else if (!Encoding_IsNONE(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) { + encDetRes.preferredEncoding = encDetRes.analyzedEncoding; + } + else if (Encoding_IsNONE(encDetRes.preferredEncoding)) { + encDetRes.preferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; + } + + // -------------------------------------------------------------------------- + + encDetRes.bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false); + encDetRes.bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(encDetRes.analyzedEncoding) && encDetRes.bIsAnalysisReliable) && !IS_ENC_ENFORCED() && !bSkipUTFDetection && !encDetRes.bIsUTF8Sig); + encDetRes.unicodeEncoding = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse)); + + if (Encoding_IsNONE(encDetRes.unicodeEncoding)) + { + bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); + bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); + + if ((encDetRes.forcedEncoding == CPI_UNICODE) || bBOM_LE) { + encDetRes.bHasBOM = bBOM_LE; + encDetRes.bIsReverse = false; + } + else if ((encDetRes.forcedEncoding == CPI_UNICODEBE) || bBOM_BE) { + encDetRes.bHasBOM = bBOM_BE; + encDetRes.bIsReverse = true; + } + } + + if (!IS_ENC_ENFORCED()) + { + FileVars_Init(lpData, cbData, &Globals.fvCurFile); + + encDetRes.fileVarEncoding = (FileVars_IsValidEncoding(&Globals.fvCurFile) && !Settings.NoEncodingTags) ? + FileVars_GetEncoding(&Globals.fvCurFile) : CPI_NONE; + + // force file vars ? + if (!Encoding_IsNONE(encDetRes.fileVarEncoding) && (Globals.fvCurFile.mask & FV_ENCODING)) { + encDetRes.forcedEncoding = encDetRes.preferredEncoding = encDetRes.fileVarEncoding; + } + } + + return encDetRes; +} + From ec1682eed9f230fa1d84ce91e03c3917da9987a2 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Sun, 17 Nov 2019 19:39:06 +0100 Subject: [PATCH 2/2] + rfc: refactoring: Encoding Detection --- src/Edit.c | 130 +++++++++++------------------- src/Encoding.c | 18 ++--- src/Encoding.h | 14 ++-- src/EncodingDetection.cpp | 163 ++++++++++++++++++++------------------ src/Helpers.c | 8 +- src/Notepad3.c | 64 +++++++-------- src/Notepad3.h | 1 - src/TypeDefs.h | 1 - 8 files changed, 183 insertions(+), 216 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index 81636c419..0aee17035 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -975,8 +975,7 @@ bool EditLoadFile( Globals.dwLastError = GetLastError(); if (hFile == INVALID_HANDLE_VALUE) { - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); + Encoding_Forced(CPI_NONE); return false; } @@ -993,8 +992,7 @@ bool EditLoadFile( // refuse to handle file InfoBoxLng(MB_ICONERROR, NULL, IDS_MUI_ERR_FILE_TOO_LARGE, (liFileSize.QuadPart / 1024LL / 1024LL)); CloseHandle(hFile); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); + Encoding_Forced(CPI_NONE); status->bFileTooBig = true; } return false; @@ -1009,8 +1007,7 @@ bool EditLoadFile( if ((dwFileSizeLimit != 0LL) && ((dwFileSizeLimit * 1024LL * 1024LL) < dwFileSize)) { if (InfoBoxLng(MB_YESNO, L"MsgFileSizeWarning", IDS_MUI_WARN_LOAD_BIG_FILE) != IDYES) { CloseHandle(hFile); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); + Encoding_Forced(CPI_NONE); status->bFileTooBig = true; return false; } @@ -1022,8 +1019,7 @@ bool EditLoadFile( INT_PTR const answer = InfoBoxLng(MB_YESNO, L"MsgFileUnknownExt", IDS_MUI_WARN_UNKNOWN_EXT, PathFindFileName(pszFile)); if (!((IDOK == answer) || (IDYES == answer))) { CloseHandle(hFile); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); + Encoding_Forced(CPI_NONE); status->bUnknownExt = true; return false; } @@ -1034,8 +1030,7 @@ bool EditLoadFile( { Globals.dwLastError = GetLastError(); CloseHandle(hFile); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); + Encoding_Forced(CPI_NONE); status->bFileTooBig = true; return false; } @@ -1051,9 +1046,8 @@ bool EditLoadFile( status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; EditSetNewText(hwnd, "", 0, bClearUndoHistory); SciCall_SetEOLMode(Settings.DefaultEOLMode); + Encoding_Forced(CPI_NONE); FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); return true; } @@ -1063,9 +1057,8 @@ bool EditLoadFile( { bReadSuccess = (InfoBoxLng(MB_OKCANCEL, L"MsgNoOrWrongPassphrase", IDS_MUI_NOPASS) == IDOK); if (!bReadSuccess) { + Encoding_Forced(CPI_NONE); FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); return true; } else { @@ -1073,9 +1066,8 @@ bool EditLoadFile( } } if (!bReadSuccess) { + Encoding_Forced(CPI_NONE); FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); return false; } @@ -1110,26 +1102,28 @@ bool EditLoadFile( g_Encodings[FileVars_GetEncoding(&Globals.fvCurFile)].wchLabel); AppendAdditionalTitleInfo(wchBuf); } + + WCHAR wcBuf[128] = { L'\0' }; + StringCchPrintf(wcBuf, ARRAYSIZE(wcBuf), L" - OS-CP='%s'", g_Encodings[CPI_ANSI_DEFAULT].wchLabel); + AppendAdditionalTitleInfo(wcBuf); } + // -------------------------------------------------------------------------- + // === UNICODE ( UTF-16LE / UTF-16BE ) === // -------------------------------------------------------------------------- - bool const bIsUnicodeForced = Encoding_IsUNICODE(encDetection.forcedEncoding); - bool const bIsUnicodeDetected = !IS_ENC_ENFORCED() && (encDetection.bIsUnicodeAnalyzed || !Encoding_IsNONE(encDetection.unicodeEncoding)); + bool const bIsUnicodeDetected = !IS_ENC_ENFORCED() && Encoding_IsUNICODE(encDetection.unicodeAnalysis); - if (bIsUnicodeForced || bIsUnicodeDetected) + if (Encoding_IsUNICODE(encDetection.Encoding) || bIsUnicodeDetected) { - // === UNICODE === - if (encDetection.bIsReverse) - { - SwabEx(lpData, lpData, cbData); - status->iEncoding = (encDetection.bHasBOM ? CPI_UNICODEBEBOM : CPI_UNICODEBE); - } - else { - status->iEncoding = (encDetection.bHasBOM ? CPI_UNICODEBOM : CPI_UNICODE); - } + // ---------------------------------------------------------------------- + status->iEncoding = encDetection.bHasBOM ? (encDetection.bIsReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) : + (encDetection.bIsReverse ? CPI_UNICODEBE : CPI_UNICODE); + // ---------------------------------------------------------------------- - char* lpDataUTF8 = AllocMem((cbData * 3) + 2, HEAP_ZERO_MEMORY); + if (encDetection.bIsReverse) { SwabEx(lpData, lpData, cbData); } + + char* const lpDataUTF8 = AllocMem((cbData * 3) + 2, HEAP_ZERO_MEMORY); ptrdiff_t convCnt = WideCharToMultiByteEx(Encoding_SciCP, 0, (encDetection.bHasBOM ? (LPWSTR)lpData + 1 : (LPWSTR)lpData), (encDetection.bHasBOM ? (cbData / sizeof(WCHAR)) : (cbData / sizeof(WCHAR) + 1)), lpDataUTF8, SizeOfMem(lpDataUTF8), NULL, NULL); @@ -1140,20 +1134,25 @@ bool EditLoadFile( status->bUnicodeErr = true; } - FreeMem(lpData); FileVars_Init(lpDataUTF8, convCnt - 1, &Globals.fvCurFile); EditSetNewText(hwnd, lpDataUTF8, convCnt - 1, bClearUndoHistory); EditDetectEOLMode(lpDataUTF8, convCnt - 1, status); FreeMem(lpDataUTF8); - } - else { // === ALL OTHERS === + } + else // === ALL OTHERS === + { + // ---------------------------------------------------------------------- + status->iEncoding = encDetection.Encoding; + // ---------------------------------------------------------------------- + + UINT const uCodePage = Encoding_GetCodePage(status->iEncoding); // === UTF-8 ? === bool const bValidUTF8 = IsValidUTF8(lpData, cbData); - bool const bForcedUTF8 = Encoding_IsUTF8(encDetection.forcedEncoding); + bool const bForcedUTF8 = Encoding_IsUTF8(encDetection.forcedEncoding);// ~ don't || encDetection.bIsUTF8Sig here ! bool const bAnalysisUTF8 = Encoding_IsUTF8(encDetection.analyzedEncoding) && encDetection.bIsAnalysisReliable; - bool const bSoftHintUTF8 = Encoding_IsUTF8(encDetection.analyzedEncoding) && Encoding_IsUTF8(encDetection.preferredEncoding); // non-reliable analysis = soft-hint + bool const bSoftHintUTF8 = Encoding_IsUTF8(encDetection.analyzedEncoding) && Encoding_IsUTF8(encDetection.Encoding); // non-reliable analysis = soft-hint bool const bRejectUTF8 = (IS_ENC_ENFORCED() && !bForcedUTF8) || !bValidUTF8 || (!encDetection.bIsUTF8Sig && bSkipUTFDetection); @@ -1169,20 +1168,20 @@ bool EditLoadFile( status->iEncoding = CPI_UTF8; EditDetectEOLMode(lpData, cbData, status); } - FreeMem(lpData); + } + else if ((uCodePage == CP_UTF7) && IsValidUTF7(lpData, cbData)) + { + // load UTF-7/ASCII(7-bit) as ANSI/UTF-8 + EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory); + status->iEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; + EditDetectEOLMode(lpData, cbData, status); } else { // === ALL OTHER NON UTF-8 === - // ---------------------------------------------------------------------- - status->iEncoding = Encoding_IsValid(encDetection.preferredEncoding) ? encDetection.preferredEncoding : CPI_ANSI_DEFAULT; - // ---------------------------------------------------------------------- - - if (((Encoding_GetCodePage(status->iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(status->iEncoding)) || - ((Encoding_GetCodePage(status->iEncoding) == CP_UTF7) && IsValidUTF7(lpData, cbData))) { - - UINT uCodePage = Encoding_GetCodePage(status->iEncoding); - + if (Encoding_IsEXTERNAL_8BIT(status->iEncoding)) + { LPWSTR lpDataWide = AllocMem(cbData * 2 + 16, HEAP_ZERO_MEMORY); + ptrdiff_t const cbDataWide = MultiByteToWideCharEx(uCodePage, 0, lpData, cbData, lpDataWide, (SizeOfMem(lpDataWide) / sizeof(WCHAR))); if (cbDataWide != 0) { @@ -1191,45 +1190,35 @@ bool EditLoadFile( cbData = WideCharToMultiByteEx(Encoding_SciCP, 0, lpDataWide, cbDataWide, lpData, SizeOfMem(lpData), NULL, NULL); if (cbData != 0) { - FreeMem(lpDataWide); EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory); EditDetectEOLMode(lpData, cbData, status); - FreeMem(lpData); + FreeMem(lpDataWide); } else { + Encoding_Forced(CPI_NONE); FreeMem(lpDataWide); FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); return false; } } else { + Encoding_Forced(CPI_NONE); FreeMem(lpDataWide); FreeMem(lpData); - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); return false; } } else { EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory); EditDetectEOLMode(lpData, cbData, status); - FreeMem(lpData); } } } - Encoding_SrcCmdLn(CPI_NONE); - Encoding_SrcWeak(CPI_NONE); - SciCall_SetCharacterCategoryOptimization(Encoding_IsCJK(encDetection.analyzedEncoding) ? 0x10000 : 0x1000); - if (Flags.bDevDebugMode) { - WCHAR wcBuf[128] = { L'\0' }; - StringCchPrintf(wcBuf, ARRAYSIZE(wcBuf), L" - OS-CP='%s'", g_Encodings[CPI_ANSI_DEFAULT].wchLabel); - AppendAdditionalTitleInfo(wcBuf); - } + Encoding_Forced(CPI_NONE); + FreeMem(lpData); return true; } @@ -1315,29 +1304,6 @@ bool EditSaveFile( lpData = AllocMem(cbData + 4, HEAP_ZERO_MEMORY); //fix: +bom cbData = SciCall_GetText((cbData+1), lpData); - // FIXME: move checks in front of disk file access - // Msg if file tag encoding does not correspond to BOM - /*if ((g_Encodings[iEncoding].uFlags & NCP_UNICODE) == 0 && (g_Encodings[iEncoding].uFlags & NCP_UTF8_SIGN) == 0) { - bool bEncodingMismatch = true; - FILEVARS fv; - FileVars_Init(lpData,cbData,&fv); - if (fv.mask & FV_ENCODING) { - int iAltEncoding; - if (FileVars_IsValidEncoding(&fv)) { - iAltEncoding = FileVars_GetEncoding(&fv); - if (iAltEncoding == iEncoding) - bEncodingMismatch = false; - else if ((g_Encodings[iAltEncoding].uFlags & NCP_UTF8) && (g_Encodings[iEncoding].uFlags & NCP_UTF8)) - bEncodingMismatch = false; - } - if (bEncodingMismatch) { - InfoBoxLng(MB_OK,L"MsgEncodingMismatch",IDS_MUI_ENCODINGMISMATCH, - g_Encodings[iAltEncoding].wchLabel, - g_Encodings[iEncoding].wchLabel); - } - } - }*/ - if (Encoding_IsUNICODE(status->iEncoding)) // UTF-16LE/BE_(BOM) { SetEndOfFile(hFile); diff --git a/src/Encoding.c b/src/Encoding.c index 46011841f..6cdefb30c 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -55,17 +55,17 @@ cpi_enc_t Encoding_Current(cpi_enc_t iEncoding) // ============================================================================ -cpi_enc_t Encoding_SrcCmdLn(cpi_enc_t iSrcEncoding) +cpi_enc_t Encoding_Forced(cpi_enc_t iEncoding) { static cpi_enc_t SourceEncoding = CPI_NONE; - if (iSrcEncoding >= 0) { - if (Encoding_IsValid(iSrcEncoding)) - SourceEncoding = iSrcEncoding; + if (iEncoding >= 0) { + if (Encoding_IsValid(iEncoding)) + SourceEncoding = iEncoding; else SourceEncoding = CPI_ANSI_DEFAULT; } - else if (iSrcEncoding == CPI_NONE) { + else if (iEncoding == CPI_NONE) { SourceEncoding = CPI_NONE; } return SourceEncoding; @@ -211,17 +211,17 @@ int Encoding_MapIniSetting(bool bLoad, int iSetting) // ============================================================================ -cpi_enc_t Encoding_MapUnicode(cpi_enc_t iUni) +cpi_enc_t Encoding_MapSignature(cpi_enc_t iUni) { + if (iUni == CPI_UTF8SIGN) { + return CPI_UTF8; + } if (iUni == CPI_UNICODEBOM) { return CPI_UNICODE; } if (iUni == CPI_UNICODEBEBOM) { return CPI_UNICODEBE; } - if (iUni == CPI_UTF8SIGN) { - return CPI_UTF8; - } return iUni; } // ============================================================================ diff --git a/src/Encoding.h b/src/Encoding.h index 4d4ea90d4..fea39d3f9 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -68,19 +68,19 @@ typedef struct _np2encoding { } NP2ENCODING; cpi_enc_t Encoding_Current(cpi_enc_t iEncoding); // getter/setter -cpi_enc_t Encoding_SrcCmdLn(cpi_enc_t iSrcEncoding); // getter/setter +cpi_enc_t Encoding_Forced(cpi_enc_t iEncoding); // getter/setter cpi_enc_t Encoding_SrcWeak(cpi_enc_t iSrcWeakEnc); // getter/setter bool Encoding_HasChanged(cpi_enc_t iOriginalEncoding); // query/setter void Encoding_InitDefaults(); int Encoding_MapIniSetting(bool, int iSetting); -cpi_enc_t Encoding_MapUnicode(cpi_enc_t iUni); void Encoding_SetLabel(cpi_enc_t iEncoding); cpi_enc_t Encoding_MatchW(LPCWSTR pwszTest); cpi_enc_t Encoding_MatchA(const char* pchTest); bool Encoding_IsValid(cpi_enc_t iTestEncoding); cpi_enc_t Encoding_GetByCodePage(const UINT codepage); +cpi_enc_t Encoding_MapSignature(cpi_enc_t iUni); void Encoding_AddToListView(HWND hwnd, cpi_enc_t idSel, bool); bool Encoding_GetFromListView(HWND hwnd, cpi_enc_t* pidEncoding); void Encoding_AddToComboboxEx(HWND hwnd, cpi_enc_t idSel, bool); @@ -168,14 +168,14 @@ cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv); typedef struct _enc_det_t { + cpi_enc_t Encoding; // final detection result + // statistic: cpi_enc_t forcedEncoding; - cpi_enc_t analyzedEncoding; - cpi_enc_t preferredEncoding; - cpi_enc_t unicodeEncoding; cpi_enc_t fileVarEncoding; - + cpi_enc_t analyzedEncoding; + cpi_enc_t unicodeAnalysis; + // flags: bool bIsAnalysisReliable; - bool bIsUnicodeAnalyzed; bool bHasBOM; bool bIsReverse; bool bIsUTF8Sig; diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 093f6fbf8..b2d6de2e9 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -1025,9 +1025,7 @@ extern "C" bool FileVars_Init(const char* lpData, size_t cbData, LPFILEVARS lpfv lpfv->iLongLinesLimit = Settings.LongLinesLimit; lpfv->iEncoding = Settings.DefaultEncoding; - if ((Flags.NoFileVariables && Settings.NoEncodingTags) || !lpData || !cbData) { - return true; - } + if ((Flags.NoFileVariables && Settings.NoEncodingTags) || !lpData || !cbData) { return true; } char tmpbuf[LARGE_BUFFER]; size_t const cch = min_s(cbData + 1, COUNTOF(tmpbuf)); @@ -1236,46 +1234,93 @@ extern "C" cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv) extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, const size_t cbData, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection) { - // -------------------------------------------------------------------------- - // Encoding Detection - // -------------------------------------------------------------------------- - - ENC_DET_T encDetRes = { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, false, false, false, false, false }; + + ENC_DET_T encDetRes = { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, false, false, false, false }; - // assume current code-page or default encoding (if forced) - cpi_enc_t const iAnalyzeFallback = Settings.UseDefaultForFileEncoding ? Settings.DefaultEncoding : CPI_ANSI_DEFAULT; + FileVars_Init(lpData, cbData, &Globals.fvCurFile); + + bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); + bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); + encDetRes.bHasBOM = (bBOM_LE || bBOM_BE); + encDetRes.bIsReverse = bBOM_BE; + + encDetRes.bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false); // --- 1st check for force encodings --- LPCWSTR lpszExt = PathFindExtension(pszFile); bool const bNfoDizDetected = (lpszExt && !(StringCchCompareXI(lpszExt, L".nfo") && StringCchCompareXI(lpszExt, L".diz"))); - encDetRes.forcedEncoding = Globals.bForceReLoadAsUTF8 ? CPI_UTF8 : - ((Settings.LoadNFOasOEM && bNfoDizDetected) ? Globals.DOSEncoding : Encoding_SrcCmdLn(CPI_GET)); + #define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetRes.forcedEncoding)) -#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetRes.forcedEncoding)) + encDetRes.forcedEncoding = (Settings.LoadNFOasOEM && bNfoDizDetected) ? Globals.DOSEncoding : Encoding_Forced(CPI_GET); + + if (!IS_ENC_ENFORCED()) + { + encDetRes.fileVarEncoding = (FileVars_IsValidEncoding(&Globals.fvCurFile)) ? FileVars_GetEncoding(&Globals.fvCurFile) : CPI_NONE; + // force file vars ? + if (Encoding_IsValid(encDetRes.fileVarEncoding) && (Globals.fvCurFile.mask & FV_ENCODING)) { + encDetRes.forcedEncoding = encDetRes.fileVarEncoding; + } + } // --- 2nd Use Encoding Analysis if applicable + cpi_enc_t const iAnalyzeFallback = Settings.UseDefaultForFileEncoding ? Settings.DefaultEncoding : CPI_ANSI_DEFAULT; + size_t const cbNbytes4Analysis = (cbData < 200000L) ? cbData : 200000L; float confidence = 0.0f; - encDetRes.analyzedEncoding = iAnalyzeFallback; if (!IS_ENC_ENFORCED() || bForceEncDetection) { - encDetRes.analyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &confidence, iAnalyzeFallback); + if (!bSkipANSICPDetection) + { + encDetRes.analyzedEncoding = Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &confidence, iAnalyzeFallback); + } - if (bForceEncDetection && !Encoding_IsNONE(encDetRes.analyzedEncoding)) { - encDetRes.forcedEncoding = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) ? CPI_ANSI_DEFAULT : encDetRes.analyzedEncoding; // no bIsReliable check (forced unreliable detection) + if (encDetRes.analyzedEncoding == CPI_NONE) + { + encDetRes.analyzedEncoding = iAnalyzeFallback; + confidence = Settings2.AnalyzeReliableConfidenceLevel; + } + + if (!bSkipUTFDetection) + { + encDetRes.unicodeAnalysis = GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse)); + + if (Encoding_IsNONE(encDetRes.unicodeAnalysis) && Encoding_IsUNICODE(encDetRes.analyzedEncoding)) + { + encDetRes.unicodeAnalysis = encDetRes.analyzedEncoding; + } + + //// check for UTF-32, can't handle + //if (encDetRes.bHasBOM && !bBOM_LE && !bBOM_BE) { + // encDetRes.unicodeAnalysis = CPI_NONE; + //} + //else if (encDetRes.bHasBOM && encDetRes.bIsReverse && !bBOM_BE) { + // encDetRes.unicodeAnalysis = CPI_NONE; + //} + //else if (encDetRes.bHasBOM && !encDetRes.bIsReverse && !bBOM_LE) { + // // must be UTF-32, can't handle + // encDetRes.unicodeAnalysis = CPI_NONE; + //} + } + + if (bForceEncDetection) { + if (Encoding_IsValid(encDetRes.analyzedEncoding)) { + // no bIsReliable check (forced unreliable detection) + encDetRes.forcedEncoding = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) ? CPI_ANSI_DEFAULT : encDetRes.analyzedEncoding; + } + else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) { + encDetRes.forcedEncoding = encDetRes.unicodeAnalysis; + } } } - // ------------------------------------------------------ + //bool const bIsUTF8orUnicodeAnalysis = Encoding_IsUTF8(encDetRes.analyzedEncoding) || Encoding_IsUNICODE(encDetRes.analyzedEncoding); if (!IS_ENC_ENFORCED()) { - bool const bIsUnicode = Encoding_IsUTF8(encDetRes.analyzedEncoding) || Encoding_IsUNICODE(encDetRes.analyzedEncoding); - if (encDetRes.analyzedEncoding == CPI_NONE) { encDetRes.analyzedEncoding = iAnalyzeFallback; @@ -1285,72 +1330,38 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(LPWSTR pszFile, const char* lpData, encDetRes.analyzedEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; confidence = 1.0; } - else { - if ((bSkipUTFDetection && bIsUnicode) || (bSkipANSICPDetection && !bIsUnicode)) { - encDetRes.analyzedEncoding = CPI_NONE; - confidence = 0.0; - } - } } encDetRes.bIsAnalysisReliable = (confidence >= Settings2.AnalyzeReliableConfidenceLevel); // -------------------------------------------------------------------------- - - // --- 3rd Unicode Checks - - // choose best encoding guess - cpi_enc_t const iFileEncWeak = Encoding_SrcWeak(CPI_GET); - - // set Preferred Encoding - encDetRes.preferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - - if (IS_ENC_ENFORCED()) { - encDetRes.preferredEncoding = encDetRes.forcedEncoding; - } - else if (!Encoding_IsNONE(iFileEncWeak)) { - encDetRes.preferredEncoding = iFileEncWeak; - } - else if (!Encoding_IsNONE(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) { - encDetRes.preferredEncoding = encDetRes.analyzedEncoding; - } - else if (Encoding_IsNONE(encDetRes.preferredEncoding)) { - encDetRes.preferredEncoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - } - + // --- choose best encoding guess ---- // -------------------------------------------------------------------------- - encDetRes.bIsUTF8Sig = ((cbData >= 3) ? IsUTF8Signature(lpData) : false); - encDetRes.bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(encDetRes.analyzedEncoding) && encDetRes.bIsAnalysisReliable) && !IS_ENC_ENFORCED() && !bSkipUTFDetection && !encDetRes.bIsUTF8Sig); - encDetRes.unicodeEncoding = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse)); - - if (Encoding_IsNONE(encDetRes.unicodeEncoding)) - { - bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData); - bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData); + // init Preferred Encoding + encDetRes.Encoding = Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; - if ((encDetRes.forcedEncoding == CPI_UNICODE) || bBOM_LE) { - encDetRes.bHasBOM = bBOM_LE; - encDetRes.bIsReverse = false; - } - else if ((encDetRes.forcedEncoding == CPI_UNICODEBE) || bBOM_BE) { - encDetRes.bHasBOM = bBOM_BE; - encDetRes.bIsReverse = true; - } + if (IS_ENC_ENFORCED()) + { + encDetRes.Encoding = encDetRes.forcedEncoding; + } + else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) + { + encDetRes.Encoding = encDetRes.analyzedEncoding; + } + else if (encDetRes.bIsUTF8Sig) + { + encDetRes.Encoding = CPI_UTF8SIGN; + } + else if (bBOM_LE || bBOM_BE) { + encDetRes.Encoding = bBOM_LE ? CPI_UNICODEBOM : CPI_UNICODEBEBOM; + encDetRes.bIsReverse = bBOM_BE; + } + else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) { + encDetRes.Encoding = Encoding_SrcWeak(CPI_GET); } - if (!IS_ENC_ENFORCED()) - { - FileVars_Init(lpData, cbData, &Globals.fvCurFile); - - encDetRes.fileVarEncoding = (FileVars_IsValidEncoding(&Globals.fvCurFile) && !Settings.NoEncodingTags) ? - FileVars_GetEncoding(&Globals.fvCurFile) : CPI_NONE; - - // force file vars ? - if (!Encoding_IsNONE(encDetRes.fileVarEncoding) && (Globals.fvCurFile.mask & FV_ENCODING)) { - encDetRes.forcedEncoding = encDetRes.preferredEncoding = encDetRes.fileVarEncoding; - } - } + if (!Encoding_IsValid(encDetRes.Encoding)) { encDetRes.Encoding = CPI_ANSI_DEFAULT; } return encDetRes; } diff --git a/src/Helpers.c b/src/Helpers.c index 5539a97ee..2aada3670 100644 --- a/src/Helpers.c +++ b/src/Helpers.c @@ -1671,14 +1671,14 @@ ptrdiff_t WideCharToMultiByteEx( ptrdiff_t outBufSiz = cbMultiByte; ptrdiff_t bytesConv = 0LL; - static ptrdiff_t const maxBufSize = (INT_MAX - 128); + static ptrdiff_t const maxBufSize = (INT_MAX - 1); BOOL bIsDefCharUse = FALSE; while ((inBufCnt > 0LL) || (inBufCnt == -1LL)) { int const cnt = (inBufCnt > maxBufSize) ? (int)maxBufSize : ((inBufCnt > 0LL) ? (int)inBufCnt : -1); - int const siz = (outBufSiz > maxBufSize) ? (int)maxBufSize : (int)outBufSiz; + int const siz = (outBufSiz > (ptrdiff_t)INT_MAX) ? INT_MAX : (int)outBufSiz; int const bytes = WideCharToMultiByte(CodePage, dwFlags, inPtr, cnt, outPtr, siz, lpDefaultChar, lpUsedDefaultChar); if (bytes == 0) { break; } @@ -1716,12 +1716,12 @@ ptrdiff_t MultiByteToWideCharEx( ptrdiff_t outBufCnt = cchWideChar; ptrdiff_t wcharConv = 0LL; - static ptrdiff_t const maxBufSize = (INT_MAX - 128); + static ptrdiff_t const maxBufSize = (INT_MAX - 1); while ((inBufSiz > 0LL) || (inBufSiz == -1LL)) { int const siz = (inBufSiz > maxBufSize) ? (int)maxBufSize : ((inBufSiz > 0LL) ? (int)inBufSiz : -1); - int const cnt = (outBufCnt > maxBufSize) ? (int)maxBufSize : (int)outBufCnt; + int const cnt = (outBufCnt > (ptrdiff_t)INT_MAX) ? INT_MAX : (int)outBufCnt; int const wchars = MultiByteToWideChar(CodePage, dwFlags, inPtr, siz, outPtr, cnt); if (wchars == 0) { break; } diff --git a/src/Notepad3.c b/src/Notepad3.c index 4545f156e..3178fdde9 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -535,10 +535,9 @@ static void CALLBACK MQ_ExecuteNext(HWND hwnd, UINT uMsg, UINT_PTR idEvent, DWOR // // CommandLine Parsing Flags // -static LPWSTR s_lpEncodingArg = NULL; -static LPWSTR s_lpMatchArg = NULL; static LPWSTR s_lpSchemeArg = NULL; static LPWSTR s_lpOrigFileArg = NULL; +static LPWSTR s_lpMatchArg = NULL; static WCHAR s_lpFileArg[MAX_PATH+1]; static cpi_enc_t s_flagSetEncoding = CPI_NONE; @@ -639,7 +638,6 @@ static void _InitGlobals() Globals.flagShellUseSystemMRU = 0; Globals.flagPrintFileAndLeave = 0; - Globals.bForceReLoadAsUTF8 = false; Globals.DOSEncoding = CPI_NONE; Globals.bZeroBasedColumnIndex = false; Globals.bZeroBasedCharacterCount = false; @@ -1217,9 +1215,7 @@ HWND InitInstance(HINSTANCE hInstance,LPCWSTR pszCmdLine,int nCmdShow) } // Source Encoding - if (s_lpEncodingArg) { - Encoding_SrcCmdLn(Encoding_MatchW(s_lpEncodingArg)); - } + Encoding_Forced(s_flagSetEncoding); // Pathname parameter if (s_IsThisAnElevatedRelaunch || (StrIsNotEmpty(s_lpFileArg) /*&& !g_flagNewFromClipboard*/)) @@ -1297,14 +1293,14 @@ HWND InitInstance(HINSTANCE hInstance,LPCWSTR pszCmdLine,int nCmdShow) } } else { - if (Encoding_SrcCmdLn(CPI_GET) != CPI_NONE) { - Encoding_Current(Encoding_SrcCmdLn(CPI_GET)); - Encoding_HasChanged(Encoding_SrcCmdLn(CPI_GET)); + if (Encoding_IsValid(Encoding_Forced(CPI_GET))) { + Encoding_Current(Encoding_Forced(CPI_GET)); + Encoding_HasChanged(Encoding_Forced(CPI_GET)); } } // reset - Encoding_SrcCmdLn(CPI_NONE); + Encoding_Forced(CPI_NONE); s_flagQuietCreate = false; s_flagKeepTitleExcerpt = false; @@ -2821,7 +2817,7 @@ LRESULT MsgCopyData(HWND hwnd, WPARAM wParam, LPARAM lParam) if (params->flagFileSpecified) { bool bOpened = false; - Encoding_SrcCmdLn(params->iSrcEncoding); + Encoding_Forced(params->flagSetEncoding); if (PathIsDirectory(¶ms->wchData)) { WCHAR tchFile[MAX_PATH] = { L'\0' }; @@ -2876,7 +2872,7 @@ LRESULT MsgCopyData(HWND hwnd, WPARAM wParam, LPARAM lParam) } } // reset - Encoding_SrcCmdLn(CPI_NONE); + Encoding_Forced(CPI_NONE); } if (params->flagJumpTo) { @@ -3930,7 +3926,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) { if (StrIsNotEmpty(Globals.CurrentFile)) { - cpi_enc_t iNewEncoding = Encoding_MapUnicode(Encoding_Current(CPI_GET)); + cpi_enc_t iNewEncoding = Encoding_MapSignature(Encoding_Current(CPI_GET)); if (IsSaveNeeded(ISN_GET)) { INT_PTR const answer = InfoBoxLng(MB_YESNO | MB_ICONQUESTION, NULL, IDS_MUI_ASK_RECODE); @@ -3942,7 +3938,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) if (RecodeDlg(hwnd,&iNewEncoding)) { StringCchCopy(tchMaxPathBuffer,COUNTOF(tchMaxPathBuffer),Globals.CurrentFile); - Encoding_SrcCmdLn(iNewEncoding); + Encoding_Forced(iNewEncoding); FileLoad(true,false,true,false,true, false, tchMaxPathBuffer); } } @@ -5863,7 +5859,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) case CMD_RECODEDEFAULT: { if (StrIsNotEmpty(Globals.CurrentFile)) { - Encoding_SrcCmdLn(Encoding_MapUnicode(Settings.DefaultEncoding)); + Encoding_Forced(Settings.DefaultEncoding); StringCchCopy(tchMaxPathBuffer,COUNTOF(tchMaxPathBuffer),Globals.CurrentFile); FileLoad(false,false,true,true,true,false,tchMaxPathBuffer); } @@ -5874,7 +5870,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) case CMD_RECODEANSI: { if (StrIsNotEmpty(Globals.CurrentFile)) { - Encoding_SrcCmdLn(CPI_ANSI_DEFAULT); + Encoding_Forced(CPI_ANSI_DEFAULT); StringCchCopy(tchMaxPathBuffer,COUNTOF(tchMaxPathBuffer),Globals.CurrentFile); FileLoad(false,false,true,true,Settings.SkipANSICodePageDetection,false,tchMaxPathBuffer); } @@ -5885,7 +5881,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) case CMD_RECODEOEM: { if (StrIsNotEmpty(Globals.CurrentFile)) { - Encoding_SrcCmdLn(CPI_OEM); + Encoding_Forced(CPI_OEM); StringCchCopy(tchMaxPathBuffer,COUNTOF(tchMaxPathBuffer),Globals.CurrentFile); FileLoad(false,false,true,true,true,false,tchMaxPathBuffer); } @@ -5896,7 +5892,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) case CMD_RECODEGB18030: { if (StrIsNotEmpty(Globals.CurrentFile)) { - Encoding_SrcCmdLn(Encoding_GetByCodePage(54936)); // GB18030 + Encoding_Forced(Encoding_GetByCodePage(54936)); // GB18030 StringCchCopy(tchMaxPathBuffer, COUNTOF(tchMaxPathBuffer), Globals.CurrentFile); FileLoad(false, false, true, true, true, false, tchMaxPathBuffer); } @@ -5908,10 +5904,9 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) { if (StrIsNotEmpty(Globals.CurrentFile)) { - Globals.bForceReLoadAsUTF8 = true; + Encoding_Forced(CPI_UTF8); StringCchCopy(tchMaxPathBuffer,COUNTOF(tchMaxPathBuffer),Globals.CurrentFile); FileLoad(false, false, true, true, true, false, tchMaxPathBuffer); - Globals.bForceReLoadAsUTF8 = false; } } break; @@ -5921,7 +5916,7 @@ LRESULT MsgCommand(HWND hwnd, UINT umsg, WPARAM wParam, LPARAM lParam) { if (StrIsNotEmpty(Globals.CurrentFile)) { - Globals.bForceReLoadAsUTF8 = false; + Encoding_Forced(CPI_NONE); StringCchCopy(tchMaxPathBuffer, COUNTOF(tchMaxPathBuffer), Globals.CurrentFile); FileLoad(false, false, true, false, false, true, tchMaxPathBuffer); } @@ -7659,6 +7654,8 @@ void ParseCommandLine() bool bContinue = true; bool bIsFileArg = false; + s_flagSetEncoding = CPI_NONE; + while (bContinue && ExtractFirstArgument(lp3, lp1, lp2, (int)len)) { // options if (lp1[1] == L'\0') { @@ -7697,13 +7694,9 @@ void ParseCommandLine() s_flagSetEncoding = CPI_UTF8SIGN; } // maybe parsed encoding - else if (encoding != CPI_NONE) { - if (s_lpEncodingArg) { LocalFree(s_lpEncodingArg); } - s_lpEncodingArg = StrDup(lp1); + else if (Encoding_IsValid(encoding)) { s_flagSetEncoding = encoding; } - - // EOL Mode else if (StringCchCompareXI(lp1, L"CRLF") == 0 || StringCchCompareXI(lp1, L"CR+LF") == 0) { s_flagSetEOLMode = IDM_LINEENDINGS_CRLF - IDM_LINEENDINGS_CRLF + 1; @@ -7872,8 +7865,7 @@ void ParseCommandLine() case L'E': if (ExtractFirstArgument(lp2, lp1, lp2, (int)len)) { - if (s_lpEncodingArg) { LocalFree(s_lpEncodingArg); } - s_lpEncodingArg = StrDup(lp1); + s_flagSetEncoding = Encoding_MatchW(lp1); } break; @@ -9571,12 +9563,13 @@ bool FileLoad(bool bDontSave, bool bNew, bool bReload, EditSetNewText(Globals.hwndEdit,"",0, true); Style_SetDefaultLexer(Globals.hwndEdit); SciCall_SetEOLMode(Settings.DefaultEOLMode); - if (Encoding_SrcCmdLn(CPI_GET) != CPI_NONE) { - fioStatus.iEncoding = Encoding_SrcCmdLn(CPI_GET); + if (Encoding_IsValid(Encoding_Forced(CPI_GET))) { + fioStatus.iEncoding = Encoding_Forced(CPI_GET); Encoding_Current(fioStatus.iEncoding); Encoding_HasChanged(fioStatus.iEncoding); } else { + fioStatus.iEncoding = Settings.DefaultEncoding; Encoding_Current(Settings.DefaultEncoding); Encoding_HasChanged(Settings.DefaultEncoding); } @@ -9593,8 +9586,9 @@ bool FileLoad(bool bDontSave, bool bNew, bool bReload, int idx; if (!bReload && MRU_FindFile(Globals.pFileMRU,szFileName,&idx)) { fioStatus.iEncoding = Globals.pFileMRU->iEncoding[idx]; - if (fioStatus.iEncoding > 0) - Encoding_SrcCmdLn(Encoding_MapUnicode(fioStatus.iEncoding)); + if (Encoding_IsValid(fioStatus.iEncoding)) { + Encoding_SrcWeak(fioStatus.iEncoding); + } } else { fioStatus.iEncoding = Encoding_Current(CPI_GET); @@ -9767,7 +9761,7 @@ bool FileRevert(LPCWSTR szFileName, bool bIgnoreCmdLnEnc) DOCVIEWPOS_T const docView = EditGetCurrentDocView(Globals.hwndEdit); if (bIgnoreCmdLnEnc) { - Encoding_SrcCmdLn(CPI_NONE); // ignore history too + Encoding_Forced(CPI_NONE); // ignore history too } Encoding_SrcWeak(Encoding_Current(CPI_GET)); @@ -9832,7 +9826,7 @@ bool DoElevatedRelaunch(EditFileIOStatus* pFioStatus, bool bAutoSaveOnRelaunch) // remove forced command line encoding from argument list WCHAR wchEncoding[80] = { L'\0' }; wchEncoding[0] = L'/'; - Encoding_GetNameW(Encoding_SrcCmdLn(CPI_GET), &wchEncoding[1], COUNTOF(wchEncoding)-1); + Encoding_GetNameW(Encoding_Forced(CPI_GET), &wchEncoding[1], COUNTOF(wchEncoding)-1); if (StrIsNotEmpty(&wchEncoding[1])) { lpArgs = StrCutI(lpArgs, wchEncoding); } @@ -10312,7 +10306,6 @@ bool ActivatePrevInst() params->iInitialLine = s_iInitialLine; params->iInitialColumn = s_iInitialColumn; - params->iSrcEncoding = (s_lpEncodingArg) ? Encoding_MatchW(s_lpEncodingArg) : CPI_NONE; params->flagSetEncoding = s_flagSetEncoding; params->flagSetEOLMode = s_flagSetEOLMode; params->flagTitleExcerpt = 0; @@ -10390,7 +10383,6 @@ bool ActivatePrevInst() params->iInitialLine = s_iInitialLine; params->iInitialColumn = s_iInitialColumn; - params->iSrcEncoding = (s_lpEncodingArg) ? Encoding_MatchW(s_lpEncodingArg) : CPI_NONE; params->flagSetEncoding = s_flagSetEncoding; params->flagSetEOLMode = s_flagSetEOLMode; diff --git a/src/Notepad3.h b/src/Notepad3.h index 799513700..27293ce45 100644 --- a/src/Notepad3.h +++ b/src/Notepad3.h @@ -37,7 +37,6 @@ typedef struct np3params { int flagJumpTo; int iInitialLine; int iInitialColumn; - cpi_enc_t iSrcEncoding; cpi_enc_t flagSetEncoding; int flagSetEOLMode; int flagTitleExcerpt; diff --git a/src/TypeDefs.h b/src/TypeDefs.h index 44d28e036..85c489063 100644 --- a/src/TypeDefs.h +++ b/src/TypeDefs.h @@ -324,7 +324,6 @@ typedef struct _globals_t int flagShellUseSystemMRU; int flagPrintFileAndLeave; - bool bForceReLoadAsUTF8; bool bZeroBasedColumnIndex; bool bZeroBasedCharacterCount; int iReplacedOccurrences;