Merge pull request #5534 from RaiKoHoff/Dev_Master

fix: Win's Unicode detection is not reliable, …
This commit is contained in:
Pairi Daiza 2026-02-16 18:54:12 +01:00 committed by GitHub
commit 7722617f19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 86 additions and 69 deletions

View File

@ -339,6 +339,8 @@
<LanguageStandard_C>stdc17</LanguageStandard_C>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<Optimization>Disabled</Optimization>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -457,6 +459,8 @@
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<OmitFramePointers>false</OmitFramePointers>
<Optimization>Disabled</Optimization>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -490,6 +494,8 @@
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<OmitFramePointers>false</OmitFramePointers>
<Optimization>Disabled</Optimization>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>

View File

@ -396,9 +396,8 @@
<CETCompat>true</CETCompat>
<AdditionalLibraryDirectories>$(OutputPath)obj;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<ManifestFile />
<GenerateDebugInformation>DebugFull</GenerateDebugInformation>
</Link>
<ClCompile>
</ClCompile>
<ClCompile>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@ -416,6 +415,7 @@
<ExceptionHandling>Sync</ExceptionHandling>
<LanguageStandard_C>stdc17</LanguageStandard_C>
<UseStandardPreprocessor>false</UseStandardPreprocessor>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Manifest>
<EnableDpiAwareness>PerMonitorHighDPIAware</EnableDpiAwareness>
@ -441,8 +441,8 @@
<AdditionalLibraryDirectories>$(OutputPath)obj;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<ManifestFile>
</ManifestFile>
<GenerateDebugInformation>DebugFull</GenerateDebugInformation>
</Link>
<ClCompile />
<ClCompile>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@ -461,6 +461,7 @@
<ExceptionHandling>Sync</ExceptionHandling>
<LanguageStandard_C>stdc17</LanguageStandard_C>
<UseStandardPreprocessor>false</UseStandardPreprocessor>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Manifest>
<EnableDpiAwareness>PerMonitorHighDPIAware</EnableDpiAwareness>

View File

@ -355,10 +355,14 @@
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp20</LanguageStandard>
<LanguageStandard_C>stdc17</LanguageStandard_C>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -374,10 +378,14 @@
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp20</LanguageStandard>
<LanguageStandard_C>stdc17</LanguageStandard_C>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
@ -393,10 +401,14 @@
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<LanguageStandard>stdcpp20</LanguageStandard>
<LanguageStandard_C>stdc17</LanguageStandard_C>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">

View File

@ -1422,11 +1422,12 @@ bool EditLoadFile(
EditDetectEOLMode(lpData, cbData, status);
}
}
else if (!IS_ENC_ENFORCED() && (encDetection.bPureASCII7Bit && !encDetection.bHasUnicodeNullBytes)) {
else if (!IS_ENC_ENFORCED() && encDetection.bPureASCII7Bit) {
// load ASCII(7-bit) as ANSI/UTF-8
EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory, bReloadFile);
status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT);
EditDetectEOLMode(lpData, cbData, status);
} else { // === ALL OTHER NON UTF-8 ===
status->iEncoding = encDetection.Encoding;
@ -4659,45 +4660,53 @@ void EditUniteDuplicateLines(HWND hwnd, bool bRemoveEmptyLines, bool bRemoveLast
DocLn iCurLine = iStartLine;
while (iCurLine < iEndLine) {
DocPos const iCurLnLen = Sci_GetNetLineLength(iCurLine);
DocPos const iBegCurLine = SciCall_PositionFromLine(iCurLine);
// range-pointer may move during line deletion, so copy current line for const comparison
StringCchCopyNA(pCurrentLine, SizeOfMem(pCurrentLine), SciCall_GetRangePointer(iBegCurLine, iCurLnLen + 1), iCurLnLen);
pCurrentLine[iCurLnLen] = '\0';
DocPos const iBegCurLine = SciCall_PositionFromLine(iCurLine);
DocLn iPrevLine = iCurLine;
DocLn iCompareLine = iCurLine;
bool bFoundDup = false;
while (++iCompareLine <= iEndLine) {
if (iBegCurLine >= 0) {
DocPos const iCmpLnLen = Sci_GetNetLineLength(iCompareLine);
if (bRemoveEmptyLines || (iCmpLnLen > 0)) {
DocPos const iCurLnLen = Sci_GetNetLineLength(iCurLine);
DocPos const iBegCmpLine = SciCall_PositionFromLine(iCompareLine);
const char* const pCompareLine = SciCall_GetRangePointer(iBegCmpLine, iCmpLnLen);
// range-pointer may move during line deletion, so copy current line for const comparison
StringCchCopyNA(pCurrentLine, SizeOfMem(pCurrentLine), SciCall_GetRangePointer(iBegCurLine, iCurLnLen + 1), iCurLnLen);
pCurrentLine[iCurLnLen] = '\0';
if ((iCurLnLen == iCmpLnLen) && IsSameCharSequence(pCurrentLine, pCompareLine, iCmpLnLen)) {
bFoundDup = true;
DocPos const posPrev = SciCall_GetLineEndPosition(iPrevLine);
DocPos const posComp = SciCall_GetLineEndPosition(iCompareLine);
assert(posPrev != posComp);
SciCall_SetTargetRange(posPrev, posComp);
SciCall_ReplaceTarget(0, "");
--iEndLine; // line inbetween removed
--iCompareLine; // don't proceed compare-line
DocLn iPrevLine = iCurLine;
DocLn iCompareLine = iCurLine;
bool bFoundDup = false;
while (++iCompareLine <= iEndLine) {
DocPos const iCmpLnLen = Sci_GetNetLineLength(iCompareLine);
if (bRemoveEmptyLines || (iCmpLnLen > 0)) {
DocPos const iBegCmpLine = SciCall_PositionFromLine(iCompareLine);
const char* const pCompareLine = SciCall_GetRangePointer(iBegCmpLine, iCmpLnLen);
if ((iCurLnLen == iCmpLnLen) && IsSameCharSequence(pCurrentLine, pCompareLine, iCmpLnLen)) {
bFoundDup = true;
DocPos const posPrev = SciCall_GetLineEndPosition(iPrevLine);
DocPos const posComp = SciCall_GetLineEndPosition(iCompareLine);
assert(posPrev != posComp);
SciCall_SetTargetRange(posPrev, posComp);
SciCall_ReplaceTarget(0, "");
--iEndLine; // line inbetween removed
--iCompareLine; // compare-line removed, so stay at same line for next compare
}
else iPrevLine = iCompareLine;
}
else iPrevLine = iCompareLine;
} // while
if (bRemoveLastDup && bFoundDup) {
DocPos const posBeg = SciCall_PositionFromLine(iCurLine);
DocPos const posEnd = SciCall_PositionFromLine(iCurLine + 1);
SciCall_SetTargetRange(posBeg, posEnd);
SciCall_ReplaceTarget(0, "");
--iEndLine; // line removed
}
iPrevLine = iCompareLine;
}
if (bRemoveLastDup && bFoundDup) {
DocPos const posBeg = SciCall_PositionFromLine(iCurLine);
DocPos const posEnd = SciCall_PositionFromLine(iCurLine + 1);
SciCall_SetTargetRange(posBeg, posEnd);
SciCall_ReplaceTarget(0, "");
}
else {
++iCurLine;
else ++iCurLine;
}
else ++iCurLine;
}
EndUndoTransAction();

View File

@ -714,18 +714,11 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
}
// ----------------------------------------------------------------------------
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt)
{
int iTest = IS_TEXT_UNICODE_NULL_BYTES;
bool const ok = IsTextUnicode(pBuf, (int)cnt, &iTest);
return (ok && ((iTest & IS_TEXT_UNICODE_NULL_BYTES) != 0));
}
// ----------------------------------------------------------------------------
bool Has_UTF16_BOM(const char* pBuf, size_t cnt)
{
return (Has_UTF16_LE_BOM(pBuf, cnt) || Has_UTF16_BE_BOM(pBuf, cnt));
}
// ----------------------------------------------------------------------------
// ============================================================================

View File

@ -124,7 +124,6 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch);
bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt);
bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt);
bool Has_UTF16_BOM(const char *pBuf, size_t cnt);
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt);
inline bool IsUTF8Signature(const char* p)
{
@ -132,8 +131,8 @@ inline bool IsUTF8Signature(const char* p)
}
#define UTF8StringStart(p) (IsUTF8Signature(p)) ? ((p)+3) : (p)
bool IsValidUTF8(const char* pTest, size_t nLength);
bool IsPureAscii7Bit(const char* pTest, size_t nLength);
bool IsValidUTF8(const char* pTest, size_t nLength);
//////////////////////////////////////////////////////
@ -193,14 +192,13 @@ typedef struct _enc_det_t {
bool bIsReverse;
bool bIsUTF8Sig;
bool bValidUTF8;
bool bHasUnicodeNullBytes;
bool bPureASCII7Bit;
char encodingStrg[64];
} ENC_DET_T;
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, false, "" }
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" }
ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData,

View File

@ -362,7 +362,7 @@ extern "C" NP2ENCODING g_Encodings[] = {
/* 004 */{ NCP_UNICODE | NCP_RECODE, CP_UTF8, ENC_PARSE_NAM_UTF16LE, IDS_ENC_UTF16LE, L"" }, // CPI_UNICODE 4
/* 005 */{ NCP_UNICODE | NCP_UNICODE_REVERSE | NCP_RECODE, CP_UTF8, ENC_PARSE_NAM_UTF16BE, IDS_ENC_UTF16BE, L"" }, // CPI_UNICODEBE 5
/* 006 */{ NCP_ASCII_7BIT | NCP_UTF8 | NCP_RECODE, CP_UTF8, ENC_PARSE_NAM_UTF8, IDS_ENC_UTF8, L"" }, // CPI_UTF8 6
/* 007 */{ NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, ENC_PARSE_NAM_UTF8SIG, IDS_ENC_UTF8SIG, L"" }, // CPI_UTF8SIGN 7
/* 007 */{ NCP_UTF8 | NCP_UTF8_SIGN, CP_UTF8, ENC_PARSE_NAM_UTF8SIG, IDS_ENC_UTF8SIG, L"" }, // CPI_UTF8SIGN 7
/* 008 */{ NCP_ASCII_7BIT | NCP_EXTERNAL_8BIT | NCP_RECODE, CP_UTF7, ENC_PARSE_NAM_UTF7, IDS_ENC_UTF7, L"" }, // CPI_UTF7 8
/* 009 */{ NCP_ASCII_7BIT | NCP_EXTERNAL_8BIT | NCP_RECODE, 720, ENC_PARSE_NAM_DOS_720, IDS_ENC_DOS_720, L"" },
/* 010 */{ NCP_ASCII_7BIT | NCP_EXTERNAL_8BIT | NCP_RECODE, 28596, ENC_PARSE_NAM_ISO_8859_6, IDS_ENC_ISO_8859_6, L"" },
@ -561,6 +561,7 @@ cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM
return CPI_NONE; // iTest doesn't seem to have been modified ...
}
bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
@ -570,7 +571,7 @@ cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM
//bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) {
if ((bHasBOM || bHasRBOM || (bIsUnicode || bIsReverse)) && !bIsIllegal && !(bIsUnicode && bIsReverse)) {
if (lpbBOM) {
*lpbBOM = (bHasBOM || bHasRBOM);
}
@ -1261,14 +1262,13 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection)
{
ENC_DET_T encDetRes = INIT_ENC_DET_T;
#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetRes.forcedEncoding))
FileVars_GetFromData(lpData, cbData, &Globals.fvCurFile);
bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
#define IS_ENC_ENFORCED() (!Encoding_IsNONE(encDetRes.forcedEncoding))
// --- 1st check for force encodings ---
LPCWSTR lpszExt = Path_FindExtension(hpath);
@ -1306,27 +1306,23 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint);
// ---------------------------------------------------------------------------
}
encDetRes.bHasUnicodeNullBytes = HasUnicodeNullBytes(lpData, cbData);
encDetRes.bPureASCII7Bit = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsPureAscii7Bit(lpData, cbData);
if (encDetRes.analyzedEncoding == CPI_NONE) {
encDetRes.analyzedEncoding = iAnalyzeHint;
encDetRes.confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
else if (encDetRes.bPureASCII7Bit && !encDetRes.bHasUnicodeNullBytes) {
encDetRes.analyzedEncoding = (Settings.LoadASCIIasUTF8) ? CPI_UTF8 : CPI_ANSI_DEFAULT;
else if (encDetRes.bPureASCII7Bit && encDetRes.bValidUTF8) {
encDetRes.analyzedEncoding = CPI_UTF8;
}
if (!bSkipUTFDetection) {
encDetRes.unicodeAnalysis = GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse));
if (Encoding_IsNONE(encDetRes.unicodeAnalysis) && Encoding_IsUNICODE(encDetRes.analyzedEncoding)) {
encDetRes.unicodeAnalysis = encDetRes.analyzedEncoding;
}
if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) {
// check consistent BOM
if (encDetRes.bHasBOM && !bBOM_LE && !bBOM_BE) {
if (encDetRes.bHasBOM && !(bBOM_LE || bBOM_BE)) {
encDetRes.unicodeAnalysis = CPI_NONE;
}
else if (encDetRes.bHasBOM && encDetRes.bIsReverse && !bBOM_BE) {
@ -1354,14 +1350,14 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
_SetEncodingTitleInfo(&encDetRes);
}
int const iConfidence = f2int(encDetRes.confidence * 100.0f);
int const iReliableThreshold = f2int(Settings2.AnalyzeReliableConfidenceLevel * 100.0f);
encDetRes.bIsAnalysisReliable = (iConfidence >= iReliableThreshold);
// --------------------------------------------------------------------------
// --- choose best encoding guess ----
// --------------------------------------------------------------------------
int const iConfidence = f2int(encDetRes.confidence * 100.0f);
int const iReliableThreshold = f2int(Settings2.AnalyzeReliableConfidenceLevel * 100.0f);
encDetRes.bIsAnalysisReliable = (iConfidence >= iReliableThreshold);
// init Preferred Encoding
encDetRes.Encoding = CPI_PREFERRED_ENCODING;
@ -1375,17 +1371,13 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
encDetRes.Encoding = bBOM_LE ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
encDetRes.bIsReverse = bBOM_BE;
}
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && encDetRes.bHasUnicodeNullBytes)
{
encDetRes.Encoding = encDetRes.unicodeAnalysis;
}
else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly))
{
encDetRes.Encoding = encDetRes.analyzedEncoding;
}
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis))
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && (iConfidence > 66))
{
encDetRes.Encoding = encDetRes.unicodeAnalysis;
encDetRes.Encoding = encDetRes.analyzedEncoding; // (1) rely on analyzed encoding
}
else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET)))
{

View File

@ -109,6 +109,9 @@
<AdditionalIncludeDirectories>ced;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<TreatWarningAsError>true</TreatWarningAsError>
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
@ -126,6 +129,9 @@
<AdditionalIncludeDirectories>ced;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<TreatWarningAsError>true</TreatWarningAsError>
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>