mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-11 21:03:05 +08:00
+ fix Unicode Detection in case of pure ASCCI containing null bytes
This commit is contained in:
parent
9bf6f1e4dd
commit
ae32f27f98
11
src/Edit.c
11
src/Edit.c
@ -1339,15 +1339,10 @@ bool EditLoadFile(
|
||||
// === UNICODE ( UTF-16LE / UTF-16BE ) ===
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
bool const bPureASCIINoBOM = encDetection.bPureASCII && !encDetection.bHasBOM;
|
||||
bool bIsUnicodeDetected = !IS_ENC_ENFORCED() && Encoding_IsUNICODE(encDetection.unicodeAnalysis) && !bPureASCIINoBOM;
|
||||
|
||||
if (Encoding_IsUNICODE(encDetection.Encoding) || bIsUnicodeDetected) {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
if (Encoding_IsUNICODE(encDetection.Encoding))
|
||||
{
|
||||
status->iEncoding = encDetection.bHasBOM ? (encDetection.bIsReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) :
|
||||
(encDetection.bIsReverse ? CPI_UNICODEBE : CPI_UNICODE);
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
if (encDetection.bIsReverse) {
|
||||
SwabEx(lpData, lpData, cbData);
|
||||
@ -1389,7 +1384,7 @@ bool EditLoadFile(
|
||||
EditDetectEOLMode(lpData, cbData, status);
|
||||
}
|
||||
}
|
||||
else if (!IS_ENC_ENFORCED() && encDetection.bPureASCII) {
|
||||
else if (!IS_ENC_ENFORCED() && (encDetection.bPureASCII7Bit && !encDetection.bHasUnicodeNullBytes)) {
|
||||
// load ASCII(7-bit) as ANSI/UTF-8
|
||||
EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory);
|
||||
status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT);
|
||||
|
||||
@ -701,18 +701,24 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch)
|
||||
bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt)
|
||||
{
|
||||
int iTest = IS_TEXT_UNICODE_SIGNATURE;
|
||||
/*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
|
||||
//~return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0));
|
||||
return ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0); // don't rely on result ok
|
||||
bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
|
||||
return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0));
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
|
||||
{
|
||||
int iTest = IS_TEXT_UNICODE_REVERSE_SIGNATURE;
|
||||
/*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
|
||||
//~return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0));
|
||||
return ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0); // don't rely on result ok
|
||||
bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
|
||||
return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0));
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt)
|
||||
{
|
||||
int iTest = IS_TEXT_UNICODE_NULL_BYTES;
|
||||
bool const ok = IsTextUnicode(pBuf, (int)cnt, &iTest);
|
||||
return (ok && ((iTest & IS_TEXT_UNICODE_NULL_BYTES) != 0));
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
@ -723,14 +729,14 @@ bool Has_UTF16_BOM(const char* pBuf, size_t cnt)
|
||||
|
||||
// ============================================================================
|
||||
|
||||
bool IsValidUTF7(const char* pTest, size_t nLength)
|
||||
bool IsPureAscii7Bit(const char* pTest, size_t nLength)
|
||||
{
|
||||
if (!pTest) {
|
||||
return false;
|
||||
}
|
||||
char const *pt = pTest;
|
||||
for (size_t i = 0; i < nLength; ++i) {
|
||||
if ((*pt & 0x80) || !*pt) {
|
||||
if (*pt & 0x80) {
|
||||
return false;
|
||||
}
|
||||
++pt;
|
||||
|
||||
@ -124,15 +124,16 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch);
|
||||
bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt);
|
||||
bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt);
|
||||
bool Has_UTF16_BOM(const char *pBuf, size_t cnt);
|
||||
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt);
|
||||
|
||||
inline bool IsUTF8Signature(const char* p)
|
||||
inline bool IsUTF8Signature(const char* p)
|
||||
{
|
||||
return ((p[0] == '\xEF') && (p[1] == '\xBB') && (p[2] == '\xBF'));
|
||||
}
|
||||
#define UTF8StringStart(p) (IsUTF8Signature(p)) ? ((p)+3) : (p)
|
||||
|
||||
bool IsValidUTF7(const char* pTest, size_t nLength);
|
||||
bool IsValidUTF8(const char* pTest, size_t nLength);
|
||||
bool IsPureAscii7Bit(const char* pTest, size_t nLength);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
@ -192,13 +193,14 @@ typedef struct _enc_det_t {
|
||||
bool bIsReverse;
|
||||
bool bIsUTF8Sig;
|
||||
bool bValidUTF8;
|
||||
bool bPureASCII;
|
||||
bool bHasUnicodeNullBytes;
|
||||
bool bPureASCII7Bit;
|
||||
|
||||
char encodingStrg[64];
|
||||
|
||||
} ENC_DET_T;
|
||||
|
||||
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" }
|
||||
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, false, "" }
|
||||
|
||||
|
||||
ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData,
|
||||
|
||||
@ -555,7 +555,7 @@ cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM
|
||||
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
|
||||
|
||||
int iTest = iAllTests;
|
||||
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
|
||||
/*bool const ok =*/ IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok in case of multi-flags
|
||||
|
||||
if (iTest == iAllTests) {
|
||||
return CPI_NONE; // iTest doesn't seem to have been modified ...
|
||||
@ -1261,7 +1261,7 @@ extern "C" cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv)
|
||||
// GetFileEncoding()
|
||||
//
|
||||
extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData,
|
||||
cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection)
|
||||
cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection)
|
||||
{
|
||||
ENC_DET_T encDetRes = INIT_ENC_DET_T;
|
||||
|
||||
@ -1309,27 +1309,26 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
|
||||
Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint);
|
||||
// ---------------------------------------------------------------------------
|
||||
}
|
||||
encDetRes.bPureASCII = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsValidUTF7(lpData, cbData);
|
||||
encDetRes.bHasUnicodeNullBytes = HasUnicodeNullBytes(lpData, cbData);
|
||||
encDetRes.bPureASCII7Bit = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsPureAscii7Bit(lpData, cbData);
|
||||
|
||||
if (encDetRes.analyzedEncoding == CPI_NONE) {
|
||||
encDetRes.analyzedEncoding = iAnalyzeHint;
|
||||
encDetRes.confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
|
||||
}
|
||||
else if (encDetRes.bPureASCII) {
|
||||
else if (encDetRes.bPureASCII7Bit && !encDetRes.bHasUnicodeNullBytes) {
|
||||
encDetRes.analyzedEncoding = (Settings.LoadASCIIasUTF8) ? CPI_UTF8 : CPI_ANSI_DEFAULT;
|
||||
}
|
||||
|
||||
if (!bSkipUTFDetection) {
|
||||
|
||||
encDetRes.unicodeAnalysis = GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse));
|
||||
|
||||
if (Encoding_IsNONE(encDetRes.unicodeAnalysis) && Encoding_IsUNICODE(encDetRes.analyzedEncoding)) {
|
||||
encDetRes.unicodeAnalysis = encDetRes.analyzedEncoding;
|
||||
}
|
||||
|
||||
if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis))
|
||||
{
|
||||
// check considten BOM
|
||||
if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) {
|
||||
// check consistent BOM
|
||||
if (encDetRes.bHasBOM && !bBOM_LE && !bBOM_BE) {
|
||||
encDetRes.unicodeAnalysis = CPI_NONE;
|
||||
}
|
||||
@ -1347,7 +1346,8 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
|
||||
if (Encoding_IsValid(encDetRes.analyzedEncoding)) {
|
||||
// no bIsReliable check (forced unreliable detection)
|
||||
encDetRes.forcedEncoding = encDetRes.analyzedEncoding;
|
||||
} else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) {
|
||||
}
|
||||
else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) {
|
||||
encDetRes.forcedEncoding = encDetRes.unicodeAnalysis;
|
||||
}
|
||||
}
|
||||
@ -1370,23 +1370,38 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
|
||||
|
||||
if (IS_ENC_ENFORCED()) {
|
||||
encDetRes.Encoding = encDetRes.forcedEncoding;
|
||||
} else if (encDetRes.bIsUTF8Sig) {
|
||||
}
|
||||
else if (encDetRes.bIsUTF8Sig) {
|
||||
encDetRes.Encoding = CPI_UTF8SIGN;
|
||||
} else if (bBOM_LE || bBOM_BE) {
|
||||
}
|
||||
else if (bBOM_LE || bBOM_BE) {
|
||||
encDetRes.Encoding = bBOM_LE ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
|
||||
encDetRes.bIsReverse = bBOM_BE;
|
||||
} else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) {
|
||||
}
|
||||
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && encDetRes.bHasUnicodeNullBytes)
|
||||
{
|
||||
encDetRes.Encoding = encDetRes.unicodeAnalysis;
|
||||
}
|
||||
else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly))
|
||||
{
|
||||
encDetRes.Encoding = encDetRes.analyzedEncoding;
|
||||
} else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) {
|
||||
}
|
||||
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis))
|
||||
{
|
||||
encDetRes.Encoding = encDetRes.unicodeAnalysis;
|
||||
}
|
||||
else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET)))
|
||||
{
|
||||
encDetRes.Encoding = Encoding_SrcWeak(CPI_GET);
|
||||
} else if (Encoding_IsValid(iAnalyzeHint)) {
|
||||
}
|
||||
else if (Encoding_IsValid(iAnalyzeHint))
|
||||
{
|
||||
encDetRes.Encoding = iAnalyzeHint;
|
||||
}
|
||||
|
||||
if (!Encoding_IsValid(encDetRes.Encoding)) {
|
||||
encDetRes.Encoding = CPI_PREFERRED_ENCODING;
|
||||
}
|
||||
|
||||
return encDetRes;
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user