+ fix Unicode Detection in case of pure ASCCI containing null bytes

This commit is contained in:
METANEOCORTEX\Kotti 2022-08-28 13:27:14 +02:00
parent 9bf6f1e4dd
commit ae32f27f98
5 changed files with 53 additions and 35 deletions

View File

@ -1339,15 +1339,10 @@ bool EditLoadFile(
// === UNICODE ( UTF-16LE / UTF-16BE ) ===
// --------------------------------------------------------------------------
bool const bPureASCIINoBOM = encDetection.bPureASCII && !encDetection.bHasBOM;
bool bIsUnicodeDetected = !IS_ENC_ENFORCED() && Encoding_IsUNICODE(encDetection.unicodeAnalysis) && !bPureASCIINoBOM;
if (Encoding_IsUNICODE(encDetection.Encoding) || bIsUnicodeDetected) {
// ----------------------------------------------------------------------
if (Encoding_IsUNICODE(encDetection.Encoding))
{
status->iEncoding = encDetection.bHasBOM ? (encDetection.bIsReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) :
(encDetection.bIsReverse ? CPI_UNICODEBE : CPI_UNICODE);
// ----------------------------------------------------------------------
if (encDetection.bIsReverse) {
SwabEx(lpData, lpData, cbData);
@ -1389,7 +1384,7 @@ bool EditLoadFile(
EditDetectEOLMode(lpData, cbData, status);
}
}
else if (!IS_ENC_ENFORCED() && encDetection.bPureASCII) {
else if (!IS_ENC_ENFORCED() && (encDetection.bPureASCII7Bit && !encDetection.bHasUnicodeNullBytes)) {
// load ASCII(7-bit) as ANSI/UTF-8
EditSetNewText(hwnd, lpData, cbData, bClearUndoHistory);
status->iEncoding = (Settings.LoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT);

View File

@ -701,18 +701,24 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch)
bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt)
{
int iTest = IS_TEXT_UNICODE_SIGNATURE;
/*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
//~return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0));
return ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0); // don't rely on result ok
bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
return (ok && ((iTest & IS_TEXT_UNICODE_SIGNATURE) != 0));
}
// ----------------------------------------------------------------------------
bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
{
int iTest = IS_TEXT_UNICODE_REVERSE_SIGNATURE;
/*bool const ok =*/ (void)IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
//~return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0));
return ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0); // don't rely on result ok
bool const ok = IsTextUnicode(pBuf, clampi((int)cnt, 0, 4), &iTest);
return (ok && ((iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE) != 0));
}
// ----------------------------------------------------------------------------
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt)
{
int iTest = IS_TEXT_UNICODE_NULL_BYTES;
bool const ok = IsTextUnicode(pBuf, (int)cnt, &iTest);
return (ok && ((iTest & IS_TEXT_UNICODE_NULL_BYTES) != 0));
}
// ----------------------------------------------------------------------------
@ -723,14 +729,14 @@ bool Has_UTF16_BOM(const char* pBuf, size_t cnt)
// ============================================================================
bool IsValidUTF7(const char* pTest, size_t nLength)
bool IsPureAscii7Bit(const char* pTest, size_t nLength)
{
if (!pTest) {
return false;
}
char const *pt = pTest;
for (size_t i = 0; i < nLength; ++i) {
if ((*pt & 0x80) || !*pt) {
if (*pt & 0x80) {
return false;
}
++pt;

View File

@ -124,15 +124,16 @@ int Encoding_GetNameW(const cpi_enc_t iEncoding, LPWSTR buffer, size_t cwch);
bool Has_UTF16_LE_BOM(const char* pBuf, size_t cnt);
bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt);
bool Has_UTF16_BOM(const char *pBuf, size_t cnt);
bool HasUnicodeNullBytes(const char* pBuf, size_t cnt);
inline bool IsUTF8Signature(const char* p)
inline bool IsUTF8Signature(const char* p)
{
return ((p[0] == '\xEF') && (p[1] == '\xBB') && (p[2] == '\xBF'));
}
#define UTF8StringStart(p) (IsUTF8Signature(p)) ? ((p)+3) : (p)
bool IsValidUTF7(const char* pTest, size_t nLength);
bool IsValidUTF8(const char* pTest, size_t nLength);
bool IsPureAscii7Bit(const char* pTest, size_t nLength);
//////////////////////////////////////////////////////
@ -192,13 +193,14 @@ typedef struct _enc_det_t {
bool bIsReverse;
bool bIsUTF8Sig;
bool bValidUTF8;
bool bPureASCII;
bool bHasUnicodeNullBytes;
bool bPureASCII7Bit;
char encodingStrg[64];
} ENC_DET_T;
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, "" }
#define INIT_ENC_DET_T { CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, CPI_NONE, 0.0f, false, false, false, false, false, false, false, "" }
ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData,

View File

@ -555,7 +555,7 @@ cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
int iTest = iAllTests;
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
/*bool const ok =*/ IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok in case of multi-flags
if (iTest == iAllTests) {
return CPI_NONE; // iTest doesn't seem to have been modified ...
@ -1261,7 +1261,7 @@ extern "C" cpi_enc_t FileVars_GetEncoding(LPFILEVARS lpfv)
// GetFileEncoding()
//
extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpData, const size_t cbData,
cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection)
cpi_enc_t iAnalyzeHint, bool bSkipUTFDetection, bool bSkipANSICPDetection, bool bForceEncDetection)
{
ENC_DET_T encDetRes = INIT_ENC_DET_T;
@ -1309,27 +1309,26 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint);
// ---------------------------------------------------------------------------
}
encDetRes.bPureASCII = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsValidUTF7(lpData, cbData);
encDetRes.bHasUnicodeNullBytes = HasUnicodeNullBytes(lpData, cbData);
encDetRes.bPureASCII7Bit = (encDetRes.analyzedEncoding == CPI_ASCII_7BIT) || IsPureAscii7Bit(lpData, cbData);
if (encDetRes.analyzedEncoding == CPI_NONE) {
encDetRes.analyzedEncoding = iAnalyzeHint;
encDetRes.confidence = (1.0f - Settings2.AnalyzeReliableConfidenceLevel);
}
else if (encDetRes.bPureASCII) {
else if (encDetRes.bPureASCII7Bit && !encDetRes.bHasUnicodeNullBytes) {
encDetRes.analyzedEncoding = (Settings.LoadASCIIasUTF8) ? CPI_UTF8 : CPI_ANSI_DEFAULT;
}
if (!bSkipUTFDetection) {
encDetRes.unicodeAnalysis = GetUnicodeEncoding(lpData, cbData, &(encDetRes.bHasBOM), &(encDetRes.bIsReverse));
if (Encoding_IsNONE(encDetRes.unicodeAnalysis) && Encoding_IsUNICODE(encDetRes.analyzedEncoding)) {
encDetRes.unicodeAnalysis = encDetRes.analyzedEncoding;
}
if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis))
{
// check considten BOM
if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis)) {
// check consistent BOM
if (encDetRes.bHasBOM && !bBOM_LE && !bBOM_BE) {
encDetRes.unicodeAnalysis = CPI_NONE;
}
@ -1347,7 +1346,8 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
if (Encoding_IsValid(encDetRes.analyzedEncoding)) {
// no bIsReliable check (forced unreliable detection)
encDetRes.forcedEncoding = encDetRes.analyzedEncoding;
} else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) {
}
else if (Encoding_IsValid(encDetRes.unicodeAnalysis)) {
encDetRes.forcedEncoding = encDetRes.unicodeAnalysis;
}
}
@ -1370,23 +1370,38 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD
if (IS_ENC_ENFORCED()) {
encDetRes.Encoding = encDetRes.forcedEncoding;
} else if (encDetRes.bIsUTF8Sig) {
}
else if (encDetRes.bIsUTF8Sig) {
encDetRes.Encoding = CPI_UTF8SIGN;
} else if (bBOM_LE || bBOM_BE) {
}
else if (bBOM_LE || bBOM_BE) {
encDetRes.Encoding = bBOM_LE ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
encDetRes.bIsReverse = bBOM_BE;
} else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) {
}
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && encDetRes.bHasUnicodeNullBytes)
{
encDetRes.Encoding = encDetRes.unicodeAnalysis;
}
else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly))
{
encDetRes.Encoding = encDetRes.analyzedEncoding;
} else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) {
}
else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis))
{
encDetRes.Encoding = encDetRes.unicodeAnalysis;
}
else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET)))
{
encDetRes.Encoding = Encoding_SrcWeak(CPI_GET);
} else if (Encoding_IsValid(iAnalyzeHint)) {
}
else if (Encoding_IsValid(iAnalyzeHint))
{
encDetRes.Encoding = iAnalyzeHint;
}
if (!Encoding_IsValid(encDetRes.Encoding)) {
encDetRes.Encoding = CPI_PREFERRED_ENCODING;
}
return encDetRes;
}