diff --git a/src/Dialogs.c b/src/Dialogs.c index 19827c6a3..770ad2eda 100644 --- a/src/Dialogs.c +++ b/src/Dialogs.c @@ -127,7 +127,7 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...) if (uIdMsg == IDS_MUI_ERR_LOADFILE || uIdMsg == IDS_MUI_ERR_SAVEFILE || uIdMsg == IDS_MUI_CREATEINI_FAIL || uIdMsg == IDS_MUI_WRITEINI_FAIL || uIdMsg == IDS_MUI_EXPORT_FAIL) { - LPVOID lpMsgBuf; + LPVOID lpMsgBuf = NULL; WCHAR wcht; FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, @@ -137,10 +137,12 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...) (LPTSTR)&lpMsgBuf, 0, NULL); - StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v"); - StringCchCat(szText, COUNTOF(szText), L"\n"); - StringCchCat(szText, COUNTOF(szText), lpMsgBuf); - LocalFree(lpMsgBuf); + if (lpMsgBuf) { + StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v"); + StringCchCat(szText, COUNTOF(szText), L"\n"); + StringCchCat(szText, COUNTOF(szText), lpMsgBuf); + LocalFree(lpMsgBuf); + } wcht = *CharPrev(szText, StrEnd(szText)); if (IsCharAlphaNumeric(wcht) || wcht == '"' || wcht == '\'') StringCchCat(szText, COUNTOF(szText), L"."); @@ -2281,12 +2283,12 @@ INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd,UINT umsg,WPARAM wParam,LPAR EndDialog(hwnd,IDCANCEL); } else { - bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED) ? 1 : 0; - bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED) ? 1 : 0; - bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED) ? 1 : 0; - bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED) ? 1 : 0; - bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED) ? 1 : 0; - bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED) ? 1 : 0; + bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED); + bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED); + bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED); + bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED); + bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED); + bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED); EndDialog(hwnd,IDOK); } } diff --git a/src/Edit.c b/src/Edit.c index 1a5d74dd2..03159a3bf 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -96,6 +96,7 @@ extern int g_iDefaultCharSet; extern bool bLoadASCIIasUTF8; extern bool bForceLoadASCIIasUTF8; extern bool bLoadNFOasOEM; +extern bool bNoEncodingTags; extern bool g_bAccelWordNavigation; @@ -1056,13 +1057,10 @@ bool EditLoadFile( if (!Encoding_IsNONE(iForcedEncoding)) { iPreferedEncoding = iForcedEncoding; } - else if (Encoding_IsUNICODE(iAnalyzedEncoding) && !bSkipUTFDetection) { - iPreferedEncoding = iAnalyzedEncoding; - } else if (iFileEncWeak != CPI_NONE) { iPreferedEncoding = iFileEncWeak; } - else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable ) { + else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable) { iPreferedEncoding = iAnalyzedEncoding; } else if (Encoding_IsNONE(iPreferedEncoding)) { @@ -1087,7 +1085,8 @@ bool EditLoadFile( // === UNICODE === else if (Encoding_IsUNICODE(iForcedEncoding) || (Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !bIsUTF8Sig - && (IsUnicode(lpData, cbData, &bBOM, &bReverse) || (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable)) + && (IsValidUnicode(lpData, cbData, &bBOM, &bReverse) + || (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable)) ) ) { @@ -1143,16 +1142,16 @@ bool EditLoadFile( FileVars_Init(lpData,cbData,&fvCurFile); // === UTF-8 === - if (Encoding_IsUTF8(iForcedEncoding) || - (Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !FileVars_IsNonUTF8(&fvCurFile) - && (bIsUTF8Sig - || FileVars_IsUTF8(&fvCurFile) - || (Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable) - || (!bNfoDizDetected && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8)) - ) - && (IsUTF8(lpData, cbData) && !UTF8_ContainsInvalidChars(lpData, cbData)) - ) - ) + bool const bHardRulesUTF8 = Encoding_IsUTF8(iForcedEncoding) || (FileVars_IsUTF8(&fvCurFile) && !bNoEncodingTags); + bool const bForcedNonUTF8 = !Encoding_IsNONE(iForcedEncoding) && !Encoding_IsUTF8(iForcedEncoding); + + bool const bValidUTF8 = IsValidUTF8(lpData, cbData); + bool const bAnalysisUTF8 = Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable; + bool const bSoftHintUTF8 = (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8); + + bool const bRejectUTF8 = bSkipUTFDetection || bForcedNonUTF8 || (FileVars_IsNonUTF8(&fvCurFile) && !bNoEncodingTags); + + if (bHardRulesUTF8 || (!bRejectUTF8 && bValidUTF8 && (bIsUTF8Sig || bAnalysisUTF8 || bSoftHintUTF8))) { EditSetNewText(hwnd,"",0); if (bIsUTF8Sig) { @@ -1181,7 +1180,7 @@ bool EditLoadFile( } if (((Encoding_GetCodePage(*iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(*iEncoding)) || - ((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsUTF7(lpData,cbData))) { + ((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsValidUTF7(lpData,cbData))) { UINT uCodePage = Encoding_GetCodePage(*iEncoding); diff --git a/src/Encoding.c b/src/Encoding.c index 54241c76b..5b513928f 100644 --- a/src/Encoding.c +++ b/src/Encoding.c @@ -587,7 +587,7 @@ const char* Encoding_GetParseNames(int iEncoding) { // ============================================================================ -bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse) +bool IsValidUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse) { if (!pBuffer || cb < 2) { return false; } @@ -630,7 +630,7 @@ bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse) // ============================================================================ -bool IsUTF7(const char* pTest, size_t nLength) { +bool IsValidUTF7(const char* pTest, size_t nLength) { const char *pt = pTest; for (size_t i = 0; i < nLength; i++) { @@ -638,7 +638,6 @@ bool IsUTF7(const char* pTest, size_t nLength) { return false; pt++; } - return true; } // ============================================================================ @@ -648,7 +647,124 @@ bool IsUTF7(const char* pTest, size_t nLength) { //#define _OLD_UTF8_VALIDATOR_ 1 #ifdef _OLD_UTF8_VALIDATOR_ -bool IsUTF8(const char* pTest, size_t nLength) +// ============================================================================ + +/* byte length of UTF-8 sequence based on value of first byte. +for UTF-16 (21-bit space), max. code length is 4, so we only need to look +at 4 upper bits. +*/ +static const size_t utf8_lengths[16] = +{ + 1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */ + 0,0,0,0, /* 1000 to 1011 : not valid */ + 2,2, /* 1100, 1101 : 2 bytes */ + 3, /* 1110 : 3 bytes */ + 4 /* 1111 : 4 bytes */ +}; + +// ---------------------------------------------------------------------------- + +/*++ +Function : +UTF8_mbslen_bytes [INTERNAL] + +Calculates the byte size of a NULL-terminated UTF-8 string. + +Parameters : +char *utf8_string : string to examine + +Return value : +size (in bytes) of a NULL-terminated UTF-8 string. +-1 if invalid NULL-terminated UTF-8 string +--*/ +size_t __fastcall UTF8_mbslen_bytes(LPCSTR utf8_string) +{ + size_t length = 0; + size_t code_size; + BYTE byte; + + while (*utf8_string) + { + byte = (BYTE)*utf8_string; + + if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { + length += code_size; + utf8_string += code_size; + } + else { + /* we got an invalid byte value but need to count it, + it will be later ignored during the string conversion */ + //WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte); + length++; + utf8_string++; + } + } + length++; /* include NULL terminator */ + return length; +} +// ---------------------------------------------------------------------------- + +/*++ +Function : +UTF8_mbslen [INTERNAL] + +Calculates the character size of a NULL-terminated UTF-8 string. + +Parameters : +char *utf8_string : string to examine +int byte_length : byte size of string + +Return value : +size (in characters) of a UTF-8 string. +-1 if invalid UTF-8 string +--*/ +size_t __fastcall UTF8_mbslen(LPCSTR utf8_string, size_t byte_length) +{ + size_t wchar_length = 0; + size_t code_size; + BYTE byte; + + while (byte_length > 0) { + byte = (BYTE)*utf8_string; + + /* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value + for first byte is 11110111. Use lookup table to determine sequence + length based on upper 4 bits of first byte */ + if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { + /* 1 sequence == 1 character */ + wchar_length++; + + if (code_size == 4) + wchar_length++; + + utf8_string += code_size; /* increment pointer */ + byte_length -= code_size; /* decrement counter*/ + } + else { + /* + unlike UTF8_mbslen_bytes, we ignore the invalid characters. + we only report the number of valid characters we have encountered + to match the Windows behavior. + */ + //WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte); + utf8_string++; + byte_length--; + } + } + return wchar_length; +} +// ---------------------------------------------------------------------------- + +bool __fastcall UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length) +{ + return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) != + UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length)); +} + +// ---------------------------------------------------------------------------- + + +bool IsValidUTF8(const char* pTest, size_t nLength) { static int byte_class_table[256] = { /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ @@ -705,121 +821,7 @@ bool IsUTF8(const char* pTest, size_t nLength) break; } - return (current == kSTART) ? true : false; -} - -// ============================================================================ - -/* byte length of UTF-8 sequence based on value of first byte. -for UTF-16 (21-bit space), max. code length is 4, so we only need to look -at 4 upper bits. -*/ -static const size_t utf8_lengths[16] = -{ - 1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */ - 0,0,0,0, /* 1000 to 1011 : not valid */ - 2,2, /* 1100, 1101 : 2 bytes */ - 3, /* 1110 : 3 bytes */ - 4 /* 1111 : 4 bytes */ -}; - -// ---------------------------------------------------------------------------- - -/*++ -Function : -UTF8_mbslen_bytes [INTERNAL] - -Calculates the byte size of a NULL-terminated UTF-8 string. - -Parameters : -char *utf8_string : string to examine - -Return value : -size (in bytes) of a NULL-terminated UTF-8 string. --1 if invalid NULL-terminated UTF-8 string ---*/ -size_t UTF8_mbslen_bytes(LPCSTR utf8_string) -{ - size_t length = 0; - size_t code_size; - BYTE byte; - - while (*utf8_string) - { - byte = (BYTE)*utf8_string; - - if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { - length += code_size; - utf8_string += code_size; - } - else { - /* we got an invalid byte value but need to count it, - it will be later ignored during the string conversion */ - //WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte); - length++; - utf8_string++; - } - } - length++; /* include NULL terminator */ - return length; -} -// ---------------------------------------------------------------------------- - -/*++ -Function : -UTF8_mbslen [INTERNAL] - -Calculates the character size of a NULL-terminated UTF-8 string. - -Parameters : -char *utf8_string : string to examine -int byte_length : byte size of string - -Return value : -size (in characters) of a UTF-8 string. --1 if invalid UTF-8 string ---*/ -size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length) -{ - size_t wchar_length = 0; - size_t code_size; - BYTE byte; - - while (byte_length > 0) { - byte = (BYTE)*utf8_string; - - /* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value - for first byte is 11110111. Use lookup table to determine sequence - length based on upper 4 bits of first byte */ - if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) { - /* 1 sequence == 1 character */ - wchar_length++; - - if (code_size == 4) - wchar_length++; - - utf8_string += code_size; /* increment pointer */ - byte_length -= code_size; /* decrement counter*/ - } - else { - /* - unlike UTF8_mbslen_bytes, we ignore the invalid characters. - we only report the number of valid characters we have encountered - to match the Windows behavior. - */ - //WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte); - utf8_string++; - byte_length--; - } - } - return wchar_length; -} -// ---------------------------------------------------------------------------- - -bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length) -{ - return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) != - UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length)); + return (current == kSTART) && !UTF8_ContainsInvalidChars(pTest, nLength); } @@ -831,17 +833,13 @@ bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length) // Copyright (c) 2008-2010 Bjoern Hoehrmann // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -enum { - UTF8_ACCEPT = 0, - UTF8_REJECT = 12, - UTF8_NOTEST = 113 -}; - -static UINT s_State = UTF8_NOTEST; - -bool IsUTF8(const char* pTest, size_t nLength) +bool IsValidUTF8(const char* pTest, size_t nLength) { + enum { + UTF8_ACCEPT = 0, + UTF8_REJECT = 12 + }; + static const unsigned char utf8_dfa[] = { // The first part of the table maps bytes to character classes that // to reduce the size of the transition table and create bitmasks. @@ -866,32 +864,16 @@ bool IsUTF8(const char* pTest, size_t nLength) const unsigned char *pt = (const unsigned char *)pTest; const unsigned char *end = pt + nLength; - s_State = UTF8_ACCEPT; + UINT state = UTF8_ACCEPT; while (pt < end && *pt) { - s_State = utf8_dfa[256 + s_State + utf8_dfa[*pt++]]; - if (s_State == UTF8_REJECT) { + state = utf8_dfa[256 + state + utf8_dfa[*pt++]]; + if (state == UTF8_REJECT) { return false; } } - return (s_State == UTF8_ACCEPT); + return (state == UTF8_ACCEPT); } -// ---------------------------------------------------------------------------- - -bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length) -{ - bool result = true; - if (s_State != UTF8_NOTEST) { - result = (s_State == UTF8_REJECT); - } - else { - result = IsUTF8(utf8_string, byte_length); - } - s_State = UTF8_NOTEST; // reset: old way, call IsUTF8() before - return result; -} - - // ---------------------------------------------------------------------------- #endif diff --git a/src/Encoding.h b/src/Encoding.h index 3978b9d23..bf9a71a07 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -128,14 +128,9 @@ const char* Encoding_GetParseNames(int); #define Has_UTF16_LE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFEFF) #define Has_UTF16_BE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFFFE) /* reverse */ -bool IsUnicode(const char*, size_t, bool*, bool*); -bool IsUTF8(const char*, size_t); -bool IsUTF7(const char*, size_t); - - -size_t UTF8_mbslen_bytes(LPCSTR utf8_string); -size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length); -bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length); +bool IsValidUnicode(const char*, size_t, bool*, bool*); +bool IsValidUTF7(const char*, size_t); +bool IsValidUTF8(const char*, size_t); // Google's "Compact Encoding Detection" extern NP2ENCODING g_Encodings[]; diff --git a/src/EncodingCED.cpp b/src/EncodingCED.cpp index 6ad478e89..8055477d8 100644 --- a/src/EncodingCED.cpp +++ b/src/EncodingCED.cpp @@ -117,9 +117,9 @@ extern "C" { #define ENC_PARSE_NAM_ISO_2022_JP "ISO-2022-jp,iso2022jp," #define ENC_PARSE_NAM_ISO_2022_KR "ISO-2022-kr,iso2022kr,csiso2022kr," #define ENC_PARSE_NAM_X_CHINESE_CNS "x-Chinese-CNS,xchinesecns," -#define ENC_PARSE_NAM_JOHAB "johab,johab," -#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600," -#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs," +//#define ENC_PARSE_NAM_JOHAB "johab,johab," +//#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600," +//#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs," //============================================================================= @@ -204,10 +204,10 @@ extern "C" NP2ENCODING g_Encodings[] = { /* 076 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 52936, ENC_PARSE_NAM_HZ_GB2312, IDS_ENC_HZ_GB2312, HZ_GB_2312, L"" }, // Chinese Simplified (HZ-GB2312) /* 077 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50220, ENC_PARSE_NAM_ISO_2022_JP, IDS_ENC_ISO_2022_JP, KDDI_ISO_2022_JP, L"" }, // Japanese (JIS) /* 078 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50225, ENC_PARSE_NAM_ISO_2022_KR, IDS_ENC_ISO_2022_KR, ISO_2022_KR, L"" }, // Korean (ISO-2022-KR) - /* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" }, // Chinese Traditional (CNS) - /* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab) - /* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10) - /* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set) + /* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" } // Chinese Traditional (CNS) + ///* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab) + ///* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10) + ///* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set) #if 0 @@ -322,6 +322,7 @@ static int __fastcall FindCodePage(const Encoding& encoding) iCodePage = 1250; break; case ISO_8859_4: + case ISO_8859_10: iCodePage = 1257; break; case ISO_8859_5: @@ -410,7 +411,7 @@ static int __fastcall MapEncoding2CPI(const char* const text, const size_t len, { bool bBOM; bool bReverse; - if (IsUnicode(text, len, &bBOM, &bReverse)) { + if (IsValidUnicode(text, len, &bBOM, &bReverse)) { iNP3Encoding = bBOM ? (bReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE); } }