+ fix: renewed UTF-8 and Unicode detection

+ rev: reverting some new encodings (needs handling for not installed codepages)
This commit is contained in:
Rainer Kottenhoff 2018-08-21 16:22:30 +02:00
parent 26e32752c2
commit 4582aa487d
5 changed files with 171 additions and 192 deletions

View File

@ -127,7 +127,7 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...)
if (uIdMsg == IDS_MUI_ERR_LOADFILE || uIdMsg == IDS_MUI_ERR_SAVEFILE ||
uIdMsg == IDS_MUI_CREATEINI_FAIL || uIdMsg == IDS_MUI_WRITEINI_FAIL ||
uIdMsg == IDS_MUI_EXPORT_FAIL) {
LPVOID lpMsgBuf;
LPVOID lpMsgBuf = NULL;
WCHAR wcht;
FormatMessage(
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
@ -137,10 +137,12 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...)
(LPTSTR)&lpMsgBuf,
0,
NULL);
StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v");
StringCchCat(szText, COUNTOF(szText), L"\n");
StringCchCat(szText, COUNTOF(szText), lpMsgBuf);
LocalFree(lpMsgBuf);
if (lpMsgBuf) {
StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v");
StringCchCat(szText, COUNTOF(szText), L"\n");
StringCchCat(szText, COUNTOF(szText), lpMsgBuf);
LocalFree(lpMsgBuf);
}
wcht = *CharPrev(szText, StrEnd(szText));
if (IsCharAlphaNumeric(wcht) || wcht == '"' || wcht == '\'')
StringCchCat(szText, COUNTOF(szText), L".");
@ -2281,12 +2283,12 @@ INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd,UINT umsg,WPARAM wParam,LPAR
EndDialog(hwnd,IDCANCEL);
}
else {
bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED) ? 1 : 0;
bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED) ? 1 : 0;
bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED) ? 1 : 0;
bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED) ? 1 : 0;
bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED) ? 1 : 0;
bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED) ? 1 : 0;
bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED);
bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED);
bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED);
bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED);
bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED);
bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED);
EndDialog(hwnd,IDOK);
}
}

View File

@ -96,6 +96,7 @@ extern int g_iDefaultCharSet;
extern bool bLoadASCIIasUTF8;
extern bool bForceLoadASCIIasUTF8;
extern bool bLoadNFOasOEM;
extern bool bNoEncodingTags;
extern bool g_bAccelWordNavigation;
@ -1056,13 +1057,10 @@ bool EditLoadFile(
if (!Encoding_IsNONE(iForcedEncoding)) {
iPreferedEncoding = iForcedEncoding;
}
else if (Encoding_IsUNICODE(iAnalyzedEncoding) && !bSkipUTFDetection) {
iPreferedEncoding = iAnalyzedEncoding;
}
else if (iFileEncWeak != CPI_NONE) {
iPreferedEncoding = iFileEncWeak;
}
else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable ) {
else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable) {
iPreferedEncoding = iAnalyzedEncoding;
}
else if (Encoding_IsNONE(iPreferedEncoding)) {
@ -1087,7 +1085,8 @@ bool EditLoadFile(
// === UNICODE ===
else if (Encoding_IsUNICODE(iForcedEncoding) ||
(Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !bIsUTF8Sig
&& (IsUnicode(lpData, cbData, &bBOM, &bReverse) || (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable))
&& (IsValidUnicode(lpData, cbData, &bBOM, &bReverse)
|| (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable))
)
)
{
@ -1143,16 +1142,16 @@ bool EditLoadFile(
FileVars_Init(lpData,cbData,&fvCurFile);
// === UTF-8 ===
if (Encoding_IsUTF8(iForcedEncoding) ||
(Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !FileVars_IsNonUTF8(&fvCurFile)
&& (bIsUTF8Sig
|| FileVars_IsUTF8(&fvCurFile)
|| (Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable)
|| (!bNfoDizDetected && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))
)
&& (IsUTF8(lpData, cbData) && !UTF8_ContainsInvalidChars(lpData, cbData))
)
)
bool const bHardRulesUTF8 = Encoding_IsUTF8(iForcedEncoding) || (FileVars_IsUTF8(&fvCurFile) && !bNoEncodingTags);
bool const bForcedNonUTF8 = !Encoding_IsNONE(iForcedEncoding) && !Encoding_IsUTF8(iForcedEncoding);
bool const bValidUTF8 = IsValidUTF8(lpData, cbData);
bool const bAnalysisUTF8 = Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable;
bool const bSoftHintUTF8 = (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8);
bool const bRejectUTF8 = bSkipUTFDetection || bForcedNonUTF8 || (FileVars_IsNonUTF8(&fvCurFile) && !bNoEncodingTags);
if (bHardRulesUTF8 || (!bRejectUTF8 && bValidUTF8 && (bIsUTF8Sig || bAnalysisUTF8 || bSoftHintUTF8)))
{
EditSetNewText(hwnd,"",0);
if (bIsUTF8Sig) {
@ -1181,7 +1180,7 @@ bool EditLoadFile(
}
if (((Encoding_GetCodePage(*iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(*iEncoding)) ||
((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsUTF7(lpData,cbData))) {
((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsValidUTF7(lpData,cbData))) {
UINT uCodePage = Encoding_GetCodePage(*iEncoding);

View File

@ -587,7 +587,7 @@ const char* Encoding_GetParseNames(int iEncoding) {
// ============================================================================
bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
bool IsValidUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
{
if (!pBuffer || cb < 2) { return false; }
@ -630,7 +630,7 @@ bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
// ============================================================================
bool IsUTF7(const char* pTest, size_t nLength) {
bool IsValidUTF7(const char* pTest, size_t nLength) {
const char *pt = pTest;
for (size_t i = 0; i < nLength; i++) {
@ -638,7 +638,6 @@ bool IsUTF7(const char* pTest, size_t nLength) {
return false;
pt++;
}
return true;
}
// ============================================================================
@ -648,7 +647,124 @@ bool IsUTF7(const char* pTest, size_t nLength) {
//#define _OLD_UTF8_VALIDATOR_ 1
#ifdef _OLD_UTF8_VALIDATOR_
bool IsUTF8(const char* pTest, size_t nLength)
// ============================================================================
/* byte length of UTF-8 sequence based on value of first byte.
for UTF-16 (21-bit space), max. code length is 4, so we only need to look
at 4 upper bits.
*/
static const size_t utf8_lengths[16] =
{
1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */
0,0,0,0, /* 1000 to 1011 : not valid */
2,2, /* 1100, 1101 : 2 bytes */
3, /* 1110 : 3 bytes */
4 /* 1111 : 4 bytes */
};
// ----------------------------------------------------------------------------
/*++
Function :
UTF8_mbslen_bytes [INTERNAL]
Calculates the byte size of a NULL-terminated UTF-8 string.
Parameters :
char *utf8_string : string to examine
Return value :
size (in bytes) of a NULL-terminated UTF-8 string.
-1 if invalid NULL-terminated UTF-8 string
--*/
size_t __fastcall UTF8_mbslen_bytes(LPCSTR utf8_string)
{
size_t length = 0;
size_t code_size;
BYTE byte;
while (*utf8_string)
{
byte = (BYTE)*utf8_string;
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
length += code_size;
utf8_string += code_size;
}
else {
/* we got an invalid byte value but need to count it,
it will be later ignored during the string conversion */
//WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte);
length++;
utf8_string++;
}
}
length++; /* include NULL terminator */
return length;
}
// ----------------------------------------------------------------------------
/*++
Function :
UTF8_mbslen [INTERNAL]
Calculates the character size of a NULL-terminated UTF-8 string.
Parameters :
char *utf8_string : string to examine
int byte_length : byte size of string
Return value :
size (in characters) of a UTF-8 string.
-1 if invalid UTF-8 string
--*/
size_t __fastcall UTF8_mbslen(LPCSTR utf8_string, size_t byte_length)
{
size_t wchar_length = 0;
size_t code_size;
BYTE byte;
while (byte_length > 0) {
byte = (BYTE)*utf8_string;
/* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value
for first byte is 11110111. Use lookup table to determine sequence
length based on upper 4 bits of first byte */
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
/* 1 sequence == 1 character */
wchar_length++;
if (code_size == 4)
wchar_length++;
utf8_string += code_size; /* increment pointer */
byte_length -= code_size; /* decrement counter*/
}
else {
/*
unlike UTF8_mbslen_bytes, we ignore the invalid characters.
we only report the number of valid characters we have encountered
to match the Windows behavior.
*/
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte);
utf8_string++;
byte_length--;
}
}
return wchar_length;
}
// ----------------------------------------------------------------------------
bool __fastcall UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
{
return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) !=
UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length));
}
// ----------------------------------------------------------------------------
bool IsValidUTF8(const char* pTest, size_t nLength)
{
static int byte_class_table[256] = {
/* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */
@ -705,121 +821,7 @@ bool IsUTF8(const char* pTest, size_t nLength)
break;
}
return (current == kSTART) ? true : false;
}
// ============================================================================
/* byte length of UTF-8 sequence based on value of first byte.
for UTF-16 (21-bit space), max. code length is 4, so we only need to look
at 4 upper bits.
*/
static const size_t utf8_lengths[16] =
{
1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */
0,0,0,0, /* 1000 to 1011 : not valid */
2,2, /* 1100, 1101 : 2 bytes */
3, /* 1110 : 3 bytes */
4 /* 1111 : 4 bytes */
};
// ----------------------------------------------------------------------------
/*++
Function :
UTF8_mbslen_bytes [INTERNAL]
Calculates the byte size of a NULL-terminated UTF-8 string.
Parameters :
char *utf8_string : string to examine
Return value :
size (in bytes) of a NULL-terminated UTF-8 string.
-1 if invalid NULL-terminated UTF-8 string
--*/
size_t UTF8_mbslen_bytes(LPCSTR utf8_string)
{
size_t length = 0;
size_t code_size;
BYTE byte;
while (*utf8_string)
{
byte = (BYTE)*utf8_string;
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
length += code_size;
utf8_string += code_size;
}
else {
/* we got an invalid byte value but need to count it,
it will be later ignored during the string conversion */
//WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte);
length++;
utf8_string++;
}
}
length++; /* include NULL terminator */
return length;
}
// ----------------------------------------------------------------------------
/*++
Function :
UTF8_mbslen [INTERNAL]
Calculates the character size of a NULL-terminated UTF-8 string.
Parameters :
char *utf8_string : string to examine
int byte_length : byte size of string
Return value :
size (in characters) of a UTF-8 string.
-1 if invalid UTF-8 string
--*/
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length)
{
size_t wchar_length = 0;
size_t code_size;
BYTE byte;
while (byte_length > 0) {
byte = (BYTE)*utf8_string;
/* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value
for first byte is 11110111. Use lookup table to determine sequence
length based on upper 4 bits of first byte */
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
/* 1 sequence == 1 character */
wchar_length++;
if (code_size == 4)
wchar_length++;
utf8_string += code_size; /* increment pointer */
byte_length -= code_size; /* decrement counter*/
}
else {
/*
unlike UTF8_mbslen_bytes, we ignore the invalid characters.
we only report the number of valid characters we have encountered
to match the Windows behavior.
*/
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte);
utf8_string++;
byte_length--;
}
}
return wchar_length;
}
// ----------------------------------------------------------------------------
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
{
return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) !=
UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length));
return (current == kSTART) && !UTF8_ContainsInvalidChars(pTest, nLength);
}
@ -831,17 +833,13 @@ bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
enum {
UTF8_ACCEPT = 0,
UTF8_REJECT = 12,
UTF8_NOTEST = 113
};
static UINT s_State = UTF8_NOTEST;
bool IsUTF8(const char* pTest, size_t nLength)
bool IsValidUTF8(const char* pTest, size_t nLength)
{
enum {
UTF8_ACCEPT = 0,
UTF8_REJECT = 12
};
static const unsigned char utf8_dfa[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
@ -866,32 +864,16 @@ bool IsUTF8(const char* pTest, size_t nLength)
const unsigned char *pt = (const unsigned char *)pTest;
const unsigned char *end = pt + nLength;
s_State = UTF8_ACCEPT;
UINT state = UTF8_ACCEPT;
while (pt < end && *pt) {
s_State = utf8_dfa[256 + s_State + utf8_dfa[*pt++]];
if (s_State == UTF8_REJECT) {
state = utf8_dfa[256 + state + utf8_dfa[*pt++]];
if (state == UTF8_REJECT) {
return false;
}
}
return (s_State == UTF8_ACCEPT);
return (state == UTF8_ACCEPT);
}
// ----------------------------------------------------------------------------
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
{
bool result = true;
if (s_State != UTF8_NOTEST) {
result = (s_State == UTF8_REJECT);
}
else {
result = IsUTF8(utf8_string, byte_length);
}
s_State = UTF8_NOTEST; // reset: old way, call IsUTF8() before
return result;
}
// ----------------------------------------------------------------------------
#endif

View File

@ -128,14 +128,9 @@ const char* Encoding_GetParseNames(int);
#define Has_UTF16_LE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFEFF)
#define Has_UTF16_BE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFFFE) /* reverse */
bool IsUnicode(const char*, size_t, bool*, bool*);
bool IsUTF8(const char*, size_t);
bool IsUTF7(const char*, size_t);
size_t UTF8_mbslen_bytes(LPCSTR utf8_string);
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length);
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length);
bool IsValidUnicode(const char*, size_t, bool*, bool*);
bool IsValidUTF7(const char*, size_t);
bool IsValidUTF8(const char*, size_t);
// Google's "Compact Encoding Detection"
extern NP2ENCODING g_Encodings[];

View File

@ -117,9 +117,9 @@ extern "C" {
#define ENC_PARSE_NAM_ISO_2022_JP "ISO-2022-jp,iso2022jp,"
#define ENC_PARSE_NAM_ISO_2022_KR "ISO-2022-kr,iso2022kr,csiso2022kr,"
#define ENC_PARSE_NAM_X_CHINESE_CNS "x-Chinese-CNS,xchinesecns,"
#define ENC_PARSE_NAM_JOHAB "johab,johab,"
#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600,"
#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs,"
//#define ENC_PARSE_NAM_JOHAB "johab,johab,"
//#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600,"
//#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs,"
//=============================================================================
@ -204,10 +204,10 @@ extern "C" NP2ENCODING g_Encodings[] = {
/* 076 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 52936, ENC_PARSE_NAM_HZ_GB2312, IDS_ENC_HZ_GB2312, HZ_GB_2312, L"" }, // Chinese Simplified (HZ-GB2312)
/* 077 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50220, ENC_PARSE_NAM_ISO_2022_JP, IDS_ENC_ISO_2022_JP, KDDI_ISO_2022_JP, L"" }, // Japanese (JIS)
/* 078 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50225, ENC_PARSE_NAM_ISO_2022_KR, IDS_ENC_ISO_2022_KR, ISO_2022_KR, L"" }, // Korean (ISO-2022-KR)
/* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" }, // Chinese Traditional (CNS)
/* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab)
/* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10)
/* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set)
/* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" } // Chinese Traditional (CNS)
///* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab)
///* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10)
///* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set)
#if 0
@ -322,6 +322,7 @@ static int __fastcall FindCodePage(const Encoding& encoding)
iCodePage = 1250;
break;
case ISO_8859_4:
case ISO_8859_10:
iCodePage = 1257;
break;
case ISO_8859_5:
@ -410,7 +411,7 @@ static int __fastcall MapEncoding2CPI(const char* const text, const size_t len,
{
bool bBOM;
bool bReverse;
if (IsUnicode(text, len, &bBOM, &bReverse)) {
if (IsValidUnicode(text, len, &bBOM, &bReverse)) {
iNP3Encoding = bBOM ? (bReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE);
}
}