mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-11 21:03:05 +08:00
+ fix: renewed UTF-8 and Unicode detection
+ rev: reverting some new encodings (needs handling for not installed codepages)
This commit is contained in:
parent
26e32752c2
commit
4582aa487d
@ -127,7 +127,7 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...)
|
||||
if (uIdMsg == IDS_MUI_ERR_LOADFILE || uIdMsg == IDS_MUI_ERR_SAVEFILE ||
|
||||
uIdMsg == IDS_MUI_CREATEINI_FAIL || uIdMsg == IDS_MUI_WRITEINI_FAIL ||
|
||||
uIdMsg == IDS_MUI_EXPORT_FAIL) {
|
||||
LPVOID lpMsgBuf;
|
||||
LPVOID lpMsgBuf = NULL;
|
||||
WCHAR wcht;
|
||||
FormatMessage(
|
||||
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
@ -137,10 +137,12 @@ int MsgBoxLng(int iType, UINT uIdMsg, ...)
|
||||
(LPTSTR)&lpMsgBuf,
|
||||
0,
|
||||
NULL);
|
||||
StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v");
|
||||
StringCchCat(szText, COUNTOF(szText), L"\n");
|
||||
StringCchCat(szText, COUNTOF(szText), lpMsgBuf);
|
||||
LocalFree(lpMsgBuf);
|
||||
if (lpMsgBuf) {
|
||||
StrTrim(lpMsgBuf, L" \a\b\f\n\r\t\v");
|
||||
StringCchCat(szText, COUNTOF(szText), L"\n");
|
||||
StringCchCat(szText, COUNTOF(szText), lpMsgBuf);
|
||||
LocalFree(lpMsgBuf);
|
||||
}
|
||||
wcht = *CharPrev(szText, StrEnd(szText));
|
||||
if (IsCharAlphaNumeric(wcht) || wcht == '"' || wcht == '\'')
|
||||
StringCchCat(szText, COUNTOF(szText), L".");
|
||||
@ -2281,12 +2283,12 @@ INT_PTR CALLBACK SelectDefEncodingDlgProc(HWND hwnd,UINT umsg,WPARAM wParam,LPAR
|
||||
EndDialog(hwnd,IDCANCEL);
|
||||
}
|
||||
else {
|
||||
bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED) ? 1 : 0;
|
||||
bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED) ? 1 : 0;
|
||||
bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED) ? 1 : 0;
|
||||
bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED) ? 1 : 0;
|
||||
bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED) ? 1 : 0;
|
||||
bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED) ? 1 : 0;
|
||||
bUseDefaultForFileEncoding = (IsDlgButtonChecked(hwnd, IDC_USEASREADINGFALLBACK) == BST_CHECKED);
|
||||
bSkipUnicodeDetection = (IsDlgButtonChecked(hwnd,IDC_NOUNICODEDETECTION) == BST_CHECKED);
|
||||
bSkipANSICodePageDetection = (IsDlgButtonChecked(hwnd, IDC_NOANSICPDETECTION) == BST_CHECKED);
|
||||
bLoadASCIIasUTF8 = (IsDlgButtonChecked(hwnd,IDC_ASCIIASUTF8) == BST_CHECKED);
|
||||
bLoadNFOasOEM = (IsDlgButtonChecked(hwnd,IDC_NFOASOEM) == BST_CHECKED);
|
||||
bNoEncodingTags = (IsDlgButtonChecked(hwnd,IDC_ENCODINGFROMFILEVARS) == BST_CHECKED);
|
||||
EndDialog(hwnd,IDOK);
|
||||
}
|
||||
}
|
||||
|
||||
31
src/Edit.c
31
src/Edit.c
@ -96,6 +96,7 @@ extern int g_iDefaultCharSet;
|
||||
extern bool bLoadASCIIasUTF8;
|
||||
extern bool bForceLoadASCIIasUTF8;
|
||||
extern bool bLoadNFOasOEM;
|
||||
extern bool bNoEncodingTags;
|
||||
|
||||
extern bool g_bAccelWordNavigation;
|
||||
|
||||
@ -1056,13 +1057,10 @@ bool EditLoadFile(
|
||||
if (!Encoding_IsNONE(iForcedEncoding)) {
|
||||
iPreferedEncoding = iForcedEncoding;
|
||||
}
|
||||
else if (Encoding_IsUNICODE(iAnalyzedEncoding) && !bSkipUTFDetection) {
|
||||
iPreferedEncoding = iAnalyzedEncoding;
|
||||
}
|
||||
else if (iFileEncWeak != CPI_NONE) {
|
||||
iPreferedEncoding = iFileEncWeak;
|
||||
}
|
||||
else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable ) {
|
||||
else if (!Encoding_IsNONE(iAnalyzedEncoding) && bIsReliable) {
|
||||
iPreferedEncoding = iAnalyzedEncoding;
|
||||
}
|
||||
else if (Encoding_IsNONE(iPreferedEncoding)) {
|
||||
@ -1087,7 +1085,8 @@ bool EditLoadFile(
|
||||
// === UNICODE ===
|
||||
else if (Encoding_IsUNICODE(iForcedEncoding) ||
|
||||
(Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !bIsUTF8Sig
|
||||
&& (IsUnicode(lpData, cbData, &bBOM, &bReverse) || (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable))
|
||||
&& (IsValidUnicode(lpData, cbData, &bBOM, &bReverse)
|
||||
|| (Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable))
|
||||
)
|
||||
)
|
||||
{
|
||||
@ -1143,16 +1142,16 @@ bool EditLoadFile(
|
||||
FileVars_Init(lpData,cbData,&fvCurFile);
|
||||
|
||||
// === UTF-8 ===
|
||||
if (Encoding_IsUTF8(iForcedEncoding) ||
|
||||
(Encoding_IsNONE(iForcedEncoding) && !bSkipUTFDetection && !FileVars_IsNonUTF8(&fvCurFile)
|
||||
&& (bIsUTF8Sig
|
||||
|| FileVars_IsUTF8(&fvCurFile)
|
||||
|| (Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable)
|
||||
|| (!bNfoDizDetected && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))
|
||||
)
|
||||
&& (IsUTF8(lpData, cbData) && !UTF8_ContainsInvalidChars(lpData, cbData))
|
||||
)
|
||||
)
|
||||
bool const bHardRulesUTF8 = Encoding_IsUTF8(iForcedEncoding) || (FileVars_IsUTF8(&fvCurFile) && !bNoEncodingTags);
|
||||
bool const bForcedNonUTF8 = !Encoding_IsNONE(iForcedEncoding) && !Encoding_IsUTF8(iForcedEncoding);
|
||||
|
||||
bool const bValidUTF8 = IsValidUTF8(lpData, cbData);
|
||||
bool const bAnalysisUTF8 = Encoding_IsUTF8(iAnalyzedEncoding) && bIsReliable;
|
||||
bool const bSoftHintUTF8 = (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8);
|
||||
|
||||
bool const bRejectUTF8 = bSkipUTFDetection || bForcedNonUTF8 || (FileVars_IsNonUTF8(&fvCurFile) && !bNoEncodingTags);
|
||||
|
||||
if (bHardRulesUTF8 || (!bRejectUTF8 && bValidUTF8 && (bIsUTF8Sig || bAnalysisUTF8 || bSoftHintUTF8)))
|
||||
{
|
||||
EditSetNewText(hwnd,"",0);
|
||||
if (bIsUTF8Sig) {
|
||||
@ -1181,7 +1180,7 @@ bool EditLoadFile(
|
||||
}
|
||||
|
||||
if (((Encoding_GetCodePage(*iEncoding) != CP_UTF7) && Encoding_IsEXTERNAL_8BIT(*iEncoding)) ||
|
||||
((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsUTF7(lpData,cbData))) {
|
||||
((Encoding_GetCodePage(*iEncoding) == CP_UTF7) && IsValidUTF7(lpData,cbData))) {
|
||||
|
||||
UINT uCodePage = Encoding_GetCodePage(*iEncoding);
|
||||
|
||||
|
||||
280
src/Encoding.c
280
src/Encoding.c
@ -587,7 +587,7 @@ const char* Encoding_GetParseNames(int iEncoding) {
|
||||
// ============================================================================
|
||||
|
||||
|
||||
bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
|
||||
bool IsValidUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
|
||||
{
|
||||
if (!pBuffer || cb < 2) { return false; }
|
||||
|
||||
@ -630,7 +630,7 @@ bool IsUnicode(const char* pBuffer, size_t cb, bool* lpbBOM, bool* lpbReverse)
|
||||
// ============================================================================
|
||||
|
||||
|
||||
bool IsUTF7(const char* pTest, size_t nLength) {
|
||||
bool IsValidUTF7(const char* pTest, size_t nLength) {
|
||||
const char *pt = pTest;
|
||||
|
||||
for (size_t i = 0; i < nLength; i++) {
|
||||
@ -638,7 +638,6 @@ bool IsUTF7(const char* pTest, size_t nLength) {
|
||||
return false;
|
||||
pt++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
// ============================================================================
|
||||
@ -648,7 +647,124 @@ bool IsUTF7(const char* pTest, size_t nLength) {
|
||||
//#define _OLD_UTF8_VALIDATOR_ 1
|
||||
#ifdef _OLD_UTF8_VALIDATOR_
|
||||
|
||||
bool IsUTF8(const char* pTest, size_t nLength)
|
||||
// ============================================================================
|
||||
|
||||
/* byte length of UTF-8 sequence based on value of first byte.
|
||||
for UTF-16 (21-bit space), max. code length is 4, so we only need to look
|
||||
at 4 upper bits.
|
||||
*/
|
||||
static const size_t utf8_lengths[16] =
|
||||
{
|
||||
1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */
|
||||
0,0,0,0, /* 1000 to 1011 : not valid */
|
||||
2,2, /* 1100, 1101 : 2 bytes */
|
||||
3, /* 1110 : 3 bytes */
|
||||
4 /* 1111 : 4 bytes */
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*++
|
||||
Function :
|
||||
UTF8_mbslen_bytes [INTERNAL]
|
||||
|
||||
Calculates the byte size of a NULL-terminated UTF-8 string.
|
||||
|
||||
Parameters :
|
||||
char *utf8_string : string to examine
|
||||
|
||||
Return value :
|
||||
size (in bytes) of a NULL-terminated UTF-8 string.
|
||||
-1 if invalid NULL-terminated UTF-8 string
|
||||
--*/
|
||||
size_t __fastcall UTF8_mbslen_bytes(LPCSTR utf8_string)
|
||||
{
|
||||
size_t length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (*utf8_string)
|
||||
{
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
|
||||
length += code_size;
|
||||
utf8_string += code_size;
|
||||
}
|
||||
else {
|
||||
/* we got an invalid byte value but need to count it,
|
||||
it will be later ignored during the string conversion */
|
||||
//WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte);
|
||||
length++;
|
||||
utf8_string++;
|
||||
}
|
||||
}
|
||||
length++; /* include NULL terminator */
|
||||
return length;
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*++
|
||||
Function :
|
||||
UTF8_mbslen [INTERNAL]
|
||||
|
||||
Calculates the character size of a NULL-terminated UTF-8 string.
|
||||
|
||||
Parameters :
|
||||
char *utf8_string : string to examine
|
||||
int byte_length : byte size of string
|
||||
|
||||
Return value :
|
||||
size (in characters) of a UTF-8 string.
|
||||
-1 if invalid UTF-8 string
|
||||
--*/
|
||||
size_t __fastcall UTF8_mbslen(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
size_t wchar_length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (byte_length > 0) {
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
/* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value
|
||||
for first byte is 11110111. Use lookup table to determine sequence
|
||||
length based on upper 4 bits of first byte */
|
||||
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
|
||||
/* 1 sequence == 1 character */
|
||||
wchar_length++;
|
||||
|
||||
if (code_size == 4)
|
||||
wchar_length++;
|
||||
|
||||
utf8_string += code_size; /* increment pointer */
|
||||
byte_length -= code_size; /* decrement counter*/
|
||||
}
|
||||
else {
|
||||
/*
|
||||
unlike UTF8_mbslen_bytes, we ignore the invalid characters.
|
||||
we only report the number of valid characters we have encountered
|
||||
to match the Windows behavior.
|
||||
*/
|
||||
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte);
|
||||
utf8_string++;
|
||||
byte_length--;
|
||||
}
|
||||
}
|
||||
return wchar_length;
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
bool __fastcall UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) !=
|
||||
UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
bool IsValidUTF8(const char* pTest, size_t nLength)
|
||||
{
|
||||
static int byte_class_table[256] = {
|
||||
/* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */
|
||||
@ -705,121 +821,7 @@ bool IsUTF8(const char* pTest, size_t nLength)
|
||||
break;
|
||||
}
|
||||
|
||||
return (current == kSTART) ? true : false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
||||
/* byte length of UTF-8 sequence based on value of first byte.
|
||||
for UTF-16 (21-bit space), max. code length is 4, so we only need to look
|
||||
at 4 upper bits.
|
||||
*/
|
||||
static const size_t utf8_lengths[16] =
|
||||
{
|
||||
1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */
|
||||
0,0,0,0, /* 1000 to 1011 : not valid */
|
||||
2,2, /* 1100, 1101 : 2 bytes */
|
||||
3, /* 1110 : 3 bytes */
|
||||
4 /* 1111 : 4 bytes */
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*++
|
||||
Function :
|
||||
UTF8_mbslen_bytes [INTERNAL]
|
||||
|
||||
Calculates the byte size of a NULL-terminated UTF-8 string.
|
||||
|
||||
Parameters :
|
||||
char *utf8_string : string to examine
|
||||
|
||||
Return value :
|
||||
size (in bytes) of a NULL-terminated UTF-8 string.
|
||||
-1 if invalid NULL-terminated UTF-8 string
|
||||
--*/
|
||||
size_t UTF8_mbslen_bytes(LPCSTR utf8_string)
|
||||
{
|
||||
size_t length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (*utf8_string)
|
||||
{
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
|
||||
length += code_size;
|
||||
utf8_string += code_size;
|
||||
}
|
||||
else {
|
||||
/* we got an invalid byte value but need to count it,
|
||||
it will be later ignored during the string conversion */
|
||||
//WARN("invalid first byte value 0x%02X in UTF-8 sequence!\n",byte);
|
||||
length++;
|
||||
utf8_string++;
|
||||
}
|
||||
}
|
||||
length++; /* include NULL terminator */
|
||||
return length;
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*++
|
||||
Function :
|
||||
UTF8_mbslen [INTERNAL]
|
||||
|
||||
Calculates the character size of a NULL-terminated UTF-8 string.
|
||||
|
||||
Parameters :
|
||||
char *utf8_string : string to examine
|
||||
int byte_length : byte size of string
|
||||
|
||||
Return value :
|
||||
size (in characters) of a UTF-8 string.
|
||||
-1 if invalid UTF-8 string
|
||||
--*/
|
||||
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
size_t wchar_length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (byte_length > 0) {
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
/* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value
|
||||
for first byte is 11110111. Use lookup table to determine sequence
|
||||
length based on upper 4 bits of first byte */
|
||||
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
|
||||
/* 1 sequence == 1 character */
|
||||
wchar_length++;
|
||||
|
||||
if (code_size == 4)
|
||||
wchar_length++;
|
||||
|
||||
utf8_string += code_size; /* increment pointer */
|
||||
byte_length -= code_size; /* decrement counter*/
|
||||
}
|
||||
else {
|
||||
/*
|
||||
unlike UTF8_mbslen_bytes, we ignore the invalid characters.
|
||||
we only report the number of valid characters we have encountered
|
||||
to match the Windows behavior.
|
||||
*/
|
||||
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte);
|
||||
utf8_string++;
|
||||
byte_length--;
|
||||
}
|
||||
}
|
||||
return wchar_length;
|
||||
}
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) !=
|
||||
UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length));
|
||||
return (current == kSTART) && !UTF8_ContainsInvalidChars(pTest, nLength);
|
||||
}
|
||||
|
||||
|
||||
@ -831,17 +833,13 @@ bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
|
||||
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
|
||||
|
||||
enum {
|
||||
UTF8_ACCEPT = 0,
|
||||
UTF8_REJECT = 12,
|
||||
UTF8_NOTEST = 113
|
||||
};
|
||||
|
||||
static UINT s_State = UTF8_NOTEST;
|
||||
|
||||
bool IsUTF8(const char* pTest, size_t nLength)
|
||||
bool IsValidUTF8(const char* pTest, size_t nLength)
|
||||
{
|
||||
enum {
|
||||
UTF8_ACCEPT = 0,
|
||||
UTF8_REJECT = 12
|
||||
};
|
||||
|
||||
static const unsigned char utf8_dfa[] = {
|
||||
// The first part of the table maps bytes to character classes that
|
||||
// to reduce the size of the transition table and create bitmasks.
|
||||
@ -866,32 +864,16 @@ bool IsUTF8(const char* pTest, size_t nLength)
|
||||
const unsigned char *pt = (const unsigned char *)pTest;
|
||||
const unsigned char *end = pt + nLength;
|
||||
|
||||
s_State = UTF8_ACCEPT;
|
||||
UINT state = UTF8_ACCEPT;
|
||||
while (pt < end && *pt) {
|
||||
s_State = utf8_dfa[256 + s_State + utf8_dfa[*pt++]];
|
||||
if (s_State == UTF8_REJECT) {
|
||||
state = utf8_dfa[256 + state + utf8_dfa[*pt++]];
|
||||
if (state == UTF8_REJECT) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return (s_State == UTF8_ACCEPT);
|
||||
return (state == UTF8_ACCEPT);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
bool result = true;
|
||||
if (s_State != UTF8_NOTEST) {
|
||||
result = (s_State == UTF8_REJECT);
|
||||
}
|
||||
else {
|
||||
result = IsUTF8(utf8_string, byte_length);
|
||||
}
|
||||
s_State = UTF8_NOTEST; // reset: old way, call IsUTF8() before
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
||||
@ -128,14 +128,9 @@ const char* Encoding_GetParseNames(int);
|
||||
#define Has_UTF16_LE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFEFF)
|
||||
#define Has_UTF16_BE_BOM(p) (*((UNALIGNED wchar_t*)(p)) == 0xFFFE) /* reverse */
|
||||
|
||||
bool IsUnicode(const char*, size_t, bool*, bool*);
|
||||
bool IsUTF8(const char*, size_t);
|
||||
bool IsUTF7(const char*, size_t);
|
||||
|
||||
|
||||
size_t UTF8_mbslen_bytes(LPCSTR utf8_string);
|
||||
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length);
|
||||
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length);
|
||||
bool IsValidUnicode(const char*, size_t, bool*, bool*);
|
||||
bool IsValidUTF7(const char*, size_t);
|
||||
bool IsValidUTF8(const char*, size_t);
|
||||
|
||||
// Google's "Compact Encoding Detection"
|
||||
extern NP2ENCODING g_Encodings[];
|
||||
|
||||
@ -117,9 +117,9 @@ extern "C" {
|
||||
#define ENC_PARSE_NAM_ISO_2022_JP "ISO-2022-jp,iso2022jp,"
|
||||
#define ENC_PARSE_NAM_ISO_2022_KR "ISO-2022-kr,iso2022kr,csiso2022kr,"
|
||||
#define ENC_PARSE_NAM_X_CHINESE_CNS "x-Chinese-CNS,xchinesecns,"
|
||||
#define ENC_PARSE_NAM_JOHAB "johab,johab,"
|
||||
#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600,"
|
||||
#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs,"
|
||||
//#define ENC_PARSE_NAM_JOHAB "johab,johab,"
|
||||
//#define ENC_PARSE_NAM_ISO_8859_10 "ISO-8859-10,iso885910,Windows-28600,Windows28600,"
|
||||
//#define ENC_PARSE_NAM_BIG5_HKSCS "big5hkscs,cnbig5hkscs,xxbig5hkscs,"
|
||||
//=============================================================================
|
||||
|
||||
|
||||
@ -204,10 +204,10 @@ extern "C" NP2ENCODING g_Encodings[] = {
|
||||
/* 076 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 52936, ENC_PARSE_NAM_HZ_GB2312, IDS_ENC_HZ_GB2312, HZ_GB_2312, L"" }, // Chinese Simplified (HZ-GB2312)
|
||||
/* 077 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50220, ENC_PARSE_NAM_ISO_2022_JP, IDS_ENC_ISO_2022_JP, KDDI_ISO_2022_JP, L"" }, // Japanese (JIS)
|
||||
/* 078 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 50225, ENC_PARSE_NAM_ISO_2022_KR, IDS_ENC_ISO_2022_KR, ISO_2022_KR, L"" }, // Korean (ISO-2022-KR)
|
||||
/* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" }, // Chinese Traditional (CNS)
|
||||
/* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab)
|
||||
/* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10)
|
||||
/* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set)
|
||||
/* 079 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 20000, ENC_PARSE_NAM_X_CHINESE_CNS, IDS_ENC_X_CHINESE_CNS, CHINESE_CNS, L"" } // Chinese Traditional (CNS)
|
||||
///* 080 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 1361, ENC_PARSE_NAM_JOHAB, IDS_ENC_JOHAB, CED_NO_MAPPING, L"" }, // Korean (Johab)
|
||||
///* 081 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 28600, ENC_PARSE_NAM_ISO_8859_10, IDS_ENC_ISO_8859_10, ISO_8859_10, L"" }, // Nordic (ISO 8859-10)
|
||||
///* 082 */{ NCP_EXTERNAL_8BIT | NCP_RECODE, 951, ENC_PARSE_NAM_BIG5_HKSCS, IDS_ENC_BIG5_HKSCS, BIG5_HKSCS, L"" } // Chinese (Hong Kong Supplementary Character Set)
|
||||
|
||||
|
||||
#if 0
|
||||
@ -322,6 +322,7 @@ static int __fastcall FindCodePage(const Encoding& encoding)
|
||||
iCodePage = 1250;
|
||||
break;
|
||||
case ISO_8859_4:
|
||||
case ISO_8859_10:
|
||||
iCodePage = 1257;
|
||||
break;
|
||||
case ISO_8859_5:
|
||||
@ -410,7 +411,7 @@ static int __fastcall MapEncoding2CPI(const char* const text, const size_t len,
|
||||
{
|
||||
bool bBOM;
|
||||
bool bReverse;
|
||||
if (IsUnicode(text, len, &bBOM, &bReverse)) {
|
||||
if (IsValidUnicode(text, len, &bBOM, &bReverse)) {
|
||||
iNP3Encoding = bBOM ? (bReverse ? CPI_UNICODEBEBOM : CPI_UNICODEBOM) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user