mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-14 21:09:05 +08:00
Merge branch 'Bugfixes4Release_0305' into Enh_0305
This commit is contained in:
commit
598f4d56ec
14
src/Edit.c
14
src/Edit.c
@ -1164,13 +1164,13 @@ BOOL EditLoadFile(
|
||||
|
||||
// === UTF-8 ===
|
||||
if (!bSkipEncodingDetection && (Encoding_IsNONE(iForcedEncoding) || Encoding_IsUTF8(iForcedEncoding)) &&
|
||||
((IsUTF8Signature(lpData) ||
|
||||
FileVars_IsUTF8(&fvCurFile) ||
|
||||
(Encoding_IsUTF8(iForcedEncoding) ||
|
||||
(!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8"
|
||||
(IsUTF8(lpData,cbData) &&
|
||||
(((UTF8_mbslen_bytes(UTF8StringStart(lpData)) - 1 != UTF8_mbslen(UTF8StringStart(lpData),IsUTF8Signature(lpData) ? cbData-3 : cbData)) ||
|
||||
(!bPreferOEM && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))))))) &&
|
||||
((IsUTF8Signature(lpData) ||
|
||||
FileVars_IsUTF8(&fvCurFile) ||
|
||||
(Encoding_IsUTF8(iForcedEncoding) ||
|
||||
Encoding_IsUTF8(iAnalyzedEncoding) ||
|
||||
(!bPreferOEM && bLoadASCIIasUTF8) || // from menu "Reload As UTF-8"
|
||||
(IsUTF8(lpData,cbData) && ((UTF8_ContainsInvalidChars(lpData, cbData) ||
|
||||
(!bPreferOEM && (Encoding_IsUTF8(iPreferedEncoding) || bLoadASCIIasUTF8))))))) &&
|
||||
!(FileVars_IsNonUTF8(&fvCurFile) && !Encoding_IsUTF8(iForcedEncoding))))
|
||||
{
|
||||
Encoding_SciSetCodePage(hwnd,CPI_UTF8);
|
||||
|
||||
@ -38,7 +38,14 @@
|
||||
|
||||
|
||||
extern HINSTANCE g_hInstance;
|
||||
extern BOOL bLoadASCIIasUTF8;
|
||||
|
||||
//=============================================================================
|
||||
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
const uint32_t limitDblHiByte4ANSI = 5; // [%] of max. double HighByte in file to be assumed as ANSI
|
||||
|
||||
//=============================================================================
|
||||
//
|
||||
@ -291,11 +298,6 @@ BOOL Encoding_HasChanged(int iOriginalEncoding) {
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
|
||||
typedef enum _UTF8_ValidationState
|
||||
{
|
||||
UTF8_INVALID,
|
||||
@ -714,7 +716,7 @@ int Encoding_Analyze(const char* const buffer, const size_t len)
|
||||
else
|
||||
++(item->count);
|
||||
|
||||
dbyte_cnt++;
|
||||
++dbyte_cnt;
|
||||
if ((last_ch > 0xa0) && (ch > 0xa0)) {
|
||||
++dbyte_hihi_cnt;
|
||||
}
|
||||
@ -753,7 +755,7 @@ int Encoding_Analyze(const char* const buffer, const size_t len)
|
||||
}
|
||||
else if (dbyte_cnt == 0) {
|
||||
// No characters outside the scope of ASCII
|
||||
iEncoding = CPI_ANSI_DEFAULT;
|
||||
iEncoding = bLoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT;
|
||||
}
|
||||
else if (is_valid_utf8) {
|
||||
// Only valid UTF-8 sequences
|
||||
@ -773,9 +775,9 @@ int Encoding_Analyze(const char* const buffer, const size_t len)
|
||||
if (probEncoding != CPI_NONE) {
|
||||
iEncoding = probEncoding;
|
||||
}
|
||||
else if (((dbyte_hihi_cnt * 100) / ++dbyte_cnt) < 5) {
|
||||
else if (((dbyte_hihi_cnt * 100) / ++dbyte_cnt) < limitDblHiByte4ANSI) {
|
||||
// mostly a low-byte follows a high-byte
|
||||
iEncoding = CPI_ANSI_DEFAULT;
|
||||
iEncoding = bLoadASCIIasUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1465,13 +1467,13 @@ BOOL IsUTF7(const char* pTest, int nLength) {
|
||||
for UTF-16 (21-bit space), max. code length is 4, so we only need to look
|
||||
at 4 upper bits.
|
||||
*/
|
||||
static const INT utf8_lengths[16] =
|
||||
static const size_t utf8_lengths[16] =
|
||||
{
|
||||
1,1,1,1,1,1,1,1, /* 0000 to 0111 : 1 byte (plain ASCII) */
|
||||
0,0,0,0, /* 1000 to 1011 : not valid */
|
||||
2,2, /* 1100, 1101 : 2 bytes */
|
||||
3, /* 1110 : 3 bytes */
|
||||
4 /* 1111 :4 bytes */
|
||||
4 /* 1111 : 4 bytes */
|
||||
};
|
||||
// ============================================================================
|
||||
|
||||
@ -1489,13 +1491,14 @@ Return value :
|
||||
size (in bytes) of a NULL-terminated UTF-8 string.
|
||||
-1 if invalid NULL-terminated UTF-8 string
|
||||
--*/
|
||||
INT UTF8_mbslen_bytes(LPCSTR utf8_string)
|
||||
size_t UTF8_mbslen_bytes(LPCSTR utf8_string)
|
||||
{
|
||||
INT length = 0;
|
||||
INT code_size;
|
||||
size_t length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (*utf8_string) {
|
||||
while (*utf8_string)
|
||||
{
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
if ((byte <= 0xF7) && (0 != (code_size = utf8_lengths[byte >> 4]))) {
|
||||
@ -1530,14 +1533,14 @@ Return value :
|
||||
size (in characters) of a UTF-8 string.
|
||||
-1 if invalid UTF-8 string
|
||||
--*/
|
||||
INT UTF8_mbslen(LPCSTR source, INT byte_length)
|
||||
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
INT wchar_length = 0;
|
||||
INT code_size;
|
||||
size_t wchar_length = 0;
|
||||
size_t code_size;
|
||||
BYTE byte;
|
||||
|
||||
while (byte_length > 0) {
|
||||
byte = (BYTE)*source;
|
||||
byte = (BYTE)*utf8_string;
|
||||
|
||||
/* UTF-16 can't encode 5-byte and 6-byte sequences, so maximum value
|
||||
for first byte is 11110111. Use lookup table to determine sequence
|
||||
@ -1549,7 +1552,7 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length)
|
||||
if (code_size == 4)
|
||||
wchar_length++;
|
||||
|
||||
source += code_size; /* increment pointer */
|
||||
utf8_string += code_size; /* increment pointer */
|
||||
byte_length -= code_size; /* decrement counter*/
|
||||
}
|
||||
else {
|
||||
@ -1558,12 +1561,21 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length)
|
||||
we only report the number of valid characters we have encountered
|
||||
to match the Windows behavior.
|
||||
*/
|
||||
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n",
|
||||
// byte);
|
||||
source++;
|
||||
//WARN("invalid byte 0x%02X in UTF-8 sequence, skipping it!\n", byte);
|
||||
utf8_string++;
|
||||
byte_length--;
|
||||
}
|
||||
}
|
||||
return wchar_length;
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
|
||||
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length)
|
||||
{
|
||||
return ((UTF8_mbslen_bytes(UTF8StringStart(utf8_string)) - 1) !=
|
||||
UTF8_mbslen(UTF8StringStart(utf8_string), IsUTF8Signature(utf8_string) ? (byte_length - 3) : byte_length));
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
@ -131,11 +131,12 @@ BOOL IsUnicode(const char*, int, LPBOOL, LPBOOL);
|
||||
BOOL IsUTF8(const char*, int);
|
||||
BOOL IsUTF7(const char*, int);
|
||||
|
||||
#define IsUTF8Signature(p) ((*(p+0) == '\xEF' && *(p+1) == '\xBB' && *(p+2) == '\xBF'))
|
||||
#define UTF8StringStart(p) (IsUTF8Signature(p)) ? (p+3) : (p)
|
||||
#define IsUTF8Signature(p) ((*((p)+0) == '\xEF' && *((p)+1) == '\xBB' && *((p)+2) == '\xBF'))
|
||||
#define UTF8StringStart(p) (IsUTF8Signature(p)) ? ((p)+3) : (p)
|
||||
|
||||
INT UTF8_mbslen_bytes(LPCSTR utf8_string);
|
||||
INT UTF8_mbslen(LPCSTR source, INT byte_length);
|
||||
size_t UTF8_mbslen_bytes(LPCSTR utf8_string);
|
||||
size_t UTF8_mbslen(LPCSTR utf8_string, size_t byte_length);
|
||||
bool UTF8_ContainsInvalidChars(LPCSTR utf8_string, size_t byte_length);
|
||||
|
||||
int Encoding_Analyze(const char* const, const size_t);
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user