mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-11 21:03:05 +08:00
+fix: enhanced Unicode detection
This commit is contained in:
parent
740a22ad9b
commit
ece4ec6ee6
@ -1 +1 @@
|
||||
2399
|
||||
2401
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
<assemblyIdentity
|
||||
name="Notepad3"
|
||||
processorArchitecture="*"
|
||||
version="5.19.719.2399"
|
||||
version="5.19.721.2401"
|
||||
type="win32"
|
||||
/>
|
||||
<description>Notepad3 BETA</description>
|
||||
|
||||
27
src/Edit.c
27
src/Edit.c
@ -1119,9 +1119,10 @@ bool EditLoadFile(
|
||||
|
||||
bool bBOM = false;
|
||||
bool bReverse = false;
|
||||
bool const bIsUnicodeValid = IsValidUnicode(lpData, cbData, &bBOM, &bReverse);
|
||||
bool const bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable) && !bIsForced && !bSkipUTFDetection && !bIsUTF8Sig);
|
||||
|
||||
cpi_enc_t const encUnicode = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &bBOM, &bReverse);
|
||||
|
||||
if (cbData == 0) {
|
||||
FileVars_Init(NULL, 0, &Globals.fvCurFile);
|
||||
status->iEOLMode = Settings.DefaultEOLMode;
|
||||
@ -1130,20 +1131,22 @@ bool EditLoadFile(
|
||||
SciCall_SetEOLMode(Settings.DefaultEOLMode);
|
||||
FreeMem(lpData);
|
||||
}
|
||||
else if (bIsUnicodeForced || (!bIsForced && bIsUnicodeAnalyzed && bIsUnicodeValid))
|
||||
else if (bIsUnicodeForced || (!bIsForced && (bIsUnicodeAnalyzed || !Encoding_IsNONE(encUnicode))))
|
||||
{
|
||||
// === UNICODE ===
|
||||
if (Encoding_IsNONE(encUnicode))
|
||||
{
|
||||
bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
|
||||
bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
|
||||
|
||||
bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
|
||||
bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
|
||||
|
||||
if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
|
||||
bBOM = bBOM_LE;
|
||||
bReverse = false;
|
||||
}
|
||||
else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
|
||||
bBOM = bBOM_BE;
|
||||
bReverse = true;
|
||||
if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
|
||||
bBOM = bBOM_LE;
|
||||
bReverse = false;
|
||||
}
|
||||
else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
|
||||
bBOM = bBOM_BE;
|
||||
bReverse = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bReverse)
|
||||
|
||||
@ -664,51 +664,6 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
|
||||
// ============================================================================
|
||||
|
||||
|
||||
bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse)
|
||||
{
|
||||
size_t const enoughData = 2048LL;
|
||||
size_t const cb = (len < enoughData) ? len : enoughData;
|
||||
|
||||
if (!pBuffer || cb < 2) { return false; }
|
||||
|
||||
// IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
|
||||
// IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
|
||||
// IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
|
||||
// IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
|
||||
//
|
||||
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
|
||||
|
||||
int iTest = iAllTests;
|
||||
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
|
||||
|
||||
if (iTest == iAllTests) {
|
||||
iTest = 0; // iTest doesn't seem to have been modified ...
|
||||
}
|
||||
|
||||
bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
|
||||
bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
|
||||
|
||||
bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
|
||||
bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
|
||||
bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
|
||||
|
||||
//bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
|
||||
|
||||
if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse)))
|
||||
{
|
||||
if (lpbBOM) {
|
||||
*lpbBOM = (bHasBOM || bHasRBOM);
|
||||
}
|
||||
if (lpbReverse) {
|
||||
*lpbReverse = (bHasRBOM || bIsReverse);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
bool IsValidUTF7(const char* pTest, size_t nLength)
|
||||
{
|
||||
if (!pTest) { return false; }
|
||||
|
||||
@ -121,7 +121,7 @@ inline bool IsUTF8Signature(const char* p) {
|
||||
|
||||
bool IsValidUTF7(const char* pTest, size_t nLength);
|
||||
bool IsValidUTF8(const char* pTest, size_t nLength);
|
||||
bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Google's CED "Compact Encoding Detection"
|
||||
@ -145,6 +145,7 @@ inline bool IsDBCSCodePage(UINT cp) {
|
||||
}
|
||||
|
||||
cpi_enc_t Encoding_AnalyzeText(const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint);
|
||||
cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);
|
||||
|
||||
const char* Encoding_GetTitleInfoA();
|
||||
const WCHAR* Encoding_GetTitleInfoW();
|
||||
|
||||
@ -552,7 +552,12 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi
|
||||
{
|
||||
bool bBOM = false;
|
||||
bool bReverse = false;
|
||||
if (IsValidUnicode(text, len, &bBOM, &bReverse)) {
|
||||
cpi_enc_t const cpi = GetUnicodeEncoding(text, len, &bBOM, &bReverse);
|
||||
if (!Encoding_IsNONE(cpiEncoding))
|
||||
{
|
||||
cpiEncoding = cpi;
|
||||
}
|
||||
else {
|
||||
cpiEncoding = bBOM ? (bReverse ? CPI_UNICODEBE : CPI_UNICODE) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE);
|
||||
}
|
||||
}
|
||||
@ -830,6 +835,57 @@ extern "C" cpi_enc_t Encoding_AnalyzeText
|
||||
// ============================================================================
|
||||
|
||||
|
||||
cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse)
|
||||
{
|
||||
size_t const enoughData = 2048LL;
|
||||
size_t const cb = (len < enoughData) ? len : enoughData;
|
||||
|
||||
if (!pBuffer || cb < 2) { return false; }
|
||||
|
||||
// IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
|
||||
// IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
|
||||
// IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
|
||||
// IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
|
||||
//
|
||||
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
|
||||
|
||||
int iTest = iAllTests;
|
||||
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
|
||||
|
||||
if (iTest == iAllTests) {
|
||||
iTest = 0; // iTest doesn't seem to have been modified ...
|
||||
}
|
||||
|
||||
bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
|
||||
bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
|
||||
|
||||
bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
|
||||
bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
|
||||
bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
|
||||
|
||||
//bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
|
||||
|
||||
cpi_enc_t iEncoding = CPI_NONE;
|
||||
|
||||
if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse)))
|
||||
{
|
||||
if (lpbBOM) {
|
||||
*lpbBOM = (bHasBOM || bHasRBOM);
|
||||
}
|
||||
if (lpbReverse) {
|
||||
*lpbReverse = (bHasRBOM || bIsReverse);
|
||||
}
|
||||
if (bHasBOM || bHasRBOM) {
|
||||
iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
|
||||
}
|
||||
else if (bIsUnicode || bIsReverse) {
|
||||
iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE;
|
||||
}
|
||||
}
|
||||
return iEncoding;
|
||||
}
|
||||
// ============================================================================
|
||||
|
||||
|
||||
//=============================================================================
|
||||
//
|
||||
|
||||
@ -7,8 +7,8 @@
|
||||
#define SAPPNAME "Notepad3"
|
||||
#define VERSION_MAJOR 5
|
||||
#define VERSION_MINOR 19
|
||||
#define VERSION_REV 719
|
||||
#define VERSION_BUILD 2399
|
||||
#define VERSION_REV 721
|
||||
#define VERSION_BUILD 2401
|
||||
#define SCINTILLA_VER 420
|
||||
#define ONIGURUMA_REGEX_VER 6.9.3
|
||||
#define VERSION_PATCH BETA
|
||||
|
||||
Loading…
Reference in New Issue
Block a user