+fix: enhanced Unicode detection

This commit is contained in:
Rainer Kottenhoff 2019-07-21 12:21:55 +02:00
parent 740a22ad9b
commit ece4ec6ee6
7 changed files with 78 additions and 63 deletions

View File

@ -1 +1 @@
2399
2401

View File

@ -3,7 +3,7 @@
<assemblyIdentity
name="Notepad3"
processorArchitecture="*"
version="5.19.719.2399"
version="5.19.721.2401"
type="win32"
/>
<description>Notepad3 BETA</description>

View File

@ -1119,9 +1119,10 @@ bool EditLoadFile(
bool bBOM = false;
bool bReverse = false;
bool const bIsUnicodeValid = IsValidUnicode(lpData, cbData, &bBOM, &bReverse);
bool const bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable) && !bIsForced && !bSkipUTFDetection && !bIsUTF8Sig);
cpi_enc_t const encUnicode = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &bBOM, &bReverse);
if (cbData == 0) {
FileVars_Init(NULL, 0, &Globals.fvCurFile);
status->iEOLMode = Settings.DefaultEOLMode;
@ -1130,20 +1131,22 @@ bool EditLoadFile(
SciCall_SetEOLMode(Settings.DefaultEOLMode);
FreeMem(lpData);
}
else if (bIsUnicodeForced || (!bIsForced && bIsUnicodeAnalyzed && bIsUnicodeValid))
else if (bIsUnicodeForced || (!bIsForced && (bIsUnicodeAnalyzed || !Encoding_IsNONE(encUnicode))))
{
// === UNICODE ===
if (Encoding_IsNONE(encUnicode))
{
bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
bBOM = bBOM_LE;
bReverse = false;
}
else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
bBOM = bBOM_BE;
bReverse = true;
if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
bBOM = bBOM_LE;
bReverse = false;
}
else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
bBOM = bBOM_BE;
bReverse = true;
}
}
if (bReverse)

View File

@ -664,51 +664,6 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
// ============================================================================
bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse)
{
size_t const enoughData = 2048LL;
size_t const cb = (len < enoughData) ? len : enoughData;
if (!pBuffer || cb < 2) { return false; }
// IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
// IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
// IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
// IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
//
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
int iTest = iAllTests;
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
if (iTest == iAllTests) {
iTest = 0; // iTest doesn't seem to have been modified ...
}
bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
//bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse)))
{
if (lpbBOM) {
*lpbBOM = (bHasBOM || bHasRBOM);
}
if (lpbReverse) {
*lpbReverse = (bHasRBOM || bIsReverse);
}
return true;
}
return false;
}
// ============================================================================
bool IsValidUTF7(const char* pTest, size_t nLength)
{
if (!pTest) { return false; }

View File

@ -121,7 +121,7 @@ inline bool IsUTF8Signature(const char* p) {
bool IsValidUTF7(const char* pTest, size_t nLength);
bool IsValidUTF8(const char* pTest, size_t nLength);
bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);
//////////////////////////////////////////////////////
// Google's CED "Compact Encoding Detection"
@ -145,6 +145,7 @@ inline bool IsDBCSCodePage(UINT cp) {
}
cpi_enc_t Encoding_AnalyzeText(const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint);
cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);
const char* Encoding_GetTitleInfoA();
const WCHAR* Encoding_GetTitleInfoW();

View File

@ -552,7 +552,12 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi
{
bool bBOM = false;
bool bReverse = false;
if (IsValidUnicode(text, len, &bBOM, &bReverse)) {
cpi_enc_t const cpi = GetUnicodeEncoding(text, len, &bBOM, &bReverse);
if (!Encoding_IsNONE(cpiEncoding))
{
cpiEncoding = cpi;
}
else {
cpiEncoding = bBOM ? (bReverse ? CPI_UNICODEBE : CPI_UNICODE) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE);
}
}
@ -830,6 +835,57 @@ extern "C" cpi_enc_t Encoding_AnalyzeText
// ============================================================================
cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse)
{
size_t const enoughData = 2048LL;
size_t const cb = (len < enoughData) ? len : enoughData;
if (!pBuffer || cb < 2) { return false; }
// IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
// IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
// IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
// IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
//
int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
int iTest = iAllTests;
/*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
if (iTest == iAllTests) {
iTest = 0; // iTest doesn't seem to have been modified ...
}
bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
//bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
cpi_enc_t iEncoding = CPI_NONE;
if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse)))
{
if (lpbBOM) {
*lpbBOM = (bHasBOM || bHasRBOM);
}
if (lpbReverse) {
*lpbReverse = (bHasRBOM || bIsReverse);
}
if (bHasBOM || bHasRBOM) {
iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
}
else if (bIsUnicode || bIsReverse) {
iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE;
}
}
return iEncoding;
}
// ============================================================================
//=============================================================================
//

View File

@ -7,8 +7,8 @@
#define SAPPNAME "Notepad3"
#define VERSION_MAJOR 5
#define VERSION_MINOR 19
#define VERSION_REV 719
#define VERSION_BUILD 2399
#define VERSION_REV 721
#define VERSION_BUILD 2401
#define SCINTILLA_VER 420
#define ONIGURUMA_REGEX_VER 6.9.3
#define VERSION_PATCH BETA