+fix: enhanced Unicode detection

2026-06-11 21:03:05 +08:00 · 2019-07-21 12:21:55 +02:00 · 2019-07-21 12:21:55 +02:00 · ece4ec6ee6
commit ece4ec6ee6
parent 740a22ad9b
7 changed files with 78 additions and 63 deletions
--- a/Versions/build.txt
+++ b/Versions/build.txt
@ -1 +1 @@
-2399
+2401
--- a/res/Notepad3.exe.manifest.conf
+++ b/res/Notepad3.exe.manifest.conf
@ -3,7 +3,7 @@
  <assemblyIdentity
    name="Notepad3"
    processorArchitecture="*"
-    version="5.19.719.2399"
+    version="5.19.721.2401"
    type="win32"
  />
  <description>Notepad3 BETA</description>
--- a/src/Edit.c
+++ b/src/Edit.c
@ -1119,9 +1119,10 @@ bool EditLoadFile(

  bool bBOM = false;
  bool bReverse = false;
-  bool const bIsUnicodeValid = IsValidUnicode(lpData, cbData, &bBOM, &bReverse);
  bool const bIsUnicodeAnalyzed = ((Encoding_IsUNICODE(iAnalyzedEncoding) && bIsReliable) && !bIsForced && !bSkipUTFDetection && !bIsUTF8Sig);

+  cpi_enc_t const encUnicode = bSkipUTFDetection ? CPI_NONE : GetUnicodeEncoding(lpData, cbData, &bBOM, &bReverse);
+
  if (cbData == 0) {
    FileVars_Init(NULL, 0, &Globals.fvCurFile);
    status->iEOLMode = Settings.DefaultEOLMode;
@ -1130,20 +1131,22 @@ bool EditLoadFile(
    SciCall_SetEOLMode(Settings.DefaultEOLMode);
    FreeMem(lpData);
  }
-  else if (bIsUnicodeForced || (!bIsForced && bIsUnicodeAnalyzed && bIsUnicodeValid))
+  else if (bIsUnicodeForced || (!bIsForced && (bIsUnicodeAnalyzed || !Encoding_IsNONE(encUnicode))))
  {
    // ===  UNICODE  ===
+    if (Encoding_IsNONE(encUnicode)) 
+    {
+      bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
+      bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);

-    bool const bBOM_LE = Has_UTF16_LE_BOM(lpData, cbData);
-    bool const bBOM_BE = Has_UTF16_BE_BOM(lpData, cbData);
-
-    if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
-      bBOM = bBOM_LE;
-      bReverse = false;
-    }
-    else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
-      bBOM = bBOM_BE;
-      bReverse = true;
+      if ((iForcedEncoding == CPI_UNICODE) || bBOM_LE) {
+        bBOM = bBOM_LE;
+        bReverse = false;
+      }
+      else if ((iForcedEncoding == CPI_UNICODEBE) || bBOM_BE) {
+        bBOM = bBOM_BE;
+        bReverse = true;
+      }
    }

    if (bReverse)
--- a/src/Encoding.c
+++ b/src/Encoding.c
@ -664,51 +664,6 @@ bool Has_UTF16_BE_BOM(const char* pBuf, size_t cnt)
 // ============================================================================


-bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse) 
-{
-  size_t const enoughData = 2048LL;
-  size_t const cb = (len < enoughData) ? len : enoughData;
-
-  if (!pBuffer || cb < 2) { return false; }
-
-  // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
-  // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
-  // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
-  // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
-  //
-  int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
-
-  int iTest = iAllTests;
-  /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
- 
-  if (iTest == iAllTests) {
-    iTest = 0; // iTest doesn't seem to have been modified ...
-  }
-
-  bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
-  bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
-
-  bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
-  bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
-  bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
-
-  //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
-
-  if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse))) 
-  {
-    if (lpbBOM) {
-      *lpbBOM = (bHasBOM || bHasRBOM);
-    }
-    if (lpbReverse) {
-      *lpbReverse = (bHasRBOM || bIsReverse);
-    }
-    return true;
-  }
-  return false;
-}
-// ============================================================================
-
-
 bool IsValidUTF7(const char* pTest, size_t nLength) 
 {
  if (!pTest) { return false; }
--- a/src/Encoding.h
+++ b/src/Encoding.h
@ -121,7 +121,7 @@ inline bool IsUTF8Signature(const char* p) {

 bool IsValidUTF7(const char* pTest, size_t nLength);
 bool IsValidUTF8(const char* pTest, size_t nLength);
-bool IsValidUnicode(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);
+

 //////////////////////////////////////////////////////
 // Google's   CED       "Compact Encoding Detection" 
@ -145,6 +145,7 @@ inline bool IsDBCSCodePage(UINT cp) {
 }

 cpi_enc_t Encoding_AnalyzeText(const char* const text, const size_t len, float* confidence_io, const cpi_enc_t encodingHint);
+cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse);

 const char*  Encoding_GetTitleInfoA();
 const WCHAR* Encoding_GetTitleInfoW();
--- a/src/EncodingDetection.cpp
+++ b/src/EncodingDetection.cpp
@ -552,7 +552,12 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi
    {
      bool bBOM = false;
      bool bReverse = false;
-      if (IsValidUnicode(text, len, &bBOM, &bReverse)) {
+      cpi_enc_t const cpi = GetUnicodeEncoding(text, len, &bBOM, &bReverse);
+      if (!Encoding_IsNONE(cpiEncoding)) 
+      {
+        cpiEncoding = cpi;
+      }
+      else {
        cpiEncoding = bBOM ? (bReverse ? CPI_UNICODEBE : CPI_UNICODE) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE);
      }
    }
@ -830,6 +835,57 @@ extern "C" cpi_enc_t Encoding_AnalyzeText
 // ============================================================================


+cpi_enc_t GetUnicodeEncoding(const char* pBuffer, const size_t len, bool* lpbBOM, bool* lpbReverse)
+{
+  size_t const enoughData = 2048LL;
+  size_t const cb = (len < enoughData) ? len : enoughData;
+
+  if (!pBuffer || cb < 2) { return false; }
+
+  // IS_TEXT_UNICODE_UNICODE_MASK -> IS_TEXT_UNICODE_ASCII16, IS_TEXT_UNICODE_STATISTICS, IS_TEXT_UNICODE_CONTROLS, IS_TEXT_UNICODE_SIGNATURE.
+  // IS_TEXT_UNICODE_REVERSE_MASK -> IS_TEXT_UNICODE_REVERSE_ASCII16, IS_TEXT_UNICODE_REVERSE_STATISTICS, IS_TEXT_UNICODE_REVERSE_CONTROLS, IS_TEXT_UNICODE_REVERSE_SIGNATURE.
+  // IS_TEXT_UNICODE_NOT_UNICODE_MASK -> IS_TEXT_UNICODE_ILLEGAL_CHARS, IS_TEXT_UNICODE_ODD_LENGTH, and two currently unused bit flags.
+  // IS_TEXT_UNICODE_NOT_ASCII_MASK -> IS_TEXT_UNICODE_NULL_BYTES and three currently unused bit flags.
+  //
+  int const iAllTests = IS_TEXT_UNICODE_UNICODE_MASK | IS_TEXT_UNICODE_REVERSE_MASK | IS_TEXT_UNICODE_NOT_UNICODE_MASK | IS_TEXT_UNICODE_NOT_ASCII_MASK;
+
+  int iTest = iAllTests;
+  /*bool const ok =*/ (void)IsTextUnicode(pBuffer, (int)cb, &iTest); // don't rely on result ok
+
+  if (iTest == iAllTests) {
+    iTest = 0; // iTest doesn't seem to have been modified ...
+  }
+
+  bool const bHasBOM = (iTest & IS_TEXT_UNICODE_SIGNATURE);
+  bool const bHasRBOM = (iTest & IS_TEXT_UNICODE_REVERSE_SIGNATURE);
+
+  bool const bIsUnicode = (iTest & IS_TEXT_UNICODE_UNICODE_MASK);
+  bool const bIsReverse = (iTest & IS_TEXT_UNICODE_REVERSE_MASK);
+  bool const bIsIllegal = (iTest & IS_TEXT_UNICODE_NOT_UNICODE_MASK);
+
+  //bool const bHasNullBytes = (iTest & IS_TEXT_UNICODE_NULL_BYTES);
+
+  cpi_enc_t iEncoding = CPI_NONE;
+
+  if (bHasBOM || bHasRBOM || ((bIsUnicode || bIsReverse) && !bIsIllegal && !(bIsUnicode && bIsReverse)))
+  {
+    if (lpbBOM) {
+      *lpbBOM = (bHasBOM || bHasRBOM);
+    }
+    if (lpbReverse) {
+      *lpbReverse = (bHasRBOM || bIsReverse);
+    }
+    if (bHasBOM || bHasRBOM) {
+      iEncoding = bHasBOM ? CPI_UNICODEBOM : CPI_UNICODEBEBOM;
+    }
+    else if (bIsUnicode || bIsReverse) {
+      iEncoding = bIsUnicode ? CPI_UNICODE : CPI_UNICODEBE;
+    }
+  }
+  return iEncoding;
+}
+// ============================================================================
+

 //=============================================================================
 //
--- a/src/VersionEx.h
+++ b/src/VersionEx.h
@ -7,8 +7,8 @@
 #define SAPPNAME "Notepad3"
 #define VERSION_MAJOR 5
 #define VERSION_MINOR 19
-#define VERSION_REV 719
-#define VERSION_BUILD 2399
+#define VERSION_REV 721
+#define VERSION_BUILD 2401
 #define SCINTILLA_VER 420
 #define ONIGURUMA_REGEX_VER 6.9.3
 #define VERSION_PATCH BETA