From 8bcc76089d479267632f77acc8d499e19dd67999 Mon Sep 17 00:00:00 2001
From: "METANEOCORTEX\\Kotti" <rainer.kottenhoff@gmail.com>
Date: Sat, 2 Mar 2024 14:16:40 +0100
Subject: [PATCH] +chg: Oniguruma syntax flavor for \h and \H (Ruby: match hex
 digit) to "horizontal space" definition of Perl / PCRE

---
 .../scintilla/OnigurumaRegExEngine.cxx        | 48 ++++++++++++++++---
 scintilla/oniguruma/src/utf8.c                | 14 +++---
 2 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx
index f4f15b3d4..ee8613eb3 100644
--- a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx
+++ b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx
@@ -69,6 +69,28 @@ static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8 };
 // ============================================================================
 // ============================================================================
 
+// https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/
+
+#define NP3_ONIG_SYNTAX_FLAVOR ONIG_SYNTAX_DEFAULT // default is ONIG_SYNTAX_ONIGURUMA
+
+// ensure some from special syntax options are excluded/included
+
+const unsigned int RemSynOptions_1[1] = { 0 };
+const unsigned int RemSynOptions_2[] = {
+  ONIG_SYN_OP2_ESC_H_XDIGIT                 // remove to replace \h\H with [^\S\n\v\f\r\u2028\u2029]
+};
+
+const unsigned int AddSynOptions_1[] = {
+  ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END       // \<. \>
+};
+const unsigned int AddSynOptions_2[] = {
+  ONIG_SYN_OP2_ESC_U_HEX4                   // \uHHHH
+};
+
+// -----------------------------------------------------------------------------
+
+
+
 // ------------------------------------
 // --- Onigmo Engine Simple Options ---
 // ------------------------------------
@@ -134,13 +156,9 @@ static void SetSimpleOptions(OnigOptionType &onigOptions, EOLmode /*eolMode*/,
   }
 
 }
+
 // ============================================================================
 
-
-#define NP3_ONIG_SYNTAX_FLAVOR (ONIG_SYNTAX_DEFAULT) // default is: ONIG_SYNTAX_ONIGURUMA
-
-// -----------------------------------------------------------------------------
-
 class OnigurumaRegExEngine : public RegexSearchBase
 {
 public:
@@ -161,7 +179,19 @@ public:
     onig_initialize(s_UsedEncodingsTypes, _ARRAYSIZE(s_UsedEncodingsTypes));
     onig_set_default_syntax(NP3_ONIG_SYNTAX_FLAVOR);        // std is: ONIG_SYNTAX_ONIGURUMA
 
-    m_OnigSyntax.op |= ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END; // xcluded from ONIG_SYNTAX_DEFAULT ?
+    for (const auto op1 : RemSynOptions_1) {
+      m_OnigSyntax.op &= ~op1;
+    }
+    for (const auto op2 : RemSynOptions_2) {
+      m_OnigSyntax.op2 &= ~op2;
+    }
+
+    for (const auto op1 : AddSynOptions_1) {
+      m_OnigSyntax.op |= op1;
+    }
+    for (const auto op2 : AddSynOptions_2) {
+      m_OnigSyntax.op2 |= op2;
+    }
 
     onig_region_init(&m_Region);
   }
@@ -557,7 +587,8 @@ void OnigurumaRegExEngine::clear() {
 // ----------------------------------------------------------------------------
 
 
-std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart, EndOfLine eolMode, OnigOptionType & /*rxOptions*/)
+std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart,
+                                                   EndOfLine eolMode, OnigOptionType & /*rxOptions*/)
 {
   UNREFERENCED_PARAMETER(eolMode);
 
@@ -584,6 +615,9 @@ std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprSt
   //~replaceAll(transRegExpr, R"(\>)", R"((?<=\w)(?!\w))"); // word end
   //~replaceAll(transRegExpr, R"(\(?<=\w)(?!\w))", R"(\\>)"); // esc'd
 
+  replaceAll(transRegExpr, R"(\h)", R"([^\S\n\v\f\r\u2028\u2029])"); // horizontal space
+  replaceAll(transRegExpr, R"(\H)", R"([^\t\p{Zs}])");               // not horizontal space
+
   #if 0
   // EOL modes is controlled by 
   switch (eolMode) {
diff --git a/scintilla/oniguruma/src/utf8.c b/scintilla/oniguruma/src/utf8.c
index 34fb5cee2..40616993f 100644
--- a/scintilla/oniguruma/src/utf8.c
+++ b/scintilla/oniguruma/src/utf8.c
@@ -264,13 +264,6 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag,
 static int
 is_utf8_newline(const UChar *p, const UChar *end)
 {
-#ifdef USE_CRNL_AS_LINE_TERMINATOR
-  if (p + 1 < end) {
-    if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE))  // CRLF
-      return 1;
-  }
-#endif
-
 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
   if (p + 2 < end) {
     if ((*p == 0xe2) && (*(p+1) == 0x80) && ((*(p+2) == 0xa8) || (*(p+2) == 0xa9))) // LS or PS
@@ -282,6 +275,13 @@ is_utf8_newline(const UChar *p, const UChar *end)
   }
 #endif
 
+#ifdef USE_CRNL_AS_LINE_TERMINATOR
+  if (p + 1 < end) {
+    if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE))  // CRLF
+      return 1;
+  }
+#endif
+
   if (p < end) {
 #ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
     if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE))