From 8bcc76089d479267632f77acc8d499e19dd67999 Mon Sep 17 00:00:00 2001 From: "METANEOCORTEX\\Kotti" Date: Sat, 2 Mar 2024 14:16:40 +0100 Subject: [PATCH] +chg: Oniguruma syntax flavor for \h and \H (Ruby: match hex digit) to "horizontal space" definition of Perl / PCRE --- .../scintilla/OnigurumaRegExEngine.cxx | 48 ++++++++++++++++--- scintilla/oniguruma/src/utf8.c | 14 +++--- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx index f4f15b3d4..ee8613eb3 100644 --- a/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx +++ b/scintilla/oniguruma/scintilla/OnigurumaRegExEngine.cxx @@ -69,6 +69,28 @@ static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8 }; // ============================================================================ // ============================================================================ +// https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/ + +#define NP3_ONIG_SYNTAX_FLAVOR ONIG_SYNTAX_DEFAULT // default is ONIG_SYNTAX_ONIGURUMA + +// ensure some from special syntax options are excluded/included + +const unsigned int RemSynOptions_1[1] = { 0 }; +const unsigned int RemSynOptions_2[] = { + ONIG_SYN_OP2_ESC_H_XDIGIT // remove to replace \h\H with [^\S\n\v\f\r\u2028\u2029] +}; + +const unsigned int AddSynOptions_1[] = { + ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END // \<. \> +}; +const unsigned int AddSynOptions_2[] = { + ONIG_SYN_OP2_ESC_U_HEX4 // \uHHHH +}; + +// ----------------------------------------------------------------------------- + + + // ------------------------------------ // --- Onigmo Engine Simple Options --- // ------------------------------------ @@ -134,13 +156,9 @@ static void SetSimpleOptions(OnigOptionType &onigOptions, EOLmode /*eolMode*/, } } + // ============================================================================ - -#define NP3_ONIG_SYNTAX_FLAVOR (ONIG_SYNTAX_DEFAULT) // default is: ONIG_SYNTAX_ONIGURUMA - -// ----------------------------------------------------------------------------- - class OnigurumaRegExEngine : public RegexSearchBase { public: @@ -161,7 +179,19 @@ public: onig_initialize(s_UsedEncodingsTypes, _ARRAYSIZE(s_UsedEncodingsTypes)); onig_set_default_syntax(NP3_ONIG_SYNTAX_FLAVOR); // std is: ONIG_SYNTAX_ONIGURUMA - m_OnigSyntax.op |= ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END; // xcluded from ONIG_SYNTAX_DEFAULT ? + for (const auto op1 : RemSynOptions_1) { + m_OnigSyntax.op &= ~op1; + } + for (const auto op2 : RemSynOptions_2) { + m_OnigSyntax.op2 &= ~op2; + } + + for (const auto op1 : AddSynOptions_1) { + m_OnigSyntax.op |= op1; + } + for (const auto op2 : AddSynOptions_2) { + m_OnigSyntax.op2 |= op2; + } onig_region_init(&m_Region); } @@ -557,7 +587,8 @@ void OnigurumaRegExEngine::clear() { // ---------------------------------------------------------------------------- -std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart, EndOfLine eolMode, OnigOptionType & /*rxOptions*/) +std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart, + EndOfLine eolMode, OnigOptionType & /*rxOptions*/) { UNREFERENCED_PARAMETER(eolMode); @@ -584,6 +615,9 @@ std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprSt //~replaceAll(transRegExpr, R"(\>)", R"((?<=\w)(?!\w))"); // word end //~replaceAll(transRegExpr, R"(\(?<=\w)(?!\w))", R"(\\>)"); // esc'd + replaceAll(transRegExpr, R"(\h)", R"([^\S\n\v\f\r\u2028\u2029])"); // horizontal space + replaceAll(transRegExpr, R"(\H)", R"([^\t\p{Zs}])"); // not horizontal space + #if 0 // EOL modes is controlled by switch (eolMode) { diff --git a/scintilla/oniguruma/src/utf8.c b/scintilla/oniguruma/src/utf8.c index 34fb5cee2..40616993f 100644 --- a/scintilla/oniguruma/src/utf8.c +++ b/scintilla/oniguruma/src/utf8.c @@ -264,13 +264,6 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, static int is_utf8_newline(const UChar *p, const UChar *end) { -#ifdef USE_CRNL_AS_LINE_TERMINATOR - if (p + 1 < end) { - if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF - return 1; - } -#endif - #ifdef USE_UNICODE_ALL_LINE_TERMINATORS if (p + 2 < end) { if ((*p == 0xe2) && (*(p+1) == 0x80) && ((*(p+2) == 0xa8) || (*(p+2) == 0xa9))) // LS or PS @@ -282,6 +275,13 @@ is_utf8_newline(const UChar *p, const UChar *end) } #endif +#ifdef USE_CRNL_AS_LINE_TERMINATOR + if (p + 1 < end) { + if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF + return 1; + } +#endif + if (p < end) { #ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE))