+chg: Oniguruma syntax flavor for \h and \H (Ruby: match hex digit) to "horizontal space" definition of Perl / PCRE

This commit is contained in:
METANEOCORTEX\Kotti 2024-03-02 14:16:40 +01:00
parent 863e017a07
commit 8bcc76089d
2 changed files with 48 additions and 14 deletions

View File

@ -69,6 +69,28 @@ static OnigEncoding s_UsedEncodingsTypes[] = { ONIG_ENCODING_UTF8 };
// ============================================================================
// ============================================================================
// https://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/
#define NP3_ONIG_SYNTAX_FLAVOR ONIG_SYNTAX_DEFAULT // default is ONIG_SYNTAX_ONIGURUMA
// ensure some from special syntax options are excluded/included
const unsigned int RemSynOptions_1[1] = { 0 };
const unsigned int RemSynOptions_2[] = {
ONIG_SYN_OP2_ESC_H_XDIGIT // remove to replace \h\H with [^\S\n\v\f\r\u2028\u2029]
};
const unsigned int AddSynOptions_1[] = {
ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END // \<. \>
};
const unsigned int AddSynOptions_2[] = {
ONIG_SYN_OP2_ESC_U_HEX4 // \uHHHH
};
// -----------------------------------------------------------------------------
// ------------------------------------
// --- Onigmo Engine Simple Options ---
// ------------------------------------
@ -134,13 +156,9 @@ static void SetSimpleOptions(OnigOptionType &onigOptions, EOLmode /*eolMode*/,
}
}
// ============================================================================
#define NP3_ONIG_SYNTAX_FLAVOR (ONIG_SYNTAX_DEFAULT) // default is: ONIG_SYNTAX_ONIGURUMA
// -----------------------------------------------------------------------------
class OnigurumaRegExEngine : public RegexSearchBase
{
public:
@ -161,7 +179,19 @@ public:
onig_initialize(s_UsedEncodingsTypes, _ARRAYSIZE(s_UsedEncodingsTypes));
onig_set_default_syntax(NP3_ONIG_SYNTAX_FLAVOR); // std is: ONIG_SYNTAX_ONIGURUMA
m_OnigSyntax.op |= ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END; // xcluded from ONIG_SYNTAX_DEFAULT ?
for (const auto op1 : RemSynOptions_1) {
m_OnigSyntax.op &= ~op1;
}
for (const auto op2 : RemSynOptions_2) {
m_OnigSyntax.op2 &= ~op2;
}
for (const auto op1 : AddSynOptions_1) {
m_OnigSyntax.op |= op1;
}
for (const auto op2 : AddSynOptions_2) {
m_OnigSyntax.op2 |= op2;
}
onig_region_init(&m_Region);
}
@ -557,7 +587,8 @@ void OnigurumaRegExEngine::clear() {
// ----------------------------------------------------------------------------
std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart, EndOfLine eolMode, OnigOptionType & /*rxOptions*/)
std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprStr, bool wholeWord, bool wordStart,
EndOfLine eolMode, OnigOptionType & /*rxOptions*/)
{
UNREFERENCED_PARAMETER(eolMode);
@ -584,6 +615,9 @@ std::string OnigurumaRegExEngine::translateRegExpr(const std::string & regExprSt
//~replaceAll(transRegExpr, R"(\>)", R"((?<=\w)(?!\w))"); // word end
//~replaceAll(transRegExpr, R"(\(?<=\w)(?!\w))", R"(\\>)"); // esc'd
replaceAll(transRegExpr, R"(\h)", R"([^\S\n\v\f\r\u2028\u2029])"); // horizontal space
replaceAll(transRegExpr, R"(\H)", R"([^\t\p{Zs}])"); // not horizontal space
#if 0
// EOL modes is controlled by
switch (eolMode) {

View File

@ -264,13 +264,6 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag,
static int
is_utf8_newline(const UChar *p, const UChar *end)
{
#ifdef USE_CRNL_AS_LINE_TERMINATOR
if (p + 1 < end) {
if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF
return 1;
}
#endif
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
if (p + 2 < end) {
if ((*p == 0xe2) && (*(p+1) == 0x80) && ((*(p+2) == 0xa8) || (*(p+2) == 0xa9))) // LS or PS
@ -282,6 +275,13 @@ is_utf8_newline(const UChar *p, const UChar *end)
}
#endif
#ifdef USE_CRNL_AS_LINE_TERMINATOR
if (p + 1 < end) {
if ((*p == CARRIAGE_RET) && (*(p+1) == NEWLINE_CODE)) // CRLF
return 1;
}
#endif
if (p < end) {
#ifdef USE_END_OF_FILE_AS_LINE_TERMINATOR
if ((*p == CARRIAGE_RET) || (*p == NEWLINE_CODE) || (*p == END_OF_FILE))