From 5c6346018d98932536710ea5841d073872a20263 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Mon, 1 Mar 2021 17:15:20 +0100 Subject: [PATCH 1/2] + upd: Oniguruma RegEx engine to current (2021-03-01) version [c0a86974] --- scintilla/oniguruma/README.md | 3 +- scintilla/oniguruma/doc/API | 48 +++++++++++++++++++++-------- scintilla/oniguruma/src/oniguruma.h | 2 +- scintilla/oniguruma/src/regexec.c | 11 +++---- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/scintilla/oniguruma/README.md b/scintilla/oniguruma/README.md index dcc153b3e..94135b45c 100644 --- a/scintilla/oniguruma/README.md +++ b/scintilla/oniguruma/README.md @@ -31,8 +31,9 @@ Supported character encodings: Master branch ------------- -* NEW API: ONIG_SYNTAX_PYTHON +* NEW API: ONIG_OPTION_CALLBACK_EACH_MATCH * NEW API: ONIG_OPTION_IGNORECASE_IS_ASCII +* NEW API: ONIG_SYNTAX_PYTHON Version 6.9.6 diff --git a/scintilla/oniguruma/doc/API b/scintilla/oniguruma/doc/API index 2ebf9d89b..8e06855bf 100644 --- a/scintilla/oniguruma/doc/API +++ b/scintilla/oniguruma/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.7 2021/01/18 +Oniguruma API Version 6.9.7 2021/03/01 #include @@ -333,6 +333,9 @@ Oniguruma API Version 6.9.7 2021/01/18 not found: ONIG_MISMATCH (< 0) error: error code (< 0) + * If option ONIG_OPTION_CALLBACK_EACH_MATCH is used, + it will return ONIG_MISMATCH even if there is a match. + arguments 1 reg: regex object 2 str: target string @@ -344,11 +347,12 @@ Oniguruma API Version 6.9.7 2021/01/18 6 region: address for return group match range info (NULL is allowed) 7 option: search time option - ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string - ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string - ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) - ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + ONIG_OPTION_CALLBACK_EACH_MATCH Call back for all successful matches. (including the case of the same matching start position). # int onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, @@ -374,6 +378,9 @@ Oniguruma API Version 6.9.7 2021/01/18 not match: ONIG_MISMATCH (< 0) error: error code (< 0) + * If option ONIG_OPTION_CALLBACK_EACH_MATCH is used, + it will return ONIG_MISMATCH even if there is a match. + arguments 1 reg: regex object 2 str: target string @@ -382,11 +389,12 @@ Oniguruma API Version 6.9.7 2021/01/18 5 region: address for return group match range info (NULL is allowed) 6 option: search time option - ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string - ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string - ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) - ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + ONIG_OPTION_CALLBACK_EACH_MATCH Call back for all successful matches. # int onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, @@ -703,6 +711,23 @@ Oniguruma API Version 6.9.7 2021/01/18 1 reg: regex object. +# OnigCallbackEachMatchFunc onig_get_callback_each_match(void) + + Return the current callback function for ONIG_OPTION_CALLBACK_EACH_MATCH. + + +# int onig_set_callback_each_match(OnigCallbackEachMatchFunc func) + + Set the callback function for ONIG_OPTION_CALLBACK_EACH_MATCH. + If NULL is set, the callback will never be executed. + + return value + normal: 0 + + arguments + 1 func: callback function + + # int onig_number_of_capture_histories(regex_t* reg) Return the number of capture history defined in the pattern. @@ -714,7 +739,6 @@ Oniguruma API Version 6.9.7 2021/01/18 1 reg: regex object. - # OnigCaptureTreeNode* onig_get_capture_tree(OnigRegion* region) Return the root node of capture history data tree. diff --git a/scintilla/oniguruma/src/oniguruma.h b/scintilla/oniguruma/src/oniguruma.h index 2c5d1e938..99dec1665 100644 --- a/scintilla/oniguruma/src/oniguruma.h +++ b/scintilla/oniguruma/src/oniguruma.h @@ -740,7 +740,7 @@ typedef struct { OnigCaseFoldType case_fold_flag; } OnigCompileInfo; -typedef int (*OnigCallbackEachMatchFunc)(const UChar* str, const UChar* end, const UChar* range, const UChar* match_start, OnigRegion* region, void* user_data); +typedef int (*OnigCallbackEachMatchFunc)(const UChar* str, const UChar* end, const UChar* match_start, OnigRegion* region, void* user_data); /* types for callout */ diff --git a/scintilla/oniguruma/src/regexec.c b/scintilla/oniguruma/src/regexec.c index 941c510c3..de37dd35c 100644 --- a/scintilla/oniguruma/src/regexec.c +++ b/scintilla/oniguruma/src/regexec.c @@ -3095,7 +3095,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_CAPTURE_HISTORY if (reg->capture_history != 0) { - int r; OnigCaptureTreeNode* node; if (IS_NULL(region->history_root)) { @@ -3112,9 +3111,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, node->end = (int )(s - str); stkp = stk_base; - r = make_capture_history_tree(region->history_root, &stkp, + i = make_capture_history_tree(region->history_root, &stkp, stk, (UChar* )str, reg); - if (r < 0) MATCH_AT_ERROR_RETURN(r); + if (i < 0) MATCH_AT_ERROR_RETURN(i); } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API @@ -3126,9 +3125,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (OPTON_CALLBACK_EACH_MATCH(options) && IS_NOT_NULL(CallbackEachMatch)) { - int r = CallbackEachMatch(str, end, in_right_range, sstart, region, - msa->mp->callout_user_data); - if (r < 0) MATCH_AT_ERROR_RETURN(r); + i = CallbackEachMatch(str, end, sstart, region, + msa->mp->callout_user_data); + if (i < 0) MATCH_AT_ERROR_RETURN(i); #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (! OPTON_FIND_LONGEST(options)) From 2910c10a5f768f125f94e3e28993b81009e44f88 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Mon, 1 Mar 2021 18:56:26 +0100 Subject: [PATCH 2/2] + fix: Hyperlink RegEx pattern fixed to use valid Unicode (Code-Points) (valid of BMP) only --- src/Edit.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/Edit.c b/src/Edit.c index 7c015c305..da5d8487d 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -88,10 +88,18 @@ static const char *const s_pColorRegEx_A = "#([0-9a-fA-F]){8}"; static const char *const s_pUnicodeRegEx = "(\\\\[uU|xX]([0-9a-fA-F]){4}|\\\\[xX]([0-9a-fA-F]){2})+"; // https://mathiasbynens.be/demo/url-regex : @stephenhay -//static const char* pUrlRegEx = "\\b(?:(?:https?|ftp|file)://|www\\.|ftp\\.)[^\\s/$.?#].[^\\s]*"; +//static const char* s_pUrlRegEx = "\\b(?:(?:https?|ftp|file)://|www\\.|ftp\\.)[^\\s/$.?#].[^\\s]*"; + +// using Gruber's Liberal Regex Pattern for All URLs (https://gist.github.com/gruber/249502) +/// => unfortunately to slow to use as scanner +//static const char *const s_pUrlRegEx = "(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)" +// "(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" +// "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; + +// pretty fast: static const char *const s_pUrlRegEx = "\\b(?:(?:https?|ftp|file)://|www\\.|ftp\\.)" - "(?:\\([-a-z\\u00a1-\\uffff0-9+&@#/%=~_|$?!:,.]*\\)|[-a-z\\u00a1-\\uffff0-9+&@#/%=~_|$?!:,.])*" - "(?:\\([-a-z\\u00a1-\\uffff0-9+&@#/%=~_|$?!:,.]*\\)|[a-z\\u00a1-\\uffff0-9+&@#/%=~_|$])"; + "(?:\\([-a-zA-Z0-9\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF+&@#/%=~_|$?!:,.]*\\)|[-a-zA-Z0-9\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF+&@#/%=~_|$?!:,.])*" + "(?:\\([-a-zA-Z0-9\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF+&@#/%=~_|$?!:,.]*\\)|[a-zA-Z0-9\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF+&@#/%=~_|$])"; // ----------------------------------------------------------------------------