From ae9c01699d74ddfc0d10b4b9b3172aafd129e4e7 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Sun, 24 Jan 2021 08:35:24 +0100 Subject: [PATCH] + upd: Oniguruma update (2021-01-24) --- scintilla/oniguruma/README.md | 1 + scintilla/oniguruma/doc/API | 20 ++-- scintilla/oniguruma/src/oniguruma.h | 7 +- scintilla/oniguruma/src/regerror.c | 4 +- scintilla/oniguruma/src/regparse.c | 138 +++++++++++++++++++++++++--- scintilla/oniguruma/src/regposix.c | 3 +- scintilla/oniguruma/src/regsyntax.c | 31 ++++++- 7 files changed, 179 insertions(+), 25 deletions(-) diff --git a/scintilla/oniguruma/README.md b/scintilla/oniguruma/README.md index 056abd61a..dcc153b3e 100644 --- a/scintilla/oniguruma/README.md +++ b/scintilla/oniguruma/README.md @@ -31,6 +31,7 @@ Supported character encodings: Master branch ------------- +* NEW API: ONIG_SYNTAX_PYTHON * NEW API: ONIG_OPTION_IGNORECASE_IS_ASCII diff --git a/scintilla/oniguruma/doc/API b/scintilla/oniguruma/doc/API index 740933741..bd0186196 100644 --- a/scintilla/oniguruma/doc/API +++ b/scintilla/oniguruma/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.7 2020/12/07 +Oniguruma API Version 6.9.7 2021/01/18 #include @@ -82,10 +82,7 @@ Oniguruma API Version 6.9.7 2020/12/07 ONIG_OPTION_EXTEND extended pattern form ONIG_OPTION_FIND_LONGEST find longest match ONIG_OPTION_FIND_NOT_EMPTY ignore empty match - ONIG_OPTION_NEGATE_SINGLELINE - clear ONIG_OPTION_SINGLELINE which is enabled on - ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED, - ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVA + ONIG_OPTION_NEGATE_SINGLELINE clear ONIG_OPTION_SINGLELINE which is enabled on ONIG_SYNTAX_POSIX_BASIC/POSIX_EXTENDED/PERL/PERL_NG/PYTHON/JAVA ONIG_OPTION_DONT_CAPTURE_GROUP only named group captured. ONIG_OPTION_CAPTURE_GROUP named and no-named group captured. @@ -153,6 +150,7 @@ Oniguruma API Version 6.9.7 2020/12/07 ONIG_SYNTAX_JAVA Java (Sun java.util.regex) ONIG_SYNTAX_PERL Perl ONIG_SYNTAX_PERL_NG Perl + named group + ONIG_SYNTAX_PYTHON Python ONIG_SYNTAX_ONIGURUMA Oniguruma ONIG_SYNTAX_DEFAULT default (== ONIG_SYNTAX_ONIGURUMA) onig_set_default_syntax() @@ -680,7 +678,6 @@ Oniguruma API Version 6.9.7 2020/12/07 # OnigEncoding onig_get_encoding(regex_t* reg) # OnigOptionType onig_get_options(regex_t* reg) -# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) # OnigSyntaxType* onig_get_syntax(regex_t* reg) Return a value of the regex object. @@ -689,6 +686,15 @@ Oniguruma API Version 6.9.7 2020/12/07 1 reg: regex object. +# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) + + Return the case_fold_flag of the regex object. + This function is deprecated. + + arguments + 1 reg: regex object. + + # int onig_number_of_captures(regex_t* reg) Return the number of capture group in the pattern. @@ -896,11 +902,13 @@ Oniguruma API Version 6.9.7 2020/12/07 # OnigCaseFoldType onig_get_default_case_fold_flag() Get default case fold flag. + This function is deprecated. # int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) Set default case fold flag. + This function is deprecated. 1 case_fold_flag: case fold flag diff --git a/scintilla/oniguruma/src/oniguruma.h b/scintilla/oniguruma/src/oniguruma.h index 6bd233d8a..cb62ff55a 100644 --- a/scintilla/oniguruma/src/oniguruma.h +++ b/scintilla/oniguruma/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -441,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPython; ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; /* predefined syntaxes (see regsyntax.c) */ @@ -454,6 +455,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; #define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) #define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG) #define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) +#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython) #define ONIG_SYNTAX_ONIGURUMA (&OnigSyntaxOniguruma) /* default syntax */ @@ -526,6 +528,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */ #define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (1U<<29) /* (*name) (*name{a,..}) */ #define ONIG_SYN_OP2_OPTION_ONIGURUMA (1U<<30) /* (?imxWDSPy) */ +#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME (1U<<31) /* (?P...) (?P=name) */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ @@ -541,6 +544,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ #define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */ #define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */ +#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ @@ -623,6 +627,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 #define ONIGERR_TOO_MANY_CAPTURES -210 #define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 +#define ONIGERR_UNDEFINED_OPERATOR -213 #define ONIGERR_EMPTY_GROUP_NAME -214 #define ONIGERR_INVALID_GROUP_NAME -215 #define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 diff --git a/scintilla/oniguruma/src/regerror.c b/scintilla/oniguruma/src/regerror.c index ce69fd18a..18a5bdd22 100644 --- a/scintilla/oniguruma/src/regerror.c +++ b/scintilla/oniguruma/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -146,6 +146,8 @@ onig_error_code_to_format(int code) p = "too big wide-char value"; break; case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE: p = "too long wide-char value"; break; + case ONIGERR_UNDEFINED_OPERATOR: + p = "undefined operator"; break; case ONIGERR_INVALID_CODE_POINT_VALUE: p = "invalid code point value"; break; case ONIGERR_EMPTY_GROUP_NAME: diff --git a/scintilla/oniguruma/src/regparse.c b/scintilla/oniguruma/src/regparse.c index fe04aa795..5da8ef265 100644 --- a/scintilla/oniguruma/src/regparse.c +++ b/scintilla/oniguruma/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -5061,6 +5061,7 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) int r; OnigCodePoint code; OnigCodePoint c, c2; + int mindigits, maxdigits; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; UChar* prev; @@ -5249,10 +5250,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) case 'u': if (PEND) break; - prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + mindigits = maxdigits = 4; + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5263,6 +5265,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { @@ -5334,10 +5345,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int r; OnigCodePoint code; OnigCodePoint c; - OnigEncoding enc = env->enc; - OnigSyntaxType* syn = env->syntax; + int mindigits, maxdigits; UChar* prev; - UChar* p = *src; + int allow_num; + OnigEncoding enc; + OnigSyntaxType* syn; + UChar* p; + + enc = env->enc; + syn = env->syntax; + p = *src; + PFETCH_READY; if (tok->code_point_continue != 0) { @@ -5576,12 +5594,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case 'Z': + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + goto end_buf; + } + else { if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; tok->type = TK_ANCHOR; tok->u.subtype = ANCR_SEMI_END_BUF; + } break; case 'z': + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_OPERATOR; + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; end_buf: tok->type = TK_ANCHOR; @@ -5670,10 +5696,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'u': if (PEND) break; - prev = p; + mindigits = maxdigits = 4; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5684,6 +5711,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': PUNFETCH; @@ -5745,6 +5781,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int back_num; enum REF_NUM num_type; + allow_num = 1; + + backref_start: prev = p; #ifdef USE_BACKREF_WITH_LEVEL @@ -5759,6 +5798,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_INVALID_BACKREF; + if (num_type == IS_REL_NUM) { back_num = backref_rel_to_abs(back_num, env); } @@ -5815,12 +5856,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* name_end; enum REF_NUM num_type; + allow_num = 1; + + call_start: prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE; + if (num_type == IS_REL_NUM) { gnum = backref_rel_to_abs(gnum, env); if (gnum < 0) { @@ -5977,6 +6023,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '(': if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + prev = p; PINC; if (! PEND) { c = PPEEK; @@ -6064,11 +6111,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; } } + else if (c == 'P' && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + PINC; /* skip 'P' */ + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + allow_num = 0; + if (c == '=') { + c = '('; + goto backref_start; + } + else if (c == '>') { +#ifdef USE_CALL + c = '('; + goto call_start; +#else + return ONIGERR_UNDEFINED_OPERATOR; +#endif + } + else { + p = prev; + goto lparen_qmark_end2; + } + } } lparen_qmark_end: PUNFETCH; } + lparen_qmark_end2: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; @@ -7934,12 +8005,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; #endif + case 'P': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == '<') goto named_group1; + + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + /* else fall */ + case 'W': case 'D': case 'S': + case 'y': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + /* else fall */ + #ifdef USE_POSIXLINE_OPTION case 'p': #endif + case 'a': case '-': case 'i': case 'm': case 's': case 'x': - case 'W': case 'D': case 'S': case 'P': - case 'y': { int neg = 0; @@ -7976,10 +8061,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); break; #endif - case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; - case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; - case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; - case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; + case 'W': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); + break; + case 'D': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); + break; + case 'S': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); + break; + case 'P': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; case 'y': /* y{g}, y{w} */ { @@ -8018,8 +8119,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, PFETCH(c); if (c != '}') return ONIGERR_UNDEFINED_GROUP_OPTION; - break; } /* case 'y' */ + break; + + case 'a': + if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; default: return ONIGERR_UNDEFINED_GROUP_OPTION; diff --git a/scintilla/oniguruma/src/regposix.c b/scintilla/oniguruma/src/regposix.c index 8408079e0..7534d4d70 100644 --- a/scintilla/oniguruma/src/regposix.c +++ b/scintilla/oniguruma/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,6 +120,7 @@ onig2posix_error_code(int code) { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT }, { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT }, { ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC }, { ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT }, { ONIGERR_INVALID_GROUP_NAME, REG_BADPAT }, diff --git a/scintilla/oniguruma/src/regsyntax.c b/scintilla/oniguruma/src/regsyntax.c index 984aac662..8e1c31320 100644 --- a/scintilla/oniguruma/src/regsyntax.c +++ b/scintilla/oniguruma/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = { } }; +/* Python 3.9 */ +OnigSyntaxType OnigSyntaxPython = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | + ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME | + ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | + ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | + ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON ) + , ONIG_OPTION_SINGLELINE + , + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + } +}; + extern int