+ upd: Oniguruma update (2021-01-24)

This commit is contained in:
Rainer Kottenhoff 2021-01-24 08:35:24 +01:00
parent fdd88bf2e5
commit ae9c01699d
7 changed files with 179 additions and 25 deletions

View File

@ -31,6 +31,7 @@ Supported character encodings:
Master branch
-------------
* NEW API: ONIG_SYNTAX_PYTHON
* NEW API: ONIG_OPTION_IGNORECASE_IS_ASCII

View File

@ -1,4 +1,4 @@
Oniguruma API Version 6.9.7 2020/12/07
Oniguruma API Version 6.9.7 2021/01/18
#include <oniguruma.h>
@ -82,10 +82,7 @@ Oniguruma API Version 6.9.7 2020/12/07
ONIG_OPTION_EXTEND extended pattern form
ONIG_OPTION_FIND_LONGEST find longest match
ONIG_OPTION_FIND_NOT_EMPTY ignore empty match
ONIG_OPTION_NEGATE_SINGLELINE
clear ONIG_OPTION_SINGLELINE which is enabled on
ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED,
ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVA
ONIG_OPTION_NEGATE_SINGLELINE clear ONIG_OPTION_SINGLELINE which is enabled on ONIG_SYNTAX_POSIX_BASIC/POSIX_EXTENDED/PERL/PERL_NG/PYTHON/JAVA
ONIG_OPTION_DONT_CAPTURE_GROUP only named group captured.
ONIG_OPTION_CAPTURE_GROUP named and no-named group captured.
@ -153,6 +150,7 @@ Oniguruma API Version 6.9.7 2020/12/07
ONIG_SYNTAX_JAVA Java (Sun java.util.regex)
ONIG_SYNTAX_PERL Perl
ONIG_SYNTAX_PERL_NG Perl + named group
ONIG_SYNTAX_PYTHON Python
ONIG_SYNTAX_ONIGURUMA Oniguruma
ONIG_SYNTAX_DEFAULT default (== ONIG_SYNTAX_ONIGURUMA)
onig_set_default_syntax()
@ -680,7 +678,6 @@ Oniguruma API Version 6.9.7 2020/12/07
# OnigEncoding onig_get_encoding(regex_t* reg)
# OnigOptionType onig_get_options(regex_t* reg)
# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg)
# OnigSyntaxType* onig_get_syntax(regex_t* reg)
Return a value of the regex object.
@ -689,6 +686,15 @@ Oniguruma API Version 6.9.7 2020/12/07
1 reg: regex object.
# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg)
Return the case_fold_flag of the regex object.
This function is deprecated.
arguments
1 reg: regex object.
# int onig_number_of_captures(regex_t* reg)
Return the number of capture group in the pattern.
@ -896,11 +902,13 @@ Oniguruma API Version 6.9.7 2020/12/07
# OnigCaseFoldType onig_get_default_case_fold_flag()
Get default case fold flag.
This function is deprecated.
# int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
Set default case fold flag.
This function is deprecated.
1 case_fold_flag: case fold flag

View File

@ -4,7 +4,7 @@
oniguruma.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2020 K.Kosako
* Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -441,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG;
ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPython;
ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;
/* predefined syntaxes (see regsyntax.c) */
@ -454,6 +455,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;
#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl)
#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG)
#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby)
#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython)
#define ONIG_SYNTAX_ONIGURUMA (&OnigSyntaxOniguruma)
/* default syntax */
@ -526,6 +528,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */
#define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (1U<<29) /* (*name) (*name{a,..}) */
#define ONIG_SYN_OP2_OPTION_ONIGURUMA (1U<<30) /* (?imxWDSPy) */
#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME (1U<<31) /* (?P<name>...) (?P=name) */
/* syntax (behavior) */
#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */
@ -541,6 +544,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */
#define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */
#define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */
#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */
/* syntax (behavior) in char class [...] */
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */
@ -623,6 +627,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209
#define ONIGERR_TOO_MANY_CAPTURES -210
#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212
#define ONIGERR_UNDEFINED_OPERATOR -213
#define ONIGERR_EMPTY_GROUP_NAME -214
#define ONIGERR_INVALID_GROUP_NAME -215
#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216

View File

@ -2,7 +2,7 @@
regerror.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2020 K.Kosako
* Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -146,6 +146,8 @@ onig_error_code_to_format(int code)
p = "too big wide-char value"; break;
case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:
p = "too long wide-char value"; break;
case ONIGERR_UNDEFINED_OPERATOR:
p = "undefined operator"; break;
case ONIGERR_INVALID_CODE_POINT_VALUE:
p = "invalid code point value"; break;
case ONIGERR_EMPTY_GROUP_NAME:

View File

@ -2,7 +2,7 @@
regparse.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2020 K.Kosako
* Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -5061,6 +5061,7 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
int r;
OnigCodePoint code;
OnigCodePoint c, c2;
int mindigits, maxdigits;
OnigSyntaxType* syn = env->syntax;
OnigEncoding enc = env->enc;
UChar* prev;
@ -5249,10 +5250,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
case 'u':
if (PEND) break;
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
mindigits = maxdigits = 4;
u_hex_digits:
r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);
if (r < 0) return r;
if (p == prev) { /* can't read nothing. */
code = 0; /* but, it's not error */
@ -5263,6 +5265,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
}
break;
case 'U':
if (PEND) break;
prev = p;
if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
mindigits = maxdigits = 8;
goto u_hex_digits;
}
break;
case '0':
case '1': case '2': case '3': case '4': case '5': case '6': case '7':
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
@ -5334,10 +5345,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
int r;
OnigCodePoint code;
OnigCodePoint c;
OnigEncoding enc = env->enc;
OnigSyntaxType* syn = env->syntax;
int mindigits, maxdigits;
UChar* prev;
UChar* p = *src;
int allow_num;
OnigEncoding enc;
OnigSyntaxType* syn;
UChar* p;
enc = env->enc;
syn = env->syntax;
p = *src;
PFETCH_READY;
if (tok->code_point_continue != 0) {
@ -5576,12 +5594,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
case 'Z':
if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
goto end_buf;
}
else {
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
tok->type = TK_ANCHOR;
tok->u.subtype = ANCR_SEMI_END_BUF;
}
break;
case 'z':
if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON))
return ONIGERR_UNDEFINED_OPERATOR;
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
end_buf:
tok->type = TK_ANCHOR;
@ -5670,10 +5696,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'u':
if (PEND) break;
prev = p;
mindigits = maxdigits = 4;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
u_hex_digits:
r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);
if (r < 0) return r;
if (p == prev) { /* can't read nothing. */
code = 0; /* but, it's not error */
@ -5684,6 +5711,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
break;
case 'U':
if (PEND) break;
prev = p;
if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
mindigits = maxdigits = 8;
goto u_hex_digits;
}
break;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
PUNFETCH;
@ -5745,6 +5781,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
int back_num;
enum REF_NUM num_type;
allow_num = 1;
backref_start:
prev = p;
#ifdef USE_BACKREF_WITH_LEVEL
@ -5759,6 +5798,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (r < 0) return r;
if (num_type != IS_NOT_NUM) {
if (allow_num == 0) return ONIGERR_INVALID_BACKREF;
if (num_type == IS_REL_NUM) {
back_num = backref_rel_to_abs(back_num, env);
}
@ -5815,12 +5856,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
UChar* name_end;
enum REF_NUM num_type;
allow_num = 1;
call_start:
prev = p;
r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
&gnum, &num_type, TRUE);
if (r < 0) return r;
if (num_type != IS_NOT_NUM) {
if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE;
if (num_type == IS_REL_NUM) {
gnum = backref_rel_to_abs(gnum, env);
if (gnum < 0) {
@ -5977,6 +6023,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '(':
if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
prev = p;
PINC;
if (! PEND) {
c = PPEEK;
@ -6064,11 +6111,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
}
}
else if (c == 'P' &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) {
PINC; /* skip 'P' */
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
allow_num = 0;
if (c == '=') {
c = '(';
goto backref_start;
}
else if (c == '>') {
#ifdef USE_CALL
c = '(';
goto call_start;
#else
return ONIGERR_UNDEFINED_OPERATOR;
#endif
}
else {
p = prev;
goto lparen_qmark_end2;
}
}
}
lparen_qmark_end:
PUNFETCH;
}
lparen_qmark_end2:
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_OPEN;
break;
@ -7934,12 +8005,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
break;
#endif
case 'P':
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) {
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
if (c == '<') goto named_group1;
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
/* else fall */
case 'W': case 'D': case 'S':
case 'y':
if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
return ONIGERR_UNDEFINED_GROUP_OPTION;
/* else fall */
#ifdef USE_POSIXLINE_OPTION
case 'p':
#endif
case 'a':
case '-': case 'i': case 'm': case 's': case 'x':
case 'W': case 'D': case 'S': case 'P':
case 'y':
{
int neg = 0;
@ -7976,10 +8061,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
break;
#endif
case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
case 'W':
if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
return ONIGERR_UNDEFINED_GROUP_OPTION;
OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg);
break;
case 'D':
if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
return ONIGERR_UNDEFINED_GROUP_OPTION;
OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg);
break;
case 'S':
if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
return ONIGERR_UNDEFINED_GROUP_OPTION;
OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg);
break;
case 'P':
if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
return ONIGERR_UNDEFINED_GROUP_OPTION;
OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg);
break;
case 'y': /* y{g}, y{w} */
{
@ -8018,8 +8119,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
PFETCH(c);
if (c != '}')
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
} /* case 'y' */
break;
case 'a':
if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON))
return ONIGERR_UNDEFINED_GROUP_OPTION;
OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg);
break;
default:
return ONIGERR_UNDEFINED_GROUP_OPTION;

View File

@ -2,7 +2,7 @@
regposix.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2020 K.Kosako
* Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -120,6 +120,7 @@ onig2posix_error_code(int code)
{ ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT },
{ ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
{ ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
{ ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT },
{ ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC },
{ ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT },
{ ONIGERR_INVALID_GROUP_NAME, REG_BADPAT },

View File

@ -2,7 +2,7 @@
regsyntax.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2020 K.Kosako
* Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = {
}
};
/* Python 3.9 */
OnigSyntaxType OnigSyntaxPython = {
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
ONIG_SYN_OP_ESC_CONTROL_CHARS |
ONIG_SYN_OP_ESC_C_CONTROL )
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
, ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME |
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME |
ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 )
, ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH |
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON )
, ONIG_OPTION_SINGLELINE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
extern int