Merge pull request #4586 from RaiKoHoff/Dev_Master

Update Oniguruma current dev
This commit is contained in:
Rainer Kottenhoff 2023-02-24 18:02:51 +01:00 committed by GitHub
commit 51a51ab148
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 149 additions and 40 deletions

View File

@ -289,7 +289,7 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default syntax)
* (?i) option has no effect on word types (\w, \p{Word}). However, if the word types are used within a character class, it is valid. But, this would only be a concern when word types are used with the (?W) option.
/(?CIL).../, /(?CIL:...)/ whole opiton
/(?CIL).../, /(?CIL:...)/ whole option
This option must be placed in a position that
affects the entire regular expression.

View File

@ -546,6 +546,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */
#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */
#define ONIG_SYN_WHOLE_OPTIONS (1U<<13) /* (?Ie) */
#define ONIG_SYN_BRE_ANCHOR_AT_EDGE_OF_SUBEXP (1U<<14) /* \(^abc$\) */
/* syntax (behavior) in char class [...] */
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */

View File

@ -887,7 +887,7 @@ add_op(regex_t* reg, int opcode)
return 0;
}
static int compile_length_tree(Node* node, regex_t* reg);
static int compile_length_tree(Node* node, regex_t* reg, ParseEnv* env);
static int compile_tree(Node* node, regex_t* reg, ParseEnv* env);
@ -1322,12 +1322,12 @@ is_anychar_infinite_greedy(QuantNode* qn)
#define CKN_ON (ckn > 0)
static int
compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
compile_length_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
{
int len, mod_tlen;
int infinite = IS_INFINITE_REPEAT(qn->upper);
enum BodyEmptyType emptiness = qn->emptiness;
int tlen = compile_length_tree(ND_QUANT_BODY(qn), reg);
int tlen = compile_length_tree(ND_QUANT_BODY(qn), reg, env);
if (tlen < 0) return tlen;
if (tlen == 0) return 0;
@ -1401,7 +1401,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
int i, r, mod_tlen;
int infinite = IS_INFINITE_REPEAT(qn->upper);
enum BodyEmptyType emptiness = qn->emptiness;
int tlen = compile_length_tree(ND_QUANT_BODY(qn), reg);
int tlen = compile_length_tree(ND_QUANT_BODY(qn), reg, env);
if (tlen < 0) return tlen;
if (tlen == 0) return 0;
@ -1566,11 +1566,11 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
}
static int
compile_length_option_node(BagNode* node, regex_t* reg)
compile_length_option_node(BagNode* node, regex_t* reg, ParseEnv* env)
{
int tlen;
tlen = compile_length_tree(ND_BAG_BODY(node), reg);
tlen = compile_length_tree(ND_BAG_BODY(node), reg, env);
return tlen;
}
@ -1586,16 +1586,16 @@ compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env)
}
static int
compile_length_bag_node(BagNode* node, regex_t* reg)
compile_length_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
{
int len;
int tlen;
if (node->type == BAG_OPTION)
return compile_length_option_node(node, reg);
return compile_length_option_node(node, reg, env);
if (ND_BAG_BODY(node)) {
tlen = compile_length_tree(ND_BAG_BODY(node), reg);
tlen = compile_length_tree(ND_BAG_BODY(node), reg, env);
if (tlen < 0) return tlen;
}
else
@ -1644,7 +1644,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)
QuantNode* qn;
qn = QUANT_(ND_BAG_BODY(node));
tlen = compile_length_tree(ND_QUANT_BODY(qn), reg);
tlen = compile_length_tree(ND_QUANT_BODY(qn), reg, env);
if (tlen < 0) return tlen;
v = onig_positive_int_multiply(qn->lower, tlen);
@ -1662,12 +1662,12 @@ compile_length_bag_node(BagNode* node, regex_t* reg)
Node* Then = node->te.Then;
Node* Else = node->te.Else;
len = compile_length_tree(cond, reg);
len = compile_length_tree(cond, reg, env);
if (len < 0) return len;
len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK;
if (IS_NOT_NULL(Then)) {
tlen = compile_length_tree(Then, reg);
tlen = compile_length_tree(Then, reg, env);
if (tlen < 0) return tlen;
len += tlen;
}
@ -1675,7 +1675,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)
len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK;
if (IS_NOT_NULL(Else)) {
tlen = compile_length_tree(Else, reg);
tlen = compile_length_tree(Else, reg, env);
if (tlen < 0) return tlen;
len += tlen;
}
@ -1712,7 +1712,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)
COP(reg)->call.addr = (int )node->m.called_addr;
if (node->m.regnum == 0) {
len = compile_length_tree(ND_BAG_BODY(node), reg);
len = compile_length_tree(ND_BAG_BODY(node), reg, env);
len += OPSIZE_RETURN;
r = add_op(reg, OP_JUMP);
@ -1726,7 +1726,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)
return r;
}
else {
len = compile_length_tree(ND_BAG_BODY(node), reg);
len = compile_length_tree(ND_BAG_BODY(node), reg, env);
len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN);
if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum))
len += (ND_IS_RECURSION(node)
@ -1795,7 +1795,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
r = compile_tree_n_times(ND_QUANT_BODY(qn), qn->lower, reg, env);
if (r != 0) return r;
len = compile_length_tree(ND_QUANT_BODY(qn), reg);
len = compile_length_tree(ND_QUANT_BODY(qn), reg, env);
if (len < 0) return len;
r = add_op(reg, OP_PUSH);
@ -1844,10 +1844,10 @@ compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
COP(reg)->mark.id = mid;
COP(reg)->mark.save_pos = 0;
cond_len = compile_length_tree(cond, reg);
cond_len = compile_length_tree(cond, reg, env);
if (cond_len < 0) return cond_len;
if (IS_NOT_NULL(Then)) {
then_len = compile_length_tree(Then, reg);
then_len = compile_length_tree(Then, reg, env);
if (then_len < 0) return then_len;
}
else
@ -1872,7 +1872,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
}
if (IS_NOT_NULL(Else)) {
else_len = compile_length_tree(Else, reg);
else_len = compile_length_tree(Else, reg, env);
if (else_len < 0) return else_len;
}
else
@ -1902,13 +1902,13 @@ compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
}
static int
compile_length_anchor_node(AnchorNode* node, regex_t* reg)
compile_length_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
{
int len;
int tlen = 0;
if (IS_NOT_NULL(ND_ANCHOR_BODY(node))) {
tlen = compile_length_tree(ND_ANCHOR_BODY(node), reg);
tlen = compile_length_tree(ND_ANCHOR_BODY(node), reg, env);
if (tlen < 0) return tlen;
}
@ -1926,11 +1926,14 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg)
len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR;
if (IS_NOT_NULL(node->lead_node)) {
int llen = compile_length_tree(node->lead_node, reg);
int llen = compile_length_tree(node->lead_node, reg, env);
if (llen < 0) return llen;
len += OPSIZE_MOVE + llen;
}
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0)
len += OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR;
}
break;
case ANCR_LOOK_BEHIND_NOT:
@ -1940,11 +1943,14 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg)
len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP;
if (IS_NOT_NULL(node->lead_node)) {
int llen = compile_length_tree(node->lead_node, reg);
int llen = compile_length_tree(node->lead_node, reg, env);
if (llen < 0) return llen;
len += OPSIZE_MOVE + llen;
}
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0)
len += OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR;
}
break;
@ -1999,7 +2005,7 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
COP(reg)->cut_to_mark.restore_pos = FALSE;
}
else {
MemNumType mid1, mid2;
MemNumType mid1, mid2, mid3;
OnigLen diff;
if (IS_NOT_NULL(node->lead_node)) {
@ -2046,6 +2052,14 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
r = add_op(reg, OP_FAIL);
if (r != 0) return r;
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0) {
ID_ENTRY(env, mid3);
r = add_op(reg, OP_SAVE_VAL);
if (r != 0) return r;
COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
COP(reg)->save_val.id = mid3;
}
r = add_op(reg, OP_STEP_BACK_START);
if (r != 0) return r;
@ -2064,6 +2078,14 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
r = compile_tree(ND_ANCHOR_BODY(node), reg, env);
if (r != 0) return r;
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0) {
r = add_op(reg, OP_UPDATE_VAR);
if (r != 0) return r;
COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
COP(reg)->update_var.id = mid3;
COP(reg)->update_var.clear = FALSE;
}
r = add_op(reg, OP_CHECK_POSITION);
if (r != 0) return r;
COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
@ -2090,7 +2112,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
int r;
int len;
len = compile_length_tree(ND_ANCHOR_BODY(node), reg);
len = compile_length_tree(ND_ANCHOR_BODY(node), reg, env);
if (node->char_min_len == node->char_max_len) {
MemNumType mid;
@ -2122,7 +2144,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
r = add_op(reg, OP_POP);
}
else {
MemNumType mid1, mid2;
MemNumType mid1, mid2, mid3;
OnigLen diff;
ID_ENTRY(env, mid1);
@ -2143,13 +2165,16 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
r = add_op(reg, OP_PUSH);
if (r != 0) return r;
COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL;
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0)
COP(reg)->push.addr += OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR;
if (IS_NOT_NULL(node->lead_node)) {
int clen;
MinMaxCharLen ci;
clen = compile_length_tree(node->lead_node, reg);
clen = compile_length_tree(node->lead_node, reg, env);
COP(reg)->push.addr += OPSIZE_MOVE + clen;
r = node_char_len(node->lead_node, reg, &ci, env);
@ -2162,6 +2187,14 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
if (r != 0) return r;
}
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0) {
ID_ENTRY(env, mid3);
r = add_op(reg, OP_SAVE_VAL);
if (r != 0) return r;
COP(reg)->save_val.type = SAVE_RIGHT_RANGE;
COP(reg)->save_val.id = mid3;
}
r = add_op(reg, OP_STEP_BACK_START);
if (r != 0) return r;
@ -2180,6 +2213,14 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
r = compile_tree(ND_ANCHOR_BODY(node), reg, env);
if (r != 0) return r;
if ((env->flags & PE_FLAG_HAS_ABSENT_STOPPER) != 0) {
r = add_op(reg, OP_UPDATE_VAR);
if (r != 0) return r;
COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK;
COP(reg)->update_var.id = mid3;
COP(reg)->update_var.clear = FALSE;
}
r = add_op(reg, OP_CHECK_POSITION);
if (r != 0) return r;
COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE;
@ -2292,7 +2333,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
case ANCR_PREC_READ_NOT:
{
len = compile_length_tree(ND_ANCHOR_BODY(node), reg);
len = compile_length_tree(ND_ANCHOR_BODY(node), reg, env);
if (len < 0) return len;
ID_ENTRY(env, mid);
@ -2434,7 +2475,7 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
}
static int
compile_length_tree(Node* node, regex_t* reg)
compile_length_tree(Node* node, regex_t* reg, ParseEnv* env)
{
int len, r;
@ -2442,7 +2483,7 @@ compile_length_tree(Node* node, regex_t* reg)
case ND_LIST:
len = 0;
do {
r = compile_length_tree(ND_CAR(node), reg);
r = compile_length_tree(ND_CAR(node), reg, env);
if (r < 0) return r;
len += r;
} while (IS_NOT_NULL(node = ND_CDR(node)));
@ -2455,7 +2496,7 @@ compile_length_tree(Node* node, regex_t* reg)
n = r = 0;
do {
r += compile_length_tree(ND_CAR(node), reg);
r += compile_length_tree(ND_CAR(node), reg, env);
n++;
} while (IS_NOT_NULL(node = ND_CDR(node)));
r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1);
@ -2488,15 +2529,15 @@ compile_length_tree(Node* node, regex_t* reg)
#endif
case ND_QUANT:
r = compile_length_quantifier_node(QUANT_(node), reg);
r = compile_length_quantifier_node(QUANT_(node), reg, env);
break;
case ND_BAG:
r = compile_length_bag_node(BAG_(node), reg);
r = compile_length_bag_node(BAG_(node), reg, env);
break;
case ND_ANCHOR:
r = compile_length_anchor_node(ANCHOR_(node), reg);
r = compile_length_anchor_node(ANCHOR_(node), reg, env);
break;
case ND_GIMMICK:
@ -2528,7 +2569,7 @@ compile_tree(Node* node, regex_t* reg, ParseEnv* env)
Node* x = node;
len = 0;
do {
len += compile_length_tree(ND_CAR(x), reg);
len += compile_length_tree(ND_CAR(x), reg, env);
if (IS_NOT_NULL(ND_CDR(x))) {
len += OPSIZE_PUSH + OPSIZE_JUMP;
}
@ -2536,7 +2577,7 @@ compile_tree(Node* node, regex_t* reg, ParseEnv* env)
pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */
do {
len = compile_length_tree(ND_CAR(node), reg);
len = compile_length_tree(ND_CAR(node), reg, env);
if (IS_NOT_NULL(ND_CDR(node))) {
enum OpCode push = ND_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
r = add_op(reg, push);

View File

@ -486,6 +486,7 @@ onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
#define PFETCH_READY UChar* pfetch_prev
#define PEND (p < end ? 0 : 1)
#define PUNFETCH p = pfetch_prev
#define PPREV pfetch_prev
#define PINC do { \
pfetch_prev = p; \
p += ONIGENC_MBC_ENC_LEN(enc, p); \
@ -5103,6 +5104,63 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
return NULL_UCHARP;
}
static int
is_head_of_bre_subexp(UChar* p, UChar* end, OnigEncoding enc, ParseEnv* env)
{
UChar* start;
OnigCodePoint code;
start = env->pattern;
if (p > start) {
p = onigenc_get_prev_char_head(enc, start, p);
if (p > start) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (code == '(' ||
(code == '|' &&
IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_VBAR_ALT))) {
p = onigenc_get_prev_char_head(enc, start, p);
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (IS_MC_ESC_CODE(code, env->syntax)) {
int count = 0;
while (p > start) {
p = onigenc_get_prev_char_head(enc, start, p);
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (! IS_MC_ESC_CODE(code, env->syntax)) break;
count++;
}
return (count % 2 == 0);
}
}
}
return FALSE;
}
else {
return TRUE;
}
}
static int
is_end_of_bre_subexp(UChar* p, UChar* end, OnigEncoding enc, ParseEnv* env)
{
OnigCodePoint code;
if (p == end) return TRUE;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (IS_MC_ESC_CODE(code, env->syntax)) {
p += ONIGENC_MBC_ENC_LEN(enc, p);
if (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (code == ')' ||
(code == '|' &&
IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_VBAR_ALT)))
return TRUE;
}
}
return FALSE;
}
static int
is_posix_bracket_start(UChar* from, UChar* to, OnigEncoding enc)
{
@ -6235,6 +6293,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env)
case '^':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
if (IS_SYNTAX_BV(syn, ONIG_SYN_BRE_ANCHOR_AT_EDGE_OF_SUBEXP)) {
if (! is_head_of_bre_subexp(PPREV, end, enc, env)) break;
}
tok->type = TK_ANCHOR;
tok->u.subtype = (OPTON_SINGLELINE(env->options)
? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE);
@ -6242,6 +6303,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env)
case '$':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
if (IS_SYNTAX_BV(syn, ONIG_SYN_BRE_ANCHOR_AT_EDGE_OF_SUBEXP)) {
if (! is_end_of_bre_subexp(p, end, enc, env)) break;
}
tok->type = TK_ANCHOR;
tok->u.subtype = (OPTON_SINGLELINE(env->options)
? ANCR_SEMI_END_BUF : ANCR_END_LINE);
@ -7810,6 +7874,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
PINC;
r = make_range_clear(np, env);
if (r != 0) return r;
env->flags |= PE_FLAG_HAS_ABSENT_STOPPER;
goto end;
}
}
@ -7831,7 +7896,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
if (ND_TYPE(top) != ND_ALT || IS_NULL(ND_CDR(top))) {
expr = NULL_NODE;
is_range_cutter = 1;
/* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */
env->flags |= PE_FLAG_HAS_ABSENT_STOPPER;
}
else {
absent = ND_CAR(top);

View File

@ -456,6 +456,7 @@ typedef struct {
#define PE_FLAG_HAS_CALL_ZERO (1<<0)
#define PE_FLAG_HAS_WHOLE_OPTIONS (1<<1)
#define PE_FLAG_HAS_ABSENT_STOPPER (1<<2)
extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map));

View File

@ -49,7 +49,7 @@ OnigSyntaxType OnigSyntaxPosixBasic = {
( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP |
ONIG_SYN_OP_ESC_BRACE_INTERVAL )
, 0
, 0
, ( ONIG_SYN_BRE_ANCHOR_AT_EDGE_OF_SUBEXP )
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE )
,
{
@ -113,7 +113,8 @@ OnigSyntaxType OnigSyntaxGrep = {
ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND |
ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF )
, 0
, ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC )
, ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC |
ONIG_SYN_BRE_ANCHOR_AT_EDGE_OF_SUBEXP )
, ONIG_OPTION_NONE
,
{