From f591ebfe7f0290ea5ce4f2cbcdecd646f5874a4c Mon Sep 17 00:00:00 2001 From: RaiKoHoff Date: Wed, 8 Jan 2020 12:42:49 +0100 Subject: [PATCH] + update Oniguruma to current (2020-01-08) develoment version (6.9.4 R3) --- oniguruma/.gitignore | 5 +- oniguruma/HISTORY | 12 +- oniguruma/README.md | 11 +- oniguruma/doc/SYNTAX.md | 9 +- oniguruma/src/make_unicode_fold_data.py | 146 +++ oniguruma/src/mktable.c | 2 +- oniguruma/src/oniguruma.h | 1 + oniguruma/src/regcomp.c | 1289 ++++++++++------------- oniguruma/src/regenc.c | 66 +- oniguruma/src/regenc.h | 2 + oniguruma/src/regexec.c | 295 ++---- oniguruma/src/regint.h | 83 +- oniguruma/src/regparse.c | 261 +++-- oniguruma/src/regparse.h | 60 +- oniguruma/src/regsyntax.c | 7 +- oniguruma/src/st.h | 5 + oniguruma/src/unicode.c | 253 +++-- 17 files changed, 1265 insertions(+), 1242 deletions(-) diff --git a/oniguruma/.gitignore b/oniguruma/.gitignore index d872d4bf1..668306c71 100644 --- a/oniguruma/.gitignore +++ b/oniguruma/.gitignore @@ -47,6 +47,7 @@ m4/*.m4 /test/testcu /test/testp /test/test_regset +/test/test_syntax /test/kofu-utf8.txt # sample/ @@ -67,8 +68,8 @@ m4/*.m4 /sample/log* /harnesses/utf16*.dict -/harnesses/*-libfuzzer -/harnesses/main-* +/harnesses/fuzzer-* +/harnesses/read-* /harnesses/libfuzzer-onig /harnesses/libfuzzer-onig-full /harnesses/slow-unit-* diff --git a/oniguruma/HISTORY b/oniguruma/HISTORY index fa15dd26c..f4d4f6744 100644 --- a/oniguruma/HISTORY +++ b/oniguruma/HISTORY @@ -1,13 +1,15 @@ History -2019/MM/DD: Version 6.9.4 +2019/11/29: Version 6.9.4 +2019/11/22: Release Candidate 3 for Version 6.9.4 + +2019/11/20: fix a problem found by libFuzzer test 2019/11/14: Release Candidate 2 for Version 6.9.4 - 2019/11/12: fix integer overflow by nested quantifier -2019/11/11: fix #164: Integer overflow related to reg->dmax in search_in_range() -2019/11/07: fix #163: heap-buffer-overflow in gb18030_mbc_enc_len() -2019/11/06: fix #162: heap-buffer-overflow in fetch_interval_quantifier() +2019/11/11: fix CVE-2019-19012: Integer overflow related to reg->dmax in search_in_range() +2019/11/07: fix CVE-2019-19203: heap-buffer-overflow in gb18030_mbc_enc_len() +2019/11/06: fix CVE-2019-19204: heap-buffer-overflow in fetch_interval_quantifier() 2019/11/06: add HAVE_INTTYPES_H into config.h.windows.in and config.h.win{32,64} 2019/11/06: add HAVE_STDINT_H into config.h.win{32,64} 2019/11/05: Release Candidate 1 for Version 6.9.4 diff --git a/oniguruma/README.md b/oniguruma/README.md index 519b1e093..b02a86b05 100644 --- a/oniguruma/README.md +++ b/oniguruma/README.md @@ -27,11 +27,20 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw +Master branch +------------- + +* Fixed behavior of isolated options in Perl and Java syntaxes. /...(?i).../ + + Version 6.9.4 ------------- * NEW API: RegSet (set of regexes) -* Fixed CVE-2019-19012 (Issue #164) +* Fixed CVE-2019-19012 +* Fixed CVE-2019-19203 (Does not affect UTF-8, UTF-16 and UTF-32 encodings) +* Fixed CVE-2019-19204 (Affects only PosixBasic, Emacs and Grep syntaxes) +* Fixed CVE-2019-19246 * Fixed some problems (found by libFuzzer test) diff --git a/oniguruma/doc/SYNTAX.md b/oniguruma/doc/SYNTAX.md index 69ecf3aad..5e861a78f 100644 --- a/oniguruma/doc/SYNTAX.md +++ b/oniguruma/doc/SYNTAX.md @@ -1,7 +1,7 @@ # Oniguruma syntax (operator) configuration -_Documented for Oniguruma 6.9.3 (2019/08/08)_ +_Documented for Oniguruma 6.9.5 (2019/12/16)_ ---------- @@ -910,6 +910,13 @@ If this flag is set, then intervals of a fixed size will ignore a lazy (non-gree little as possible" is meaningless for a fixed-size interval. If this flag is clear, then `r{n}?` will mean the same as `r{n}`, and the useless `?` will be discarded. +### 10. ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (`..(?i)..`) + +_Set in: Perl, Java_ + +If this flag is set, then an isolated option doesn't break the branch and affects until the end of the group (or end of the pattern). +If this flag is not set, then an isolated option is interpreted as the starting point of a new branch. /a(?i)b|c/ ==> /a(?i:b|c)/ + ### 20. ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (add `\n` to `[^...]`) _Set in: Grep_ diff --git a/oniguruma/src/make_unicode_fold_data.py b/oniguruma/src/make_unicode_fold_data.py index 55d5b88c5..e2a92c37d 100644 --- a/oniguruma/src/make_unicode_fold_data.py +++ b/oniguruma/src/make_unicode_fold_data.py @@ -55,6 +55,11 @@ def form3bytes(x): x2 = (x>>16) & 0xff return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) +def enc_len(code, encode): + u = unichr(code) + s = u.encode(encode) + return len(s) + def check_version_info(s): m = VERSION_REG.match(s) if m is not None: @@ -324,6 +329,138 @@ def output_gperf_source(): with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f: output_gperf_fold_key(f, i) +def unfolds_byte_length_check(encode): + l = UNFOLDS.items() + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, e in sl: + key_len = enc_len(unfold, encode) + fold_len = sum(map(lambda c: enc_len(c, encode), e.fold)) + if key_len > fold_len: + sfolds = ' '.join(map(lambda c: "0x%06x" % c, e.fold)) + s = "%s byte length: %d > %d: 0x%06x => %s" % (encode, key_len, fold_len, unfold, sfolds) + print >> sys.stderr, s + +def double_fold_check(): + l = UNFOLDS.items() + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, e in sl: + for f in e.fold: + #print >> sys.stderr, ("check 0x%06x" % f) + e2 = UNFOLDS.get(f) + if e2 is not None: + s = "double folds: 0x%06x => %s, 0x%06x => %s" % (unfold, e.fold, f, e2.fold) + print >> sys.stderr, s + +def unfold_is_multi_code_folds_head_check(): + l = UNFOLDS.items() + l2 = filter(lambda (k,e):e.fold_len == 2, l) + l3 = filter(lambda (k,e):e.fold_len == 3, l) + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, _ in sl: + for k, e in l2: + if e.fold[0] == unfold: + s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) + print >> sys.stderr, s + for k, e in l3: + if e.fold[0] == unfold: + s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) + print >> sys.stderr, s + +def make_one_folds(l): + h = {} + for unfold, e in l: + if e.fold_len != 1: + continue + fold = e.fold[0] + unfolds = h.get(fold) + if unfolds is None: + unfolds = [unfold] + h[fold] = unfolds + else: + unfolds.append(unfold) + + return h + +def make_foldn_heads(l, fold_len, one_folds): + h = {} + for unfold, e in l: + if e.fold_len != fold_len: + continue + unfolds = one_folds.get(e.fold[0]) + h[e.fold[0]] = (e, unfolds) + + return h + +def fold2_expansion_num(e, one_folds): + n = len(e.unfolds) + n0 = 1 + u0 = one_folds.get(e.fold[0]) + if u0 is not None: + n0 += len(u0) + n1 = 1 + u1 = one_folds.get(e.fold[1]) + if u1 is not None: + n1 += len(u1) + n += (n0 * n1) + return n + +def fold3_expansion_num(e, one_folds): + n = len(e.unfolds) + n0 = 1 + u0 = one_folds.get(e.fold[0]) + if u0 is not None: + n0 += len(u0) + n1 = 1 + u1 = one_folds.get(e.fold[1]) + if u1 is not None: + n1 += len(u1) + n2 = 1 + u2 = one_folds.get(e.fold[2]) + if u2 is not None: + n2 += len(u2) + n += (n0 * n1 * n2) + return n + +def get_all_folds_expansion_num(x, one_folds, fold2_heads, fold3_heads): + e = UNFOLDS[x] + n = 0 + if e.fold_len == 1: + n1 = len(e.unfolds) + 1 # +1: fold + fx = e.fold[0] + r = fold2_heads.get(fx) + n2 = n3 = 0 + if r is not None: + e2, _ = r + n2 = fold2_expansion_num(e2, one_folds) + r = fold3_heads.get(fx) + if r is not None: + e3, _ = r + n3 = fold3_expansion_num(e3, one_folds) + n = max(n1, n2, n3) + elif e.fold_len == 2: + n = fold2_expansion_num(e, one_folds) + elif e.fold_len == 3: + n = fold3_expansion_num(e, one_folds) + else: + raise RuntimeError("Invalid fold_len %d" % (e.fold_len)) + + return n + +def get_all_folds_expansion_max_num(): + l = UNFOLDS.items() + one_folds = make_one_folds(l) + fold2_heads = make_foldn_heads(l, 2, one_folds) + fold3_heads = make_foldn_heads(l, 3, one_folds) + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + nmax = 0 + max_unfold = None + for unfold, e in sl: + n = get_all_folds_expansion_num(unfold, one_folds, fold2_heads, fold3_heads) + if nmax < n: + nmax = n + max_unfold = unfold + + return (nmax, max_unfold) ## main ## with open(SOURCE_FILE, 'r') as f: @@ -335,3 +472,12 @@ out_comment = True output_fold_source(sys.stdout, out_comment) output_gperf_source() + +#unfolds_byte_length_check('utf-8') +#unfolds_byte_length_check('utf-16') +double_fold_check() +unfold_is_multi_code_folds_head_check() + +#max_num, max_code = get_all_folds_expansion_max_num() +#max_num -= 1 # remove self +#print >> sys.stderr, "max expansion: 0x%06x: %d" % (max_code, max_num) diff --git a/oniguruma/src/mktable.c b/oniguruma/src/mktable.c index 3363762cd..4ea0fb856 100644 --- a/oniguruma/src/mktable.c +++ b/oniguruma/src/mktable.c @@ -1103,7 +1103,7 @@ static int IsAscii(int enc ARG_UNUSED, int c) static int IsNewline(int enc ARG_UNUSED, int c) { - if (c == 0x0a) return 1; + if (c == NEWLINE_CODE) return 1; return 0; } diff --git a/oniguruma/src/oniguruma.h b/oniguruma/src/oniguruma.h index 38c30d5b8..d0d815bb8 100644 --- a/oniguruma/src/oniguruma.h +++ b/oniguruma/src/oniguruma.h @@ -531,6 +531,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */ #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?)(?) */ #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ +#define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ diff --git a/oniguruma/src/regcomp.c b/oniguruma/src/regcomp.c index b84fae209..9426ee57a 100644 --- a/oniguruma/src/regcomp.c +++ b/oniguruma/src/regcomp.c @@ -3,7 +3,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,6 +32,11 @@ #define OPS_INIT_SIZE 8 +typedef struct { + OnigLen min; + OnigLen max; +} MinMaxLen; + OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; #if 0 @@ -100,7 +105,7 @@ int_stack_pop(int_stack* s) #ifdef ONIG_DEBUG if (s->n <= 0) { - fprintf(stderr, "int_stack_pop: fail empty. %p\n", s); + fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s); return 0; } #endif @@ -229,13 +234,13 @@ ops_free(regex_t* reg) if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: - case OP_STR_MB2N3: case OP_STR_1_IC: + case OP_STR_MB2N3: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -303,9 +308,6 @@ ops_calc_size_of_string_pool(regex_t* reg) total += op->exact_len_n.len * op->exact_len_n.n; break; case OP_STR_N: - case OP_STR_N_IC: - total += op->exact_n.n; - break; case OP_STR_MB2N: total += op->exact_n.n * 2; break; @@ -358,7 +360,6 @@ ops_make_string_pool(regex_t* reg) curr += len; break; case OP_STR_N: - case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -490,42 +491,29 @@ node_str_node_cat(Node* node, Node* add) { int r; + if (NODE_STATUS(node) != NODE_STATUS(add)) + return ONIGERR_TYPE_BUG; + if (STR_(node)->flag != STR_(add)->flag) return ONIGERR_TYPE_BUG; r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); if (r != 0) return r; - if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) - STR_(node)->case_min_len += STR_(add)->case_min_len; - - return 0; -} - -static int -node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) -{ - int r; - - if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) - return ONIGERR_TYPE_BUG; - - r = onig_node_str_cat(node, s, end); - if (r != 0) return r; - - STR_(node)->case_min_len += case_min_len; return 0; } static void -node_conv_to_str_node(Node* node, int flag) +node_conv_to_str_node(Node* node, Node* ref_node) { + xmemset(node, 0, sizeof(*node)); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; + NODE_STATUS(node) = NODE_STATUS(ref_node); + + STR_(node)->flag = STR_(ref_node)->flag; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; - STR_(node)->case_min_len = 0; } static OnigLen @@ -555,7 +543,7 @@ bitset_is_empty(BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { if (bs[i] != 0) return 0; } return 1; @@ -603,6 +591,298 @@ unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node) } #endif /* USE_CALL */ +enum CharLenReturnType { + CHAR_LEN_NORMAL = 0, /* fixed or variable */ + CHAR_LEN_TOP_ALT_FIXED = 1 +}; + + +static int +mml_is_equal(MinMaxLen* a, MinMaxLen* b) +{ + return a->min == b->min && a->max == b->max; +} + +static int +mml_fixed(MinMaxLen* c) +{ + return (c->min == c->max && c->min != INFINITE_LEN); +} + +static void +mml_set(MinMaxLen* l, OnigLen len) +{ + l->min = len; + l->max = len; +} + +static void +mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max) +{ + l->min = min; + l->max = max; +} + +static void +mml_clear(MinMaxLen* l) +{ + l->min = l->max = 0; +} + +static void +mml_copy(MinMaxLen* to, MinMaxLen* from) +{ + to->min = from->min; + to->max = from->max; +} + +static void +mml_add(MinMaxLen* to, MinMaxLen* add) +{ + to->min = distance_add(to->min, add->min); + to->max = distance_add(to->max, add->max); +} + +static void +mml_multiply(MinMaxLen* to, int m) +{ + to->min = distance_multiply(to->min, m); + to->max = distance_multiply(to->max, m); +} + +static void +mml_repeat_range_multiply(MinMaxLen* to, int mlow, int mhigh) +{ + to->min = distance_multiply(to->min, mlow); + if (IS_INFINITE_REPEAT(mhigh)) + to->max = INFINITE_LEN; + else + to->max = distance_multiply(to->max, mhigh); +} + +static void +mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) +{ + if (to->min > alt->min) to->min = alt->min; + if (to->max < alt->max) to->max = alt->max; +} + +/* fixed size pattern node only */ +static int +node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, + int level) +{ + MinMaxLen tci; + int r = CHAR_LEN_NORMAL; + + level++; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + int first = TRUE; + do { + r = node_char_len1(NODE_CAR(node), reg, &tci, env, level); + if (r < 0) break; + if (first == TRUE) { + *ci = tci; + first = FALSE; + } + else + mml_add(ci, &tci); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + } + break; + + case NODE_ALT: + { + int fixed; + + r = node_char_len1(NODE_CAR(node), reg, ci, env, level); + if (r < 0) break; + + fixed = TRUE; + while (IS_NOT_NULL(node = NODE_CDR(node))) { + r = node_char_len1(NODE_CAR(node), reg, &tci, env, level); + if (r < 0) break; + if (! mml_fixed(&tci)) + fixed = FALSE; + mml_alt_merge(ci, &tci); + } + if (r < 0) break; + + r = CHAR_LEN_NORMAL; + if (mml_fixed(ci)) break; + + if (fixed == TRUE && level == 1) { + r = CHAR_LEN_TOP_ALT_FIXED; + } + } + break; + + case NODE_STRING: + { + OnigLen clen; + StrNode* sn = STR_(node); + UChar *s = sn->s; + + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + r = ONIGERR_PARSER_BUG; + break; + } + + clen = 0; + while (s < sn->end) { + s += enclen(reg->enc, s); + clen = distance_add(clen, 1); + } + mml_set(ci, clen); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->lower == qn->upper) { + if (qn->upper == 0) { + mml_set(ci, 0); + } + else { + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + mml_multiply(ci, qn->lower); + } + } + else { + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + mml_repeat_range_multiply(ci, qn->lower, qn->upper); + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + if (NODE_IS_RECURSION(node)) + mml_set_min_max(ci, 0, INFINITE_LEN); + else + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + break; +#endif + + case NODE_CTYPE: + case NODE_CCLASS: + mml_set(ci, 1); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + switch (en->type) { + case BAG_MEMORY: + if (NODE_IS_FIXED_CLEN(node)) { + mml_set_min_max(ci, en->min_char_len, en->max_char_len); + } + else { + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + + en->min_char_len = ci->min; + en->max_char_len = ci->max; + NODE_STATUS_ADD(node, FIXED_CLEN); + } + break; + case BAG_OPTION: + case BAG_STOP_BACKTRACK: + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + break; + case BAG_IF_ELSE: + { + MinMaxLen eci; + + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + + if (IS_NOT_NULL(en->te.Then)) { + r = node_char_len1(en->te.Then, reg, &tci, env, level); + if (r < 0) break; + mml_add(ci, &tci); + } + + if (IS_NOT_NULL(en->te.Else)) { + r = node_char_len1(en->te.Else, reg, &eci, env, level); + if (r < 0) break; + } + else { + mml_set(&eci, 0); + } + + mml_alt_merge(ci, &eci); + } + break; + default: /* never come here */ + r = ONIGERR_PARSER_BUG; + break; + } + } + break; + + case NODE_ANCHOR: + case NODE_GIMMICK: + zero: + mml_set(ci, 0); + break; + + case NODE_BACKREF: + if (NODE_IS_CHECKER(node)) + goto zero; + + if (NODE_IS_RECURSION(node)) { +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + mml_set_min_max(ci, 0, INFINITE_LEN); + break; + } +#endif + + mml_set_min_max(ci, 0, 0); + break; + } + + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + + backs = BACKREFS_P(br); + r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level); + if (r < 0) break; + + for (i = 1; i < br->back_num; i++) { + r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level); + if (r < 0) break; + mml_alt_merge(ci, &tci); + } + } + break; + + default: /* never come here */ + r = ONIGERR_PARSER_BUG; + break; + } + + return r; +} + +static int +node_char_len(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env) +{ + return node_char_len1(node, reg, ci, env, 0); +} + static int add_op(regex_t* reg, int opcode) @@ -627,7 +907,7 @@ static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); #define IS_NEED_STR_LEN_OP(op) \ ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ - (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) + (op) == OP_STR_MB3N || (op) == OP_STR_MBN) static int select_str_opcode(int mb_len, int str_len) @@ -712,16 +992,16 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; if (emptiness != BODY_IS_NOT_EMPTY) { - if (emptiness == BODY_IS_EMPTY_POSSIBILITY) + if (emptiness == BODY_MAY_BE_EMPTY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + else if (emptiness == BODY_MAY_BE_EMPTY_MEM) { if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); else r = add_op(reg, OP_EMPTY_CHECK_END); } #ifdef USE_CALL - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) + else if (emptiness == BODY_MAY_BE_EMPTY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); #endif @@ -795,12 +1075,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - - if (op == OP_STR_N_IC) - COP(reg)->exact_n.n = byte_len; - else - COP(reg)->exact_n.n = str_len; - + COP(reg)->exact_n.n = str_len; COP(reg)->exact_n.s = p; } else { @@ -823,8 +1098,6 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; - p = prev = sn->s; prev_len = enclen(enc, p); p += prev_len; @@ -861,40 +1134,6 @@ compile_length_string_crude_node(StrNode* sn, regex_t* reg) reg); } -static int -compile_ambig_string_node(Node* node, regex_t* reg) -{ - int r; - int len; - int byte_len; - UChar* p; - StrNode* sn; - OnigEncoding enc = reg->enc; - - sn = STR_(node); - len = enclen(enc, sn->s); - byte_len = (int )(sn->end - sn->s); - if (len == byte_len) { - r = add_op(reg, OP_STR_1_IC); - if (r != 0) return r; - - xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); - xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); - } - else { - r = add_op(reg, OP_STR_N_IC); - if (r != 0) return r; - - p = onigenc_strdup(enc, sn->s, sn->end); - CHECK_NULL_RETURN_MEMERR(p); - - COP(reg)->exact_n.s = p; - COP(reg)->exact_n.n = byte_len; - } - - return 0; -} - static int compile_string_node(Node* node, regex_t* reg) { @@ -908,9 +1147,6 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { - return compile_ambig_string_node(node, reg); - } p = prev = sn->s; prev_len = enclen(enc, p); @@ -1181,8 +1417,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; if (IS_NOT_NULL(qn->next_head_exact)) { - r = add_op(reg, - IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ? + r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ? OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT); if (r != 0) return r; @@ -1190,8 +1425,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) return 0; } else { - r = add_op(reg, - IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ? + r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ? OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR); return r; } @@ -1338,11 +1572,8 @@ static int compile_length_option_node(BagNode* node, regex_t* reg) { int tlen; - OnigOptionType prev = reg->options; - reg->options = node->o.options; tlen = compile_length_tree(NODE_BAG_BODY(node), reg); - reg->options = prev; return tlen; } @@ -1351,11 +1582,8 @@ static int compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - OnigOptionType prev = reg->options; - reg->options = node->o.options; r = compile_tree(NODE_BAG_BODY(node), reg, env); - reg->options = prev; return r; } @@ -1467,8 +1695,6 @@ compile_length_bag_node(BagNode* node, regex_t* reg) return len; } -static int get_char_len_node(Node* node, regex_t* reg, int* len); - static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { @@ -1482,7 +1708,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; - NODE_STATUS_ADD(node, ADDR_FIXED); + NODE_STATUS_ADD(node, FIXED_ADDR); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { @@ -1745,7 +1971,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY; #ifdef USE_UNICODE_WORD_BREAK - if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD)) + if (NODE_IS_TEXT_SEGMENT_WORD(node)) type = WORD_BOUNDARY; #endif @@ -1776,44 +2002,23 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) break; case ANCR_LOOK_BEHIND: - { - int n; - r = add_op(reg, OP_LOOK_BEHIND); - if (r != 0) return r; - if (node->char_len < 0) { - r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); - if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - } - else - n = node->char_len; - - COP(reg)->look_behind.len = n; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - } + r = add_op(reg, OP_LOOK_BEHIND); + if (r != 0) return r; + COP(reg)->look_behind.len = node->char_len; + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); break; case ANCR_LOOK_BEHIND_NOT: - { - int n; + len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); + r = add_op(reg, OP_LOOK_BEHIND_NOT_START); + if (r != 0) return r; - len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); - r = add_op(reg, OP_LOOK_BEHIND_NOT_START); - if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.len = node->char_len; - if (node->char_len < 0) { - r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); - if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - } - else - n = node->char_len; - - COP(reg)->look_behind_not_start.len = n; - - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_LOOK_BEHIND_NOT_END); - } + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + r = add_op(reg, OP_LOOK_BEHIND_NOT_END); break; default: @@ -1835,10 +2040,10 @@ compile_gimmick_node(GimmickNode* node, regex_t* reg) break; case GIMMICK_SAVE: - r = add_op(reg, OP_PUSH_SAVE_VAL); + r = add_op(reg, OP_SAVE_VAL); if (r != 0) return r; - COP(reg)->push_save_val.type = node->detail_type; - COP(reg)->push_save_val.id = node->id; + COP(reg)->save_val.type = node->detail_type; + COP(reg)->save_val.id = node->id; break; case GIMMICK_UPDATE_VAR: @@ -1889,7 +2094,7 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) break; case GIMMICK_SAVE: - len = OPSIZE_PUSH_SAVE_VAL; + len = OPSIZE_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: @@ -2056,8 +2261,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) switch (CTYPE_(node)->ctype) { case CTYPE_ANYCHAR: - r = add_op(reg, IS_MULTILINE(CTYPE_OPTION(node, reg)) ? - OP_ANYCHAR_ML : OP_ANYCHAR); + r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR); break; case ONIGENC_CTYPE_WORD: @@ -2099,7 +2303,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) else { #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - if ((reg->options & ONIG_OPTION_IGNORECASE) != 0) + if (NODE_IS_IGNORECASE(node)) r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC); else r = add_op(reg, OP_BACKREF_WITH_LEVEL); @@ -2112,7 +2316,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) #endif if (br->back_num == 1) { n = br->back_static[0]; - if (IS_IGNORECASE(reg->options)) { + if (NODE_IS_IGNORECASE(node)) { r = add_op(reg, OP_BACKREF_N_IC); if (r != 0) return r; COP(reg)->backref_n.n1 = n; @@ -2133,7 +2337,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) int num; int* p; - r = add_op(reg, IS_IGNORECASE(reg->options) ? + r = add_op(reg, NODE_IS_IGNORECASE(node) ? OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI); if (r != 0) return r; @@ -2184,7 +2388,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) default: #ifdef ONIG_DEBUG - fprintf(stderr, "compile_tree: undefined node type %d\n", NODE_TYPE(node)); + fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node)); #endif break; } @@ -2193,7 +2397,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) } static int -noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) +make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) { int r = 0; Node* node = *plink; @@ -2202,7 +2406,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) case NODE_LIST: case NODE_ALT: do { - r = noname_disable_map(&(NODE_CAR(node)), map, counter); + r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -2210,7 +2414,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; - r = noname_disable_map(ptarget, map, counter); + r = make_named_capture_number_map(ptarget, map, counter); if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { r = onig_reduce_nested_quantifier(node); @@ -2226,35 +2430,35 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) (*counter)++; map[en->m.regnum].new_val = *counter; en->m.regnum = *counter; - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); } else { *plink = NODE_BODY(node); NODE_BODY(node) = NULL_NODE; onig_node_free(node); - r = noname_disable_map(plink, map, counter); + r = make_named_capture_number_map(plink, map, counter); } } else if (en->type == BAG_IF_ELSE) { - r = noname_disable_map(&(NODE_BAG_BODY(en)), map, counter); + r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = noname_disable_map(&(en->te.Then), map, counter); + r = make_named_capture_number_map(&(en->te.Then), map, counter); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = noname_disable_map(&(en->te.Else), map, counter); + r = make_named_capture_number_map(&(en->te.Else), map, counter); if (r != 0) return r; } } else - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); } break; case NODE_ANCHOR: if (IS_NOT_NULL(NODE_BODY(node))) - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); break; default: @@ -2265,7 +2469,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) } static int -renumber_node_backref(Node* node, GroupNumRemap* map) +renumber_backref_node(Node* node, GroupNumMap* map) { int i, pos, n, old_num; int *backs; @@ -2293,7 +2497,7 @@ renumber_node_backref(Node* node, GroupNumRemap* map) } static int -renumber_by_map(Node* node, GroupNumRemap* map) +renumber_backref_traverse(Node* node, GroupNumMap* map) { int r = 0; @@ -2301,28 +2505,28 @@ renumber_by_map(Node* node, GroupNumRemap* map) case NODE_LIST: case NODE_ALT: do { - r = renumber_by_map(NODE_CAR(node), map); + r = renumber_backref_traverse(NODE_CAR(node), map); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = renumber_by_map(en->te.Then, map); + r = renumber_backref_traverse(en->te.Then, map); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = renumber_by_map(en->te.Else, map); + r = renumber_backref_traverse(en->te.Else, map); if (r != 0) return r; } } @@ -2330,12 +2534,12 @@ renumber_by_map(Node* node, GroupNumRemap* map) break; case NODE_BACKREF: - r = renumber_node_backref(node, map); + r = renumber_backref_node(node, map); break; case NODE_ANCHOR: if (IS_NOT_NULL(NODE_BODY(node))) - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); break; default: @@ -2404,18 +2608,18 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) { int r, i, pos, counter; MemStatusType loc; - GroupNumRemap* map; + GroupNumMap* map; - map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1)); + map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1)); CHECK_NULL_RETURN_MEMERR(map); for (i = 1; i <= env->num_mem; i++) { map[i].new_val = 0; } counter = 0; - r = noname_disable_map(root, map, &counter); + r = make_named_capture_number_map(root, map, &counter); if (r != 0) return r; - r = renumber_by_map(*root, map); + r = renumber_backref_traverse(*root, map); if (r != 0) return r; for (i = 1, pos = 1; i <= env->num_mem; i++) { @@ -2449,7 +2653,7 @@ fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg) AbsAddrType* paddr; for (i = 0; i < uslist->num; i++) { - if (! NODE_IS_ADDR_FIXED(uslist->us[i].target)) + if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) return ONIGERR_PARSER_BUG; en = BAG_(uslist->us[i].target); @@ -2463,173 +2667,6 @@ fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg) } #endif - -#define GET_CHAR_LEN_VARLEN -1 -#define GET_CHAR_LEN_TOP_ALT_VARLEN -2 - -/* fixed size pattern node only */ -static int -get_char_len_node1(Node* node, regex_t* reg, int* len, int level) -{ - int tlen; - int r = 0; - - level++; - *len = 0; - switch (NODE_TYPE(node)) { - case NODE_LIST: - do { - r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level); - if (r == 0) - *len = distance_add(*len, tlen); - } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_ALT: - { - int tlen2; - int varlen = 0; - - r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level); - while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))) { - r = get_char_len_node1(NODE_CAR(node), reg, &tlen2, level); - if (r == 0) { - if (tlen != tlen2) - varlen = 1; - } - } - if (r == 0) { - if (varlen != 0) { - if (level == 1) - r = GET_CHAR_LEN_TOP_ALT_VARLEN; - else - r = GET_CHAR_LEN_VARLEN; - } - else - *len = tlen; - } - } - break; - - case NODE_STRING: - { - StrNode* sn = STR_(node); - UChar *s = sn->s; - - while (s < sn->end) { - s += enclen(reg->enc, s); - (*len)++; - } - } - break; - - case NODE_QUANT: - { - QuantNode* qn = QUANT_(node); - - if (qn->lower == qn->upper) { - if (qn->upper == 0) { - *len = 0; - } - else { - r = get_char_len_node1(NODE_BODY(node), reg, &tlen, level); - if (r == 0) - *len = distance_multiply(tlen, qn->lower); - } - } - else - r = GET_CHAR_LEN_VARLEN; - } - break; - -#ifdef USE_CALL - case NODE_CALL: - if (! NODE_IS_RECURSION(node)) - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - else - r = GET_CHAR_LEN_VARLEN; - break; -#endif - - case NODE_CTYPE: - case NODE_CCLASS: - *len = 1; - break; - - case NODE_BAG: - { - BagNode* en = BAG_(node); - - switch (en->type) { - case BAG_MEMORY: -#ifdef USE_CALL - if (NODE_IS_CLEN_FIXED(node)) - *len = en->char_len; - else { - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - if (r == 0) { - en->char_len = *len; - NODE_STATUS_ADD(node, CLEN_FIXED); - } - } - break; -#endif - case BAG_OPTION: - case BAG_STOP_BACKTRACK: - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - break; - case BAG_IF_ELSE: - { - int clen, elen; - - r = get_char_len_node1(NODE_BODY(node), reg, &clen, level); - if (r == 0) { - if (IS_NOT_NULL(en->te.Then)) { - r = get_char_len_node1(en->te.Then, reg, &tlen, level); - if (r != 0) break; - } - else tlen = 0; - if (IS_NOT_NULL(en->te.Else)) { - r = get_char_len_node1(en->te.Else, reg, &elen, level); - if (r != 0) break; - } - else elen = 0; - - if (clen + tlen != elen) { - r = GET_CHAR_LEN_VARLEN; - } - else { - *len = elen; - } - } - } - break; - } - } - break; - - case NODE_ANCHOR: - case NODE_GIMMICK: - break; - - case NODE_BACKREF: - if (NODE_IS_CHECKER(node)) - break; - /* fall */ - default: - r = GET_CHAR_LEN_VARLEN; - break; - } - - return r; -} - -static int -get_char_len_node(Node* node, regex_t* reg, int* len) -{ - return get_char_len_node1(node, reg, len, 0); -} - /* x is not included y ==> 1 : 0 */ static int is_exclusive(Node* x, Node* y, regex_t* reg) @@ -2805,14 +2842,9 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { - /* tiny version */ - return 0; - } - else { - for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { - if (*p != *q) return 1; - } + + for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { + if (*p != *q) return 1; } } break; @@ -2865,7 +2897,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { + ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2977,7 +3009,7 @@ check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) } static OnigLen -tree_min_len(Node* node, ScanEnv* env) +node_min_byte_len(Node* node, ScanEnv* env) { OnigLen len; OnigLen tmin; @@ -2993,9 +3025,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].mem_node, env); + len = node_min_byte_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].mem_node, env); + tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3006,18 +3038,18 @@ tree_min_len(Node* node, ScanEnv* env) { Node* t = NODE_BODY(node); if (NODE_IS_RECURSION(node)) { - if (NODE_IS_MIN_FIXED(t)) + if (NODE_IS_FIXED_MIN(t)) len = BAG_(t)->min_len; } else - len = tree_min_len(t, env); + len = node_min_byte_len(t, env); } break; #endif case NODE_LIST: do { - tmin = tree_min_len(NODE_CAR(node), env); + tmin = node_min_byte_len(NODE_CAR(node), env); len = distance_add(len, tmin); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -3028,7 +3060,7 @@ tree_min_len(Node* node, ScanEnv* env) y = node; do { x = NODE_CAR(y); - tmin = tree_min_len(x, env); + tmin = node_min_byte_len(x, env); if (y == node) len = tmin; else if (len > tmin) len = tmin; } while (IS_NOT_NULL(y = NODE_CDR(y))); @@ -3052,7 +3084,7 @@ tree_min_len(Node* node, ScanEnv* env) QuantNode* qn = QUANT_(node); if (qn->lower > 0) { - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); len = distance_multiply(len, qn->lower); } } @@ -3063,35 +3095,35 @@ tree_min_len(Node* node, ScanEnv* env) BagNode* en = BAG_(node); switch (en->type) { case BAG_MEMORY: - if (NODE_IS_MIN_FIXED(node)) + if (NODE_IS_FIXED_MIN(node)) len = en->min_len; else { if (NODE_IS_MARK1(node)) len = 0; /* recursive */ else { NODE_STATUS_ADD(node, MARK1); - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); NODE_STATUS_REMOVE(node, MARK1); en->min_len = len; - NODE_STATUS_ADD(node, MIN_FIXED); + NODE_STATUS_ADD(node, FIXED_MIN); } } break; case BAG_OPTION: case BAG_STOP_BACKTRACK: - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); break; case BAG_IF_ELSE: { OnigLen elen; - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); if (IS_NOT_NULL(en->te.Then)) - len += tree_min_len(en->te.Then, env); + len += node_min_byte_len(en->te.Then, env); if (IS_NOT_NULL(en->te.Else)) - elen = tree_min_len(en->te.Else, env); + elen = node_min_byte_len(en->te.Else, env); else elen = 0; if (elen < len) len = elen; @@ -3119,7 +3151,7 @@ tree_min_len(Node* node, ScanEnv* env) } static OnigLen -tree_max_len(Node* node, ScanEnv* env) +node_max_byte_len(Node* node, ScanEnv* env) { OnigLen len; OnigLen tmax; @@ -3128,14 +3160,14 @@ tree_max_len(Node* node, ScanEnv* env) switch (NODE_TYPE(node)) { case NODE_LIST: do { - tmax = tree_max_len(NODE_CAR(node), env); + tmax = node_max_byte_len(NODE_CAR(node), env); len = distance_add(len, tmax); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_ALT: do { - tmax = tree_max_len(NODE_CAR(node), env); + tmax = node_max_byte_len(NODE_CAR(node), env); if (len < tmax) len = tmax; } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -3159,12 +3191,16 @@ tree_max_len(Node* node, ScanEnv* env) MemEnv* mem_env = SCANENV_MEMENV(env); BackRefNode* br = BACKREF_(node); if (NODE_IS_RECURSION(node)) { - len = INFINITE_LEN; +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + len = INFINITE_LEN; + } +#endif break; } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].mem_node, env); + tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3173,7 +3209,7 @@ tree_max_len(Node* node, ScanEnv* env) #ifdef USE_CALL case NODE_CALL: if (! NODE_IS_RECURSION(node)) - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); else len = INFINITE_LEN; break; @@ -3184,7 +3220,7 @@ tree_max_len(Node* node, ScanEnv* env) QuantNode* qn = QUANT_(node); if (qn->upper != 0) { - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); if (len != 0) { if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); @@ -3200,37 +3236,37 @@ tree_max_len(Node* node, ScanEnv* env) BagNode* en = BAG_(node); switch (en->type) { case BAG_MEMORY: - if (NODE_IS_MAX_FIXED(node)) + if (NODE_IS_FIXED_MAX(node)) len = en->max_len; else { if (NODE_IS_MARK1(node)) len = INFINITE_LEN; else { NODE_STATUS_ADD(node, MARK1); - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); NODE_STATUS_REMOVE(node, MARK1); en->max_len = len; - NODE_STATUS_ADD(node, MAX_FIXED); + NODE_STATUS_ADD(node, FIXED_MAX); } } break; case BAG_OPTION: case BAG_STOP_BACKTRACK: - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); break; case BAG_IF_ELSE: { OnigLen tlen, elen; - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); if (IS_NOT_NULL(en->te.Then)) { - tlen = tree_max_len(en->te.Then, env); + tlen = node_max_byte_len(en->te.Then, env); len = distance_add(len, tlen); } if (IS_NOT_NULL(en->te.Else)) - elen = tree_max_len(en->te.Else, env); + elen = node_max_byte_len(en->te.Else, env); else elen = 0; if (elen > len) len = elen; @@ -3538,7 +3574,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret; r |= ret; if (head != 0) { - min = tree_min_len(NODE_CAR(x), env); + min = node_min_byte_len(NODE_CAR(x), env); if (min != 0) head = 0; } } while (IS_NOT_NULL(x = NODE_CDR(x))); @@ -3603,7 +3639,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if (IS_NOT_NULL(en->te.Then)) { OnigLen min; if (head != 0) { - min = tree_min_len(NODE_BODY(node), env); + min = node_min_byte_len(NODE_BODY(node), env); } else min = 0; @@ -3889,7 +3925,9 @@ reduce_string_list(Node* node) next_node = NODE_CDR(node); curr = NODE_CAR(node); if (NODE_TYPE(curr) == NODE_STRING) { - if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + if (IS_NULL(prev) + || STR_(curr)->flag != STR_(prev)->flag + || NODE_STATUS(curr) != NODE_STATUS(prev)) { prev = curr; prev_node = node; } @@ -3980,7 +4018,7 @@ divide_look_behind_alternatives(Node* node) np = node; while (IS_NOT_NULL(np = NODE_CDR(np))) { - insert_node = onig_node_new_anchor(anc_type, an->ascii_mode); + insert_node = onig_node_copy(head); CHECK_NULL_RETURN_MEMERR(insert_node); NODE_BODY(insert_node) = NODE_CAR(np); NODE_CAR(np) = insert_node; @@ -3995,22 +4033,46 @@ divide_look_behind_alternatives(Node* node) return 0; } +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); + static int -tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) { - int r, len; + int r; + int state1; + MinMaxLen ci; AnchorNode* an = ANCHOR_(node); - r = get_char_len_node(NODE_ANCHOR_BODY(an), reg, &len); - if (r == 0) - an->char_len = len; - else if (r == GET_CHAR_LEN_VARLEN) - r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) - r = divide_look_behind_alternatives(node); - else - r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + if (an->type == ANCR_LOOK_BEHIND_NOT) + state1 = state | IN_NOT | IN_LOOK_BEHIND; + else + state1 = state | IN_LOOK_BEHIND; + + /* Execute tune_tree(body) before call node_char_len(). + Because case-fold expansion must be done before node_char_len(). + */ + r = tune_tree(NODE_ANCHOR_BODY(an), reg, state1, env); + if (r != 0) return r; + + r = node_char_len(NODE_ANCHOR_BODY(an), reg, &ci, env); + if (r >= 0) { + if (r == CHAR_LEN_TOP_ALT_FIXED) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) { + r = divide_look_behind_alternatives(node); + if (r == 0) + r = tune_tree(node, reg, state, env); + } + else + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else { /* CHAR_LEN_NORMAL */ + if (mml_fixed(&ci)) { + an->char_len = ci.min; + } + else { + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + } } return r; @@ -4096,45 +4158,6 @@ get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* r return 0; } -static int -conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, - UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) -{ - UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *sbuf, *ebuf, *sp; - int i, n, len, sbuf_size; - - *rs = NULL; - sbuf_size = (int )(end - s) * 2; - sbuf = (UChar* )xmalloc(sbuf_size); - CHECK_NULL_RETURN_MEMERR(sbuf); - ebuf = sbuf + sbuf_size; - - n = 0; - sp = sbuf; - p = s; - while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); - for (i = 0; i < len; i++) { - if (sp >= ebuf) { - sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); - CHECK_NULL_RETURN_MEMERR(sbuf); - sp = sbuf + sbuf_size; - sbuf_size *= 2; - ebuf = sbuf + sbuf_size; - } - - *sp++ = buf[i]; - } - n++; - } - - *rs = sbuf; - *rend = sp; - *rcase_min_len = n; - return 0; -} - static int make_code_list_to_string(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]) @@ -4187,7 +4210,7 @@ unravel_cf_node_add(Node** rlist, Node* add) static int unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, - unsigned int flag, int case_min_len) + unsigned int flag) { int r; Node *sn, *list; @@ -4196,17 +4219,13 @@ unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, sn = *rsn; if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { - if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) - r = node_str_cat_case_fold(sn, s, end, case_min_len); - else - r = onig_node_str_cat(sn, s, end); + r = onig_node_str_cat(sn, s, end); } else { sn = onig_node_new_str(s, end); CHECK_NULL_RETURN_MEMERR(sn); STR_(sn)->flag = flag; - STR_(sn)->case_min_len = case_min_len; r = unravel_cf_node_add(&list, sn); } @@ -4217,25 +4236,6 @@ unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, return r; } -static int -unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, - OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) -{ - int r; - int case_min_len; - UChar *rs, *rend; - - r = conv_string_case_fold(enc, case_fold_flag, s, end, - &rs, &rend, &case_min_len); - if (r != 0) return r; - - r = unravel_cf_string_add(rlist, rsn, rs, rend, - NODE_STRING_CASE_FOLD_MATCH, case_min_len); - xfree(rs); - - return r; -} - static int unravel_cf_string_alt_or_cc_add(Node** rlist, int n, OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, @@ -4310,7 +4310,7 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn, } if (found == 0) { - r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */); } else { Node* node; @@ -4350,8 +4350,8 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; + NODE_STATUS_REMOVE(node, IGNORECASE); snode = STR_(node); - start = snode->s; end = snode->end; if (start >= end) return 0; @@ -4372,29 +4372,33 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) one_len = enclen(enc, p); if (n == 0) { q = p + one_len; - r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */); if (r != 0) goto err; } else { if (in_look_behind != 0) { q = p + one_len; + if (items[0].byte_len != one_len) { + r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q, + items); + if (r < 0) goto err; + n = r; + } r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); if (r != 0) goto err; } else { get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + if (min_len != max_len) { + r = ONIGERR_PARSER_BUG; + goto err; + } + q = p + max_len; - if (one_len == max_len && min_len == max_len) { - r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, - reg->case_fold_flag, p, q); - if (r != 0) goto err; - sn = NULL_NODE; - } - else { - r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, - p, q); - if (r != 0) goto err; - } + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } } @@ -4429,7 +4433,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY_POSSIBILITY; + int r = BODY_MAY_BE_EMPTY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4446,7 +4450,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ + return BODY_MAY_BE_EMPTY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4468,9 +4472,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_POSSIBILITY_REC; + return BODY_MAY_BE_EMPTY_REC; } - return BODY_IS_EMPTY_POSSIBILITY_MEM; + return BODY_MAY_BE_EMPTY_MEM; break; case BAG_OPTION: @@ -4525,7 +4529,7 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - ! ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) { + ! OPTON_CAPTURE_GROUP(env->options)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } @@ -4936,8 +4940,6 @@ tune_called_state(Node* node, int state) #endif /* USE_CALL */ -static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); - #ifdef __GNUC__ __inline #endif @@ -4948,7 +4950,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) #define ALLOWED_TYPE_IN_LB \ ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \ | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \ - | NODE_BIT_CALL | NODE_BIT_GIMMICK) + | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK) #define ALLOWED_BAG_IN_LB ( 1< 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); - if (r != 0) return r; - r = tune_look_behind(node, reg, env); + r = tune_look_behind(node, reg, state, env); } break; @@ -4994,10 +4994,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); - if (r != 0) return r; - r = tune_look_behind(node, reg, env); + r = tune_look_behind(node, reg, state, env); } break; @@ -5016,7 +5013,6 @@ static int tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; - OnigLen d; QuantNode* qn = QUANT_(node); Node* body = NODE_BODY(node); @@ -5028,12 +5024,12 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { - d = tree_min_len(body, env); + OnigLen d = node_min_byte_len(body, env); if (d == 0) { #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); #else - qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; + qn->emptiness = BODY_MAY_BE_EMPTY; #endif } } @@ -5055,7 +5051,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, body); for (i = 0; i < n; i++) { r = node_str_node_cat(node, body); if (r != 0) return r; @@ -5116,7 +5112,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { r = unravel_case_fold_string(node, reg, state); } break; @@ -5137,8 +5133,9 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } #endif #else - /* More precisely, it checks whether alt/repeat exists before - the subject capture node.*/ + /* More precisely, it should be checked whether alt/repeat exists before + the subject capture node, and then this backreference position + exists before (or in) the capture node. */ MEM_STATUS_ON(env->backtrack_mem, p[i]); #endif } @@ -5299,14 +5296,8 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, #endif typedef struct { - OnigLen min; /* min byte length */ - OnigLen max; /* max byte length */ -} MinMax; - -typedef struct { - MinMax mm; + MinMaxLen mm; OnigEncoding enc; - OnigOptionType options; OnigCaseFoldType case_fold_flag; ScanEnv* scan_env; } OptEnv; @@ -5317,23 +5308,22 @@ typedef struct { } OptAnc; typedef struct { - MinMax mm; /* position */ + MinMaxLen mm; /* position */ OptAnc anc; int reach_end; - int case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mm; /* position */ + MinMaxLen mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; } OptMap; typedef struct { - MinMax len; + MinMaxLen len; OptAnc anc; OptStr sb; /* boundary */ OptStr sm; /* middle */ @@ -5367,7 +5357,7 @@ map_position_value(OnigEncoding enc, int i) } static int -distance_value(MinMax* mm) +distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ static const short int dist_vals[] = { @@ -5396,7 +5386,7 @@ distance_value(MinMax* mm) } static int -comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2) +comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2) { if (v2 <= 0) return -1; if (v1 <= 0) return 1; @@ -5412,46 +5402,6 @@ comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2) return 0; } -static int -is_equal_mml(MinMax* a, MinMax* b) -{ - return a->min == b->min && a->max == b->max; -} - -static void -set_mml(MinMax* l, OnigLen min, OnigLen max) -{ - l->min = min; - l->max = max; -} - -static void -clear_mml(MinMax* l) -{ - l->min = l->max = 0; -} - -static void -copy_mml(MinMax* to, MinMax* from) -{ - to->min = from->min; - to->max = from->max; -} - -static void -add_mml(MinMax* to, MinMax* from) -{ - to->min = distance_add(to->min, from->min); - to->max = distance_add(to->max, from->max); -} - -static void -alt_merge_mml(MinMax* to, MinMax* from) -{ - if (to->min > from->min) to->min = from->min; - if (to->max < from->max) to->max = from->max; -} - static void copy_opt_env(OptEnv* to, OptEnv* from) { @@ -5543,12 +5493,11 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mm); + mml_clear(&e->mm); clear_opt_anc_info(&e->anc); - e->reach_end = 0; - e->case_fold = 0; - e->len = 0; - e->s[0] = '\0'; + e->reach_end = 0; + e->len = 0; + e->s[0] = '\0'; } static void @@ -5564,14 +5513,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) UChar *p, *end; OptAnc tanc; - if (add->case_fold != 0) { - if (! to->case_fold) { - if (to->len > 1 || to->len >= add->len) return 0; /* avoid */ - - to->case_fold = 1; - } - } - r = 0; p = add->s; end = p + add->len; @@ -5624,7 +5565,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mm, &add->mm)) { + if (! mml_is_equal(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5644,8 +5585,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->reach_end = 0; } to->len = i; - if (add->case_fold != 0) - to->case_fold = 1; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5675,8 +5614,8 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (alt->len > 1) va += 5; } - if (now->case_fold == 0) vn *= 2; - if (alt->case_fold == 0) va *= 2; + vn *= 2; + va *= 2; if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); @@ -5725,28 +5664,6 @@ add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc) } } -static int -add_char_amb_opt_map(OptMap* map, UChar* p, UChar* end, - OnigEncoding enc, OnigCaseFoldType fold_flag) -{ - OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - int i, n; - - add_char_opt_map(map, p[0], enc); - - fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag); - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, fold_flag, p, end, items); - if (n < 0) return n; - - for (i = 0; i < n; i++) { - ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf); - add_char_opt_map(map, buf[0], enc); - } - - return 0; -} - static void select_opt_map(OptMap* now, OptMap* alt) { @@ -5775,12 +5692,7 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; - if (e->case_fold != 0) { - case_value = 1; - } - else - case_value = 3; - + case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; return comp_distance_value(&e->mm, &m->mm, ae, am); @@ -5791,14 +5703,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ + /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mm, &add->mm); + mml_alt_merge(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5814,17 +5726,17 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) } static void -set_bound_node_opt_info(OptNode* opt, MinMax* plen) +set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen) { - copy_mml(&(opt->sb.mm), plen); - copy_mml(&(opt->spr.mm), plen); - copy_mml(&(opt->map.mm), plen); + mml_copy(&(opt->sb.mm), plen); + mml_copy(&(opt->spr.mm), plen); + mml_copy(&(opt->map.mm), plen); } static void clear_node_opt_info(OptNode* opt) { - clear_mml(&opt->len); + mml_clear(&opt->len); clear_opt_anc_info(&opt->anc); clear_opt_exact(&opt->sb); clear_opt_exact(&opt->sm); @@ -5889,7 +5801,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } select_opt_map(&to->map, &add->map); - add_mml(&to->len, &add->len); + mml_add(&to->len, &add->len); } static void @@ -5901,7 +5813,7 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env) alt_merge_opt_exact(&to->spr, &add->spr, env); alt_merge_opt_map(env->enc, &to->map, &add->map); - alt_merge_mml(&to->len, &add->len); + mml_alt_merge(&to->len, &add->len); } @@ -5930,7 +5842,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mm, &xo.len); + mml_add(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5956,29 +5868,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - if (slen > 0) { - add_char_opt_map(&opt->map, *(sn->s), enc); - } - set_mml(&opt->len, slen, slen); - } - else { - int max, min; - - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } - - max = slen; - min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); - set_mml(&opt->len, min, max); + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + if (slen > 0) { + add_char_opt_map(&opt->map, *(sn->s), enc); } + mml_set_min_max(&opt->len, slen, slen); } break; @@ -5993,7 +5887,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) OnigLen min = ONIGENC_MBC_MINLEN(enc); OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc); - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } else { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { @@ -6002,7 +5896,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) add_char_opt_map(&opt->map, (UChar )i, enc); } } - set_mml(&opt->len, 1, 1); + mml_set_min_max(&opt->len, 1, 1); } } break; @@ -6046,7 +5940,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) else { min = ONIGENC_MBC_MINLEN(enc); } - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } break; @@ -6087,37 +5981,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) case NODE_BACKREF: if (! NODE_IS_CHECKER(node)) { - int* backs; - OnigLen min, max, tmin, tmax; - MemEnv* mem_env = SCANENV_MEMENV(env->scan_env); - BackRefNode* br = BACKREF_(node); + OnigLen min, max; - if (NODE_IS_RECURSION(node)) { - set_mml(&opt->len, 0, INFINITE_LEN); - break; - } - backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); - for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); - if (min > tmin) min = tmin; - if (max < tmax) max = tmax; - } - set_mml(&opt->len, min, max); + min = node_min_byte_len(node, env->scan_env); + max = node_max_byte_len(node, env->scan_env); + mml_set_min_max(&opt->len, min, max); } break; #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) - set_mml(&opt->len, 0, INFINITE_LEN); + mml_set_min_max(&opt->len, 0, INFINITE_LEN); else { - OnigOptionType save = env->options; - env->options = BAG_(NODE_BODY(node))->o.options; r = optimize_nodes(NODE_BODY(node), opt, env); - env->options = save; } break; #endif @@ -6153,7 +6030,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) if (IS_INFINITE_REPEAT(qn->upper)) { if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { - if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) + if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); else add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF); @@ -6166,7 +6043,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } min = distance_multiply(xo.len.min, qn->lower); - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } break; @@ -6175,14 +6052,9 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) BagNode* en = BAG_(node); switch (en->type) { + case BAG_STOP_BACKTRACK: case BAG_OPTION: - { - OnigOptionType save = env->options; - - env->options = en->o.options; - r = optimize_nodes(NODE_BODY(node), opt, env); - env->options = save; - } + r = optimize_nodes(NODE_BODY(node), opt, env); break; case BAG_MEMORY: @@ -6193,9 +6065,9 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) min = 0; max = INFINITE_LEN; - if (NODE_IS_MIN_FIXED(node)) min = en->min_len; - if (NODE_IS_MAX_FIXED(node)) max = en->max_len; - set_mml(&opt->len, min, max); + if (NODE_IS_FIXED_MIN(node)) min = en->min_len; + if (NODE_IS_FIXED_MAX(node)) max = en->max_len; + mml_set_min_max(&opt->len, min, max); } else #endif @@ -6208,10 +6080,6 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } break; - case BAG_STOP_BACKTRACK: - r = optimize_nodes(NODE_BODY(node), opt, env); - break; - case BAG_IF_ELSE: { OptEnv nenv; @@ -6219,7 +6087,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mm, &xo.len); + mml_add(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -6245,7 +6113,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) default: #ifdef ONIG_DEBUG - fprintf(stderr, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node)); + fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node)); #endif r = ONIGERR_TYPE_BUG; break; @@ -6258,6 +6126,7 @@ static int set_optimize_exact(regex_t* reg, OptStr* e) { int r; + int allow_reverse; if (e->len == 0) return 0; @@ -6266,40 +6135,28 @@ set_optimize_exact(regex_t* reg, OptStr* e) xmemcpy(reg->exact, e->s, e->len); reg->exact_end = reg->exact + e->len; - if (e->case_fold) { - reg->optimize = OPTIMIZE_STR_CASE_FOLD; + allow_reverse = + ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); + + if (e->len >= 2 || (e->len >= 1 && allow_reverse)) { + r = set_sunday_quick_search_or_bmh_skip_table(reg, 0, + reg->exact, reg->exact_end, + reg->map, &(reg->map_offset)); + if (r != 0) return r; + + reg->optimize = (allow_reverse != 0 + ? OPTIMIZE_STR_FAST + : OPTIMIZE_STR_FAST_STEP_FORWARD); } else { - int allow_reverse; - - allow_reverse = - ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); - - if (e->len >= 2 || (e->len >= 1 && allow_reverse)) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 0, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - - reg->optimize = (allow_reverse != 0 - ? OPTIMIZE_STR_FAST - : OPTIMIZE_STR_FAST_STEP_FORWARD); - } - else { - reg->optimize = OPTIMIZE_STR; - } + reg->optimize = OPTIMIZE_STR; } reg->dist_min = e->mm.min; reg->dist_max = e->mm.max; if (reg->dist_min != INFINITE_LEN) { - int n; - if (e->case_fold != 0) - n = 1; - else - n = (int )(reg->exact_end - reg->exact); - + int n = (int )(reg->exact_end - reg->exact); reg->threshold_len = reg->dist_min + n; } @@ -6342,10 +6199,9 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) OptEnv env; env.enc = reg->enc; - env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mm); + mml_clear(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -6387,7 +6243,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) } #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) - print_optimize_info(stderr, reg); + print_optimize_info(DBGFP, reg); #endif return r; } @@ -6414,8 +6270,6 @@ clear_optimize_info(regex_t* reg) static void print_enc_string(FILE* fp, OnigEncoding enc, const UChar *s, const UChar *end) { - fprintf(fp, "\nPATTERN: /"); - if (ONIGENC_MBC_MINLEN(enc) > 1) { const UChar *p; OnigCodePoint code; @@ -6537,7 +6391,8 @@ print_optimize_info(FILE* f, regex_t* reg) for (p = reg->exact; p < reg->exact_end; p++) { fputc(*p, f); } - fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact)); + fprintf(f, "]: length: %ld, dmin: %u, dmax: %u\n", + (reg->exact_end - reg->exact), reg->dist_min, reg->dist_max); } else if (reg->optimize & OPTIMIZE_MAP) { int c, i, n = 0; @@ -6545,7 +6400,8 @@ print_optimize_info(FILE* f, regex_t* reg) for (i = 0; i < CHAR_MAP_SIZE; i++) if (reg->map[i]) n++; - fprintf(f, "map: n=%d\n", n); + fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n", + n, reg->dist_min, reg->dist_max); if (n > 0) { c = 0; fputc('[', f); @@ -6680,7 +6536,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #ifdef ONIG_DEBUG - print_enc_string(stderr, reg->enc, pattern, pattern_end); + fprintf(DBGFP, "\nPATTERN: /"); + print_enc_string(DBGFP, reg->enc, pattern, pattern_end); #endif if (reg->ops_alloc == 0) { @@ -6708,7 +6565,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - ! ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { + ! OPTON_CAPTURE_GROUP(reg->options)) { if (scan_env.num_named != scan_env.num_mem) r = disable_noname_group_capture(&root, reg, &scan_env); else @@ -6741,10 +6598,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, #endif #ifdef ONIG_DEBUG_PARSE - fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); - fprintf(stderr, "TREE (parsed)\n"); - print_tree(stderr, root); - fprintf(stderr, "\n"); + fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(DBGFP, "TREE (parsed)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); #endif r = tune_tree(root, reg, 0, &scan_env); @@ -6758,13 +6615,13 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #ifdef ONIG_DEBUG_PARSE - fprintf(stderr, "TREE (after tune)\n"); - print_tree(stderr, root); - fprintf(stderr, "\n"); + fprintf(DBGFP, "TREE (after tune)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); #endif - reg->capture_history = scan_env.cap_history; - reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; #ifdef USE_CALLOUT if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { @@ -6847,8 +6704,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE - onig_print_names(stderr, reg); - onig_print_compiled_byte_code_list(stderr, reg); + onig_print_names(DBGFP, reg); + onig_print_compiled_byte_code_list(DBGFP, reg); #endif #ifdef USE_DIRECT_THREADED_CODE @@ -6920,20 +6777,18 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl else option |= syntax->options; - (reg)->enc = enc; - (reg)->options = option; - (reg)->syntax = syntax; - (reg)->optimize = 0; - (reg)->exact = (UChar* )NULL; - (reg)->extp = (RegexExt* )NULL; - - (reg)->ops = (Operation* )NULL; - (reg)->ops_curr = (Operation* )NULL; - (reg)->ops_used = 0; - (reg)->ops_alloc = 0; - (reg)->name_table = (void* )NULL; - - (reg)->case_fold_flag = case_fold_flag; + (reg)->enc = enc; + (reg)->options = option; + (reg)->syntax = syntax; + (reg)->optimize = 0; + (reg)->exact = (UChar* )NULL; + (reg)->extp = (RegexExt* )NULL; + (reg)->ops = (Operation* )NULL; + (reg)->ops_curr = (Operation* )NULL; + (reg)->ops_used = 0; + (reg)->ops_alloc = 0; + (reg)->name_table = (void* )NULL; + (reg)->case_fold_flag = case_fold_flag; return 0; } @@ -7171,8 +7026,8 @@ print_indent_tree(FILE* f, Node* node, int indent) if (NODE_STRING_IS_CRUDE(node)) mode = "-crude"; - else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) - mode = "-case_fold_match"; + else if (NODE_IS_IGNORECASE(node)) + mode = "-ignorecase"; else mode = ""; diff --git a/oniguruma/src/regenc.c b/oniguruma/src/regenc.c index e4e514634..ff51a41eb 100644 --- a/oniguruma/src/regenc.c +++ b/oniguruma/src/regenc.c @@ -30,6 +30,9 @@ #include "regint.h" +#define LARGE_S 0x53 +#define SMALL_S 0x73 + OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; #define INITED_LIST_SIZE 20 @@ -550,7 +553,7 @@ static int ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED, OnigApplyAllCaseFoldFunc f, void* arg) { - static OnigCodePoint ss[] = { 0x73, 0x73 }; + static OnigCodePoint ss[] = { SMALL_S, SMALL_S }; return (*f)((OnigCodePoint )0xdf, ss, 2, arg); } @@ -589,35 +592,48 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { - if (0x41 <= *p && *p <= 0x5a) { + int i, j, n; + static OnigUChar sa[] = { LARGE_S, SMALL_S }; + + if (0x41 <= *p && *p <= 0x5a) { /* A - Z */ + if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1 + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + ss_combination: + items[0].byte_len = 2; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )0xdf; + + n = 1; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (sa[i] == *p && sa[j] == *(p+1)) + continue; + + items[n].byte_len = 2; + items[n].code_len = 2; + items[n].code[0] = (OnigCodePoint )sa[i]; + items[n].code[1] = (OnigCodePoint )sa[j]; + n++; + } + } + return 4; + } + items[0].byte_len = 1; items[0].code_len = 1; items[0].code[0] = (OnigCodePoint )(*p + 0x20); - if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == 0x53 || *(p+1) == 0x73)) { - /* SS */ - items[1].byte_len = 2; - items[1].code_len = 1; - items[1].code[0] = (OnigCodePoint )0xdf; - return 2; - } - else - return 1; + return 1; } - else if (0x61 <= *p && *p <= 0x7a) { + else if (0x61 <= *p && *p <= 0x7a) { /* a - z */ + if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1 + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { + goto ss_combination; + } + items[0].byte_len = 1; items[0].code_len = 1; items[0].code[0] = (OnigCodePoint )(*p - 0x20); - if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == 0x73 || *(p+1) == 0x53)) { - /* ss */ - items[1].byte_len = 2; - items[1].code_len = 1; - items[1].code[0] = (OnigCodePoint )0xdf; - return 2; - } - else - return 1; + return 1; } else if (*p == 0xdf && ess_tsett_flag != 0) { items[0].byte_len = 1; @@ -677,7 +693,7 @@ extern int onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end) { if (p < end) { - if (*p == 0x0a) return 1; + if (*p == NEWLINE_CODE) return 1; } return 0; } @@ -906,7 +922,7 @@ onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end) { OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end); - if (code > 127) return 0; + if (code > ASCII_LIMIT) return 0; return ONIGENC_IS_ASCII_CODE_WORD(code); } diff --git a/oniguruma/src/regenc.h b/oniguruma/src/regenc.h index fb2e20447..2b5fb5a3b 100644 --- a/oniguruma/src/regenc.h +++ b/oniguruma/src/regenc.h @@ -75,6 +75,8 @@ typedef struct { #define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val) #define MAX_CODE_POINT (~((OnigCodePoint )0)) +#define ASCII_LIMIT 127 +#define NEWLINE_CODE 0x0a #define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) diff --git a/oniguruma/src/regexec.c b/oniguruma/src/regexec.c index ce498c611..5efe3a50a 100644 --- a/oniguruma/src/regexec.c +++ b/oniguruma/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -177,8 +177,6 @@ static OpInfoType OpInfo[] = { { OP_STR_MB2N, "str_mb2-n"}, { OP_STR_MB3N, "str_mb3n"}, { OP_STR_MBN, "str_mbn"}, - { OP_STR_1_IC, "str_1-ic"}, - { OP_STR_N_IC, "str_n-ic"}, { OP_CCLASS, "cclass"}, { OP_CCLASS_MB, "cclass-mb"}, { OP_CCLASS_MIX, "cclass-mix"}, @@ -254,7 +252,7 @@ static OpInfoType OpInfo[] = { { OP_LOOK_BEHIND, "look-behind"}, { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start"}, { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end"}, - { OP_PUSH_SAVE_VAL, "push-save-val"}, + { OP_SAVE_VAL, "save-val"}, { OP_UPDATE_VAR, "update-var"}, #ifdef USE_CALL { OP_CALL, "call"}, @@ -377,14 +375,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, while (n-- > 0) { fputc(*q++, f); } } break; - case OP_STR_1_IC: - len = enclen(enc, p->exact.s); - p_string(f, len, p->exact.s); - break; - case OP_STR_N_IC: - len = p->exact_n.n; - p_len_string(f, len, 1, p->exact_n.s); - break; case OP_CCLASS: case OP_CCLASS_NOT: @@ -564,12 +554,12 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; #endif - case OP_PUSH_SAVE_VAL: + case OP_SAVE_VAL: { SaveType type; - type = p->push_save_val.type; - mem = p->push_save_val.id; + type = p->save_val.type; + mem = p->save_val.id; fprintf(f, ":%d:%d", type, mem); } break; @@ -638,7 +628,7 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; default: - fprintf(stderr, "print_compiled_byte_code: undefined code %d\n", opcode); + fprintf(DBGFP, "print_compiled_byte_code: undefined code %d\n", opcode); break; } } @@ -1808,26 +1798,6 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while (0) -#define STACK_GET_SAVE_VAL_TYPE_LAST_ID_FROM(stype, sid, sval, stk_from) do { \ - int level = 0;\ - StackType *k = (stk_from);\ - while (k > stk_base) {\ - STACK_BASE_CHECK(k, "STACK_GET_SAVE_VAL_TYPE_LAST_ID_FROM"); \ - if (k->type == STK_SAVE_VAL && k->u.val.type == (stype)\ - && k->u.val.id == (sid)) {\ - if (level == 0) {\ - (sval) = k->u.val.v;\ - break;\ - }\ - }\ - else if (k->type == STK_CALL_FRAME)\ - level--;\ - else if (k->type == STK_RETURN)\ - level++;\ - k--;\ - }\ -} while (0) - #define STACK_PUSH_CALLOUT_CONTENTS(anum, func) do {\ STACK_ENSURE(1);\ stk->type = STK_CALLOUT;\ @@ -1849,7 +1819,7 @@ stack_double(int is_alloca, char** arg_alloc_base, #ifdef ONIG_DEBUG #define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ - fprintf(stderr, "at %s\n", at);\ + fprintf(DBGFP, "at %s\n", at);\ MATCH_AT_ERROR_RETURN(ONIGERR_STACK_BUG);\ } #else @@ -2544,7 +2514,7 @@ typedef struct { int len, spos;\ spos = IS_NOT_NULL(s) ? (int )(s - str) : -1;\ xp = p - (offset);\ - fprintf(stderr, "%7u: %7ld: %4d> \"",\ + fprintf(DBGFP, "%7u: %7ld: %4d> \"",\ counter, GET_STACK_INDEX(stk), spos);\ counter++;\ bp = buf;\ @@ -2560,21 +2530,23 @@ typedef struct { xmemcpy(bp, "\"", 1); bp += 1;\ }\ *bp = 0;\ - fputs((char* )buf, stderr);\ - for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr);\ + fputs((char* )buf, DBGFP);\ + for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', DBGFP);\ if (xp == FinishCode)\ - fprintf(stderr, "----: finish");\ + fprintf(DBGFP, "----: finish");\ else {\ - fprintf(stderr, "%4d: ", (int )(xp - reg->ops));\ - print_compiled_byte_code(stderr, reg, (int )(xp - reg->ops), reg->ops, encode);\ + fprintf(DBGFP, "%4d: ", (int )(xp - reg->ops));\ + print_compiled_byte_code(DBGFP, reg, (int )(xp - reg->ops), reg->ops, encode); \ }\ - fprintf(stderr, "\n");\ + fprintf(DBGFP, "\n");\ } while(0); #else #define MATCH_DEBUG_OUT(offset) #endif -#define MATCH_AT_ERROR_RETURN(err_code) best_len = err_code; goto match_at_end +#define MATCH_AT_ERROR_RETURN(err_code) do {\ + best_len = err_code; goto match_at_end;\ +} while(0) /* match data(str - end) from position (sstart). */ @@ -2607,8 +2579,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_STR_MB2N, &&L_STR_MB3N, &&L_STR_MBN, - &&L_STR_1_IC, - &&L_STR_N_IC, &&L_CCLASS, &&L_CCLASS_MB, &&L_CCLASS_MIX, @@ -2684,7 +2654,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_LOOK_BEHIND, &&L_LOOK_BEHIND_NOT_START, &&L_LOOK_BEHIND_NOT_END, - &&L_PUSH_SAVE_VAL, + &&L_SAVE_VAL, &&L_UPDATE_VAR, #ifdef USE_CALL &&L_CALL, @@ -2760,9 +2730,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "match_at: str: %p, end: %p, start: %p, sprev: %p\n", + fprintf(DBGFP, "match_at: str: %p, end: %p, start: %p, sprev: %p\n", str, end, sstart, sprev); - fprintf(stderr, "size: %d, start offset: %d\n", + fprintf(DBGFP, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); #endif @@ -2781,7 +2751,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (n > best_len) { OnigRegion* region; #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - if (IS_FIND_LONGEST(option)) { + if (OPTON_FIND_LONGEST(option)) { if (n > msa->best_len) { msa->best_len = n; msa->best_s = (UChar* )sstart; @@ -2796,7 +2766,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (keep > s) keep = s; #ifdef USE_POSIX_API_REGION_OPTION - if (IS_POSIX_REGION(msa->options)) { + if (OPTON_POSIX_REGION(msa->options)) { posix_regmatch_t* rmt = (posix_regmatch_t* )region; rmt[0].rm_so = (regoff_t )(keep - str); @@ -2850,7 +2820,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION - } /* else IS_POSIX_REGION() */ + } /* else OPTON_POSIX_REGION() */ #endif } /* if (region) */ } /* n > best_len */ @@ -2860,12 +2830,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif SOP_OUT; - if (IS_FIND_CONDITION(option)) { - if (IS_FIND_NOT_EMPTY(option) && s == sstart) { + if (OPTON_FIND_CONDITION(option)) { + if (OPTON_FIND_NOT_EMPTY(option) && s == sstart) { best_len = ONIG_MISMATCH; goto fail; /* for retry */ } - if (IS_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) { + if (OPTON_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) { goto fail; /* for retry */ } } @@ -2881,27 +2851,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(STR_1_IC) - { - int len; - UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - - DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, - /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ - case_fold_flag, - &s, end, lowbuf); - DATA_ENSURE(0); - q = lowbuf; - ps = p->exact.s; - while (len-- > 0) { - if (*ps != *q) goto fail; - ps++; q++; - } - } - INC_OP; - NEXT_OUT; - CASE_OP(STR_2) DATA_ENSURE(2); ps = p->exact.s; @@ -2969,34 +2918,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(STR_N_IC) - { - int len; - UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - - tlen = p->exact_n.n; - ps = p->exact_n.s; - endp = ps + tlen; - while (ps < endp) { - sprev = s; - DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, - /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ - case_fold_flag, - &s, end, lowbuf); - DATA_ENSURE(0); - q = lowbuf; - while (len-- > 0) { - if (ps >= endp) goto fail; - if (*ps != *q) goto fail; - ps++; q++; - } - } - } - - INC_OP; - JUMP_OUT; - CASE_OP(STR_MB2N1) DATA_ENSURE(2); ps = p->exact.s; @@ -3420,7 +3341,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BEGIN_LINE) if (ON_STR_BEGIN(s)) { - if (IS_NOTBOL(msa->options)) goto fail; + if (OPTON_NOTBOL(msa->options)) goto fail; INC_OP; JUMP_OUT; } @@ -3435,7 +3356,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (IS_NOTEOL(msa->options)) goto fail; + if (OPTON_NOTEOL(msa->options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3459,7 +3380,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (IS_NOTEOL(msa->options)) goto fail; + if (OPTON_NOTEOL(msa->options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3746,7 +3667,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; if (is_empty) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "EMPTY_CHECK_END: skip id:%d, s:%p\n", (int )mem, s); + fprintf(DBGFP, "EMPTY_CHECK_END: skip id:%d, s:%p\n", (int )mem, s); #endif empty_check_found: /* empty loop founded, skip next instruction */ @@ -3779,7 +3700,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; if (is_empty) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "EMPTY_CHECK_END_MEM: skip id:%d, s:%p\n", (int)mem, s); + fprintf(DBGFP, "EMPTY_CHECK_END_MEM: skip id:%d, s:%p\n", (int)mem, s); #endif if (is_empty == -1) goto fail; goto empty_check_found; @@ -3802,7 +3723,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; if (is_empty) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "EMPTY_CHECK_END_MEM_PUSH: skip id:%d, s:%p\n", + fprintf(DBGFP, "EMPTY_CHECK_END_MEM_PUSH: skip id:%d, s:%p\n", (int )mem, s); #endif if (is_empty == -1) goto fail; @@ -4010,12 +3931,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; #endif - CASE_OP(PUSH_SAVE_VAL) + CASE_OP(SAVE_VAL) { SaveType type; - type = p->push_save_val.type; - mem = p->push_save_val.id; /* mem: save id */ + type = p->save_val.type; + mem = p->save_val.id; /* mem: save id */ switch ((enum SaveType )type) { case SAVE_KEEP: STACK_PUSH_SAVE_VAL(mem, type, s); @@ -4167,6 +4088,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, return best_len; } + +#ifdef USE_REGSET + typedef struct { regex_t* reg; OnigRegion* region; @@ -4433,7 +4357,7 @@ onig_regset_search_with_param(OnigRegSet* set, if (set->n == 0) return ONIG_MISMATCH; - if (IS_POSIX_REGION(option)) + if (OPTON_POSIX_REGION(option)) return ONIGERR_INVALID_ARGUMENT; r = 0; @@ -4457,7 +4381,7 @@ onig_regset_search_with_param(OnigRegSet* set, return ONIGERR_INVALID_ARGUMENT; } - if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (OPTON_CHECK_VALIDITY_OF_STRING(option)) { if (! ONIGENC_IS_VALID_MBC_STRING(enc, str, end)) { r = ONIGERR_INVALID_WIDE_CHAR_VALUE; goto finish_no_msa; @@ -4567,7 +4491,7 @@ onig_regset_search_with_param(OnigRegSet* set, for (i = 0; i < set->n; i++) { if (IS_NOT_NULL(msas)) MATCH_ARG_FREE(msas[i]); - if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + if (OPTON_FIND_NOT_EMPTY(set->rs[i].reg->options) && IS_NOT_NULL(set->rs[i].region)) { onig_region_clear(set->rs[i].region); } @@ -4586,7 +4510,7 @@ onig_regset_search_with_param(OnigRegSet* set, for (i = 0; i < set->n; i++) { if (IS_NOT_NULL(msas)) MATCH_ARG_FREE(msas[i]); - if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + if (OPTON_FIND_NOT_EMPTY(set->rs[i].reg->options) && IS_NOT_NULL(set->rs[i].region)) { onig_region_clear(set->rs[i].region); } @@ -4625,6 +4549,9 @@ onig_regset_search(OnigRegSet* set, const UChar* str, const UChar* end, return r; } +#endif /* USE_REGSET */ + + static UChar* slow_search(OnigEncoding enc, UChar* target, UChar* target_end, const UChar* text, const UChar* text_end, UChar* text_range) @@ -4656,48 +4583,6 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, return (UChar* )NULL; } -static int -str_lower_case_match(OnigEncoding enc, int case_fold_flag, - const UChar* t, const UChar* tend, - const UChar* p, const UChar* end) -{ - int lowlen; - UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - - while (t < tend) { - if (p >= end) return 0; - lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); - q = lowbuf; - while (lowlen > 0) { - if (t >= tend) return 0; - if (*t++ != *q++) return 0; - lowlen--; - } - } - - return 1; -} - -static UChar* -slow_search_ic(OnigEncoding enc, int case_fold_flag, - UChar* target, UChar* target_end, - const UChar* text, const UChar* text_end, UChar* text_range) -{ - UChar *s; - - s = (UChar* )text; - - while (s < text_range) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, text_end)) - return s; - - s += enclen(enc, s); - } - - return (UChar* )NULL; -} - static UChar* slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, const UChar* text, const UChar* adjust_text, @@ -4730,33 +4615,6 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, return (UChar* )NULL; } -static UChar* -slow_search_backward_ic(OnigEncoding enc, int case_fold_flag, - UChar* target, UChar* target_end, - const UChar* text, const UChar* adjust_text, - const UChar* text_end, const UChar* text_start) -{ - UChar *s; - - s = (UChar* )text_end; - s -= (target_end - target); - if (s > text_start) - s = (UChar* )text_start; - else - s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); - - while (s >= text) { - if (str_lower_case_match(enc, case_fold_flag, - target, target_end, s, text_end)) - return s; - - s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s); - } - - return (UChar* )NULL; -} - - static UChar* sunday_quick_search_step_forward(regex_t* reg, const UChar* target, const UChar* target_end, @@ -4770,8 +4628,9 @@ sunday_quick_search_step_forward(regex_t* reg, OnigEncoding enc; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, - "sunday_quick_search_step_forward: text: %p, text_end: %p, text_range: %p\n", text, text_end, text_range); + fprintf(DBGFP, + "sunday_quick_search_step_forward: text: %p, text_end: %p, text_range: %p\n", + text, text_end, text_range); #endif enc = reg->enc; @@ -4894,7 +4753,7 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, MATCH_ARG_INIT(msa, reg, option, region, at, mp); if (region #ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) + && !OPTON_POSIX_REGION(option) #endif ) { r = onig_region_resize_clear(region, reg->num_mem + 1); @@ -4903,7 +4762,7 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, r = 0; if (r == 0) { - if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (OPTON_CHECK_VALIDITY_OF_STRING(option)) { if (! ONIGENC_IS_VALID_MBC_STRING(reg->enc, str, end)) { r = ONIGERR_INVALID_WIDE_CHAR_VALUE; goto end; @@ -4926,7 +4785,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar *p, *pprev = (UChar* )NULL; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search: str: %p, end: %p, start: %p, range: %p\n", + fprintf(DBGFP, "forward_search: str: %p, end: %p, start: %p, range: %p\n", str, end, start, range); #endif @@ -4949,10 +4808,6 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, case OPTIMIZE_STR: p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); break; - case OPTIMIZE_STR_CASE_FOLD: - p = slow_search_ic(reg->enc, reg->case_fold_flag, - reg->exact, reg->exact_end, p, end, range); - break; case OPTIMIZE_STR_FAST: p = sunday_quick_search(reg, reg->exact, reg->exact_end, p, end, range); @@ -5047,7 +4902,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, + fprintf(DBGFP, "forward_search success: low: %d, high: %d, dmin: %u, dmax: %u\n", (int )(*low - str), (int )(*high - str), reg->dist_min, reg->dist_max); @@ -5075,12 +4930,6 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, range, adjrange, end, p); break; - case OPTIMIZE_STR_CASE_FOLD: - p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, - reg->exact, reg->exact_end, - range, adjrange, end, p); - break; - case OPTIMIZE_STR_FAST: case OPTIMIZE_STR_FAST_STEP_FORWARD: goto exact_method; @@ -5150,7 +4999,7 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search: low: %d, high: %d\n", + fprintf(DBGFP, "backward_search: low: %d, high: %d\n", (int )(*low - str), (int )(*high - str)); #endif return 1; /* success */ @@ -5158,7 +5007,7 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, fail: #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search: fail.\n"); + fprintf(DBGFP, "backward_search: fail.\n"); #endif return 0; /* fail */ } @@ -5202,7 +5051,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, const UChar *orig_start = start; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, + fprintf(DBGFP, "onig_search (entry point): str: %p, end: %d, start: %d, range: %d\n", str, (int )(end - str), (int )(start - str), (int )(range - str)); #endif @@ -5211,7 +5060,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (region #ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) + && ! OPTON_POSIX_REGION(option) #endif ) { r = onig_region_resize_clear(region, reg->num_mem + 1); @@ -5220,7 +5069,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (start > end || start < str) goto mismatch_no_msa; - if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (OPTON_CHECK_VALIDITY_OF_STRING(option)) { if (! ONIGENC_IS_VALID_MBC_STRING(reg->enc, str, end)) { r = ONIGERR_INVALID_WIDE_CHAR_VALUE; goto finish_no_msa; @@ -5233,7 +5082,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ - if (! IS_FIND_LONGEST(reg->options)) {\ + if (! OPTON_FIND_LONGEST(reg->options)) {\ goto match;\ }\ }\ @@ -5350,7 +5199,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, static const UChar* address_for_empty_string = (UChar* )""; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "onig_search: empty string.\n"); + fprintf(DBGFP, "onig_search: empty string.\n"); #endif if (reg->threshold_len == 0) { @@ -5366,7 +5215,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "onig_search(apply anchor): end: %d, start: %d, range: %d\n", + fprintf(DBGFP, "onig_search(apply anchor): end: %d, start: %d, range: %d\n", (int )(end - str), (int )(start - str), (int )(range - str)); #endif @@ -5509,7 +5358,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, mismatch: #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - if (IS_FIND_LONGEST(reg->options)) { + if (OPTON_FIND_LONGEST(reg->options)) { if (msa.best_len >= 0) { s = msa.best_s; goto match; @@ -5523,9 +5372,9 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, /* If result is mismatch and no FIND_NOT_EMPTY option, then the region is not set in match_at(). */ - if (IS_FIND_NOT_EMPTY(reg->options) && region + if (OPTON_FIND_NOT_EMPTY(reg->options) && region #ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) + && !OPTON_POSIX_REGION(option) #endif ) { onig_region_clear(region); @@ -5533,7 +5382,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG if (r != ONIG_MISMATCH) - fprintf(stderr, "onig_search: error %d\n", r); + fprintf(DBGFP, "onig_search: error %d\n", r); #endif return r; @@ -5542,7 +5391,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, finish_no_msa: #ifdef ONIG_DEBUG if (r != ONIG_MISMATCH) - fprintf(stderr, "onig_search: error %d\n", r); + fprintf(DBGFP, "onig_search: error %d\n", r); #endif return r; @@ -5578,7 +5427,7 @@ onig_scan(regex_t* reg, const UChar* str, const UChar* end, int rs; const UChar* start; - if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (OPTON_CHECK_VALIDITY_OF_STRING(option)) { if (! ONIGENC_IS_VALID_MBC_STRING(reg->enc, str, end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; @@ -5669,6 +5518,8 @@ onig_copy_encoding(OnigEncoding to, OnigEncoding from) *to = *from; } +#ifdef USE_REGSET + extern int onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) { @@ -5759,7 +5610,7 @@ onig_regset_add(OnigRegSet* set, regex_t* reg) { OnigRegion* region; - if (IS_FIND_LONGEST(reg->options)) + if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; if (set->n != 0 && reg->enc != set->enc) @@ -5805,7 +5656,7 @@ onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) set->n--; } else { - if (IS_FIND_LONGEST(reg->options)) + if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; if (set->n > 1 && reg->enc != set->enc) @@ -5864,6 +5715,8 @@ onig_regset_get_region(OnigRegSet* set, int at) return set->rs[at].region; } +#endif /* USE_REGSET */ + #ifdef USE_DIRECT_THREADED_CODE extern int @@ -6385,6 +6238,8 @@ onig_builtin_cmp(OnigCalloutArgs* args, void* user_data ARG_UNUSED) } +#ifndef ONIGURUMA_UNSUPPORTED_PRINT + #include static FILE* OutFp; @@ -6483,4 +6338,6 @@ onig_setup_builtin_monitors_by_ascii_encoded_name(void* fp /* FILE* */) return ONIG_NORMAL; } +#endif /* ONIGURUMA_UNSUPPORTED_PRINT */ + #endif /* USE_CALLOUT */ diff --git a/oniguruma/src/regint.h b/oniguruma/src/regint.h index 02f64ea78..0e32c7377 100644 --- a/oniguruma/src/regint.h +++ b/oniguruma/src/regint.h @@ -5,7 +5,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -45,6 +45,7 @@ defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG +#define DBGFP stderr #endif #endif @@ -56,6 +57,7 @@ /* config */ /* spec. config */ +#define USE_REGSET #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k, \k */ @@ -119,6 +121,9 @@ /* */ #define onig_st_is_member st_is_member + +#ifndef ONIGURUMA_SYS_UEFI + #define xmemset memset #define xmemcpy memcpy #define xmemmove memmove @@ -176,6 +181,19 @@ typedef unsigned int uintptr_t; #endif #endif +/* strend hash */ +typedef void hash_table_type; + +#ifdef _WIN32 +# include +typedef ULONG_PTR hash_data_type; +#else +typedef unsigned long hash_data_type; +#endif + +#endif /* ONIGURUMA_SYS_UEFI */ + + #ifdef MIN #undef MIN #endif @@ -237,7 +255,6 @@ enum OptimizeType { OPTIMIZE_STR, /* Slow Search */ OPTIMIZE_STR_FAST, /* Sunday quick search / BMH */ OPTIMIZE_STR_FAST_STEP_FORWARD, /* Sunday quick search / BMH */ - OPTIMIZE_STR_CASE_FOLD, /* Slow Search (ignore case) */ OPTIMIZE_MAP /* char map */ }; @@ -290,32 +307,20 @@ typedef unsigned int MemStatusType; (IS_CODE_DIGIT_ASCII(enc,code) ? DIGITVAL(code) \ : (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10)) -#define IS_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE) -#define IS_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE) -#define IS_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE) -#define IS_EXTEND(option) ((option) & ONIG_OPTION_EXTEND) -#define IS_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST) -#define IS_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) -#define IS_FIND_CONDITION(option) ((option) & \ +#define OPTON_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST) +#define OPTON_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) +#define OPTON_FIND_CONDITION(option) ((option) & \ (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY)) -#define IS_NOTBOL(option) ((option) & ONIG_OPTION_NOTBOL) -#define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL) -#define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) - -#define IS_WORD_ASCII(option) \ - ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) -#define IS_DIGIT_ASCII(option) \ - ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) -#define IS_SPACE_ASCII(option) \ - ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) -#define IS_POSIX_ASCII(option) ((option) & ONIG_OPTION_POSIX_IS_ASCII) - -#define IS_ASCII_MODE_CTYPE_OPTION(ctype, options) \ - ((ctype) >= 0 && \ - (((ctype) < ONIGENC_CTYPE_ASCII && IS_POSIX_ASCII(options)) ||\ - ((ctype) == ONIGENC_CTYPE_WORD && IS_WORD_ASCII(options)) ||\ - ((ctype) == ONIGENC_CTYPE_DIGIT && IS_DIGIT_ASCII(options)) ||\ - ((ctype) == ONIGENC_CTYPE_SPACE && IS_SPACE_ASCII(options)))) +#define OPTON_NEGATE_SINGLELINE(option) ((option) & \ + ONIG_OPTION_NEGATE_SINGLELINE) +#define OPTON_DONT_CAPTURE_GROUP(option) ((option) & \ + ONIG_OPTION_DONT_CAPTURE_GROUP) +#define OPTON_CAPTURE_GROUP(option) ((option) & ONIG_OPTION_CAPTURE_GROUP) +#define OPTON_NOTBOL(option) ((option) & ONIG_OPTION_NOTBOL) +#define OPTON_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL) +#define OPTON_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) +#define OPTON_CHECK_VALIDITY_OF_STRING(option) ((option) & \ + ONIG_OPTION_CHECK_VALIDITY_OF_STRING) #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) @@ -327,17 +332,17 @@ typedef unsigned int MemStatusType; #define BITS_PER_BYTE 8 #define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) #define BITS_IN_ROOM 32 /* 4 * BITS_PER_BYTE */ -#define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) +#define BITSET_REAL_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) typedef uint32_t Bits; -typedef Bits BitSet[BITSET_SIZE]; +typedef Bits BitSet[BITSET_REAL_SIZE]; typedef Bits* BitSetRef; #define SIZE_BITSET sizeof(BitSet) #define BITSET_CLEAR(bs) do {\ int i;\ - for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \ + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { (bs)[i] = 0; } \ } while (0) #define BS_ROOM(bs,pos) (bs)[(unsigned int )(pos) >> 5] @@ -475,8 +480,6 @@ enum OpCode { OP_STR_MB2N, /* mb-length = 2 */ OP_STR_MB3N, /* mb-length = 3 */ OP_STR_MBN, /* other length */ - OP_STR_1_IC, /* single byte, N = 1, ignore case */ - OP_STR_N_IC, /* single byte, ignore case */ OP_CCLASS, OP_CCLASS_MB, OP_CCLASS_MIX, @@ -552,7 +555,7 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_LOOK_BEHIND_NOT_START, /* (? */ @@ -650,7 +653,7 @@ typedef int ModeType; #define OPSIZE_LOOK_BEHIND_NOT_END 1 #define OPSIZE_CALL 1 #define OPSIZE_RETURN 1 -#define OPSIZE_PUSH_SAVE_VAL 1 +#define OPSIZE_SAVE_VAL 1 #define OPSIZE_UPDATE_VAR 1 #ifdef USE_CALLOUT @@ -810,7 +813,7 @@ typedef struct { struct { SaveType type; MemNumType id; - } push_save_val; + } save_val; struct { UpdateVarType type; MemNumType id; @@ -999,16 +1002,6 @@ extern OnigCalloutFunc onig_get_callout_start_func(regex_t* reg, int callout_num #endif /* USE_CALLOUT */ -/* strend hash */ -typedef void hash_table_type; - -#ifdef _WIN32 -# include -typedef ULONG_PTR hash_data_type; -#else -typedef unsigned long hash_data_type; -#endif - extern hash_table_type* onig_st_init_strend_table_with_size P_((int size)); extern int onig_st_lookup_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type *value)); extern int onig_st_insert_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type value)); diff --git a/oniguruma/src/regparse.c b/oniguruma/src/regparse.c index 36fc8b250..d69a2f726 100644 --- a/oniguruma/src/regparse.c +++ b/oniguruma/src/regparse.c @@ -3,7 +3,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -46,6 +46,26 @@ #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \ ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_') +#define OPTON_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE) +#define OPTON_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE) +#define OPTON_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE) +#define OPTON_EXTEND(option) ((option) & ONIG_OPTION_EXTEND) +#define OPTON_WORD_ASCII(option) \ + ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) +#define OPTON_DIGIT_ASCII(option) \ + ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) +#define OPTON_SPACE_ASCII(option) \ + ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) +#define OPTON_POSIX_ASCII(option) ((option) & ONIG_OPTION_POSIX_IS_ASCII) +#define OPTON_TEXT_SEGMENT_WORD(option) ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD) + +#define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \ + ((ctype) >= 0 && \ + (((ctype) < ONIGENC_CTYPE_ASCII && OPTON_POSIX_ASCII(options)) ||\ + ((ctype) == ONIGENC_CTYPE_WORD && OPTON_WORD_ASCII(options)) ||\ + ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\ + ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options)))) + OnigSyntaxType OnigSyntaxOniguruma = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | @@ -296,7 +316,7 @@ backref_rel_to_abs(int rel_no, ScanEnv* env) #define BITSET_IS_EMPTY(bs,empty) do {\ int i;\ empty = 1;\ - for (i = 0; i < (int )BITSET_SIZE; i++) {\ + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\ if ((bs)[i] != 0) {\ empty = 0; break;\ }\ @@ -316,35 +336,35 @@ static void bitset_invert(BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); } } static void bitset_invert_to(BitSetRef from, BitSetRef to) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); } + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); } } static void bitset_and(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; } + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; } } static void bitset_or(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; } + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; } } static void bitset_copy(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; } + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; } } extern int @@ -776,7 +796,7 @@ onig_foreach_name(regex_t* reg, } static int -i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) +i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map) { int i; @@ -793,7 +813,7 @@ i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) } extern int -onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) +onig_renumber_name_table(regex_t* reg, GroupNumMap* map) { NameTable* t = (NameTable* )reg->name_table; @@ -1143,12 +1163,12 @@ onig_name_to_backref_number(regex_t* reg, const UChar* name, extern int onig_noname_group_capture_is_active(regex_t* reg) { - if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) + if (OPTON_DONT_CAPTURE_GROUP(reg->options)) return 0; if (onig_number_of_names(reg) > 0 && IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { + ! OPTON_CAPTURE_GROUP(reg->options)) { return 0; } @@ -1604,12 +1624,11 @@ onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type, fe->arg_types[i] = arg_types[i]; } for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) { + if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT; if (fe->arg_types[i] == ONIG_TYPE_STRING) { OnigValue* val; UChar* ds; - if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT; - val = opt_defaults + j; ds = onigenc_strdup(enc, val->s.start, val->s.end); CHECK_NULL_RETURN_MEMERR(ds); @@ -2138,6 +2157,18 @@ node_new(void) return node; } +extern Node* +onig_node_copy(Node* from) +{ + Node* copy; + + copy = node_new(); + CHECK_NULL_RETURN(copy); + xmemcpy(copy, from, sizeof(*copy)); + + return copy; +} + static void initialize_cclass(CClassNode* cc) @@ -2167,30 +2198,20 @@ node_new_ctype(int type, int not, OnigOptionType options) NODE_SET_TYPE(node, NODE_CTYPE); CTYPE_(node)->ctype = type; CTYPE_(node)->not = not; - CTYPE_(node)->options = options; - CTYPE_(node)->ascii_mode = IS_ASCII_MODE_CTYPE_OPTION(type, options); + CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options); return node; } static Node* -node_new_anychar(void) +node_new_anychar(OnigOptionType options) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); - return node; -} - -static Node* -node_new_anychar_with_fixed_option(OnigOptionType option) -{ - CtypeNode* ct; Node* node; - node = node_new_anychar(); + node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options); CHECK_NULL_RETURN(node); - ct = CTYPE_(node); - ct->options = option; - NODE_STATUS_ADD(node, FIXED_OPTION); + if (OPTON_MULTILINE(options)) + NODE_STATUS_ADD(node, MULTILINE); return node; } @@ -2199,18 +2220,18 @@ node_new_no_newline(Node** node, ScanEnv* env) { Node* n; - n = node_new_anychar_with_fixed_option(ONIG_OPTION_NONE); + n = node_new_anychar(ONIG_OPTION_NONE); CHECK_NULL_RETURN_MEMERR(n); *node = n; return 0; } static int -node_new_true_anychar(Node** node, ScanEnv* env) +node_new_true_anychar(Node** node) { Node* n; - n = node_new_anychar_with_fixed_option(ONIG_OPTION_MULTILINE); + n = node_new_anychar(ONIG_OPTION_MULTILINE); CHECK_NULL_RETURN_MEMERR(n); *node = n; return 0; @@ -2292,16 +2313,39 @@ make_alt(int n, Node* ns[]) return make_list_or_alt(NODE_ALT, n, ns); } -extern Node* -onig_node_new_anchor(int type, int ascii_mode) +static Node* +node_new_anchor(int type) { - Node* node = node_new(); + Node* node; + + node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_ANCHOR); ANCHOR_(node)->type = type; - ANCHOR_(node)->char_len = -1; + ANCHOR_(node)->char_len = INFINITE_LEN; + ANCHOR_(node)->ascii_mode = 0; + return node; +} + +static Node* +node_new_anchor_with_options(int type, OnigOptionType options) +{ + int ascii_mode; + Node* node; + + node = node_new_anchor(type); + CHECK_NULL_RETURN(node); + + ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0; ANCHOR_(node)->ascii_mode = ascii_mode; + + if (type == ANCR_TEXT_SEGMENT_BOUNDARY || + type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) { + if (OPTON_TEXT_SEGMENT_WORD(options)) + NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD); + } + return node; } @@ -2313,8 +2357,9 @@ node_new_backref(int back_num, int* backrefs, int by_name, ScanEnv* env) { int i; - Node* node = node_new(); + Node* node; + node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_BACKREF); @@ -2323,6 +2368,9 @@ node_new_backref(int back_num, int* backrefs, int by_name, if (by_name != 0) NODE_STATUS_ADD(node, BY_NAME); + if (OPTON_IGNORECASE(env->options)) + NODE_STATUS_ADD(node, IGNORECASE); + #ifdef USE_BACKREF_WITH_LEVEL if (exist_level != 0) { NODE_STATUS_ADD(node, NEST_LEVEL); @@ -2693,10 +2741,10 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); + ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options); if (IS_NULL(ns[0])) goto err; - r = node_new_true_anychar(&ns[1], env); + r = node_new_true_anychar(&ns[1]); if (r != 0) goto err1; x = make_list(2, ns); @@ -2711,7 +2759,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_true_anychar(&ns[0], env); + r = node_new_true_anychar(&ns[0]); if (r != 0) goto err1; x = make_list(2, ns); @@ -3060,7 +3108,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; - r = node_new_true_anychar(&body, env); + r = node_new_true_anychar(&body); if (r != 0) { onig_node_free(quant); goto err; @@ -3095,7 +3143,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, id2 = GIMMICK_(ns[1])->id; - r = node_new_true_anychar(&ns[3], env); + r = node_new_true_anychar(&ns[3]); if (r != 0) goto err; possessive = 1; @@ -3195,7 +3243,6 @@ onig_node_str_clear(Node* node) STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; - STR_(node)->case_min_len = 0; } static Node* @@ -3209,7 +3256,6 @@ node_new_str(const UChar* s, const UChar* end) STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; - STR_(node)->case_min_len = 0; if (onig_node_str_cat(node, s, end)) { onig_node_free(node); @@ -3225,9 +3271,22 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_crude(UChar* s, UChar* end) +node_new_str_with_options(const UChar* s, const UChar* end, + OnigOptionType options) { - Node* node = node_new_str(s, end); + Node* node; + node = node_new_str(s, end); + + if (OPTON_IGNORECASE(options)) + NODE_STATUS_ADD(node, IGNORECASE); + + return node; +} + +static Node* +node_new_str_crude(UChar* s, UChar* end, OnigOptionType options) +{ + Node* node = node_new_str_with_options(s, end, options); CHECK_NULL_RETURN(node); NODE_STRING_SET_CRUDE(node); return node; @@ -3240,14 +3299,14 @@ node_new_empty(void) } static Node* -node_new_str_crude_char(UChar c) +node_new_str_crude_char(UChar c, OnigOptionType options) { int i; UChar p[1]; Node* node; p[0] = c; - node = node_new_str_crude(p, p + 1); + node = node_new_str_crude(p, p + 1, options); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) @@ -3270,12 +3329,13 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_CRUDE(node)) - NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; + STR_(rn)->flag = sn->flag; + NODE_STATUS(rn) = NODE_STATUS(node); } } + return rn; } @@ -4001,10 +4061,10 @@ node_new_general_newline(Node** node, ScanEnv* env) dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf); if (dlen < 0) return dlen; - alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); + alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_crude(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4012,10 +4072,10 @@ node_new_general_newline(Node** node, ScanEnv* env) cc = CCLASS_(ncc); if (dlen == 1) { - bitset_set_range(cc->bs, 0x0a, 0x0d); + bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d); } else { - r = add_code_range(&(cc->mbuf), env, 0x0a, 0x0d); + r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d); if (r != 0) { err1: onig_node_free(ncc); @@ -5485,7 +5545,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (c == MC_ANYCHAR(syn)) goto any_char; else if (c == MC_ANYTIME(syn)) - goto anytime; + goto any_time; else if (c == MC_ZERO_OR_ONE_TIME(syn)) goto zero_or_one_time; else if (c == MC_ONE_OR_MORE_TIME(syn)) @@ -5509,7 +5569,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '*': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; #ifdef USE_VARIABLE_META_CHARS - anytime: + any_time: #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; @@ -5665,14 +5725,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '^': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = (IS_SINGLELINE(env->options) + tok->u.subtype = (OPTON_SINGLELINE(env->options) ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE); break; case '$': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = (IS_SINGLELINE(env->options) + tok->u.subtype = (OPTON_SINGLELINE(env->options) ? ANCR_SEMI_END_BUF : ANCR_END_LINE); break; @@ -5687,7 +5747,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '#': - if (IS_EXTEND(env->options)) { + if (OPTON_EXTEND(env->options)) { while (!PEND) { PFETCH(c); if (ONIGENC_IS_CODE_NEWLINE(enc, c)) @@ -5699,7 +5759,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case ' ': case '\t': case '\n': case '\r': case '\f': - if (IS_EXTEND(env->options)) + if (OPTON_EXTEND(env->options)) goto start; break; @@ -5885,8 +5945,6 @@ add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not, static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { -#define ASCII_LIMIT 127 - int c, r; int ascii_mode; int is_single; @@ -5895,7 +5953,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) OnigCodePoint sb_out; OnigEncoding enc = env->enc; - ascii_mode = IS_ASCII_MODE_CTYPE_OPTION(ctype, env->options); + ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options); r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { @@ -6579,8 +6637,6 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) BITSET_IS_EMPTY(cc->bs, is_empty); if (is_empty == 0) { -#define NEWLINE_CODE 0x0a - if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) BITSET_SET_BIT(cc->bs, NEWLINE_CODE); @@ -7096,10 +7152,10 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); + *np = node_new_anchor(ANCR_PREC_READ); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); + *np = node_new_anchor(ANCR_PREC_READ_NOT); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7117,9 +7173,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); + *np = node_new_anchor(ANCR_LOOK_BEHIND); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); + *np = node_new_anchor(ANCR_LOOK_BEHIND_NOT); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7132,7 +7188,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group1: list_capture = 0; +#ifdef USE_CAPTURE_HISTORY named_group2: +#endif name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, &num_type, FALSE); @@ -7613,7 +7671,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } #endif else { - if (ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) + if (OPTON_DONT_CAPTURE_GROUP(env->options)) goto group; *np = node_new_memory(0); @@ -7884,7 +7942,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) else { len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { - csnode = onig_node_new_str(buf, buf + len); + csnode = node_new_str(buf, buf + len); if (IS_NULL(csnode)) goto err_free_ns; NODE_STRING_SET_CASE_EXPANDED(csnode); @@ -7923,6 +7981,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, Node** tp; unsigned int parse_depth; + retry: group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) @@ -7956,19 +8015,28 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } } else if (r == 2) { /* option only */ - Node* target; - OnigOptionType prev = env->options; - - env->options = BAG_(*np)->o.options; - r = fetch_token(tok, src, end, env); - if (r < 0) return r; - r = parse_alts(&target, tok, term, src, end, env, FALSE); - env->options = prev; - if (r < 0) { - onig_node_free(target); - return r; + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) { + env->options = BAG_(*np)->o.options; + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + onig_node_free(*np); + goto retry; + } + else { + Node* target; + OnigOptionType prev = env->options; + + env->options = BAG_(*np)->o.options; + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + r = parse_alts(&target, tok, term, src, end, env, FALSE); + env->options = prev; + if (r < 0) { + onig_node_free(target); + return r; + } + NODE_BODY(*np) = target; } - NODE_BODY(*np) = target; return tok->type; } break; @@ -7984,7 +8052,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_STRING: tk_byte: { - *np = node_new_str(tok->backp, *src); + *np = node_new_str_with_options(tok->backp, *src, env->options); CHECK_NULL_RETURN_MEMERR(*np); while (1) { @@ -8005,7 +8073,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_CRUDE_BYTE: tk_crude_byte: { - *np = node_new_str_crude_char(tok->u.byte); + *np = node_new_str_crude_char(tok->u.byte, env->options); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { @@ -8042,9 +8110,9 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_crude(buf, buf + len); + *np = node_new_str_crude(buf, buf + len, env->options); #else - *np = node_new_str(buf, buf + len); + *np = node_new_str_with_options(buf, buf + len, env->options); #endif CHECK_NULL_RETURN_MEMERR(*np); } @@ -8062,7 +8130,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_NULL(qend)) { nextp = qend = end; } - *np = node_new_str(qstart, qend); + *np = node_new_str_with_options(qstart, qend, env->options); CHECK_NULL_RETURN_MEMERR(*np); *src = nextp; } @@ -8110,7 +8178,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r != 0) return r; cc = CCLASS_(*np); - if (IS_IGNORECASE(env->options)) { + if (OPTON_IGNORECASE(env->options)) { IApplyCaseFoldArg iarg; iarg.env = env; @@ -8137,12 +8205,12 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; case TK_ANYCHAR: - *np = node_new_anychar(); + *np = node_new_anychar(env->options); CHECK_NULL_RETURN_MEMERR(*np); break; case TK_ANYCHAR_ANYTIME: - *np = node_new_anychar(); + *np = node_new_anychar(env->options); CHECK_NULL_RETURN_MEMERR(*np); qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); @@ -8180,12 +8248,8 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, #endif case TK_ANCHOR: - { - int ascii_mode = - IS_WORD_ASCII(env->options) && IS_WORD_ANCHOR_TYPE(tok->u.anchor) ? 1 : 0; - *np = onig_node_new_anchor(tok->u.anchor, ascii_mode); - CHECK_NULL_RETURN_MEMERR(*np); - } + *np = node_new_anchor_with_options(tok->u.anchor, env->options); + CHECK_NULL_RETURN_MEMERR(*np); break; case TK_REPEAT: @@ -8219,7 +8283,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; case TK_TRUE_ANYCHAR: - r = node_new_true_anychar(np, env); + r = node_new_true_anychar(np); if (r < 0) return r; break; @@ -8365,9 +8429,11 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, { int r; Node *node, **headp; + OnigOptionType save_options; *top = NULL; INC_PARSE_DEPTH(env->parse_depth); + save_options = env->options; r = parse_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { @@ -8416,6 +8482,7 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_PARSER_BUG; } + env->options = save_options; DEC_PARSE_DEPTH(env->parse_depth); return r; } diff --git a/oniguruma/src/regparse.h b/oniguruma/src/regparse.h index eabf680f0..964401653 100644 --- a/oniguruma/src/regparse.h +++ b/oniguruma/src/regparse.h @@ -33,7 +33,7 @@ #include "regint.h" #define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 20 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 6 /* node type */ @@ -68,10 +68,10 @@ enum GimmickType { }; enum BodyEmptyType { - BODY_IS_NOT_EMPTY = 0, - BODY_IS_EMPTY_POSSIBILITY = 1, - BODY_IS_EMPTY_POSSIBILITY_MEM = 2, - BODY_IS_EMPTY_POSSIBILITY_REC = 3 + BODY_IS_NOT_EMPTY = 0, + BODY_MAY_BE_EMPTY = 1, + BODY_MAY_BE_EMPTY_MEM = 2, + BODY_MAY_BE_EMPTY_REC = 3 }; struct _Node; @@ -86,7 +86,6 @@ typedef struct { unsigned int flag; UChar buf[NODE_STRING_BUF_SIZE]; int capacity; /* (allocated size - 1) or 0: use buf[] */ - int case_min_len; } StrNode; typedef struct { @@ -140,7 +139,8 @@ typedef struct { /* for multiple call reference */ OnigLen min_len; /* min length (byte) */ OnigLen max_len; /* max length (byte) */ - int char_len; /* character length */ + OnigLen min_char_len; + OnigLen max_char_len; int opt_count; /* referenced count in optimize_nodes() */ } BagNode; @@ -190,7 +190,7 @@ typedef struct { struct _Node* body; int type; - int char_len; + OnigLen char_len; int ascii_mode; } AnchorNode; @@ -210,7 +210,6 @@ typedef struct { int ctype; int not; - OnigOptionType options; int ascii_mode; } CtypeNode; @@ -288,42 +287,35 @@ typedef struct _Node { #define NODE_IS_ANYCHAR(node) \ (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR) -#define CTYPE_OPTION(node, reg) \ - (NODE_IS_FIXED_OPTION(node) ? CTYPE_(node)->options : reg->options) - #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) #define NODE_STRING_CRUDE (1<<0) #define NODE_STRING_CASE_EXPANDED (1<<1) -#define NODE_STRING_CASE_FOLD_MATCH (1<<2) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) #define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE #define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE #define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED -#define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH #define NODE_STRING_IS_CRUDE(node) \ (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) #define NODE_STRING_IS_CASE_EXPANDED(node) \ (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) -#define NODE_STRING_IS_CASE_FOLD_MATCH(node) \ - (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) /* node status bits */ -#define NODE_ST_MIN_FIXED (1<<0) -#define NODE_ST_MAX_FIXED (1<<1) -#define NODE_ST_CLEN_FIXED (1<<2) +#define NODE_ST_FIXED_MIN (1<<0) +#define NODE_ST_FIXED_MAX (1<<1) +#define NODE_ST_FIXED_CLEN (1<<2) #define NODE_ST_MARK1 (1<<3) #define NODE_ST_MARK2 (1<<4) #define NODE_ST_STRICT_REAL_REPEAT (1<<5) #define NODE_ST_RECURSION (1<<6) #define NODE_ST_CALLED (1<<7) -#define NODE_ST_ADDR_FIXED (1<<8) +#define NODE_ST_FIXED_ADDR (1<<8) #define NODE_ST_NAMED_GROUP (1<<9) #define NODE_ST_IN_REAL_REPEAT (1<<10) /* STK_REPEAT is nested in stack. */ #define NODE_ST_IN_ZERO_REPEAT (1<<11) /* (....){0} */ @@ -333,10 +325,12 @@ typedef struct _Node { #define NODE_ST_BY_NAME (1<<15) /* backref by name */ #define NODE_ST_BACKREF (1<<16) #define NODE_ST_CHECKER (1<<17) -#define NODE_ST_FIXED_OPTION (1<<18) -#define NODE_ST_PROHIBIT_RECURSION (1<<19) -#define NODE_ST_SUPER (1<<20) -#define NODE_ST_EMPTY_STATUS_CHECK (1<<21) +#define NODE_ST_PROHIBIT_RECURSION (1<<18) +#define NODE_ST_SUPER (1<<19) +#define NODE_ST_EMPTY_STATUS_CHECK (1<<20) +#define NODE_ST_IGNORECASE (1<<21) +#define NODE_ST_MULTILINE (1<<22) +#define NODE_ST_TEXT_SEGMENT_WORD (1<<23) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -350,17 +344,16 @@ typedef struct _Node { #define NODE_IS_RECURSION(node) ((NODE_STATUS(node) & NODE_ST_RECURSION) != 0) #define NODE_IS_IN_ZERO_REPEAT(node) ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0) #define NODE_IS_NAMED_GROUP(node) ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP) != 0) -#define NODE_IS_ADDR_FIXED(node) ((NODE_STATUS(node) & NODE_ST_ADDR_FIXED) != 0) -#define NODE_IS_CLEN_FIXED(node) ((NODE_STATUS(node) & NODE_ST_CLEN_FIXED) != 0) -#define NODE_IS_MIN_FIXED(node) ((NODE_STATUS(node) & NODE_ST_MIN_FIXED) != 0) -#define NODE_IS_MAX_FIXED(node) ((NODE_STATUS(node) & NODE_ST_MAX_FIXED) != 0) +#define NODE_IS_FIXED_ADDR(node) ((NODE_STATUS(node) & NODE_ST_FIXED_ADDR) != 0) +#define NODE_IS_FIXED_CLEN(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN) != 0) +#define NODE_IS_FIXED_MIN(node) ((NODE_STATUS(node) & NODE_ST_FIXED_MIN) != 0) +#define NODE_IS_FIXED_MAX(node) ((NODE_STATUS(node) & NODE_ST_FIXED_MAX) != 0) #define NODE_IS_MARK1(node) ((NODE_STATUS(node) & NODE_ST_MARK1) != 0) #define NODE_IS_MARK2(node) ((NODE_STATUS(node) & NODE_ST_MARK2) != 0) #define NODE_IS_NEST_LEVEL(node) ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL) != 0) #define NODE_IS_BY_NAME(node) ((NODE_STATUS(node) & NODE_ST_BY_NAME) != 0) #define NODE_IS_BACKREF(node) ((NODE_STATUS(node) & NODE_ST_BACKREF) != 0) #define NODE_IS_CHECKER(node) ((NODE_STATUS(node) & NODE_ST_CHECKER) != 0) -#define NODE_IS_FIXED_OPTION(node) ((NODE_STATUS(node) & NODE_ST_FIXED_OPTION) != 0) #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) #define NODE_IS_PROHIBIT_RECURSION(node) \ ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) @@ -368,6 +361,9 @@ typedef struct _Node { ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) #define NODE_IS_EMPTY_STATUS_CHECK(node) \ ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0) +#define NODE_IS_IGNORECASE(node) ((NODE_STATUS(node) & NODE_ST_IGNORECASE) != 0) +#define NODE_IS_MULTILINE(node) ((NODE_STATUS(node) & NODE_ST_MULTILINE) != 0) +#define NODE_IS_TEXT_SEGMENT_WORD(node) ((NODE_STATUS(node) & NODE_ST_TEXT_SEGMENT_WORD) != 0) #define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) @@ -431,19 +427,19 @@ typedef struct { typedef struct { int new_val; -} GroupNumRemap; +} GroupNumMap; -extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); +extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_reduce_nested_quantifier P_((Node* pnode)); +extern Node* onig_node_copy(Node* from); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); extern Node* onig_node_new_bag P_((enum BagType type)); -extern Node* onig_node_new_anchor P_((int type, int ascii_mode)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); diff --git a/oniguruma/src/regsyntax.c b/oniguruma/src/regsyntax.c index 860ebee29..428e21bfd 100644 --- a/oniguruma/src/regsyntax.c +++ b/oniguruma/src/regsyntax.c @@ -153,7 +153,8 @@ OnigSyntaxType OnigSyntaxJava = { ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY ) - , ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | + ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) , ONIG_OPTION_SINGLELINE , { @@ -186,7 +187,7 @@ OnigSyntaxType OnigSyntaxPerl = { ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT ) - , SYN_GNU_REGEX_BV + , SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH , ONIG_OPTION_SINGLELINE , { @@ -224,7 +225,7 @@ OnigSyntaxType OnigSyntaxPerl_NG = { ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT | ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL ) - , ( SYN_GNU_REGEX_BV | + , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME ) , ONIG_OPTION_SINGLELINE diff --git a/oniguruma/src/st.h b/oniguruma/src/st.h index 6c6ec64d3..3b57bc134 100644 --- a/oniguruma/src/st.h +++ b/oniguruma/src/st.h @@ -6,12 +6,17 @@ #define ST_INCLUDED +#ifndef ONIGURUMA_SYS_UEFI + #ifdef _WIN32 # include typedef ULONG_PTR st_data_t; #else typedef unsigned long st_data_t; #endif + +#endif /* ONIGURUMA_SYS_UEFI */ + #define ST_DATA_T_DEFINED typedef struct st_table st_table; diff --git a/oniguruma/src/unicode.c b/oniguruma/src/unicode.c index af05f643e..fdc1ce46f 100644 --- a/oniguruma/src/unicode.c +++ b/oniguruma/src/unicode.c @@ -279,9 +279,12 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { - int n, m, i, j, k, len; - OnigCodePoint code, codes[3]; - const struct ByUnfoldKey* buk; + int n, m, i, j, k, len, lens[3]; + int index; + int fn, ncs[3]; + OnigCodePoint cs[3][4]; + OnigCodePoint code, codes[3], orig_codes[3]; + const struct ByUnfoldKey* buk1; n = 0; @@ -317,38 +320,161 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } #endif - buk = onigenc_unicode_unfold_key(code); - if (buk != 0) { - if (buk->fold_len == 1) { + orig_codes[0] = code; + lens[0] = len; + p += len; + + buk1 = onigenc_unicode_unfold_key(orig_codes[0]); + if (buk1 != 0 && buk1->fold_len == 1) { + codes[0] = *FOLDS1_FOLD(buk1->index); + } + else + codes[0] = orig_codes[0]; + + if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0) + goto fold1; + + if (p < end) { + const struct ByUnfoldKey* buk; + + code = ONIGENC_MBC_TO_CODE(enc, p, end); + orig_codes[1] = code; + len = enclen(enc, p); + lens[1] = lens[0] + len; + buk = onigenc_unicode_unfold_key(orig_codes[1]); + if (buk != 0 && buk->fold_len == 1) { + codes[1] = *FOLDS1_FOLD(buk->index); + } + else + codes[1] = orig_codes[1]; + + p += len; + if (p < end) { + code = ONIGENC_MBC_TO_CODE(enc, p, end); + orig_codes[2] = code; + len = enclen(enc, p); + lens[2] = lens[1] + len; + buk = onigenc_unicode_unfold_key(orig_codes[2]); + if (buk != 0 && buk->fold_len == 1) { + codes[2] = *FOLDS1_FOLD(buk->index); + } + else + codes[2] = orig_codes[2]; + + index = onigenc_unicode_fold3_key(codes); + if (index >= 0) { + m = FOLDS3_UNFOLDS_NUM(index); + for (i = 0; i < m; i++) { + items[n].byte_len = lens[2]; + items[n].code_len = 1; + items[n].code[0] = FOLDS3_UNFOLDS(index)[i]; + n++; + } + + for (fn = 0; fn < 3; fn++) { + int sindex; + cs[fn][0] = FOLDS3_FOLD(index)[fn]; + ncs[fn] = 1; + sindex = onigenc_unicode_fold1_key(&cs[fn][0]); + if (sindex >= 0) { + int m = FOLDS1_UNFOLDS_NUM(sindex); + for (i = 0; i < m; i++) { + cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i]; + } + ncs[fn] += m; + } + } + + for (i = 0; i < ncs[0]; i++) { + for (j = 0; j < ncs[1]; j++) { + for (k = 0; k < ncs[2]; k++) { + items[n].byte_len = lens[2]; + items[n].code_len = 3; + items[n].code[0] = cs[0][i]; + items[n].code[1] = cs[1][j]; + items[n].code[2] = cs[2][k]; + if (items[n].code[0] == orig_codes[0] && + items[n].code[1] == orig_codes[1] && + items[n].code[2] == orig_codes[2]) + continue; + n++; + } + } + } + + return n; + } + } + + index = onigenc_unicode_fold2_key(codes); + if (index >= 0) { + m = FOLDS2_UNFOLDS_NUM(index); + for (i = 0; i < m; i++) { + items[n].byte_len = lens[1]; + items[n].code_len = 1; + items[n].code[0] = FOLDS2_UNFOLDS(index)[i]; + n++; + } + + for (fn = 0; fn < 2; fn++) { + int sindex; + cs[fn][0] = FOLDS2_FOLD(index)[fn]; + ncs[fn] = 1; + sindex = onigenc_unicode_fold1_key(&cs[fn][0]); + if (sindex >= 0) { + int m = FOLDS1_UNFOLDS_NUM(sindex); + for (i = 0; i < m; i++) { + cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i]; + } + ncs[fn] += m; + } + } + + for (i = 0; i < ncs[0]; i++) { + for (j = 0; j < ncs[1]; j++) { + items[n].byte_len = lens[1]; + items[n].code_len = 2; + items[n].code[0] = cs[0][i]; + items[n].code[1] = cs[1][j]; + if (items[n].code[0] == orig_codes[0] && + items[n].code[1] == orig_codes[1]) + continue; + n++; + } + } + + return n; + } + } + + fold1: + if (buk1 != 0) { + if (buk1->fold_len == 1) { int un; - items[0].byte_len = len; + items[0].byte_len = lens[0]; items[0].code_len = 1; - items[0].code[0] = *FOLDS1_FOLD(buk->index); + items[0].code[0] = *FOLDS1_FOLD(buk1->index); n++; - un = FOLDS1_UNFOLDS_NUM(buk->index); + un = FOLDS1_UNFOLDS_NUM(buk1->index); for (i = 0; i < un; i++) { - OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i]; - if (unfold != code) { - items[n].byte_len = len; + OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i]; + if (unfold != orig_codes[0]) { + items[n].byte_len = lens[0]; items[n].code_len = 1; items[n].code[0] = unfold; n++; } } - code = items[0].code[0]; /* for multi-code to unfold search. */ } else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - OnigCodePoint cs[3][4]; - int fn, ncs[3]; - - if (buk->fold_len == 2) { - m = FOLDS2_UNFOLDS_NUM(buk->index); + if (buk1->fold_len == 2) { + m = FOLDS2_UNFOLDS_NUM(buk1->index); for (i = 0; i < m; i++) { - OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i]; - if (unfold == code) continue; + OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i]; + if (unfold == orig_codes[0]) continue; - items[n].byte_len = len; + items[n].byte_len = lens[0]; items[n].code_len = 1; items[n].code[0] = unfold; n++; @@ -356,7 +482,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 2; fn++) { int index; - cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; + cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn]; ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { @@ -370,7 +496,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { - items[n].byte_len = len; + items[n].byte_len = lens[0]; items[n].code_len = 2; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; @@ -379,12 +505,12 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } else { /* fold_len == 3 */ - m = FOLDS3_UNFOLDS_NUM(buk->index); + m = FOLDS3_UNFOLDS_NUM(buk1->index); for (i = 0; i < m; i++) { - OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i]; - if (unfold == code) continue; + OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i]; + if (unfold == orig_codes[0]) continue; - items[n].byte_len = len; + items[n].byte_len = lens[0]; items[n].code_len = 1; items[n].code[0] = unfold; n++; @@ -392,7 +518,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 3; fn++) { int index; - cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; + cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn]; ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { @@ -407,7 +533,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { for (k = 0; k < ncs[2]; k++) { - items[n].byte_len = len; + items[n].byte_len = lens[0]; items[n].code_len = 3; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; @@ -417,17 +543,14 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } } - - /* multi char folded code is not head of another folded multi char */ - return n; } } else { - int index = onigenc_unicode_fold1_key(&code); + int index = onigenc_unicode_fold1_key(orig_codes); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { - items[n].byte_len = len; + items[n].byte_len = lens[0]; items[n].code_len = 1; items[n].code[0] = FOLDS1_UNFOLDS(index)[i]; n++; @@ -435,64 +558,6 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } - if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0) - return n; - - p += len; - if (p < end) { - int clen; - int index; - - codes[0] = code; - code = ONIGENC_MBC_TO_CODE(enc, p, end); - - buk = onigenc_unicode_unfold_key(code); - if (buk != 0 && buk->fold_len == 1) { - codes[1] = *FOLDS1_FOLD(buk->index); - } - else - codes[1] = code; - - clen = enclen(enc, p); - len += clen; - - index = onigenc_unicode_fold2_key(codes); - if (index >= 0) { - m = FOLDS2_UNFOLDS_NUM(index); - for (i = 0; i < m; i++) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = FOLDS2_UNFOLDS(index)[i]; - n++; - } - } - - p += clen; - if (p < end) { - code = ONIGENC_MBC_TO_CODE(enc, p, end); - buk = onigenc_unicode_unfold_key(code); - if (buk != 0 && buk->fold_len == 1) { - codes[2] = *FOLDS1_FOLD(buk->index); - } - else - codes[2] = code; - - clen = enclen(enc, p); - len += clen; - - index = onigenc_unicode_fold3_key(codes); - if (index >= 0) { - m = FOLDS3_UNFOLDS_NUM(index); - for (i = 0; i < m; i++) { - items[n].byte_len = len; - items[n].code_len = 1; - items[n].code[0] = FOLDS3_UNFOLDS(index)[i]; - n++; - } - } - } - } - return n; } @@ -931,7 +996,7 @@ onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev, #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER if (! ONIGENC_IS_UNICODE_ENCODING(enc)) { - return from != 0x000d || to != 0x000a; + return from != 0x000d || to != NEWLINE_CODE; } btype = unicode_egcb_is_break_2code(from, to); @@ -974,7 +1039,7 @@ onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev, return 1; #else - return from != 0x000d || to != 0x000a; + return from != 0x000d || to != NEWLINE_CODE; #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */ }