From bf5840cd873839d5b73fcd330c7658816519f429 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Mon, 4 Nov 2019 10:18:15 +0100 Subject: [PATCH] + upd: Oniguruma update (utf-8 version) --- oniguruma/HISTORY | 16 +- oniguruma/doc/RE | 7 +- oniguruma/doc/UNICODE_PROPERTIES | 2 +- oniguruma/src/ascii.c | 2 +- oniguruma/src/config.h | 3 + oniguruma/src/gperf_fold_key_conv.py | 4 +- oniguruma/src/gperf_unfold_key_conv.py | 4 +- oniguruma/src/make_unicode_property_data.py | 66 +- oniguruma/src/mktable.c | 1 + oniguruma/src/onig_init.c | 1 + oniguruma/src/oniggnu.h | 1 + oniguruma/src/oniguruma.h | 1 + oniguruma/src/regcomp.c | 1020 ++++++++++--------- oniguruma/src/regenc.c | 1 + oniguruma/src/regenc.h | 1 + oniguruma/src/regerror.c | 1 + oniguruma/src/regexec.c | 475 +++++---- oniguruma/src/regext.c | 1 + oniguruma/src/reggnu.c | 1 + oniguruma/src/regint.h | 106 +- oniguruma/src/regparse.c | 191 ++-- oniguruma/src/regparse.h | 27 +- oniguruma/src/regsyntax.c | 1 + oniguruma/src/regtrav.c | 1 + oniguruma/src/regversion.c | 1 + oniguruma/src/st.c | 1 + oniguruma/src/st.h | 2 +- oniguruma/src/unicode.c | 1 + oniguruma/src/unicode_fold1_key.c | 4 +- oniguruma/src/unicode_fold2_key.c | 4 +- oniguruma/src/unicode_fold3_key.c | 4 +- oniguruma/src/unicode_property_data.c | 3 +- oniguruma/src/unicode_property_data_posix.c | 2 +- oniguruma/src/unicode_unfold_key.c | 4 +- oniguruma/src/utf8.c | 2 +- oniguruma/windows/testc.c | 2 +- 36 files changed, 1032 insertions(+), 932 deletions(-) diff --git a/oniguruma/HISTORY b/oniguruma/HISTORY index 0380cb4f9..1ef5f9325 100644 --- a/oniguruma/HISTORY +++ b/oniguruma/HISTORY @@ -1,8 +1,22 @@ History +2019/MM/DD: Version 6.9.4 + +2019/10/31: Update Unicode Emoji version to 12.1 (Nothing data changed) +2019/10/29: implement USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR configuration +2019/10/18: re-implement case fold conversion +2019/10/04: fix #156: Heap buffer overflow in match_at() with case-insensitive match +2019/09/30: NEW API: add onig_regset_replace() +2019/09/30: change Unicode VERSION value format +2019/09/20: NEW API: add regset functions +2019/09/20: add data ensure check before peek string value in OP_PUSH_IF_PEEK_NEXT +2019/09/20: fix loose code in encode-harness.c +2019/08/13: fix heap-buffer-overflow +2019/08/13: Add a macro to disable direct threading in the match engine (PR#149) + 2019/08/06: Version 6.9.3 (secirity fix release) -2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC 2019/07/29: add STK_PREC_READ_START/END stack type 2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions 2019/07/11: add a dictionary file for libfuzzer diff --git a/oniguruma/doc/RE b/oniguruma/doc/RE index d64292650..599d2a6a0 100644 --- a/oniguruma/doc/RE +++ b/oniguruma/doc/RE @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 6.9.2 2019/08/08 +Oniguruma Regular Expressions Version 6.9.4 2019/10/31 syntax: ONIG_SYNTAX_ONIGURUMA (default) @@ -289,6 +289,11 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) In negative look-behind, capturing group isn't allowed, but non-capturing group (?:) is allowed. + * In look-behind and negative look-behind, support for + ignore-case option is limited. Only supports conversion + between single characters. (Does not support conversion + of multiple characters in Unicode) + (?>subexp) atomic group no backtracks in subexp. diff --git a/oniguruma/doc/UNICODE_PROPERTIES b/oniguruma/doc/UNICODE_PROPERTIES index ff2a6cec0..24c203156 100644 --- a/oniguruma/doc/UNICODE_PROPERTIES +++ b/oniguruma/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.1.0) +Unicode Properties (Unicode Version: 12.1.0, Emoji: 12.1) 15: ASCII_Hex_Digit 16: Adlam diff --git a/oniguruma/src/ascii.c b/oniguruma/src/ascii.c index cad63d281..4bd353d72 100644 --- a/oniguruma/src/ascii.c +++ b/oniguruma/src/ascii.c @@ -1,6 +1,6 @@ -// encoding: UTF8 /********************************************************************** ascii.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/config.h b/oniguruma/src/config.h index c3b911e6f..5dec5b12d 100644 --- a/oniguruma/src/config.h +++ b/oniguruma/src/config.h @@ -1,4 +1,6 @@ #pragma once +/* encoding: UTF8 */ + #ifndef _ONIGURUMA_CONFIG_H_ #define _ONIGURUMA_CONFIG_H_ @@ -37,6 +39,7 @@ #endif #define HAVE_DECL_SYS_NERR 1 #define STDC_HEADERS 1 +#define HAVE_STDINT_H 1 #define HAVE_STDLIB_H 1 #define HAVE_STRING_H 1 #define HAVE_LIMITS_H 1 diff --git a/oniguruma/src/gperf_fold_key_conv.py b/oniguruma/src/gperf_fold_key_conv.py index f45318677..c633100df 100644 --- a/oniguruma/src/gperf_fold_key_conv.py +++ b/oniguruma/src/gperf_fold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_RETURN_TYPE = re.compile('^const\s+short\s+int\s*\*') REG_FOLD_KEY = re.compile('unicode_fold(\d)_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".*?",\s*(-?\d+)\s*\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -34,7 +34,7 @@ def parse_line(s, key_len): if r != s: return r r = re.sub(REG_ENTRY, '\\1', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(codes);', s) if r != s: return r diff --git a/oniguruma/src/gperf_unfold_key_conv.py b/oniguruma/src/gperf_unfold_key_conv.py index 3cf4836ff..d999d4e97 100644 --- a/oniguruma/src/gperf_unfold_key_conv.py +++ b/oniguruma/src/gperf_unfold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_UNFOLD_KEY = re.compile('onigenc_unicode_unfold_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".+?",\s*/\*(.+?)\*/\s*(-?\d+),\s*(\d)\}') REG_EMPTY_ENTRY = re.compile('\{"",\s*(-?\d+),\s*(\d)\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -32,7 +32,7 @@ def parse_line(s): if r != s: return r r = re.sub(REG_EMPTY_ENTRY, '{0xffffffff, \\1, \\2}', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(&code);', s) if r != s: return r diff --git a/oniguruma/src/make_unicode_property_data.py b/oniguruma/src/make_unicode_property_data.py index f167a9733..9776628f5 100644 --- a/oniguruma/src/make_unicode_property_data.py +++ b/oniguruma/src/make_unicode_property_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_property_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -22,9 +22,12 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") +UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") +EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") VERSION_INFO = [-1, -1, -1] +EMOJI_VERSION_INFO = [-1, -1] + DIC = { } KDIC = { } PropIndex = { } @@ -40,14 +43,6 @@ def fix_block_name(name): s = re.sub(r'[- ]+', '_', name) return 'In_' + s -def check_version_info(s): - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO[0] = int(m.group(1)) - VERSION_INFO[1] = int(m.group(2)) - VERSION_INFO[2] = int(m.group(3)) - - def print_ranges(ranges): for (start, end) in ranges: print "0x%06x, 0x%06x" % (start, end) @@ -234,7 +229,8 @@ def parse_unicode_data_file(f): normalize_ranges_in_dic(dic) return dic, assigned -def parse_properties(path, klass, prop_prefix = None): +def parse_properties(path, klass, prop_prefix = None, version_reg = None): + version_match = None with open(path, 'r') as f: dic = { } prop = None @@ -244,9 +240,10 @@ def parse_properties(path, klass, prop_prefix = None): if len(s) == 0: continue - if s[0] == '#': - if VERSION_INFO[0] < 0: - check_version_info(s) + if s[0] == '#' and version_reg is not None and version_match is None: + version_match = version_reg.match(s) + if version_match is not None: + continue m = PR_LINE_REG.match(s) if m: @@ -267,7 +264,7 @@ def parse_properties(path, klass, prop_prefix = None): props.append(prop) normalize_ranges_in_dic(dic) - return (dic, props) + return (dic, props, version_match) def parse_property_aliases(path): a = { } @@ -415,11 +412,11 @@ def entry_and_print_prop_and_index(name, index): nname = normalize_prop_name(name) print_prop_and_index(nname, index) -def parse_and_merge_properties(path, klass): - dic, props = parse_properties(path, klass) +def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): + dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) merge_dic(DIC, dic) merge_props(PROPS, props) - return dic, props + return dic, props, ver_m ### main ### argv = sys.argv @@ -448,11 +445,21 @@ with open('UnicodeData.txt', 'r') as f: PROPS = DIC.keys() PROPS = list_sub(PROPS, POSIX_LIST) -parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property') -dic, props = parse_and_merge_properties('Scripts.txt', 'Script') +_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) +if ver_m is not None: + VERSION_INFO[0] = int(ver_m.group(1)) + VERSION_INFO[1] = int(ver_m.group(2)) + VERSION_INFO[2] = int(ver_m.group(3)) + +dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) + parse_and_merge_properties('PropList.txt', 'Binary Property') -parse_and_merge_properties('emoji-data.txt', 'Emoji Property') + +_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) +if ver_m is not None: + EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) + EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) PROPS.append('Unknown') KDIC['Unknown'] = 'Script' @@ -465,9 +472,9 @@ dic, BLOCKS = parse_blocks('Blocks.txt') merge_dic(DIC, dic) if INCLUDE_GRAPHEME_CLUSTER_DATA: - dic, props = parse_properties('GraphemeBreakProperty.txt', - 'GraphemeBreak Property', - GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) + dic, props, _ = parse_properties('GraphemeBreakProperty.txt', + 'GraphemeBreak Property', + GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) merge_dic(DIC, dic) merge_props(PROPS, props) #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' @@ -535,9 +542,11 @@ sys.stdout.write(s) if OUTPUT_LIST_MODE: UPF = open("UNICODE_PROPERTIES", "w") if VERSION_INFO[0] < 0: - raise RuntimeError("Version is not found") + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") - print >> UPF, "Unicode Properties (from Unicode Version: %d.%d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) print >> UPF, '' index = -1 @@ -573,9 +582,12 @@ print '%%' print '' if not(POSIX_ONLY): if VERSION_INFO[0] < 0: - raise RuntimeError("Version is not found") + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) print '' print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) diff --git a/oniguruma/src/mktable.c b/oniguruma/src/mktable.c index 318bac0d9..016f25c6e 100644 --- a/oniguruma/src/mktable.c +++ b/oniguruma/src/mktable.c @@ -1,5 +1,6 @@ /********************************************************************** mktable.c + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/onig_init.c b/oniguruma/src/onig_init.c index c660e7df7..3cb01d52f 100644 --- a/oniguruma/src/onig_init.c +++ b/oniguruma/src/onig_init.c @@ -1,5 +1,6 @@ /********************************************************************** onig_init.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2016-2019 K.Kosako diff --git a/oniguruma/src/oniggnu.h b/oniguruma/src/oniggnu.h index 96d90855a..479e6d5f5 100644 --- a/oniguruma/src/oniggnu.h +++ b/oniguruma/src/oniggnu.h @@ -2,6 +2,7 @@ #define ONIGGNU_H /********************************************************************** oniggnu.h - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/oniguruma.h b/oniguruma/src/oniguruma.h index 00f0dbb53..9cc317a00 100644 --- a/oniguruma/src/oniguruma.h +++ b/oniguruma/src/oniguruma.h @@ -2,6 +2,7 @@ #define ONIGURUMA_H /********************************************************************** oniguruma.h - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regcomp.c b/oniguruma/src/regcomp.c index fb78899ca..65d0d9618 100644 --- a/oniguruma/src/regcomp.c +++ b/oniguruma/src/regcomp.c @@ -1,5 +1,6 @@ /********************************************************************** regcomp.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako @@ -452,6 +453,20 @@ swap_node(Node* a, Node* b) } } +static int +node_list_len(Node* list) +{ + int len; + + len = 1; + while (IS_NOT_NULL(NODE_CDR(list))) { + list = NODE_CDR(list); + len++; + } + + return len; +} + static Node* node_list_add(Node* list, Node* x) { @@ -470,6 +485,49 @@ node_list_add(Node* list, Node* x) return n; } +static int +node_str_node_cat(Node* node, Node* add) +{ + int r; + + if (STR_(node)->flag != STR_(add)->flag) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); + if (r != 0) return r; + + if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + STR_(node)->case_min_len += STR_(add)->case_min_len; + + return 0; +} + +static int +node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) +{ + int r; + + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, s, end); + if (r != 0) return r; + + STR_(node)->case_min_len += case_min_len; + return 0; +} + +static void +node_conv_to_str_node(Node* node, int flag) +{ + NODE_SET_TYPE(node, NODE_STRING); + STR_(node)->flag = flag; + STR_(node)->s = STR_(node)->buf; + STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; +} + static OnigLen distance_add(OnigLen d1, OnigLen d2) { @@ -572,47 +630,40 @@ static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, int str_len) { int op; - if (ignore_case) { + switch (mb_len) { + case 1: switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; + case 1: op = OP_EXACT1; break; + case 2: op = OP_EXACT2; break; + case 3: op = OP_EXACT3; break; + case 4: op = OP_EXACT4; break; + case 5: op = OP_EXACT5; break; + default: op = OP_EXACTN; break; } - } - else { - switch (mb_len) { - case 1: - switch (str_len) { - case 1: op = OP_EXACT1; break; - case 2: op = OP_EXACT2; break; - case 3: op = OP_EXACT3; break; - case 4: op = OP_EXACT4; break; - case 5: op = OP_EXACT5; break; - default: op = OP_EXACTN; break; - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; - - case 3: - op = OP_EXACTMB3N; - break; - - default: - op = OP_EXACTMBN; - break; + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; } + break; + + case 3: + op = OP_EXACTMB3N; + break; + + default: + op = OP_EXACTMBN; + break; } + return op; } @@ -639,22 +690,22 @@ is_strict_real_node(Node* node) } static int -compile_tree_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) { int r; - int saved_num_null_check; + int saved_num_empty_check; int emptiness; Node* body; body = NODE_BODY((Node* )qn); emptiness = qn->emptiness; - saved_num_null_check = reg->num_null_check; + saved_num_empty_check = reg->num_empty_check; if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; - COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ - reg->num_null_check++; + COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */ + reg->num_empty_check++; } r = compile_tree(body, reg, env); @@ -669,11 +720,13 @@ compile_tree_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) else r = add_op(reg, OP_EMPTY_CHECK_END); } +#ifdef USE_CALL else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +#endif if (r != 0) return r; - COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */ + COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */ } return r; } @@ -710,14 +763,13 @@ compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len, - regex_t* reg ARG_UNUSED, int ignore_case) + regex_t* reg ARG_UNUSED) { return 1; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) { int op; int r; @@ -725,7 +777,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, UChar* p; UChar* end; - op = select_str_opcode(mb_len, str_len, ignore_case); + op = select_str_opcode(mb_len, str_len); r = add_op(reg, op); if (r != 0) return r; @@ -762,7 +814,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, slen; UChar *p, *prev; StrNode* sn; OnigEncoding enc = reg->enc; @@ -771,10 +823,7 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - ambig = NODE_STRING_IS_CASE_FOLD_MATCH(node); - if (ambig != 0) { - return 1; - } + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; p = prev = sn->s; prev_len = enclen(enc, p); @@ -788,7 +837,7 @@ compile_length_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; prev = p; slen = 1; @@ -797,19 +846,19 @@ compile_length_string_node(Node* node, regex_t* reg) p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; return rlen; } static int -compile_length_string_raw_node(StrNode* sn, regex_t* reg) +compile_length_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s), - reg, 0); + reg); } static int @@ -824,7 +873,7 @@ compile_ambig_string_node(Node* node, regex_t* reg) sn = STR_(node); len = enclen(enc, sn->s); - byte_len = (int)(sn->end - sn->s); + byte_len = (int )(sn->end - sn->s); if (len == byte_len) { r = add_op(reg, OP_EXACT1_IC); if (r != 0) return r; @@ -849,7 +898,7 @@ compile_ambig_string_node(Node* node, regex_t* reg) static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, slen; UChar *p, *prev, *end; StrNode* sn; OnigEncoding enc = reg->enc; @@ -859,8 +908,7 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - ambig = NODE_STRING_IS_CASE_FOLD_MATCH(node); - if (ambig != 0) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { return compile_ambig_string_node(node, reg); } @@ -875,7 +923,7 @@ compile_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, slen, reg); if (r != 0) return r; prev = p; @@ -886,16 +934,16 @@ compile_string_node(Node* node, regex_t* reg) p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, slen, reg); } static int -compile_string_raw_node(StrNode* sn, regex_t* reg) +compile_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; - return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0); + return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg); } static void* @@ -959,15 +1007,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) return 0; } +static void +set_addr_in_repeat_range(regex_t* reg) +{ + int i; + + for (i = 0; i < reg->num_repeat; i++) { + RepeatRange* p = reg->repeat_range + i; + int offset = p->u.offset; + p->u.pcode = reg->ops + offset; + } +} + static int -entry_repeat_range(regex_t* reg, int id, int lower, int upper) +entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) { #define REPEAT_RANGE_ALLOC 4 - OnigRepeatRange* p; + RepeatRange* p; if (reg->repeat_range_alloc == 0) { - p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; @@ -975,7 +1035,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; - p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); + p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -984,8 +1044,9 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) p = reg->repeat_range; } - p[id].lower = lower; - p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].lower = lower; + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].u.offset = ops_index; return 0; } @@ -1002,22 +1063,14 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, COP(reg)->repeat.id = num_repeat; COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC; - r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper, + COP_CURR_OFFSET(reg) + OPSIZE_REPEAT); if (r != 0) return r; - r = compile_tree_empty_check(qn, reg, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - if ( -#ifdef USE_CALL - NODE_IS_IN_MULTI_ENTRY(qn) || -#endif - NODE_IS_IN_REAL_REPEAT(qn)) { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); - } - else { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); - } + r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); if (r != 0) return r; COP(reg)->repeat_inc.id = num_repeat; @@ -1184,7 +1237,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(qn, reg, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1); @@ -1197,7 +1250,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(qn, reg, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT); @@ -1207,7 +1260,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; - r = compile_tree_empty_check(qn, reg, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; addr = -(mod_tlen + (int )OPSIZE_PUSH); @@ -1222,7 +1275,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->jump.addr = mod_tlen + SIZE_INC; - r = compile_tree_empty_check(qn, reg, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1420,10 +1473,11 @@ static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - int len; #ifdef USE_CALL if (NODE_IS_CALLED(node)) { + int len; + r = add_op(reg, OP_CALL); if (r != 0) return r; @@ -1893,8 +1947,8 @@ compile_length_tree(Node* node, regex_t* reg) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_length_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_length_string_crude_node(STR_(node), reg); else r = compile_length_string_node(node, reg); break; @@ -1986,8 +2040,8 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_string_crude_node(STR_(node), reg); else r = compile_string_node(node, reg); break; @@ -2810,7 +2864,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) { + ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -3807,6 +3861,96 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) #endif +static void +remove_from_list(Node* prev, Node* a) +{ + if (NODE_CDR(prev) != a) return ; + + NODE_CDR(prev) = NODE_CDR(a); + NODE_CDR(a) = NULL_NODE; +} + +static int +reduce_string_list(Node* node) +{ + int r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + Node* prev; + Node* curr; + Node* prev_node; + Node* next_node; + + prev = NULL_NODE; + do { + next_node = NODE_CDR(node); + curr = NODE_CAR(node); + if (NODE_TYPE(curr) == NODE_STRING) { + if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + prev = curr; + prev_node = node; + } + else { + r = node_str_node_cat(prev, curr); + if (r != 0) return r; + remove_from_list(prev_node, node); + onig_node_free(node); + } + } + else { + prev = NULL_NODE; + prev_node = node; + } + + node = next_node; + } while (r == 0 && IS_NOT_NULL(node)); + } + break; + + case NODE_ALT: + do { + r = reduce_string_list(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (IS_NULL(NODE_BODY(node))) + break; + /* fall */ + case NODE_QUANT: + r = reduce_string_list(NODE_BODY(node)); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = reduce_string_list(NODE_BODY(node)); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = reduce_string_list(en->te.Then); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = reduce_string_list(en->te.Else); + if (r != 0) return r; + } + } + } + break; + + default: + break; + } + + return r; +} + + #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REAL_REPEAT (1<<2) @@ -3919,23 +4063,57 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int -update_string_node_case_fold(regex_t* reg, Node *node) +is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *sbuf, *ebuf, *sp; - int r, i, len, sbuf_size; - StrNode* sn = STR_(node); + int i; - end = sn->end; - sbuf_size = (int )(end - sn->s) * 2; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->code_len != 1) return 0; + } + + return 1; +} + +static int +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +{ + int i, len, minlen, maxlen; + + minlen = INT_MAX; + maxlen = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + + len = item->byte_len; + if (len < minlen) minlen = len; + if (len > maxlen) maxlen = len; + } + + *rmin = minlen; + *rmax = maxlen; + return 0; +} + +static int +conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, + UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) +{ + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *sbuf, *ebuf, *sp; + int i, n, len, sbuf_size; + + *rs = NULL; + sbuf_size = (int )(end - s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; + n = 0; sp = sbuf; - p = sn->s; + p = s; while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); @@ -3947,355 +4125,302 @@ update_string_node_case_fold(regex_t* reg, Node *node) *sp++ = buf[i]; } + n++; } - r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } - - xfree(sbuf); + *rs = sbuf; + *rend = sp; + *rcase_min_len = n; return 0; } static int -case_fold_remaining_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) +make_code_list_to_string(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) { - int r; - Node *node; - - node = onig_node_new_str(s, end); - if (IS_NULL(node)) return ONIGERR_MEMORY; - - r = update_string_node_case_fold(reg, node); - if (r != 0) { - onig_node_free(node); - return r; - } - - NODE_STRING_SET_CASE_EXPANDED(node); - NODE_STRING_SET_CASE_FOLD_MATCH(node); - NODE_STRING_SET_DONT_GET_OPT_INFO(node); - *rnode = node; - return 0; -} - -static int -string_case_expand_to_alts(int item_num, OnigCaseFoldCodeItem items[], UChar* p, - UChar* end, int plen, regex_t* reg, Node **rnode) -{ - int r, i, j; - int len; - int varlen; - Node *anode, *var_anode, *snode, *xnode, *an, *rem_node; + int r, i, len; + Node* node; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - *rnode = var_anode = NULL_NODE; - - varlen = 0; - for (i = 0; i < item_num; i++) { - if (items[i].byte_len != plen) { - varlen = 1; - break; - } - } - - if (varlen != 0) { - *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(var_anode)) return ONIGERR_MEMORY; - - xnode = onig_node_new_list(NULL, NULL); - if (IS_NULL(xnode)) goto mem_err; - NODE_CAR(var_anode) = xnode; - - anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) goto mem_err; - NODE_CAR(xnode) = anode; - } - else { - *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) return ONIGERR_MEMORY; - } - - snode = onig_node_new_str(p, p + plen); - if (IS_NULL(snode)) goto mem_err; - - NODE_CAR(anode) = snode; - - for (i = 0; i < item_num; i++) { - snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - - for (j = 0; j < items[i].code_len; j++) { - len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); - if (len < 0) { - r = len; - goto mem_err2; - } - - r = onig_node_str_cat(snode, buf, buf + len); - if (r != 0) goto mem_err2; - } - - an = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(an)) { - goto mem_err2; - } - - if (items[i].byte_len != plen) { - UChar *q = p + items[i].byte_len; - - if (q < end) { - r = case_fold_remaining_string(&rem_node, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } - - xnode = node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) goto mem_err3; - if (IS_NULL(node_list_add(xnode, rem_node))) { - onig_node_free(xnode); - snode = NULL_NODE; - goto mem_err3; - } - - NODE_CAR(an) = xnode; - } - else { - NODE_CAR(an) = snode; - } - - NODE_CDR(var_anode) = an; - var_anode = an; - } - else { - NODE_CAR(an) = snode; - NODE_CDR(anode) = an; - anode = an; - } - } - - return varlen; - - mem_err3: - onig_node_free(an); - onig_node_free(rem_node); - - mem_err2: - onig_node_free(snode); - - mem_err: - onig_node_free(*rnode); - return ONIGERR_MEMORY; -} - -static int -is_good_case_fold_items_for_search(OnigEncoding enc, int slen, - int n, OnigCaseFoldCodeItem items[]) -{ - int i, len; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + *rnode = NULL_NODE; + node = onig_node_new_str(NULL, NULL); + CHECK_NULL_RETURN_MEMERR(node); for (i = 0; i < n; i++) { - OnigCaseFoldCodeItem* item = items + i; + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + if (len < 0) { + r = len; + goto err; + } - if (item->code_len != 1) return 0; - if (item->byte_len != slen) return 0; - len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf); - if (len != slen) return 0; + r = onig_node_str_cat(node, buf, buf + len); + if (r != 0) goto err; } - return 1; + *rnode = node; + return 0; + + err: + onig_node_free(node); + return r; } -#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 +static int +unravel_cf_node_add(Node** rlist, Node* add) +{ + Node *list; + + list = *rlist; + if (IS_NULL(list)) { + list = onig_node_new_list(add, NULL); + CHECK_NULL_RETURN_MEMERR(list); + *rlist = list; + } + else { + Node* r = node_list_add(list, add); + CHECK_NULL_RETURN_MEMERR(r); + } + + return 0; +} static int -expand_case_fold_string(Node* node, regex_t* reg, int state) +unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, + unsigned int flag, int case_min_len) { - int r, n, len, alt_num; - int fold_len; - int prev_fold, prev_is_good, is_good, is_in_look_behind; - UChar *start, *end, *p; - UChar* foldp; - Node *top, *root, *snode, *prev_node; + int r; + Node *sn, *list; + + list = *rlist; + sn = *rsn; + + if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) + r = node_str_cat_case_fold(sn, s, end, case_min_len); + else + r = onig_node_str_cat(sn, s, end); + } + else { + sn = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(sn); + + STR_(sn)->flag = flag; + STR_(sn)->case_min_len = case_min_len; + r = unravel_cf_node_add(&list, sn); + } + + if (r == 0) { + *rlist = list; + *rsn = sn; + } + return r; +} + +static int +unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r; + int case_min_len; + UChar *rs, *rend; + + r = conv_string_case_fold(enc, case_fold_flag, s, end, + &rs, &rend, &case_min_len); + if (r != 0) return r; + + r = unravel_cf_string_add(rlist, rsn, rs, rend, + NODE_STRING_CASE_FOLD_MATCH, case_min_len); + xfree(rs); + + return r; +} + +static int +unravel_cf_string_alt_or_cc_add(Node** rlist, int n, + OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r, i; + Node* node; + + if (is_all_code_len_1_items(n, items)) { + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ + + codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + codes[i+1] = item->code[0]; + } + r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes); + if (r != 0) return r; + } + else { + Node *snode, *alt, *curr; + + snode = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(snode); + node = curr = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(curr)) { + onig_node_free(snode); + return ONIGERR_MEMORY; + } + + r = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + r = make_code_list_to_string(&snode, enc, item->code_len, item->code); + if (r != 0) { + onig_node_free(node); + return r; + } + + alt = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(alt)) { + onig_node_free(snode); + onig_node_free(node); + return ONIGERR_MEMORY; + } + + NODE_CDR(curr) = alt; + curr = alt; + } + } + + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + return r; +} + +static int +unravel_cf_look_behind_add(Node** rlist, Node** rsn, + int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, + UChar* s, int one_len) +{ + int r, i, found; + + found = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + found = 1; + } + } + } + + if (found == 0) { + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); + } + else { + Node* node; + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ + + found = 0; + codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + codes[found++] = item->code[0]; + } + } + } + r = onig_new_cclass_with_code_list(&node, enc, found, codes); + if (r != 0) return r; + + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + + *rsn = NULL_NODE; + } + + return r; +} + +static int +unravel_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, one_len, min_len, max_len, in_look_behind; + UChar *start, *end, *p, *q; + StrNode* snode; + Node *sn, *list; + OnigEncoding enc; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - StrNode* sn; if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; - sn = STR_(node); + snode = STR_(node); - start = sn->s; - end = sn->end; + start = snode->s; + end = snode->end; if (start >= end) return 0; - is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + in_look_behind = (state & IN_LOOK_BEHIND) != 0; + enc = reg->enc; - r = 0; - top = root = prev_node = snode = NULL_NODE; - alt_num = 1; + list = sn = NULL_NODE; p = start; while (p < end) { - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, + items); if (n < 0) { r = n; goto err; } - len = enclen(reg->enc, p); - is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items); - - if (is_in_look_behind || IS_NOT_NULL(snode) || - /* expand single char: ex. /(?i:a)/ */ - (is_good && !(p == start && p + len >= end))) { - if (IS_NULL(snode)) { - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top = root = node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - if (IS_NOT_NULL(root)) { - if (IS_NULL(node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - - prev_fold = -1; /* -1: new */ - prev_is_good = 0; /* escape compiler warning */ - } - else { - prev_fold = NODE_STRING_IS_CASE_FOLD_MATCH(snode); - prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode); - } - - if (n != 0) { - foldp = p; - fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, - &foldp, end, buf); - foldp = buf; - } - else { - foldp = p; fold_len = len; - } - - if ((prev_fold == 0 && n != 0) || - (prev_fold > 0 && (n == 0 || prev_is_good != is_good))) { - if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) { - top = root = node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(foldp, foldp + fold_len); - if (IS_NULL(snode)) goto mem_err; - if (IS_NULL(node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - else { - r = onig_node_str_cat(snode, foldp, foldp + fold_len); + one_len = enclen(enc, p); + if (n == 0) { + q = p + one_len; + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + if (r != 0) goto err; + } + else { + if (in_look_behind != 0) { + q = p + one_len; + r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); if (r != 0) goto err; } - - if (n != 0) { - NODE_STRING_SET_CASE_EXPANDED(snode); - NODE_STRING_SET_CASE_FOLD_MATCH(snode); - } - if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode); - } - else { - alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; - - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top = root = node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - r = string_case_expand_to_alts(n, items, p, end, len, reg, &prev_node); - if (r < 0) goto mem_err; - if (r == 1) { - if (IS_NULL(root)) { - top = prev_node; + else { + get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + q = p + max_len; + if (one_len == max_len && min_len == max_len) { + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } else { - if (IS_NULL(node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } - } - - root = NODE_CAR(prev_node); - } - else { /* r == 0 */ - if (IS_NOT_NULL(root)) { - if (IS_NULL(node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } + r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, + p, q); + if (r != 0) goto err; } } } - p += len; + p = q; } - if (p < end) { - Node* rem_node; - - r = case_fold_remaining_string(&rem_node, p, end, reg); - if (r != 0) goto mem_err; - - if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { - top = root = node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(rem_node); - onig_node_free(prev_node); - goto mem_err; - } - } - - if (IS_NULL(root)) { - prev_node = rem_node; + if (IS_NOT_NULL(list)) { + if (node_list_len(list) == 1) { + swap_node(node, NODE_CAR(list)); } else { - if (IS_NULL(node_list_add(root, rem_node))) { - onig_node_free(rem_node); - goto mem_err; - } + swap_node(node, list); } + onig_node_free(list); + } + else { + swap_node(node, sn); + onig_node_free(sn); } - - /* ending */ - if (IS_NULL(top)) - top = prev_node; - - swap_node(node, top); - onig_node_free(top); return 0; - mem_err: - r = ONIGERR_MEMORY; - err: - onig_node_free(top); + if (IS_NOT_NULL(list)) + onig_node_free(list); + else if (IS_NOT_NULL(sn)) + onig_node_free(sn); + return r; } @@ -4926,13 +5051,12 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); - StrNode* sn = STR_(body); if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - onig_node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, STR_(body)->flag); for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); + r = node_str_node_cat(node, body); if (r != 0) return r; } onig_node_free(body); @@ -4991,8 +5115,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg, state); + if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + r = unravel_case_fold_string(node, reg, state); } break; @@ -5190,7 +5314,6 @@ typedef struct { OptAnc anc; int reach_end; int case_fold; - int good_case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; @@ -5417,7 +5540,6 @@ clear_opt_exact(OptStr* e) clear_opt_anc_info(&e->anc); e->reach_end = 0; e->case_fold = 0; - e->good_case_fold = 0; e->len = 0; e->s[0] = '\0'; } @@ -5441,11 +5563,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) to->case_fold = 1; } - else { - if (to->good_case_fold != 0) { - if (add->good_case_fold == 0) return 0; - } - } } r = 0; @@ -5522,8 +5639,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->len = i; if (add->case_fold != 0) to->case_fold = 1; - if (add->good_case_fold == 0) - to->good_case_fold = 0; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5556,9 +5671,6 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (now->case_fold == 0) vn *= 2; if (alt->case_fold == 0) va *= 2; - if (now->good_case_fold != 0) vn *= 4; - if (alt->good_case_fold != 0) va *= 4; - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) copy_opt_exact(now, alt); } @@ -5657,10 +5769,7 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; if (e->case_fold != 0) { - if (e->good_case_fold != 0) - case_value = 2; - else - case_value = 1; + case_value = 1; } else case_value = 3; @@ -5839,7 +5948,6 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - /* int is_raw = NODE_STRING_IS_RAW(node); */ if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); @@ -5849,28 +5957,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) set_mml(&opt->len, slen, slen); } else { - int max; + int max, min; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(enc) * n; - } - else { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - opt->sb.good_case_fold = 1; + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + opt->sb.case_fold = 1; - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } - - max = slen; + if (slen > 0) { + r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, + enc, env->case_fold_flag); + if (r != 0) break; } - set_mml(&opt->len, slen, max); + max = slen; + min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); + set_mml(&opt->len, min, max); } } break; @@ -6161,15 +6261,6 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (e->case_fold) { reg->optimize = OPTIMIZE_STR_CASE_FOLD; - if (e->good_case_fold != 0) { - if (e->len >= 2) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 1, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST; - } - } } else { int allow_reverse; @@ -6197,7 +6288,7 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (reg->dmin != INFINITE_LEN) { int n; - if (e->case_fold != 0 && e->good_case_fold == 0) + if (e->case_fold != 0) n = 1; else n = (int )(reg->exact_end - reg->exact); @@ -6419,7 +6510,7 @@ print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" }; + "STR_CASE_FOLD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -6596,14 +6687,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; - reg->num_null_check = 0; + reg->num_empty_check = 0; reg->repeat_range_alloc = 0; - reg->repeat_range = (OnigRepeatRange* )NULL; + reg->repeat_range = (RepeatRange* )NULL; reg->empty_status_mem = 0; r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; + r = reduce_string_list(root); + if (r != 0) goto err; + /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6716,7 +6810,13 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif - if ((reg->num_repeat != 0) || (reg->push_mem_end != 0) + set_addr_in_repeat_range(reg); + + if ((reg->push_mem_end != 0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + || (reg->num_repeat != 0) + || (reg->num_empty_check != 0) +#endif #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif @@ -7008,12 +7108,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG_PARSE +#ifdef USE_CALL static void p_string(FILE* f, int len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } +#endif static void Indent(FILE* f, int indent) @@ -7033,7 +7135,7 @@ print_indent_tree(FILE* f, Node* node, int indent) Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); - exit (0); + exit(0); } type = NODE_TYPE(node); @@ -7059,32 +7161,20 @@ print_indent_tree(FILE* f, Node* node, int indent) { char* str; char* mode; - char* dont; - char* good; - if (NODE_STRING_IS_RAW(node)) - mode = "-raw"; + if (NODE_STRING_IS_CRUDE(node)) + mode = "-crude"; else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) mode = "-case_fold_match"; else mode = ""; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - good = "-good"; - else - good = ""; - - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) - dont = " (dont-opt)"; - else - dont = ""; - if (STR_(node)->s == STR_(node)->end) str = "empty-string"; else str = "string"; - fprintf(f, "<%s%s%s%s:%p>", str, mode, good, dont, node); + fprintf(f, "<%s%s:%p>", str, mode, node); for (p = STR_(node)->s; p < STR_(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); diff --git a/oniguruma/src/regenc.c b/oniguruma/src/regenc.c index 8b03bb9db..9e695bcf3 100644 --- a/oniguruma/src/regenc.c +++ b/oniguruma/src/regenc.c @@ -1,5 +1,6 @@ /********************************************************************** regenc.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regenc.h b/oniguruma/src/regenc.h index 8e5879025..b4189f4e3 100644 --- a/oniguruma/src/regenc.h +++ b/oniguruma/src/regenc.h @@ -2,6 +2,7 @@ #define REGENC_H /********************************************************************** regenc.h - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regerror.c b/oniguruma/src/regerror.c index b57a276d5..0dd42d1bc 100644 --- a/oniguruma/src/regerror.c +++ b/oniguruma/src/regerror.c @@ -1,5 +1,6 @@ /********************************************************************** regerror.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regexec.c b/oniguruma/src/regexec.c index 1bd0cf206..833992e78 100644 --- a/oniguruma/src/regexec.c +++ b/oniguruma/src/regexec.c @@ -1,5 +1,6 @@ /********************************************************************** regexec.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako @@ -219,9 +220,13 @@ static OpInfoType OpInfo[] = { { OP_MEM_START_PUSH, "mem-start-push" }, { OP_MEM_START, "mem-start" }, { OP_MEM_END_PUSH, "mem-end-push" }, +#ifdef USE_CALL { OP_MEM_END_PUSH_REC, "mem-end-push-rec" }, +#endif { OP_MEM_END, "mem-end" }, +#ifdef USE_CALL { OP_MEM_END_REC, "mem-end-rec" }, +#endif { OP_FAIL, "fail" }, { OP_JUMP, "jump" }, { OP_PUSH, "push" }, @@ -235,12 +240,12 @@ static OpInfoType OpInfo[] = { { OP_REPEAT_NG, "repeat-ng" }, { OP_REPEAT_INC, "repeat-inc" }, { OP_REPEAT_INC_NG, "repeat-inc-ng" }, - { OP_REPEAT_INC_SG, "repeat-inc-sg" }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg" }, { OP_EMPTY_CHECK_START, "empty-check-start" }, { OP_EMPTY_CHECK_END, "empty-check-end" }, { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst" }, +#ifdef USE_CALL { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push" }, +#endif { OP_PREC_READ_START, "push-pos" }, { OP_PREC_READ_END, "pop-pos" }, { OP_PREC_READ_NOT_START, "prec-read-not-start" }, @@ -250,10 +255,12 @@ static OpInfoType OpInfo[] = { { OP_LOOK_BEHIND, "look-behind" }, { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start" }, { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end" }, - { OP_CALL, "call" }, - { OP_RETURN, "return" }, { OP_PUSH_SAVE_VAL, "push-save-val" }, { OP_UPDATE_VAR, "update-var" }, +#ifdef USE_CALL + { OP_CALL, "call" }, + { OP_RETURN, "return" }, +#endif #ifdef USE_CALLOUT { OP_CALLOUT_CONTENTS, "callout-contents" }, { OP_CALLOUT_NAME, "callout-name" }, @@ -466,10 +473,13 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, mem = p->memory_start.num; fprintf(f, ":%d", mem); break; - case OP_MEM_END_PUSH: - case OP_MEM_END_PUSH_REC: + case OP_MEM_END: + case OP_MEM_END_PUSH: +#ifdef USE_CALL case OP_MEM_END_REC: + case OP_MEM_END_PUSH_REC: +#endif mem = p->memory_end.num; fprintf(f, ":%d", mem); break; @@ -513,8 +523,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: mem = p->repeat.id; fprintf(f, ":%d", mem); break; @@ -525,7 +533,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; case OP_EMPTY_CHECK_END: case OP_EMPTY_CHECK_END_MEMST: +#ifdef USE_CALL case OP_EMPTY_CHECK_END_MEMST_PUSH: +#endif mem = p->empty_check_end.mem; fprintf(f, ":%d", mem); break; @@ -548,10 +558,12 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, p_rel_addr(f, addr, p, start); break; +#ifdef USE_CALL case OP_CALL: addr = p->call.addr; fprintf(f, ":{/%d}", addr); break; +#endif case OP_PUSH_SAVE_VAL: { @@ -621,7 +633,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_ATOMIC_START: case OP_ATOMIC_END: case OP_LOOK_BEHIND_NOT_END: +#ifdef USE_CALL case OP_RETURN: +#endif break; default: @@ -957,7 +971,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) result = ONIGERR_INVALID_ARGUMENT;\ }\ best_len = result;\ - goto finish;\ + goto match_at_end;\ break;\ }\ } while(0) @@ -979,18 +993,26 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /* handled by normal-POP */ #define STK_MEM_START 0x0010 #define STK_MEM_END 0x8030 -#define STK_REPEAT_INC 0x0050 +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_REPEAT_INC (0x0040 | STK_MASK_POP_HANDLED) +#else +#define STK_REPEAT_INC 0x0040 +#endif #ifdef USE_CALLOUT #define STK_CALLOUT 0x0070 #endif /* avoided by normal-POP */ #define STK_VOID 0x0000 /* for fill a blank */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_EMPTY_CHECK_START (0x3000 | STK_MASK_POP_HANDLED) +#else #define STK_EMPTY_CHECK_START 0x3000 +#endif #define STK_EMPTY_CHECK_END 0x5000 /* for recursive call */ #define STK_MEM_END_MARK 0x8100 #define STK_TO_VOID_START 0x1200 /* mark for "(?>...)" */ -#define STK_REPEAT 0x0300 +/* #define STK_REPEAT 0x0300 */ #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 @@ -1016,11 +1038,10 @@ typedef struct _StackType { UChar* pstr_prev; /* previous char position of pstr */ } state; struct { - int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ - Operation* pcode; /* byte code position (head of repeated target) */ - } repeat; - struct { - StackIndex si; /* index of stack */ + int count; +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } repeat_inc; struct { UChar *pstr; /* start/end position */ @@ -1029,7 +1050,10 @@ typedef struct _StackType { StackIndex prev_end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { - UChar *pstr; /* start position */ + UChar *pstr; /* start position */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } empty_check; #ifdef USE_CALL struct { @@ -1075,6 +1099,41 @@ struct OnigCalloutArgsStruct { #endif +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define PTR_NUM_SIZE(reg) ((reg)->num_repeat + (reg)->num_empty_check + ((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + repeat_stk = (StackIndex* )alloc_base;\ + empty_check_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ + mem_start_stk = (StackIndex* )(empty_check_stk + reg->num_empty_check);\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) stk->u.repeat_inc.prev_index = repeat_stk[sid] +#define LOAD_TO_REPEAT_STK_VAR(sid) repeat_stk[sid] = GET_STACK_INDEX(stk) +#define POP_REPEAT_INC else if (stk->type == STK_REPEAT_INC) {repeat_stk[stk->zid] = stk->u.repeat_inc.prev_index;} + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) stk->u.empty_check.prev_index = empty_check_stk[sid] +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) empty_check_stk[sid] = GET_STACK_INDEX(stk) +#define POP_EMPTY_CHECK_START else if (stk->type == STK_EMPTY_CHECK_START) {empty_check_stk[stk->zid] = stk->u.empty_check.prev_index;} + +#else + +#define PTR_NUM_SIZE(reg) (((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + mem_start_stk = (StackIndex* )alloc_base;\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) +#define LOAD_TO_REPEAT_STK_VAR(sid) +#define POP_REPEAT_INC + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) +#define POP_EMPTY_CHECK_START + +#endif /* USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ @@ -1086,7 +1145,7 @@ struct OnigCalloutArgsStruct { (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ (msa).mp = mpv;\ (msa).best_len = ONIG_MISMATCH;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #else #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ @@ -1097,7 +1156,7 @@ struct OnigCalloutArgsStruct { (msa).match_stack_limit = (mpv)->match_stack_limit;\ (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ (msa).mp = mpv;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #endif @@ -1152,12 +1211,6 @@ struct OnigCalloutArgsStruct { };\ } while(0) -#define UPDATE_FOR_STACK_REALLOC do{\ - repeat_stk = (StackIndex* )alloc_base;\ - mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ - mem_end_stk = mem_start_stk + num_mem + 1;\ -} while(0) - static unsigned int MatchStackLimit = DEFAULT_MATCH_STACK_LIMIT_SIZE; extern unsigned int @@ -1178,7 +1231,9 @@ onig_set_match_stack_limit_size(unsigned int size) static unsigned long RetryLimitInMatch = DEFAULT_RETRY_LIMIT_IN_MATCH; #define CHECK_RETRY_LIMIT_IN_MATCH do {\ - if (retry_in_match_counter++ > retry_limit_in_match) goto retry_limit_in_match_over;\ + if (retry_in_match_counter++ > retry_limit_in_match) {\ + MATCH_AT_ERROR_RETURN(ONIGERR_RETRY_LIMIT_IN_MATCH_OVER);\ + }\ } while (0) #else @@ -1568,19 +1623,23 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT_LOOK_BEHIND_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_LOOK_BEHIND_NOT,pat,s,sprev) +#if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT;\ stk->zid = (sid);\ - stk->u.repeat.pcode = (pat);\ - stk->u.repeat.count = 0;\ + stk->u.repeat.pcode = (pat);\ STACK_INC;\ } while(0) +#endif -#define STACK_PUSH_REPEAT_INC(sindex) do {\ +#define STACK_PUSH_REPEAT_INC(sid, ct) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT_INC;\ - stk->u.repeat_inc.si = (sindex);\ + stk->zid = (sid);\ + stk->u.repeat_inc.count = (ct);\ + SAVE_REPEAT_STK_VAR(sid);\ + LOAD_TO_REPEAT_STK_VAR(sid);\ STACK_INC;\ } while(0) @@ -1653,6 +1712,8 @@ stack_double(int is_alloca, char** arg_alloc_base, stk->type = STK_EMPTY_CHECK_START;\ stk->zid = (cnum);\ stk->u.empty_check.pstr = (s);\ + SAVE_EMPTY_CHECK_STK_VAR(cnum);\ + LOAD_TO_EMPTY_CHECK_STK_VAR(cnum);\ STACK_INC;\ } while(0) @@ -1790,7 +1851,7 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ fprintf(stderr, "at %s\n", at);\ - goto stack_error;\ + MATCH_AT_ERROR_RETURN(ONIGERR_STACK_BUG);\ } #else #define STACK_BASE_CHECK(p, at) @@ -1841,13 +1902,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ POP_CALLOUT_CASE\ }\ }\ @@ -1866,13 +1926,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ /* Don't call callout here because negation of total success by (?!..) (?type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - (isnull) = (k->u.empty_check.pstr == (s));\ - break;\ - }\ + if (k->zid == (sid)) break;\ }\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define GET_EMPTY_CHECK_START(sid, k) do {\ + if (reg->num_call == 0) {\ + k = STACK_AT(empty_check_stk[sid]);\ + }\ + else {\ + EMPTY_CHECK_START_SEARCH(sid, k);\ + }\ +} while(0) +#else + +#define GET_EMPTY_CHECK_START(sid, k) EMPTY_CHECK_START_SEARCH(sid, k) + +#endif + + +#define STACK_EMPTY_CHECK(isnull, sid, s) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + (isnull) = (k->u.empty_check.pstr == (s));\ +} while(0) + #define STACK_MEM_START_GET_PREV_END_ADDR(k /* STK_MEM_START*/, reg, addr) do {\ if (k->u.mem.prev_end == INVALID_STACK_INDEX) {\ (addr) = 0;\ @@ -1951,39 +2031,30 @@ stack_double(int is_alloca, char** arg_alloc_base, } while (0) #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ - StackType* k = stk;\ - while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM"); \ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - if (k->u.empty_check.pstr != (s)) {\ - (isnull) = 0;\ - break;\ +#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + if (k->u.empty_check.pstr != (s)) {\ + (isnull) = 0;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START &&\ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ + STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ + if (endp == 0) {\ + (isnull) = 0; break;\ }\ - else {\ - UChar* endp;\ - (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START &&\ - MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ - }\ - }\ - k++;\ - }\ - break;\ + else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ }\ }\ + k++;\ }\ }\ } while(0) @@ -2064,24 +2135,45 @@ stack_double(int is_alloca, char** arg_alloc_base, } while(0) #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ -#define STACK_GET_REPEAT(sid, k) do {\ - int level = 0;\ - k = stk;\ +#define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ + StackType* k = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_GET_REPEAT"); \ - if (k->type == STK_REPEAT) {\ - if (level == 0) {\ - if (k->zid == (sid)) {\ - break;\ - }\ + (k)--;\ + STACK_BASE_CHECK(k, "STACK_GET_REPEAT_COUNT_SEARCH");\ + if ((k)->type == STK_REPEAT_INC) {\ + if ((k)->zid == (sid)) {\ + (c) = (k)->u.repeat_inc.count;\ + break;\ + }\ + }\ + else if ((k)->type == STK_RETURN) {\ + int level = -1;\ + while (1) {\ + (k)--;\ + if ((k)->type == STK_CALL_FRAME) {\ + level++;\ + if (level == 0) break;\ + }\ + else if ((k)->type == STK_RETURN) level--;\ }\ }\ - else if (k->type == STK_CALL_FRAME) level--;\ - else if (k->type == STK_RETURN) level++;\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define STACK_GET_REPEAT_COUNT(sid, c) do {\ + if (reg->num_call == 0) {\ + (c) = (STACK_AT(repeat_stk[sid]))->u.repeat_inc.count;\ + }\ + else {\ + STACK_GET_REPEAT_COUNT_SEARCH(sid, c);\ + }\ +} while(0) +#else +#define STACK_GET_REPEAT_COUNT(sid, c) STACK_GET_REPEAT_COUNT_SEARCH(sid, c) +#endif + #define STACK_RETURN(addr) do {\ int level = 0;\ StackType* k = stk;\ @@ -2483,6 +2575,8 @@ typedef struct { #define MATCH_DEBUG_OUT(offset) #endif +#define MATCH_AT_ERROR_RETURN(err_code) best_len = err_code; goto match_at_end + /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ @@ -2556,9 +2650,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_MEM_START, &&L_MEM_START_PUSH, &&L_MEM_END_PUSH, +#ifdef USE_CALL &&L_MEM_END_PUSH_REC, +#endif &&L_MEM_END, +#ifdef USE_CALL &&L_MEM_END_REC, +#endif &&L_FAIL, &&L_JUMP, &&L_PUSH, @@ -2572,12 +2670,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_REPEAT_NG, &&L_REPEAT_INC, &&L_REPEAT_INC_NG, - &&L_REPEAT_INC_SG, - &&L_REPEAT_INC_NG_SG, &&L_EMPTY_CHECK_START, &&L_EMPTY_CHECK_END, &&L_EMPTY_CHECK_END_MEMST, +#ifdef USE_CALL &&L_EMPTY_CHECK_END_MEMST_PUSH, +#endif &&L_PREC_READ_START, &&L_PREC_READ_END, &&L_PREC_READ_NOT_START, @@ -2587,10 +2685,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_LOOK_BEHIND, &&L_LOOK_BEHIND_NOT_START, &&L_LOOK_BEHIND_NOT_END, - &&L_CALL, - &&L_RETURN, &&L_PUSH_SAVE_VAL, &&L_UPDATE_VAR, +#ifdef USE_CALL + &&L_CALL, + &&L_RETURN, +#endif #ifdef USE_CALLOUT &&L_CALLOUT_CONTENTS, &&L_CALLOUT_NAME, @@ -2608,15 +2708,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, char *alloc_base; StackType *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ - StackIndex si; - StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; UChar* keep; + +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex *repeat_stk; + StackIndex *empty_check_stk; +#endif #ifdef USE_RETRY_LIMIT_IN_MATCH unsigned long retry_limit_in_match; unsigned long retry_in_match_counter; #endif - #ifdef USE_CALLOUT int of; #endif @@ -2745,10 +2847,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, stk, (UChar* )str, reg); - if (r < 0) { - best_len = r; /* error code */ - goto finish; - } + if (r < 0) MATCH_AT_ERROR_RETURN(r); } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION @@ -2773,7 +2872,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } /* default behavior: return first-matching result. */ - goto finish; + goto match_at_end; CASE_OP(EXACT1) DATA_ENSURE(1); @@ -3293,7 +3392,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; #endif default: - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); break; } @@ -3419,13 +3518,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_CALL CASE_OP(MEM_END_PUSH_REC) - mem = p->memory_end.num; - STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ - si = GET_STACK_INDEX(stkp); - STACK_PUSH_MEM_END(mem, s); - mem_start_stk[mem] = si; - INC_OP; - JUMP_OUT; + { + StackIndex si; + + mem = p->memory_end.num; + STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ + si = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END(mem, s); + mem_start_stk[mem] = si; + INC_OP; + JUMP_OUT; + } CASE_OP(MEM_END_REC) mem = p->memory_end.num; @@ -3655,12 +3758,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH: case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: INC_OP; break; default: - goto unexpected_bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNEXPECTED_BYTECODE); break; } #else @@ -3776,10 +3877,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + addr, s, sprev); } @@ -3790,10 +3888,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + 1, s, sprev); p += addr; @@ -3804,64 +3899,42 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(REPEAT_INC) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc: - stkp->u.repeat.count++; - if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { + STACK_GET_REPEAT_COUNT(mem, n); + n++; + if (n >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ INC_OP; } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + else if (n >= reg->repeat_range[mem].lower) { INC_OP; STACK_PUSH_ALT(p, s, sprev); - p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + p = reg->repeat_range[mem].u.pcode; } else { - p = stkp->u.repeat.pcode; + p = reg->repeat_range[mem].u.pcode; } - STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_REPEAT_INC(mem, n); CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc; - CASE_OP(REPEAT_INC_NG) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc_ng: - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - Operation* pcode = stkp->u.repeat.pcode; - - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); + STACK_GET_REPEAT_COUNT(mem, n); + n++; + STACK_PUSH_REPEAT_INC(mem, n); + if (n == reg->repeat_range[mem].upper) { + INC_OP; + } + else { + if (n >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s, sprev); INC_OP; } else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); + p = reg->repeat_range[mem].u.pcode; } } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - INC_OP; - } CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_NG_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc_ng; - CASE_OP(PREC_READ_START) STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; @@ -4040,7 +4113,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, call_result = ONIGERR_INVALID_ARGUMENT; } best_len = call_result; - goto finish; + goto match_at_end; break; } } @@ -4066,7 +4139,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif CASE_OP(FINISH) - goto finish; + goto match_at_end; #ifdef ONIG_DEBUG_STATISTICS fail: @@ -4087,35 +4160,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; DEFAULT_OP - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); } BYTECODE_INTERPRETER_END; - finish: + match_at_end: STACK_SAVE; return best_len; - -#ifdef ONIG_DEBUG - stack_error: - STACK_SAVE; - return ONIGERR_STACK_BUG; -#endif - - bytecode_error: - STACK_SAVE; - return ONIGERR_UNDEFINED_BYTECODE; - -#if defined(ONIG_DEBUG) && !defined(USE_DIRECT_THREADED_CODE) - unexpected_bytecode_error: - STACK_SAVE; - return ONIGERR_UNEXPECTED_BYTECODE; -#endif - -#ifdef USE_RETRY_LIMIT_IN_MATCH - retry_limit_in_match_over: - STACK_SAVE; - return ONIGERR_RETRY_LIMIT_IN_MATCH_OVER; -#endif } typedef struct { @@ -4789,60 +4840,6 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } -static UChar* -sunday_quick_search_case_fold(regex_t* reg, - const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *end; - const UChar *tail; - int skip, tlen1; - int map_offset; - int case_fold_flag; - OnigEncoding enc; - -#ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, - "sunday_quick_search_case_fold: text: %p, text_end: %p, text_range: %p\n", text, text_end, text_range); -#endif - - enc = reg->enc; - case_fold_flag = reg->case_fold_flag; - - tail = target_end - 1; - tlen1 = (int )(tail - target); - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - map_offset = reg->map_offset; - s = text; - - while (s < end) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, text_end)) - return (UChar* )s; - - se = s + tlen1; - if (se + map_offset >= text_end) break; - skip = reg->map[*(se + map_offset)]; -#if 0 - p = s; - do { - s += enclen(enc, s); - } while ((s - p) < skip && s < end); -#else - /* This is faster than prev code for long text. ex: /(?i)Twain/ */ - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); -#endif - } - - return (UChar* )NULL; -} - static UChar* map_search(OnigEncoding enc, UChar map[], const UChar* text, const UChar* text_range) @@ -4956,11 +4953,6 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, reg->exact, reg->exact_end, p, end, range); break; - case OPTIMIZE_STR_CASE_FOLD_FAST: - p = sunday_quick_search_case_fold(reg, reg->exact, reg->exact_end, p, end, - range); - break; - case OPTIMIZE_STR_FAST: p = sunday_quick_search(reg, reg->exact, reg->exact_end, p, end, range); break; @@ -5081,7 +5073,6 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, break; case OPTIMIZE_STR_CASE_FOLD: - case OPTIMIZE_STR_CASE_FOLD_FAST: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); diff --git a/oniguruma/src/regext.c b/oniguruma/src/regext.c index c46f630c5..bc58938da 100644 --- a/oniguruma/src/regext.c +++ b/oniguruma/src/regext.c @@ -1,5 +1,6 @@ /********************************************************************** regext.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/reggnu.c b/oniguruma/src/reggnu.c index 7825b745a..3a33c48d7 100644 --- a/oniguruma/src/reggnu.c +++ b/oniguruma/src/reggnu.c @@ -1,5 +1,6 @@ /********************************************************************** reggnu.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regint.h b/oniguruma/src/regint.h index a2a300630..448ddee3d 100644 --- a/oniguruma/src/regint.h +++ b/oniguruma/src/regint.h @@ -2,6 +2,7 @@ #define REGINT_H /********************************************************************** regint.h - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako @@ -47,13 +48,6 @@ #endif #endif -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - (defined(__ppc__) && defined(__APPLE__)) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(__mc68020__) -#define PLATFORM_UNALIGNED_WORD_ACCESS -#endif - #ifndef ONIG_DISABLE_DIRECT_THREADING #ifdef __GNUC__ #define USE_GOTO_LABELS_AS_VALUES @@ -84,6 +78,8 @@ #define USE_VARIABLE_META_CHARS #define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +/* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ + #include "regenc.h" @@ -199,39 +195,6 @@ typedef unsigned int uintptr_t; #define CHAR_MAP_SIZE 256 #define INFINITE_LEN ONIG_INFINITE_DISTANCE -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - -#define PLATFORM_GET_INC(val,p,type) do{\ - val = *(type* )p;\ - (p) += sizeof(type);\ -} while(0) - -#else - -#define PLATFORM_GET_INC(val,p,type) do{\ - xmemcpy(&val, (p), sizeof(type));\ - (p) += sizeof(type);\ -} while(0) - -/* sizeof(OnigCodePoint) */ -#ifdef SIZEOF_SIZE_T -# define WORD_ALIGNMENT_SIZE SIZEOF_SIZE_T -#else -# define WORD_ALIGNMENT_SIZE SIZEOF_LONG -#endif - -#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ - (pad_size) = WORD_ALIGNMENT_SIZE - ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ - if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ -} while (0) - -#define ALIGNMENT_RIGHT(addr) do {\ - (addr) += (WORD_ALIGNMENT_SIZE - 1);\ - (addr) -= ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ -} while (0) - -#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ - #ifdef USE_CALLOUT @@ -274,7 +237,6 @@ enum OptimizeType { OPTIMIZE_STR, /* Slow Search */ OPTIMIZE_STR_FAST, /* Sunday quick search / BMH */ OPTIMIZE_STR_FAST_STEP_FORWARD, /* Sunday quick search / BMH */ - OPTIMIZE_STR_CASE_FOLD_FAST, /* Sunday quick search / BMH (ignore case) */ OPTIMIZE_STR_CASE_FOLD, /* Slow Search (ignore case) */ OPTIMIZE_MAP /* char map */ }; @@ -364,16 +326,12 @@ typedef unsigned int MemStatusType; /* bitset */ #define BITS_PER_BYTE 8 #define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) -#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITS_IN_ROOM 32 /* 4 * BITS_PER_BYTE */ #define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS -typedef unsigned int Bits; -#else -typedef unsigned char Bits; -#endif -typedef Bits BitSet[BITSET_SIZE]; -typedef Bits* BitSetRef; +typedef uint32_t Bits; +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; #define SIZE_BITSET sizeof(BitSet) @@ -382,8 +340,8 @@ typedef Bits* BitSetRef; for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \ } while (0) -#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] -#define BS_BIT(pos) (1u << (pos % BITS_IN_ROOM)) +#define BS_ROOM(bs,pos) (bs)[(unsigned int )(pos) >> 5] +#define BS_BIT(pos) (1u << ((unsigned int )(pos) & 0x1f)) #define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) #define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) @@ -559,9 +517,13 @@ enum OpCode { OP_MEM_START, OP_MEM_START_PUSH, /* push back-tracker to stack */ OP_MEM_END_PUSH, /* push back-tracker to stack */ +#ifdef USE_CALL OP_MEM_END_PUSH_REC, /* push back-tracker to stack */ +#endif OP_MEM_END, +#ifdef USE_CALL OP_MEM_END_REC, /* push marker to stack */ +#endif OP_FAIL, /* pop stack and move */ OP_JUMP, OP_PUSH, @@ -575,12 +537,12 @@ enum OpCode { OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, OP_REPEAT_INC_NG, /* non greedy */ - OP_REPEAT_INC_SG, /* search and get in stack */ - OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ OP_EMPTY_CHECK_START, /* null loop checker start */ OP_EMPTY_CHECK_END, /* null loop checker end */ OP_EMPTY_CHECK_END_MEMST, /* null loop checker end (with capture status) */ +#ifdef USE_CALL OP_EMPTY_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ +#endif OP_PREC_READ_START, /* (?=...) start */ OP_PREC_READ_END, /* (?=...) end */ OP_PREC_READ_NOT_START, /* (?!...) start */ @@ -590,10 +552,12 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_LOOK_BEHIND_NOT_START, /* (? */ - OP_RETURN, OP_PUSH_SAVE_VAL, OP_UPDATE_VAR, +#ifdef USE_CALL + OP_CALL, /* \g */ + OP_RETURN, +#endif #ifdef USE_CALLOUT OP_CALLOUT_CONTENTS, /* (?{...}) (?{{...}}) */ OP_CALLOUT_NAME, /* (*name) (*name[tag](args...)) */ @@ -642,23 +606,8 @@ typedef int ModeType; #define SIZE_UPDATE_VAR_TYPE sizeof(UpdateVarType) #define SIZE_MODE sizeof(ModeType) -#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType) -#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType) -#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType) -#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType) -#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) -#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) -#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) -#define GET_SAVE_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, SaveType) -#define GET_UPDATE_VAR_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, UpdateVarType) -#define GET_MODE_INC(mode,p) PLATFORM_GET_INC(mode, p, ModeType) - /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) -#define GET_BYTE_INC(byte,p) do{\ - byte = *(p);\ - (p)++;\ -} while(0) /* op-code + arg size */ @@ -838,7 +787,7 @@ typedef struct { } repeat; /* REPEAT, REPEAT_NG */ struct { MemNumType id; - } repeat_inc; /* REPEAT_INC, REPEAT_INC_SG, REPEAT_INC_NG, REPEAT_INC_NG_SG */ + } repeat_inc; /* REPEAT_INC, REPEAT_INC_NG */ struct { MemNumType mem; } empty_check_start; @@ -889,6 +838,15 @@ typedef struct { #endif } RegexExt; +typedef struct { + int lower; + int upper; + union { + Operation* pcode; /* address of repeated body */ + int offset; + } u; +} RepeatRange; + struct re_pattern_buffer { /* common members of BBuf(bytes-buffer) */ Operation* ops; @@ -903,15 +861,15 @@ struct re_pattern_buffer { int num_mem; /* used memory(...) num counted from 1 */ int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ + int num_empty_check; /* OP_EMPTY_CHECK_START/END id counter */ int num_call; /* number of subexp call */ MemStatusType capture_history; /* (?@...) flag (1-31) */ MemStatusType push_mem_start; /* need backtrack flag */ MemStatusType push_mem_end; /* need backtrack flag */ MemStatusType empty_status_mem; int stack_pop_level; - int repeat_range_alloc; - OnigRepeatRange* repeat_range; + int repeat_range_alloc; + RepeatRange* repeat_range; OnigEncoding enc; OnigOptionType options; diff --git a/oniguruma/src/regparse.c b/oniguruma/src/regparse.c index 700285469..f16da5a7c 100644 --- a/oniguruma/src/regparse.c +++ b/oniguruma/src/regparse.c @@ -1,5 +1,6 @@ /********************************************************************** regparse.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako @@ -2173,7 +2174,7 @@ node_new_ctype(int type, int not, OnigOptionType options) static Node* node_new_anychar(void) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, 0, ONIG_OPTION_NONE); + Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); return node; } @@ -2691,7 +2692,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, 0); + ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1], env); @@ -2702,7 +2703,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, INFINITE_REPEAT, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -2771,7 +2772,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, ns[0] = x; - x = node_new_quantifier(lower, upper, 0); + x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; @@ -2800,7 +2801,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, x = make_alt(2, ns); if (IS_NULL(x)) goto err0; - if (is_range_cutter != 0) + if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; @@ -2890,7 +2891,10 @@ make_range_clear(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 0, env); +#define ID_NOT_USED_DONT_CARE_ME 0 + + r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, + ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; x = make_alt(2, ns); @@ -3009,7 +3013,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, - 0, env); + FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; @@ -3052,7 +3056,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, INFINITE_REPEAT, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3178,16 +3182,6 @@ node_str_cat_char(Node* node, UChar c) return onig_node_str_cat(node, s, s + 1); } -extern void -onig_node_conv_to_str_node(Node* node, int flag) -{ - NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; - STR_(node)->capacity = 0; - STR_(node)->s = STR_(node)->buf; - STR_(node)->end = STR_(node)->buf; -} - extern void onig_node_str_clear(Node* node) { @@ -3196,10 +3190,11 @@ onig_node_str_clear(Node* node) xfree(STR_(node)->s); } - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; } static Node* @@ -3209,10 +3204,12 @@ node_new_str(const UChar* s, const UChar* end) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; + if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -3227,11 +3224,11 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_raw(UChar* s, UChar* end) +node_new_str_crude(UChar* s, UChar* end) { Node* node = node_new_str(s, end); CHECK_NULL_RETURN(node); - NODE_STRING_SET_RAW(node); + NODE_STRING_SET_CRUDE(node); return node; } @@ -3242,14 +3239,14 @@ node_new_empty(void) } static Node* -node_new_str_raw_char(UChar c) +node_new_str_crude_char(UChar c) { int i; UChar p[1]; Node* node; p[0] = c; - node = node_new_str_raw(p, p + 1); + node = node_new_str_crude(p, p + 1); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) @@ -3272,8 +3269,8 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_RAW(node)) - NODE_STRING_SET_RAW(rn); + if (NODE_STRING_IS_CRUDE(node)) + NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; } @@ -4004,7 +4001,7 @@ node_new_general_newline(Node** node, ScanEnv* env) alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_raw(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4032,7 +4029,7 @@ node_new_general_newline(Node** node, ScanEnv* env) if (r != 0) goto err1; } - x = node_new_bag_if_else(crnl, 0, ncc); + x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; @@ -4041,7 +4038,7 @@ node_new_general_newline(Node** node, ScanEnv* env) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_RAW_BYTE = 1, + TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, @@ -4454,7 +4451,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int* rback_num, - enum REF_NUM* num_type, int ref) + enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; @@ -4484,7 +4481,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { - if (ref == 1) + if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; @@ -4492,7 +4489,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, digit_count++; } else if (c == '-') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; @@ -4502,7 +4499,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } } else if (c == '+') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; @@ -4843,7 +4840,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; tok->u.byte = (UChar )code; } @@ -4876,7 +4873,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; tok->u.byte = (UChar )code; } @@ -5246,7 +5243,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; tok->u.byte = (UChar )code; } @@ -5311,7 +5308,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; tok->u.byte = (UChar )code; } @@ -5338,7 +5335,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, 1); + r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; @@ -5401,7 +5398,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { @@ -5464,7 +5461,6 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PUNFETCH; r = fetch_escaped_value(&p, end, env, &c2); if (r < 0) return r; - /* set_raw: */ if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; @@ -5590,8 +5586,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { PINC; name = p; - r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, - &num_type, 0); + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, + &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; @@ -5623,7 +5619,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { @@ -6097,7 +6093,7 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -6297,7 +6293,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto val_entry2; break; - case TK_RAW_BYTE: + case TK_CRUDE_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { int i, j; @@ -6310,7 +6306,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; - if (r != TK_RAW_BYTE || tok->base != base) { + if (r != TK_CRUDE_BYTE || tok->base != base) { fetched = 1; break; } @@ -6340,7 +6336,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (i == 1) { in_code = (OnigCodePoint )buf[0]; - goto raw_single; + goto crude_single; } else { in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); @@ -6349,7 +6345,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } else { in_code = (OnigCodePoint )tok->u.byte; - raw_single: + crude_single: in_type = CV_SB; } in_raw = 1; @@ -6815,7 +6811,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, size_t clen; add_char: - if (skip_mode == 0) { + if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ @@ -6832,7 +6828,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (max_arg_num >= 0 && n >= max_arg_num) return ONIGERR_INVALID_CALLOUT_ARG; - if (skip_mode == 0) { + if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { @@ -6964,7 +6960,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(1, '}', &p, end, -1, 0, 0, env); + arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -6978,7 +6974,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(0, '}', &p, end, max_arg_num, types, vals, env); + arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -7078,17 +7074,17 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(np, tok, term, &p, end, env, 0); + r = parse_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7106,9 +7102,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7124,7 +7120,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group2: name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, - &num_type, 0); + &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); @@ -7173,7 +7169,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&absent, tok, term, &p, end, env, 1); + r = parse_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7260,7 +7256,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), - &p, end, &name_end, env, &back_num, &num_type, 1); + &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { @@ -7284,7 +7280,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_INVALID_BACKREF; } - condition = node_new_backref_checker(1, &back_num, 0, + condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7307,7 +7303,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } } - condition = node_new_backref_checker(num, backs, 1, + condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7349,7 +7345,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&condition, tok, term, &p, end, env, 0); + r = parse_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7392,7 +7388,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_alts(&target, tok, term, &p, end, env, 1); + r = parse_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7493,7 +7489,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { - OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { @@ -7529,16 +7525,16 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: @@ -7568,7 +7564,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7615,7 +7611,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -7768,6 +7764,29 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) }\ } while (0) +extern int +onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) +{ + int i; + Node* node; + CClassNode* cc; + + *rnode = NULL_NODE; + + node = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(node); + + cc = CCLASS_(node); + + for (i = 0; i < n; i++) { + ADD_CODE_INTO_CC(cc, codes[i], enc); + } + + *rnode = node; + return 0; +} + typedef struct { ScanEnv* env; CClassNode* cc; @@ -7927,7 +7946,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, src, end, env, 0); + r = parse_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7942,7 +7961,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; - if (tok->escaped) goto tk_raw_byte; + if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; @@ -7967,23 +7986,23 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; - case TK_RAW_BYTE: - tk_raw_byte: + case TK_CRUDE_BYTE: + tk_crude_byte: { - *np = node_new_str_raw_char(tok->u.byte); + *np = node_new_str_crude_char(tok->u.byte); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - goto tk_raw_byte_end; + goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) + if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; r = node_str_cat_char(*np, tok->u.byte); @@ -7992,11 +8011,11 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len++; } - tk_raw_byte_end: + tk_crude_byte_end: if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; - NODE_STRING_CLEAR_RAW(*np); + NODE_STRING_CLEAR_CRUDE(*np); goto string_end; } break; @@ -8007,7 +8026,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + len); + *np = node_new_str_crude(buf, buf + len); #else *np = node_new_str(buf, buf + len); #endif @@ -8050,7 +8069,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8109,7 +8128,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, INFINITE_REPEAT, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8300,7 +8319,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, 0); + r = parse_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8353,7 +8372,7 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, 0); + r = parse_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8392,7 +8411,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_alts(top, &tok, TK_EOT, src, end, env, 0); + r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; diff --git a/oniguruma/src/regparse.h b/oniguruma/src/regparse.h index 08811c240..e5b2db950 100644 --- a/oniguruma/src/regparse.h +++ b/oniguruma/src/regparse.h @@ -2,6 +2,7 @@ #define REGPARSE_H /********************************************************************** regparse.h - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako @@ -32,7 +33,7 @@ #include "regint.h" #define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_STRING_BUF_SIZE 20 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 6 /* node type */ @@ -83,8 +84,9 @@ typedef struct { UChar* s; UChar* end; unsigned int flag; - int capacity; /* (allocated size - 1) or 0: use buf[] */ UChar buf[NODE_STRING_BUF_SIZE]; + int capacity; /* (allocated size - 1) or 0: use buf[] */ + int case_min_len; } StrNode; typedef struct { @@ -293,30 +295,21 @@ typedef struct _Node { #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) -#define NODE_STRING_RAW (1<<0) /* by backslashed number */ +#define NODE_STRING_CRUDE (1<<0) #define NODE_STRING_CASE_EXPANDED (1<<1) #define NODE_STRING_CASE_FOLD_MATCH (1<<2) -#define NODE_STRING_GOOD_AMBIG (1<<3) -#define NODE_STRING_DONT_GET_OPT_INFO (1<<4) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) -#define NODE_STRING_SET_RAW(node) (node)->u.str.flag |= NODE_STRING_RAW -#define NODE_STRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NODE_STRING_RAW +#define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE +#define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE #define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED #define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH -#define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG -#define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \ - (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO -#define NODE_STRING_IS_RAW(node) \ - (((node)->u.str.flag & NODE_STRING_RAW) != 0) +#define NODE_STRING_IS_CRUDE(node) \ + (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) #define NODE_STRING_IS_CASE_EXPANDED(node) \ (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) #define NODE_STRING_IS_CASE_FOLD_MATCH(node) \ (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0) -#define NODE_STRING_IS_GOOD_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0) -#define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \ - (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) @@ -446,7 +439,6 @@ extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); -extern void onig_node_conv_to_str_node P_((Node* node, int raw)); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); @@ -460,6 +452,7 @@ extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); +extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); #ifdef USE_CALLOUT diff --git a/oniguruma/src/regsyntax.c b/oniguruma/src/regsyntax.c index 513c7f7b2..06ee21dcb 100644 --- a/oniguruma/src/regsyntax.c +++ b/oniguruma/src/regsyntax.c @@ -1,5 +1,6 @@ /********************************************************************** regsyntax.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regtrav.c b/oniguruma/src/regtrav.c index 8307695d1..a9ba8a294 100644 --- a/oniguruma/src/regtrav.c +++ b/oniguruma/src/regtrav.c @@ -1,5 +1,6 @@ /********************************************************************** regtrav.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/regversion.c b/oniguruma/src/regversion.c index de993d3f6..bdb1c08f1 100644 --- a/oniguruma/src/regversion.c +++ b/oniguruma/src/regversion.c @@ -1,5 +1,6 @@ /********************************************************************** regversion.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/st.c b/oniguruma/src/st.c index e5fd1a15d..b614da1f3 100644 --- a/oniguruma/src/st.c +++ b/oniguruma/src/st.c @@ -1,4 +1,5 @@ /* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */ +/* encoding: UTF8 */ /* static char sccsid[] = "@(#) st.c 5.1 89/12/14 Crucible"; */ diff --git a/oniguruma/src/st.h b/oniguruma/src/st.h index 6f9387021..46491d1bd 100644 --- a/oniguruma/src/st.h +++ b/oniguruma/src/st.h @@ -1,5 +1,5 @@ /* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */ - +/* encoding: UTF8 */ /* @(#) st.h 5.1 89/12/14 */ #ifndef ST_INCLUDED diff --git a/oniguruma/src/unicode.c b/oniguruma/src/unicode.c index 474436ab1..792c4ddab 100644 --- a/oniguruma/src/unicode.c +++ b/oniguruma/src/unicode.c @@ -1,5 +1,6 @@ /********************************************************************** unicode.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/src/unicode_fold1_key.c b/oniguruma/src/unicode_fold1_key.c index 09d34cf01..171a0fa4b 100644 --- a/oniguruma/src/unicode_fold1_key.c +++ b/oniguruma/src/unicode_fold1_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ /* Computed positions: -k'1-3' */ @@ -2983,7 +2983,7 @@ onigenc_unicode_fold1_key(OnigCodePoint codes[]) 4026 }; - if (0 == 0) + { int key = hash(codes); diff --git a/oniguruma/src/unicode_fold2_key.c b/oniguruma/src/unicode_fold2_key.c index 7eb5f959e..c39b19da4 100644 --- a/oniguruma/src/unicode_fold2_key.c +++ b/oniguruma/src/unicode_fold2_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ /* Computed positions: -k'3,6' */ @@ -211,7 +211,7 @@ onigenc_unicode_fold2_key(OnigCodePoint codes[]) 129 }; - if (0 == 0) + { int key = hash(codes); diff --git a/oniguruma/src/unicode_fold3_key.c b/oniguruma/src/unicode_fold3_key.c index 23f466b81..295c44747 100644 --- a/oniguruma/src/unicode_fold3_key.c +++ b/oniguruma/src/unicode_fold3_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ /* Computed positions: -k'3,6,9' */ @@ -121,7 +121,7 @@ onigenc_unicode_fold3_key(OnigCodePoint codes[]) 0 }; - if (0 == 0) + { int key = hash(codes); diff --git a/oniguruma/src/unicode_property_data.c b/oniguruma/src/unicode_property_data.c index 88c24a4c1..0083dd66b 100644 --- a/oniguruma/src/unicode_property_data.c +++ b/oniguruma/src/unicode_property_data.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ /* Computed positions: -k'1-3,5-6,12,16,$' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -29581,6 +29581,7 @@ unicode_lookup_property_name (register const char *str, register size_t len) #define UNICODE_PROPERTY_VERSION 120100 +#define UNICODE_EMOJI_VERSION 1201 #define PROPERTY_NAME_MAX_SIZE 59 #define CODE_RANGES_NUM 568 diff --git a/oniguruma/src/unicode_property_data_posix.c b/oniguruma/src/unicode_property_data_posix.c index eddc1082e..e299e85de 100644 --- a/oniguruma/src/unicode_property_data_posix.c +++ b/oniguruma/src/unicode_property_data_posix.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/oniguruma/src/unicode_unfold_key.c b/oniguruma/src/unicode_unfold_key.c index 989804af9..51a037b39 100644 --- a/oniguruma/src/unicode_unfold_key.c +++ b/oniguruma/src/unicode_unfold_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_unfold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ /* Computed positions: -k'1-3' */ @@ -3288,7 +3288,7 @@ onigenc_unicode_unfold_key(OnigCodePoint code) {0x1e907, 4005, 1} }; - if (0 == 0) + { int key = hash(&code); diff --git a/oniguruma/src/utf8.c b/oniguruma/src/utf8.c index 3f69d6b75..d2b752ede 100644 --- a/oniguruma/src/utf8.c +++ b/oniguruma/src/utf8.c @@ -1,6 +1,6 @@ -// encoding: UTF8 /********************************************************************** utf8.c - Oniguruma (regular expression library) + encoding: UTF8 **********************************************************************/ /*- * Copyright (c) 2002-2019 K.Kosako diff --git a/oniguruma/windows/testc.c b/oniguruma/windows/testc.c index 2e5a3dee2..86e439031 100644 --- a/oniguruma/windows/testc.c +++ b/oniguruma/windows/testc.c @@ -1,6 +1,6 @@ -// encoding: UTF8 /* * This program was generated by testconv.rb. + * encoding: UTF8 */ #ifdef ONIG_ESCAPE_UCHAR_COLLISION #undef ONIG_ESCAPE_UCHAR_COLLISION