From 99ebacc5aa7b95037e7d8bd58ebc1df27135b151 Mon Sep 17 00:00:00 2001 From: RaiKoHoff Date: Thu, 23 Jan 2020 12:57:48 +0100 Subject: [PATCH] + upd: Oniguruma current DEV --- Build/Notepad3.ini | 2 +- Versions/build.txt | 2 +- oniguruma/doc/RE | 11 +- oniguruma/doc/SYNTAX.md | 132 ++-- oniguruma/src/Makefile.windows | 6 +- oniguruma/src/make_unicode_fold_data.py | 5 +- oniguruma/src/oniguruma.h | 9 +- oniguruma/src/regcomp.c | 959 +++++++++++++++++++----- oniguruma/src/regenc.h | 5 +- oniguruma/src/regerror.c | 18 +- oniguruma/src/regexec.c | 416 +++++----- oniguruma/src/regint.h | 331 ++++---- oniguruma/src/regparse.c | 220 ++++-- oniguruma/src/regparse.h | 148 ++-- oniguruma/src/regsyntax.c | 5 +- oniguruma/src/regversion.c | 7 +- oniguruma/src/st.c | 9 +- oniguruma/src/st.h | 12 +- oniguruma/src/unicode_fold1_key.c | 5 +- oniguruma/src/unicode_fold2_key.c | 5 +- oniguruma/src/unicode_fold3_key.c | 5 +- oniguruma/src/unicode_unfold_key.c | 5 +- res/Notepad3.exe.manifest.conf | 2 +- src/VersionEx.h | 4 +- 24 files changed, 1587 insertions(+), 736 deletions(-) diff --git a/Build/Notepad3.ini b/Build/Notepad3.ini index a34227077..d1c07f7b5 100644 --- a/Build/Notepad3.ini +++ b/Build/Notepad3.ini @@ -146,8 +146,8 @@ SettingsVersion=4 [Web Source Code] [XML Document] [YAML] -[Window] [Suppressed Messages] [Recent Files] [Recent Find] [Recent Replace] +[Window] diff --git a/Versions/build.txt b/Versions/build.txt index dc1d7d075..e41db8b9d 100644 --- a/Versions/build.txt +++ b/Versions/build.txt @@ -1 +1 @@ -2711 +2712 diff --git a/oniguruma/doc/RE b/oniguruma/doc/RE index 599d2a6a0..8975d9e88 100644 --- a/oniguruma/doc/RE +++ b/oniguruma/doc/RE @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 6.9.4 2019/10/31 +Oniguruma Regular Expressions Version 6.9.5 2020/01/23 syntax: ONIG_SYNTAX_ONIGURUMA (default) @@ -279,15 +279,12 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) (?=subexp) look-ahead (?!subexp) negative look-ahead + (?<=subexp) look-behind (?...)`) -_Set in: Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl_NG, Ruby_ Enables support for _naming_ capture groups, so that instead of having to refer to captures by position (like `\3` or `$3`), you can refer to them by names @@ -519,7 +519,7 @@ and `(?'name'...)`, but not the Python `(?P...)` syntax. ### 8. ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (enable named backreferences `\k`) -_Set in: Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl_NG, Ruby_ Enables support for substituted backreferences by name, not just by position. This supports using `\k'name'` in addition to supporting `\k`. This also @@ -530,7 +530,7 @@ the match, if the capture matched multiple times, by writing `\k` or ### 9. ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (enable backreferences `\g` and `\g`) -_Set in: Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl_NG, Ruby_ Enables support for substituted backreferences by both name and position using the same syntax. This supports using `\g'name'` and `\g'1'` in addition to @@ -554,7 +554,7 @@ enabled by default in any syntax. ### 11. ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (enable `\C-x`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ Enables support for Ruby legacy control-code escapes, like `\C-m` or `\C-M` for code-point 13. In this shorthand form, control codes may be specified by `\C-` (for "Control") @@ -567,7 +567,7 @@ See also ONIG_SYN_OP_ESC_C_CONTROL, which enables the more-common `\cx` syntax. ### 12. ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (enable `\M-x`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ Enables support for Ruby legacy meta-code escapes. When you write `\M-x`, Oniguruma will match an `x` whose 8th bit is set (i.e., the character code of `x` will be or'ed @@ -577,7 +577,7 @@ with `0x80`). So, for example, you can match `\x81` using `\x81`, or you can wr ### 13. ONIG_SYN_OP2_ESC_V_VTAB (enable `\v` as vertical tab) -_Set in: Java, Ruby, Oniguruma_ +_Set in: Oniguruma, Java, Ruby_ Enables support for a C-style `\v` escape code, meaning "vertical tab." If enabled, `\v` will be equivalent to ASCII code point 11. @@ -585,7 +585,7 @@ Enables support for a C-style `\v` escape code, meaning "vertical tab." If enab ### 14. ONIG_SYN_OP2_ESC_U_HEX4 (enable `\uHHHH` for Unicode) -_Set in: Java, Ruby, Oniguruma_ +_Set in: Oniguruma, Java, Ruby_ Enables support for a Java-style `\uHHHH` escape code for representing Unicode code-points by number, using up to four hexadecimal digits (up to `\uFFFF`). So, @@ -611,7 +611,7 @@ These anchor forms are very obscure, and rarely supported by other regex librari ### 16. ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (enable `\p{...}` and `\P{...}`) -_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Java, Perl, Perl_NG, Ruby_ Enables support for an alternate syntax for POSIX character classes; instead of writing `[:alpha:]` when this is enabled, you can instead write `\p{alpha}`. @@ -621,7 +621,7 @@ See also ONIG_SYN_OP_POSIX_BRACKET for the classic POSIX form. ### 17. ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (enable `\p{^...}` and `\P{^...}`) -_Set in: Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG, Ruby_ Enables support for an alternate syntax for POSIX character classes; instead of writing `[:^alpha:]` when this is enabled, you can instead write `\p{^alpha}`. @@ -636,7 +636,7 @@ _(not presently used)_ ### 19. ONIG_SYN_OP2_ESC_H_XDIGIT (enable `\h` and `\H`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ Enables support for the Ruby-specific shorthand `\h` and `\H` metacharacters. Somewhat like `\d` matches decimal digits, `\h` matches hexadecimal digits — that is, @@ -658,7 +658,7 @@ You usually do not want this flag to be enabled. ### 21. ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE (enable `(?(...)then|else)`) -_Set in: Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG, Ruby_ Enables support for conditional inclusion of subsequent regex patterns based on whether a prior named or numbered capture matched, or based on whether a pattern will @@ -676,7 +676,7 @@ match. This supports many different forms, including: ### 22. ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (enable `\K`) -_Set in: Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG, Ruby_ Enables support for `\K`, which excludes all content before it from the overall regex match (i.e., capture #0). So, for example, pattern `foo\Kbar` would match @@ -687,7 +687,7 @@ regex match (i.e., capture #0). So, for example, pattern `foo\Kbar` would match ### 23. ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE (enable `\R`) -_Set in: Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG, Ruby_ Enables support for `\R`, the "general newline" shorthand, which matches `(\r\n|[\n\v\f\r\u0085\u2028\u2029])` (obviously, the Unicode values are cannot be @@ -698,7 +698,7 @@ matched in ASCII encodings). ### 24. ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT (enable `\N` and `\O`) -_Set in: Perl, Perl_NG, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG_ Enables support for `\N` and `\O`. `\N` is "not a line break," which is much like the standard `.` metacharacter, except that while `.` can be affected by @@ -713,7 +713,7 @@ multi-line mode are enabled or disabled. ### 25. ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP (enable `(?~...)`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ Enables support for the `(?~r)` "absent operator" syntax, which matches as much as possible as long as the result _doesn't_ match pattern `r`. This is @@ -731,7 +731,7 @@ excellent article about it is [available on Medium](https://medium.com/rubyinsid ### 26. ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT (enable `\X` and `\Y` and `\y`) -_Set in: Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG, Ruby_ `\X` is another variation on `.`, designed to support Unicode, in that it matches a full _grapheme cluster_. In Unicode, `à` can be encoded as one code point, @@ -764,7 +764,7 @@ backreferences. ### 28. ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (enable `(?{...})`) -_Set in: Perl, Perl_NG, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG_ Enables support for Perl-style "callouts" — pattern substitutions that result from invoking a callback method. When `(?{foo})` is reached in a pattern, the callback @@ -779,7 +779,7 @@ Full documentation for this advanced feature can be found in the Oniguruma ### 29. ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (enable `(*name)`) -_Set in: Perl, Perl_NG, Oniguruma_ +_Set in: Oniguruma, Perl, Perl_NG_ Enables support for Perl-style "callouts" — pattern substitutions that result from invoking a callback method. When `(*foo)` is reached in a pattern, the callback @@ -820,7 +820,7 @@ some syntaxes but not in others. ### 0. ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (independent `?`, `*`, `+`, `{n,m}`) -_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby_ This flag specifies how to handle operators like `?` and `*` when they aren't directly attached to an operand, as in `^*` or `(*)`: Are they an error, are @@ -830,7 +830,7 @@ determines if they are errors or if they are discarded. ### 1. ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (error or ignore independent operators) -_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby_ If ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS is set, this flag controls what happens when independent operators appear in a pattern: If this flag is set, then independent @@ -847,7 +847,7 @@ character will produce an error message. ### 3. ONIG_SYN_ALLOW_INVALID_INTERVAL (allow `{???`) -_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, GnuRegex, Java, Perl, Perl_NG, Ruby_ This flag, if set, causes an invalid range, like `foo{bar}` or `foo{}`, to be silently discarded, as if `foo` had been written instead. If clear, an invalid @@ -855,7 +855,7 @@ range will produce an error message. ### 4. ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (allow `{,n}` to mean `{0,n}`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ If this flag is set, then `r{,n}` will be treated as equivalent to writing `{0,n}`. If this flag is clear, then `r{,n}` will produce an error message. @@ -876,7 +876,7 @@ No built-in syntax has this flag enabled. ### 6. ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (allow `(?<=a|bc)`) -_Set in: Java, Ruby, Oniguruma_ +_Set in: Oniguruma, Java, Ruby_ If this flag is set, lookbehind patterns with alternate options may have differing lengths among those options. If this flag is clear, lookbehind patterns with options @@ -888,7 +888,7 @@ depend on this rule. ### 7. ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (prefer `\k` over `\3`) -_Set in: Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl_NG, Ruby_ If this flag is set on the syntax *and* ONIG_OPTION_CAPTURE_GROUP is set when calling Oniguruma, then if a name is used on any capture, all captures must also use names: A @@ -896,14 +896,14 @@ single use of a named capture prohibits the use of numbered captures. ### 8. ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (allow `(?)...(?)`) -_Set in: Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, Perl_NG, Ruby_ If this flag is set, multiple capture groups may use the same name. If this flag is clear, then reuse of a name will produce an error message. ### 9. ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (`a{n}?` is equivalent to `(?:a{n})?`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ If this flag is set, then intervals of a fixed size will ignore a lazy (non-greedy) `?` quantifier and treat it as an optional match (an ordinary `r?`), since "match as @@ -912,11 +912,17 @@ then `r{n}?` will mean the same as `r{n}`, and the useless `?` will be discarded ### 10. ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (`..(?i)..`) -_Set in: Perl, Java_ +_Set in: Perl, Perl_NG, Java_ If this flag is set, then an isolated option doesn't break the branch and affects until the end of the group (or end of the pattern). If this flag is not set, then an isolated option is interpreted as the starting point of a new branch. /a(?i)b|c/ ==> /a(?i:b|c)/ +### 11. ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (`(?<=...a+...)`) + +_Set in: Oniguruma, Java_ + +If this flag is set, then a variable length expressions are allowed in look-behind. + ### 20. ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (add `\n` to `[^...]`) _Set in: Grep_ @@ -928,7 +934,7 @@ only exclude those characters and ranges written in them. ### 21. ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (allow `[...\w...]`) -_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, GnuRegex, Java, Perl, Perl_NG, Ruby_ If this flag is set, shorthands like `\w` are allowed to describe characters in character classes. If this flag is clear, shorthands like `\w` are treated as a redundantly-escaped @@ -944,7 +950,7 @@ character ranges will produce an error message. ### 23. ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (treat `[0-9-a]` as `[0-9\-a]`) -_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby_ If this flag is set, then a trailing `-` after a character range will be taken as a literal `-`, as if it had been escaped as `\-`. If this flag is clear, then a trailing @@ -952,7 +958,7 @@ literal `-`, as if it had been escaped as `\-`. If this flag is clear, then a t ### 24. ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (warn on `[[...]` and `[-x]`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ If this flag is set, Oniguruma will be stricter about warning for bad forms in character classes: `[[...]` will produce a warning, but `[\[...]` will not; @@ -962,7 +968,7 @@ will be silently discarded. ### 25. ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (warn on `(?:a*)+`) -_Set in: Ruby, Oniguruma_ +_Set in: Oniguruma, Ruby_ If this flag is set, Oniguruma will warn about nested repeat operators those have no meaning, like `(?:a*)+`. If this flag is clear, Oniguruma will allow the nested repeat operators without warning about them. @@ -975,7 +981,7 @@ If this flag is set, then invalid code points at the end of range in character c ### 31. ONIG_SYN_CONTEXT_INDEP_ANCHORS -_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ +_Set in: Oniguruma, PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby_ Not currently used, and does nothing. (But still set in several syntaxes for some reason.) @@ -1069,10 +1075,12 @@ These tables show which of the built-in syntaxes use which flags and options, fo | 3 | `ONIG_SYN_ALLOW_INVALID_INTERVAL` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | | 4 | `ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV` | - | - | - | - | - | - | - | - | Yes | Yes | | 5 | `ONIG_SYN_STRICT_CHECK_BACKREF` | - | - | - | - | - | - | - | - | - | - | -| 6 | `ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 6 | `ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND` | - | - | - | - | - | Yes | - | - | Yes | Yes | | 7 | `ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP` | - | - | - | - | - | - | - | Yes | Yes | Yes | | 8 | `ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME` | - | - | - | - | - | - | - | Yes | Yes | Yes | | 9 | `ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY` | - | - | - | - | - | - | - | - | Yes | Yes | +| 10 | `ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH` | - | - | - | - | - | Yes | Yes | Yes | - | - | +| 11 | `ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND` | - | - | - | - | - | Yes | - | - | - | Yes | | 20 | `ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC` | - | - | - | Yes | - | - | - | - | - | - | | 21 | `ONIG_SYN_BACKSLASH_ESCAPE_IN_CC` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | | 22 | `ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC` | - | - | Yes | Yes | - | - | - | - | - | - | diff --git a/oniguruma/src/Makefile.windows b/oniguruma/src/Makefile.windows index 1e8750446..90ebf287e 100644 --- a/oniguruma/src/Makefile.windows +++ b/oniguruma/src/Makefile.windows @@ -155,6 +155,10 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h +all-test: test_syntax test_regset test_utf8 testc testp testu + +test_syntax: $(TEST_DIR)/test_syntax.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_syntax.c $(libname) test_regset: $(TEST_DIR)/test_regset.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_regset.c $(libname) @@ -172,7 +176,7 @@ testu: $(TEST_DIR)/testu.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all diff --git a/oniguruma/src/make_unicode_fold_data.py b/oniguruma/src/make_unicode_fold_data.py index e2a92c37d..b9085c599 100644 --- a/oniguruma/src/make_unicode_fold_data.py +++ b/oniguruma/src/make_unicode_fold_data.py @@ -254,7 +254,7 @@ HEAD = ''' /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2019 K.Kosako + * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -278,8 +278,7 @@ HEAD = ''' * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include -#include "regenc.h" +#include "regint.h" %} '''.strip() diff --git a/oniguruma/src/oniguruma.h b/oniguruma/src/oniguruma.h index d0d815bb8..1cf0963f7 100644 --- a/oniguruma/src/oniguruma.h +++ b/oniguruma/src/oniguruma.h @@ -5,7 +5,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -71,6 +71,10 @@ extern "C" { #define ONIG_EXTERN extern #endif +#ifndef ONIG_VARIADIC_FUNC_ATTR +#define ONIG_VARIADIC_FUNC_ATTR +#endif + /* PART: character encoding */ #ifndef ONIG_ESCAPE_UCHAR_COLLISION @@ -532,6 +536,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?)(?) */ #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ #define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */ +#define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ @@ -794,7 +799,7 @@ int onig_initialize P_((OnigEncoding encodings[], int number_of_encodings)); ONIG_EXTERN int onig_init P_((void)); ONIG_EXTERN -int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); +int ONIG_VARIADIC_FUNC_ATTR onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN int onig_is_error_code_needs_param PV_((int code)); ONIG_EXTERN diff --git a/oniguruma/src/regcomp.c b/oniguruma/src/regcomp.c index 9426ee57a..139cef6cd 100644 --- a/oniguruma/src/regcomp.c +++ b/oniguruma/src/regcomp.c @@ -37,8 +37,16 @@ typedef struct { OnigLen max; } MinMaxLen; +typedef struct { + OnigLen min; + OnigLen max; + int min_is_sure; +} MinMaxCharLen; + OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; +static OnigLen node_min_byte_len(Node* node, ScanEnv* env); + #if 0 typedef struct { int n; @@ -596,24 +604,71 @@ enum CharLenReturnType { CHAR_LEN_TOP_ALT_FIXED = 1 }; - static int -mml_is_equal(MinMaxLen* a, MinMaxLen* b) -{ - return a->min == b->min && a->max == b->max; -} - -static int -mml_fixed(MinMaxLen* c) +mmcl_fixed(MinMaxCharLen* c) { return (c->min == c->max && c->min != INFINITE_LEN); } static void -mml_set(MinMaxLen* l, OnigLen len) +mmcl_set(MinMaxCharLen* l, OnigLen len) { l->min = len; l->max = len; + l->min_is_sure = TRUE; +} + +static void +mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure) +{ + l->min = min; + l->max = max; + l->min_is_sure = min_is_sure; +} + +static void +mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add) +{ + to->min = distance_add(to->min, add->min); + to->max = distance_add(to->max, add->max); + + to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0; +} + +static void +mmcl_multiply(MinMaxCharLen* to, int m) +{ + to->min = distance_multiply(to->min, m); + to->max = distance_multiply(to->max, m); +} + +static void +mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh) +{ + to->min = distance_multiply(to->min, mlow); + + if (IS_INFINITE_REPEAT(mhigh)) + to->max = INFINITE_LEN; + else + to->max = distance_multiply(to->max, mhigh); +} + +static void +mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) +{ + if (to->min > alt->min) { + to->min = alt->min; + if (alt->min_is_sure != 0) + to->min_is_sure = TRUE; + } + + if (to->max < alt->max) to->max = alt->max; +} + +static int +mml_is_equal(MinMaxLen* a, MinMaxLen* b) +{ + return a->min == b->min && a->max == b->max; } static void @@ -643,23 +698,6 @@ mml_add(MinMaxLen* to, MinMaxLen* add) to->max = distance_add(to->max, add->max); } -static void -mml_multiply(MinMaxLen* to, int m) -{ - to->min = distance_multiply(to->min, m); - to->max = distance_multiply(to->max, m); -} - -static void -mml_repeat_range_multiply(MinMaxLen* to, int mlow, int mhigh) -{ - to->min = distance_multiply(to->min, mlow); - if (IS_INFINITE_REPEAT(mhigh)) - to->max = INFINITE_LEN; - else - to->max = distance_multiply(to->max, mhigh); -} - static void mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) { @@ -669,10 +707,10 @@ mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) /* fixed size pattern node only */ static int -node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, +node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, int level) { - MinMaxLen tci; + MinMaxCharLen tci; int r = CHAR_LEN_NORMAL; level++; @@ -689,7 +727,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, first = FALSE; } else - mml_add(ci, &tci); + mmcl_add(ci, &tci); } while (IS_NOT_NULL(node = NODE_CDR(node))); } break; @@ -705,14 +743,14 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, while (IS_NOT_NULL(node = NODE_CDR(node))) { r = node_char_len1(NODE_CAR(node), reg, &tci, env, level); if (r < 0) break; - if (! mml_fixed(&tci)) + if (! mmcl_fixed(&tci)) fixed = FALSE; - mml_alt_merge(ci, &tci); + mmcl_alt_merge(ci, &tci); } if (r < 0) break; r = CHAR_LEN_NORMAL; - if (mml_fixed(ci)) break; + if (mmcl_fixed(ci)) break; if (fixed == TRUE && level == 1) { r = CHAR_LEN_TOP_ALT_FIXED; @@ -736,7 +774,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, s += enclen(reg->enc, s); clen = distance_add(clen, 1); } - mml_set(ci, clen); + mmcl_set(ci, clen); } break; @@ -746,18 +784,18 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, if (qn->lower == qn->upper) { if (qn->upper == 0) { - mml_set(ci, 0); + mmcl_set(ci, 0); } else { r = node_char_len1(NODE_BODY(node), reg, ci, env, level); if (r < 0) break; - mml_multiply(ci, qn->lower); + mmcl_multiply(ci, qn->lower); } } else { r = node_char_len1(NODE_BODY(node), reg, ci, env, level); if (r < 0) break; - mml_repeat_range_multiply(ci, qn->lower, qn->upper); + mmcl_repeat_range_multiply(ci, qn->lower, qn->upper); } } break; @@ -765,7 +803,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) - mml_set_min_max(ci, 0, INFINITE_LEN); + mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE); else r = node_char_len1(NODE_BODY(node), reg, ci, env, level); break; @@ -773,7 +811,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, case NODE_CTYPE: case NODE_CCLASS: - mml_set(ci, 1); + mmcl_set(ci, 1); break; case NODE_BAG: @@ -783,7 +821,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, switch (en->type) { case BAG_MEMORY: if (NODE_IS_FIXED_CLEN(node)) { - mml_set_min_max(ci, en->min_char_len, en->max_char_len); + mmcl_set_min_max(ci, en->min_char_len, en->max_char_len, + NODE_IS_FIXED_CLEN_MIN_SURE(node)); } else { r = node_char_len1(NODE_BODY(node), reg, ci, env, level); @@ -792,7 +831,11 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, en->min_char_len = ci->min; en->max_char_len = ci->max; NODE_STATUS_ADD(node, FIXED_CLEN); + if (ci->min_is_sure != 0) + NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE); } + /* can't optimize look-behind if capture exists. */ + ci->min_is_sure = FALSE; break; case BAG_OPTION: case BAG_STOP_BACKTRACK: @@ -800,7 +843,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, break; case BAG_IF_ELSE: { - MinMaxLen eci; + MinMaxCharLen eci; r = node_char_len1(NODE_BODY(node), reg, ci, env, level); if (r < 0) break; @@ -808,7 +851,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, if (IS_NOT_NULL(en->te.Then)) { r = node_char_len1(en->te.Then, reg, &tci, env, level); if (r < 0) break; - mml_add(ci, &tci); + mmcl_add(ci, &tci); } if (IS_NOT_NULL(en->te.Else)) { @@ -816,10 +859,10 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, if (r < 0) break; } else { - mml_set(&eci, 0); + mmcl_set(&eci, 0); } - mml_alt_merge(ci, &eci); + mmcl_alt_merge(ci, &eci); } break; default: /* never come here */ @@ -830,9 +873,14 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, break; case NODE_ANCHOR: + mmcl_set(ci, 0); + /* can't optimize look-behind if anchor exists. */ + ci->min_is_sure = FALSE; + break; + case NODE_GIMMICK: zero: - mml_set(ci, 0); + mmcl_set(ci, 0); break; case NODE_BACKREF: @@ -842,12 +890,12 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, if (NODE_IS_RECURSION(node)) { #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - mml_set_min_max(ci, 0, INFINITE_LEN); + mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE); break; } #endif - mml_set_min_max(ci, 0, 0); + mmcl_set_min_max(ci, 0, 0, FALSE); break; } @@ -860,11 +908,13 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, backs = BACKREFS_P(br); r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level); if (r < 0) break; + if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE; for (i = 1; i < br->back_num; i++) { r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level); if (r < 0) break; - mml_alt_merge(ci, &tci); + if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE; + mmcl_alt_merge(ci, &tci); } } break; @@ -878,7 +928,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env, } static int -node_char_len(Node* node, regex_t* reg, MinMaxLen* ci, ScanEnv* env) +node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env) { return node_char_len1(node, reg, ci, env, 0); } @@ -1652,10 +1702,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP; } else { - len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; + len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK; } break; @@ -1667,8 +1717,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += OPSIZE_PUSH; - len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; + len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1676,7 +1725,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } - len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; + len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK; if (IS_NOT_NULL(Else)) { tlen = compile_length_tree(Else, reg); @@ -1801,35 +1850,49 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; - r = add_op(reg, OP_POP_OUT); + r = add_op(reg, OP_POP); if (r != 0) return r; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP); } else { - r = add_op(reg, OP_ATOMIC_START); + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = 0; + r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; } break; case BAG_IF_ELSE: { int cond_len, then_len, else_len, jump_len; + MemNumType mid; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; - r = add_op(reg, OP_ATOMIC_START); + ID_ENTRY(env, mid); + + r = add_op(reg, OP_MARK); if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = 0; cond_len = compile_length_tree(cond, reg); if (cond_len < 0) return cond_len; @@ -1840,7 +1903,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; + jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; @@ -1848,8 +1911,10 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = compile_tree(cond, reg, env); if (r != 0) return r; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; if (IS_NOT_NULL(Then)) { r = compile_tree(Then, reg, env); @@ -1865,10 +1930,12 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; + COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); @@ -1893,16 +1960,38 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; + len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK; break; case ANCR_PREC_READ_NOT: - len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; + len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL; break; case ANCR_LOOK_BEHIND: - len = OPSIZE_LOOK_BEHIND + tlen; + if (node->char_min_len == node->char_max_len) + len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK; + else { + len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP; + + if (IS_NOT_NULL(node->lead_node)) { + int llen = compile_length_tree(node->lead_node, reg); + if (llen < 0) return llen; + + len += OPSIZE_MOVE + llen; + } + } break; case ANCR_LOOK_BEHIND_NOT: - len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; + if (node->char_min_len == node->char_max_len) + len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP; + else { + len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP; + + if (IS_NOT_NULL(node->lead_node)) { + int llen = compile_length_tree(node->lead_node, reg); + if (llen < 0) return llen; + + len += OPSIZE_MOVE + llen; + } + } break; case ANCR_WORD_BOUNDARY: @@ -1927,11 +2016,255 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) return len; } +static int +compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +{ + int r; + + if (node->char_min_len == node->char_max_len) { + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = 0; + COP(reg)->step_back_start.addr = 1; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = FALSE; + } + else { + MemNumType mid1, mid2; + OnigLen diff; + + if (IS_NOT_NULL(node->lead_node)) { + MinMaxCharLen ci; + + r = node_char_len(node->lead_node, reg, &ci, env); + if (r < 0) return r; + r = add_op(reg, OP_MOVE); + if (r != 0) return r; + //ORIG: COP(reg)->move.n = (RelPositionType )(-ci.min); + COP(reg)->move.n = (RelPositionType )(0-ci.min); + r = compile_tree(node->lead_node, reg, env); + if (r != 0) return r; + } + + ID_ENTRY(env, mid1); + r = add_op(reg, OP_SAVE_VAL); + if (r != 0) return r; + COP(reg)->save_val.type = SAVE_RIGHT_RANGE; + COP(reg)->save_val.id = mid1; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S; + + ID_ENTRY(env, mid2); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid2; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + + if (node->char_max_len != INFINITE_LEN) + diff = node->char_max_len - node->char_min_len; + else + diff = INFINITE_LEN; + + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = diff; + COP(reg)->step_back_start.addr = 2; + + r = add_op(reg, OP_STEP_BACK_NEXT); + if (r != 0) return r; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid2; + COP(reg)->cut_to_mark.restore_pos = FALSE; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + + r = add_op(reg, OP_POP); + } + + return r; +} + +static int +compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, + ScanEnv* env) +{ + int r; + int len; + + len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); + + if (node->char_min_len == node->char_max_len) { + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = 0; + COP(reg)->step_back_start.addr = 1; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + r = add_op(reg, OP_POP); + } + else { + MemNumType mid1, mid2; + OnigLen diff; + + ID_ENTRY(env, mid1); + r = add_op(reg, OP_SAVE_VAL); + if (r != 0) return r; + COP(reg)->save_val.type = SAVE_RIGHT_RANGE; + COP(reg)->save_val.id = mid1; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S; + + ID_ENTRY(env, mid2); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid2; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL; + + if (IS_NOT_NULL(node->lead_node)) { + int clen; + MinMaxCharLen ci; + + clen = compile_length_tree(node->lead_node, reg); + COP(reg)->push.addr += OPSIZE_MOVE + clen; + + r = node_char_len(node->lead_node, reg, &ci, env); + if (r < 0) return r; + r = add_op(reg, OP_MOVE); + if (r != 0) return r; + //ORIG: COP(reg)->move.n = (RelPositionType )(-ci.min); + COP(reg)->move.n = (RelPositionType )(0-ci.min); + + r = compile_tree(node->lead_node, reg, env); + if (r != 0) return r; + } + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + + if (node->char_max_len != INFINITE_LEN) + diff = node->char_max_len - node->char_min_len; + else + diff = INFINITE_LEN; + + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = diff; + COP(reg)->step_back_start.addr = 2; + + r = add_op(reg, OP_STEP_BACK_NEXT); + if (r != 0) return r; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE; + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid2; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + + r = add_op(reg, OP_POP); /* pop save val */ + if (r != 0) return r; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + + r = add_op(reg, OP_POP); /* pop mark */ + if (r != 0) return r; + r = add_op(reg, OP_POP); /* pop save val */ + } + + return r; +} + static int compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) { int r, len; enum OpCode op; + MemNumType mid; switch (node->type) { case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break; @@ -1939,7 +2272,11 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break; case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break; case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break; - case ANCR_BEGIN_POSITION: r = add_op(reg, OP_BEGIN_POSITION); break; + case ANCR_BEGIN_POSITION: + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START; + break; case ANCR_WORD_BOUNDARY: op = OP_WORD_BOUNDARY; @@ -1982,43 +2319,58 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) break; case ANCR_PREC_READ: - r = add_op(reg, OP_PREC_READ_START); - if (r != 0) return r; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_PREC_READ_END); + { + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = TRUE; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = TRUE; + } break; case ANCR_PREC_READ_NOT: - len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); - if (len < 0) return len; + { + len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); + if (len < 0) return len; - r = add_op(reg, OP_PREC_READ_NOT_START); - if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_PREC_READ_NOT_END); + ID_ENTRY(env, mid); + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len + + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL; + + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid; + + r = add_op(reg, OP_POP); + if (r != 0) return r; + r = add_op(reg, OP_FAIL); + } break; case ANCR_LOOK_BEHIND: - r = add_op(reg, OP_LOOK_BEHIND); - if (r != 0) return r; - COP(reg)->look_behind.len = node->char_len; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + r = compile_anchor_look_behind_node(node, reg, env); break; case ANCR_LOOK_BEHIND_NOT: - len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); - r = add_op(reg, OP_LOOK_BEHIND_NOT_START); - if (r != 0) return r; - - COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; - COP(reg)->look_behind_not_start.len = node->char_len; - - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_LOOK_BEHIND_NOT_END); + r = compile_anchor_look_behind_not_node(node, reg, env); break; default: @@ -2032,7 +2384,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) static int compile_gimmick_node(GimmickNode* node, regex_t* reg) { - int r; + int r = 0; switch (node->type) { case GIMMICK_FAIL: @@ -2863,7 +3215,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) } static Node* -get_head_value_node(Node* node, int exact, regex_t* reg) +get_tree_head_literal(Node* node, int exact, regex_t* reg) { Node* n = NULL_NODE; @@ -2886,7 +3238,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; case NODE_LIST: - n = get_head_value_node(NODE_CAR(node), exact, reg); + n = get_tree_head_literal(NODE_CAR(node), exact, reg); break; case NODE_STRING: @@ -2910,7 +3262,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (IS_NOT_NULL(qn->head_exact)) n = qn->head_exact; else - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); } } break; @@ -2920,19 +3272,10 @@ get_head_value_node(Node* node, int exact, regex_t* reg) BagNode* en = BAG_(node); switch (en->type) { case BAG_OPTION: - { - OnigOptionType options = reg->options; - - reg->options = BAG_(node)->o.options; - n = get_head_value_node(NODE_BODY(node), exact, reg); - reg->options = options; - } - break; - case BAG_MEMORY: case BAG_STOP_BACKTRACK: case BAG_IF_ELSE: - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); break; } } @@ -2940,7 +3283,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case NODE_ANCHOR: if (ANCHOR_(node)->type == ANCR_PREC_READ) - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); break; case NODE_GIMMICK: @@ -2951,42 +3294,239 @@ get_head_value_node(Node* node, int exact, regex_t* reg) return n; } +enum GetValue { + GET_VALUE_NONE = -1, + GET_VALUE_IGNORE = 0, + GET_VALUE_FOUND = 1 +}; + static int -check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) +get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg) { + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + if (IS_NULL(NODE_CDR(node))) { + r = get_tree_tail_literal(NODE_CAR(node), rnode, reg); + } + else { + r = get_tree_tail_literal(NODE_CDR(node), rnode, reg); + if (r == GET_VALUE_IGNORE) { + r = get_tree_tail_literal(NODE_CAR(node), rnode, reg); + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + break; +#endif + + case NODE_CTYPE: + if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) { + r = GET_VALUE_NONE; + break; + } + /* fall */ + case NODE_CCLASS: + *rnode = node; + r = GET_VALUE_FOUND; + break; + + case NODE_STRING: + { + StrNode* sn = STR_(node); + + if (sn->end <= sn->s) { + r = GET_VALUE_IGNORE; + break; + } + + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + r = GET_VALUE_NONE; + break; + } + + *rnode = node; + r = GET_VALUE_FOUND; + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + if (qn->lower != 0) { + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + } + else + r = GET_VALUE_NONE; + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_MARK1(node)) + r = GET_VALUE_NONE; + else { + NODE_STATUS_ADD(node, MARK1); + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + NODE_STATUS_REMOVE(node, MARK1); + } + } + else { + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + } + } + break; + + case NODE_ANCHOR: + case NODE_GIMMICK: + r = GET_VALUE_IGNORE; + break; + + case NODE_ALT: + case NODE_BACKREF: + default: + r = GET_VALUE_NONE; + break; + } + + return r; +} + +static int +check_called_node_in_look_behind(Node* node, int not) +{ + int r; + + r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = check_called_node_in_look_behind(NODE_CAR(node), not); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_MARK1(node)) + return 0; + else { + NODE_STATUS_ADD(node, MARK1); + r = check_called_node_in_look_behind(NODE_BODY(node), not); + NODE_STATUS_REMOVE(node, MARK1); + } + } + else { + r = check_called_node_in_look_behind(NODE_BODY(node), not); + if (r == 0 && en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = check_called_node_in_look_behind(en->te.Then, not); + if (r != 0) break; + } + if (IS_NOT_NULL(en->te.Else)) { + r = check_called_node_in_look_behind(en->te.Else, not); + } + } + } + } + break; + + case NODE_ANCHOR: + if (IS_NOT_NULL(NODE_BODY(node))) + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + + case NODE_GIMMICK: + if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0) + return 1; + break; + + default: + break; + } + + return r; +} + +/* allowed node types in look-behind */ +#define ALLOWED_TYPE_IN_LB \ + ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \ + | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \ + | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK) + +#define ALLOWED_BAG_IN_LB ( 1<type) & bag_mask) == 0) + if (((1<type) & bag_mask[not]) == 0) return 1; - r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(NODE_BODY(node), not); if (r == 0 && en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = check_type_tree(en->te.Then, type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(en->te.Then, not); if (r != 0) break; } if (IS_NOT_NULL(en->te.Else)) { - r = check_type_tree(en->te.Else, type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(en->te.Else, not); } } } @@ -2994,14 +3534,22 @@ check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) case NODE_ANCHOR: type = ANCHOR_(node)->type; - if ((type & anchor_mask) == 0) + if ((type & anchor_mask[not]) == 0) return 1; if (IS_NOT_NULL(NODE_BODY(node))) - r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(NODE_BODY(node), not); break; case NODE_GIMMICK: + if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0) + return 1; + break; + + case NODE_CALL: + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + default: break; } @@ -4006,9 +4554,13 @@ reduce_string_list(Node* node) static int divide_look_behind_alternatives(Node* node) { + int r; + int anc_type; Node *head, *np, *insert_node; - AnchorNode* an = ANCHOR_(node); - int anc_type = an->type; + AnchorNode* an; + + an = ANCHOR_(node); + anc_type = an->type; head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); @@ -4018,7 +4570,8 @@ divide_look_behind_alternatives(Node* node) np = node; while (IS_NOT_NULL(np = NODE_CDR(np))) { - insert_node = onig_node_copy(head); + r = onig_node_copy(&insert_node, head); + if (r != 0) return r; CHECK_NULL_RETURN_MEMERR(insert_node); NODE_BODY(insert_node) = NODE_CAR(np); NODE_CAR(np) = insert_node; @@ -4033,6 +4586,73 @@ divide_look_behind_alternatives(Node* node) return 0; } +static int +node_reduce_in_look_behind(Node* node) +{ + NodeType type; + Node* body; + + if (NODE_TYPE(node) != NODE_QUANT) return 0; + + body = NODE_BODY(node); + type = NODE_TYPE(body); + if (type == NODE_STRING || type == NODE_CTYPE || + type == NODE_CCLASS || type == NODE_BACKREF) { + QuantNode* qn = QUANT_(node); + qn->upper = qn->lower; + if (qn->upper == 0) + return 1; /* removed */ + } + + return 0; +} + +static int +list_reduce_in_look_behind(Node* node) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_QUANT: + r = node_reduce_in_look_behind(node); + if (r > 0) r = 0; + break; + + case NODE_LIST: + do { + r = node_reduce_in_look_behind(NODE_CAR(node)); + if (r <= 0) break; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_ALT: + do { + r = list_reduce_in_look_behind(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + default: + r = list_reduce_in_look_behind(node); + break; + } + + return r; +} + static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); static int @@ -4040,7 +4660,8 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; int state1; - MinMaxLen ci; + MinMaxCharLen ci; + Node* body; AnchorNode* an = ANCHOR_(node); if (an->type == ANCR_LOOK_BEHIND_NOT) @@ -4048,29 +4669,59 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) else state1 = state | IN_LOOK_BEHIND; + body = NODE_ANCHOR_BODY(an); /* Execute tune_tree(body) before call node_char_len(). Because case-fold expansion must be done before node_char_len(). */ - r = tune_tree(NODE_ANCHOR_BODY(an), reg, state1, env); + r = tune_tree(body, reg, state1, env); if (r != 0) return r; - r = node_char_len(NODE_ANCHOR_BODY(an), reg, &ci, env); + r = alt_reduce_in_look_behind(body, reg, env); + if (r != 0) return r; + + r = node_char_len(body, reg, &ci, env); if (r >= 0) { + if (ci.min == 0 && ci.min_is_sure != 0) { + if (an->type == ANCR_LOOK_BEHIND_NOT) + r = onig_node_reset_fail(node); + else + r = onig_node_reset_empty(node); + + return r; + } + if (r == CHAR_LEN_TOP_ALT_FIXED) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) { r = divide_look_behind_alternatives(node); if (r == 0) r = tune_tree(node, reg, state, env); } + else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) + goto normal; else r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else { /* CHAR_LEN_NORMAL */ - if (mml_fixed(&ci)) { - an->char_len = ci.min; + normal: + if (ci.min == INFINITE_LEN) { + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else { - r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + if (ci.min != ci.max && + ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) { + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else { + Node* tail; + an->char_min_len = ci.min; + an->char_max_len = ci.max; + r = get_tree_tail_literal(body, &tail, reg); + if (r == GET_VALUE_FOUND) { + r = onig_node_copy(&(an->lead_node), tail); + if (r != 0) return r; + } + r = ONIG_NORMAL; + } } } } @@ -4089,7 +4740,7 @@ tune_next(Node* node, Node* next_node, regex_t* reg) QuantNode* qn = QUANT_(node); if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT - Node* n = get_head_value_node(next_node, 1, reg); + Node* n = get_tree_head_literal(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { qn->next_head_exact = n; @@ -4099,9 +4750,9 @@ tune_next(Node* node, Node* next_node, regex_t* reg) if (qn->lower <= 1) { if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; - x = get_head_value_node(NODE_BODY(node), 0, reg); + x = get_tree_head_literal(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { - y = get_head_value_node(next_node, 0, reg); + y = get_tree_head_literal(next_node, 0, reg); if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); @@ -4946,27 +5597,6 @@ __inline static int tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { -/* allowed node types in look-behind */ -#define ALLOWED_TYPE_IN_LB \ - ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \ - | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \ - | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK) - -#define ALLOWED_BAG_IN_LB ( 1< 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_look_behind(node, reg, state, env); - } - break; - case ANCR_LOOK_BEHIND_NOT: - { - r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB, - ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); - if (r < 0) return r; - if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_look_behind(node, reg, state, env); - } + r = check_node_in_look_behind(NODE_ANCHOR_BODY(an), + an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + r = tune_look_behind(node, reg, state, env); break; default: @@ -5071,7 +5690,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } else { - qn->head_exact = get_head_value_node(NODE_BODY(node), 1, reg); + qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg); } } diff --git a/oniguruma/src/regenc.h b/oniguruma/src/regenc.h index 2b5fb5a3b..df029aa96 100644 --- a/oniguruma/src/regenc.h +++ b/oniguruma/src/regenc.h @@ -5,7 +5,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,7 +35,10 @@ #endif #include "config.h" + +#ifndef ONIG_NO_STANDARD_C_HEADERS #include +#endif #ifdef ONIG_ESCAPE_UCHAR_COLLISION #undef ONIG_ESCAPE_UCHAR_COLLISION diff --git a/oniguruma/src/regerror.c b/oniguruma/src/regerror.c index 7b87e5ce3..ab0decd67 100644 --- a/oniguruma/src/regerror.c +++ b/oniguruma/src/regerror.c @@ -3,7 +3,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,10 +28,12 @@ * SUCH DAMAGE. */ -#include "regint.h" -#include /* for vsnprintf() */ +#ifndef NEED_TO_INCLUDE_STDIO +/* for vsnprintf() */ +#define NEED_TO_INCLUDE_STDIO +#endif -#include +#include "regint.h" extern UChar* onig_error_code_to_format(int code) @@ -278,7 +280,8 @@ onig_is_error_code_needs_param(int code) /* for ONIG_MAX_ERROR_MESSAGE_LEN */ #define MAX_ERROR_PAR_LEN 30 -extern int onig_error_code_to_str(UChar* s, int code, ...) +extern int ONIG_VARIADIC_FUNC_ATTR +onig_error_code_to_str(UChar* s, int code, ...) { UChar *p, *q; OnigErrorInfo* einfo; @@ -338,8 +341,9 @@ extern int onig_error_code_to_str(UChar* s, int code, ...) } -void onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, - UChar* pat, UChar* pat_end, const UChar *fmt, ...) +void ONIG_VARIADIC_FUNC_ATTR +onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, + UChar* pat, UChar* pat_end, const UChar *fmt, ...) { int n, need, len; UChar *p, *s, *bp; diff --git a/oniguruma/src/regexec.c b/oniguruma/src/regexec.c index 5efe3a50a..0487f88d2 100644 --- a/oniguruma/src/regexec.c +++ b/oniguruma/src/regexec.c @@ -26,6 +26,13 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ + +#ifndef ONIG_NO_PRINT +#ifndef NEED_TO_INCLUDE_STDIO +#define NEED_TO_INCLUDE_STDIO +#endif +#endif + #include "regint.h" #define IS_MBC_WORD_ASCII_MODE(enc,s,end,mode) \ @@ -203,7 +210,7 @@ static OpInfoType OpInfo[] = { { OP_BEGIN_LINE, "begin-line"}, { OP_END_LINE, "end-line"}, { OP_SEMI_END_BUF, "semi-end-buf"}, - { OP_BEGIN_POSITION, "begin-position"}, + { OP_CHECK_POSITION, "check-position"}, { OP_BACKREF1, "backref1"}, { OP_BACKREF2, "backref2"}, { OP_BACKREF_N, "backref-n"}, @@ -228,7 +235,8 @@ static OpInfoType OpInfo[] = { { OP_JUMP, "jump"}, { OP_PUSH, "push"}, { OP_PUSH_SUPER, "push-super"}, - { OP_POP_OUT, "pop-out"}, + { OP_POP, "pop"}, + { OP_POP_TO_MARK, "pop-to-mark"}, #ifdef USE_OP_PUSH_OR_JUMP_EXACT { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1"}, #endif @@ -243,15 +251,11 @@ static OpInfoType OpInfo[] = { #ifdef USE_CALL { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push"}, #endif - { OP_PREC_READ_START, "push-pos"}, - { OP_PREC_READ_END, "pop-pos"}, - { OP_PREC_READ_NOT_START, "prec-read-not-start"}, - { OP_PREC_READ_NOT_END, "prec-read-not-end"}, - { OP_ATOMIC_START, "atomic-start"}, - { OP_ATOMIC_END, "atomic-end"}, - { OP_LOOK_BEHIND, "look-behind"}, - { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start"}, - { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end"}, + { OP_MOVE, "move"}, + { OP_STEP_BACK_START, "step-back-start"}, + { OP_STEP_BACK_NEXT, "step-back-next"}, + { OP_CUT_TO_MARK, "cut-to-mark"}, + { OP_MARK, "mark"}, { OP_SAVE_VAL, "save-val"}, { OP_UPDATE_VAR, "update-var"}, #ifdef USE_CALL @@ -529,24 +533,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, fprintf(f, ":%d", mem); break; - case OP_PREC_READ_NOT_START: - addr = p->prec_read_not_start.addr; - fputc(':', f); - p_rel_addr(f, addr, p, start); - break; - - case OP_LOOK_BEHIND: - len = p->look_behind.len; - fprintf(f, ":%d", len); - break; - - case OP_LOOK_BEHIND_NOT_START: - addr = p->look_behind_not_start.addr; - len = p->look_behind_not_start.len; - fprintf(f, ":%d:", len); - p_rel_addr(f, addr, p, start); - break; - #ifdef USE_CALL case OP_CALL: addr = p->call.addr; @@ -554,6 +540,43 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; #endif + case OP_MOVE: + fprintf(f, ":%d", p->move.n); + break; + + case OP_STEP_BACK_START: + addr = p->step_back_start.addr; + fprintf(f, ":%d:%d:", + p->step_back_start.initial, + p->step_back_start.remaining); + p_rel_addr(f, addr, p, start); + break; + + case OP_POP_TO_MARK: + mem = p->pop_to_mark.id; + fprintf(f, ":%d", mem); + break; + + case OP_CUT_TO_MARK: + { + int restore; + + mem = p->cut_to_mark.id; + restore = p->cut_to_mark.restore_pos; + fprintf(f, ":%d:%d", mem, restore); + } + break; + + case OP_MARK: + { + int save; + + mem = p->mark.id; + save = p->mark.save_pos; + fprintf(f, ":%d:%d", mem, save); + } + break; + case OP_SAVE_VAL: { SaveType type; @@ -596,6 +619,17 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, fprintf(f, ":not"); break; + case OP_CHECK_POSITION: + switch (p->check_position.type) { + case CHECK_POSITION_SEARCH_START: + fprintf(f, ":search-start"); break; + case CHECK_POSITION_CURRENT_RIGHT_RANGE: + fprintf(f, ":current-right-range"); break; + default: + break; + }; + break; + case OP_FINISH: case OP_END: case OP_ANYCHAR: @@ -611,17 +645,11 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_BEGIN_LINE: case OP_END_LINE: case OP_SEMI_END_BUF: - case OP_BEGIN_POSITION: case OP_BACKREF1: case OP_BACKREF2: case OP_FAIL: - case OP_POP_OUT: - case OP_PREC_READ_START: - case OP_PREC_READ_END: - case OP_PREC_READ_NOT_END: - case OP_ATOMIC_START: - case OP_ATOMIC_END: - case OP_LOOK_BEHIND_NOT_END: + case OP_POP: + case OP_STEP_BACK_NEXT: #ifdef USE_CALL case OP_RETURN: #endif @@ -976,8 +1004,6 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /* used by normal-POP */ #define STK_SUPER_ALT STK_ALT_FLAG #define STK_ALT (0x0002 | STK_ALT_FLAG) -#define STK_ALT_PREC_READ_NOT (0x0004 | STK_ALT_FLAG) -#define STK_ALT_LOOK_BEHIND_NOT (0x0006 | STK_ALT_FLAG) /* handled by normal-POP */ #define STK_MEM_START 0x0010 @@ -1000,13 +1026,10 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #endif #define STK_EMPTY_CHECK_END 0x5000 /* for recursive call */ #define STK_MEM_END_MARK 0x8100 -#define STK_TO_VOID_START 0x1200 /* mark for "(?>...)" */ -/* #define STK_REPEAT 0x0300 */ #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 -#define STK_PREC_READ_START 0x0700 -#define STK_PREC_READ_END 0x0800 +#define STK_MARK 0x0704 /* stack type check mask */ #define STK_MASK_POP_USED STK_ALT_FLAG @@ -1580,6 +1603,16 @@ stack_double(int is_alloca, char** arg_alloc_base, STACK_INC;\ } while(0) +#define STACK_PUSH_WITH_ZID(stack_type,pat,s,sprev,id) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + stk->zid = (int )(id);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + #define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ @@ -1604,13 +1637,8 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_PREC_READ_START(s,sprev) \ - STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev) -#define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \ - STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev) -#define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START) -#define STACK_PUSH_ALT_LOOK_BEHIND_NOT(pat,s,sprev) \ - STACK_PUSH(STK_ALT_LOOK_BEHIND_NOT,pat,s,sprev) +#define STACK_PUSH_ALT_WITH_ZID(pat,s,sprev,id) \ + STACK_PUSH_WITH_ZID(STK_ALT,pat,s,sprev,id) #if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ @@ -1726,6 +1754,22 @@ stack_double(int is_alloca, char** arg_alloc_base, STACK_INC;\ } while(0) +#define STACK_PUSH_MARK(sid) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MARK;\ + stk->zid = (sid);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MARK_WITH_POS(sid, s, sprev) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MARK;\ + stk->zid = (sid);\ + stk->u.val.v = (UChar* )(s);\ + stk->u.val.v2 = (sprev);\ + STACK_INC;\ +} while(0) + #define STACK_PUSH_SAVE_VAL(sid, stype, sval) do {\ STACK_ENSURE(1);\ stk->type = STK_SAVE_VAL;\ @@ -1884,6 +1928,32 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) +#define STACK_POP_TO_MARK(sid) do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk, "STACK_POP_TO_MARK");\ + if ((stk->type & STK_MASK_POP_HANDLED_TIL) != 0) {\ + if (stk->type == STK_MARK) {\ + if (stk->zid == (sid)) break;\ + }\ + else {\ + if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ + mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ + mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ + }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ + /* Don't call callout here because negation of total success by (?!..) (?type == STK_TO_VOID_START) {\ + if (k->type == STK_MARK) {\ + if (k->zid == (sid)) {\ + k->type = STK_VOID;\ + break;\ + } /* don't void different id mark */ \ + }\ + else\ k->type = STK_VOID;\ - break;\ - }\ - k->type = STK_VOID;\ }\ }\ } while(0) -#define STACK_GET_PREC_READ_START(k) do {\ - int level = 0;\ - k = stk;\ - while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ - if (IS_TO_VOID_TARGET(k)) {\ - k->type = STK_VOID;\ - }\ - else if (k->type == STK_PREC_READ_START) {\ - if (level == 0) {\ - break;\ - }\ - level--;\ - }\ - else if (k->type == STK_PREC_READ_END) {\ - level++;\ - }\ - }\ -} while(0) - - #define EMPTY_CHECK_START_SEARCH(sid, k) do {\ k = stk;\ while (1) {\ @@ -2373,27 +2416,19 @@ backref_check_at_nested_level(regex_t* reg, #ifdef ONIG_DEBUG_STATISTICS -#define USE_TIMEOFDAY - #ifdef USE_TIMEOFDAY -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif + static struct timeval ts, te; #define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) #define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ (((te).tv_sec - (ts).tv_sec)*1000000)) #else -#ifdef HAVE_SYS_TIMES_H -#include -#endif + static struct tms ts, te; #define GETTIME(t) times(&(t)) #define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) -#endif + +#endif /* USE_TIMEOFDAY */ static int OpCounter[256]; static int OpPrevCounter[256]; @@ -2605,7 +2640,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_BEGIN_LINE, &&L_END_LINE, &&L_SEMI_END_BUF, - &&L_BEGIN_POSITION, + &&L_CHECK_POSITION, &&L_BACKREF1, &&L_BACKREF2, &&L_BACKREF_N, @@ -2630,7 +2665,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_JUMP, &&L_PUSH, &&L_PUSH_SUPER, - &&L_POP_OUT, + &&L_POP, + &&L_POP_TO_MARK, #ifdef USE_OP_PUSH_OR_JUMP_EXACT &&L_PUSH_OR_JUMP_EXACT1, #endif @@ -2645,15 +2681,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_CALL &&L_EMPTY_CHECK_END_MEMST_PUSH, #endif - &&L_PREC_READ_START, - &&L_PREC_READ_END, - &&L_PREC_READ_NOT_START, - &&L_PREC_READ_NOT_END, - &&L_ATOMIC_START, - &&L_ATOMIC_END, - &&L_LOOK_BEHIND, - &&L_LOOK_BEHIND_NOT_START, - &&L_LOOK_BEHIND_NOT_END, + &&L_MOVE, + &&L_STEP_BACK_START, + &&L_STEP_BACK_NEXT, + &&L_CUT_TO_MARK, + &&L_MARK, &&L_SAVE_VAL, &&L_UPDATE_VAR, #ifdef USE_CALL @@ -2671,7 +2703,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, LengthType tlen, tlen2; MemNumType mem; RelAddrType addr; - UChar *s, *q, *ps, *sbegin; + UChar *s, *ps, *sbegin; UChar *right_range; int is_alloca; char *alloc_base; @@ -3404,10 +3436,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif goto fail; - CASE_OP(BEGIN_POSITION) - if (s != msa->start) - goto fail; - + CASE_OP(CHECK_POSITION) + switch (p->check_position.type) { + case CHECK_POSITION_SEARCH_START: + if (s != msa->start) goto fail; + break; + case CHECK_POSITION_CURRENT_RIGHT_RANGE: + if (s != right_range) goto fail; + break; + default: + break; + } INC_OP; JUMP_OUT; @@ -3753,13 +3792,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(POP_OUT) + CASE_OP(POP) STACK_POP_ONE; /* for stop backtrack */ /* CHECK_RETRY_LIMIT_IN_MATCH; */ INC_OP; JUMP_OUT; + CASE_OP(POP_TO_MARK) + STACK_POP_TO_MARK(p->pop_to_mark.id); + INC_OP; + JUMP_OUT; + #ifdef USE_OP_PUSH_OR_JUMP_EXACT CASE_OP(PUSH_OR_JUMP_EXACT1) { @@ -3854,70 +3898,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(PREC_READ_START) - STACK_PUSH_PREC_READ_START(s, sprev); - INC_OP; - JUMP_OUT; - - CASE_OP(PREC_READ_END) - STACK_GET_PREC_READ_START(stkp); - s = stkp->u.state.pstr; - sprev = stkp->u.state.pstr_prev; - STACK_PUSH(STK_PREC_READ_END,0,0,0); - INC_OP; - JUMP_OUT; - - CASE_OP(PREC_READ_NOT_START) - addr = p->prec_read_not_start.addr; - STACK_PUSH_ALT_PREC_READ_NOT(p + addr, s, sprev); - INC_OP; - JUMP_OUT; - - CASE_OP(PREC_READ_NOT_END) - STACK_POP_TIL_ALT_PREC_READ_NOT; - goto fail; - - CASE_OP(ATOMIC_START) - STACK_PUSH_TO_VOID_START; - INC_OP; - JUMP_OUT; - - CASE_OP(ATOMIC_END) - STACK_EXEC_TO_VOID(stkp); - INC_OP; - JUMP_OUT; - - CASE_OP(LOOK_BEHIND) - tlen = p->look_behind.len; - s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); - if (IS_NULL(s)) goto fail; - sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); - INC_OP; - JUMP_OUT; - - CASE_OP(LOOK_BEHIND_NOT_START) - addr = p->look_behind_not_start.addr; - tlen = p->look_behind_not_start.len; - q = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); - if (IS_NULL(q)) { - /* too short case -> success. ex. /(?call.addr; @@ -3931,6 +3911,72 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; #endif + CASE_OP(MOVE) + if (p->move.n < 0) { + s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, -p->move.n); + if (IS_NULL(s)) goto fail; + } + else { + int len; + + for (tlen = 0; tlen < p->move.n; tlen++) { + len = enclen(encode, s); + if (s + len > end) goto fail; + sprev = s; + s += len; + } + } + sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + INC_OP; + JUMP_OUT; + + CASE_OP(STEP_BACK_START) + tlen = p->step_back_start.initial; + if (tlen != 0) { + s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); + if (IS_NULL(s)) goto fail; + sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + } + if (p->step_back_start.remaining != 0) { + STACK_PUSH_ALT_WITH_ZID(p + 1, s, sprev, p->step_back_start.remaining); + p += p->step_back_start.addr; + } + else + INC_OP; + JUMP_OUT; + + CASE_OP(STEP_BACK_NEXT) + tlen = (LengthType )stk->zid; /* remaining count */ + if (tlen != INFINITE_LEN) tlen--; + s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, 1); + if (IS_NULL(s)) goto fail; + sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (tlen != 0) { + STACK_PUSH_ALT_WITH_ZID(p, s, sprev, (int )tlen); + } + INC_OP; + JUMP_OUT; + + CASE_OP(CUT_TO_MARK) + mem = p->cut_to_mark.id; /* mem: mark id */ + STACK_TO_VOID_TO_MARK(stkp, mem); + if (p->cut_to_mark.restore_pos != 0) { + s = stkp->u.val.v; + sprev = stkp->u.val.v2; + } + INC_OP; + JUMP_OUT; + + CASE_OP(MARK) + mem = p->mark.id; /* mem: mark id */ + if (p->mark.save_pos != 0) + STACK_PUSH_MARK_WITH_POS(mem, s, sprev); + else + STACK_PUSH_MARK(mem); + + INC_OP; + JUMP_OUT; + CASE_OP(SAVE_VAL) { SaveType type; @@ -3960,13 +4006,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, enum SaveType save_type; type = p->update_var.type; - mem = p->update_var.id; /* mem: save id */ switch ((enum UpdateVarType )type) { case UPDATE_VAR_KEEP_FROM_STACK_LAST: STACK_GET_SAVE_VAL_TYPE_LAST(SAVE_KEEP, keep); break; case UPDATE_VAR_S_FROM_STACK: + mem = p->update_var.id; /* mem: save id */ STACK_GET_SAVE_VAL_TYPE_LAST_ID_WITH_SPREV(SAVE_S, mem, s); break; case UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK: @@ -3976,8 +4022,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case UPDATE_VAR_RIGHT_RANGE_FROM_STACK: save_type = SAVE_RIGHT_RANGE; get_save_val_type_last_id: + mem = p->update_var.id; /* mem: save id */ STACK_GET_SAVE_VAL_TYPE_LAST_ID(save_type, mem, right_range); break; + case UPDATE_VAR_RIGHT_RANGE_TO_S: + right_range = s; + break; case UPDATE_VAR_RIGHT_RANGE_INIT: INIT_RIGHT_RANGE; break; @@ -6238,9 +6288,7 @@ onig_builtin_cmp(OnigCalloutArgs* args, void* user_data ARG_UNUSED) } -#ifndef ONIGURUMA_UNSUPPORTED_PRINT - -#include +#ifndef ONIG_NO_PRINT static FILE* OutFp; @@ -6338,6 +6386,6 @@ onig_setup_builtin_monitors_by_ascii_encoded_name(void* fp /* FILE* */) return ONIG_NORMAL; } -#endif /* ONIGURUMA_UNSUPPORTED_PRINT */ +#endif /* ONIG_NO_PRINT */ #endif /* USE_CALLOUT */ diff --git a/oniguruma/src/regint.h b/oniguruma/src/regint.h index 0e32c7377..926e96cc9 100644 --- a/oniguruma/src/regint.h +++ b/oniguruma/src/regint.h @@ -74,76 +74,31 @@ #define USE_OP_PUSH_OR_JUMP_EXACT #define USE_QUANT_PEEK_NEXT #define USE_ST_LIBRARY +#define USE_TIMEOFDAY -#define USE_WORD_BEGIN_END /* "\<", "\>" */ +#define USE_WORD_BEGIN_END /* "\<", "\>" */ #define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS #define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE /* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ +#define INIT_MATCH_STACK_SIZE 160 +#define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ +#define DEFAULT_RETRY_LIMIT_IN_MATCH 10000000 +#define DEFAULT_PARSE_DEPTH_LIMIT 4096 + #include "regenc.h" -#define INIT_MATCH_STACK_SIZE 160 -#define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ -#define DEFAULT_RETRY_LIMIT_IN_MATCH 10000000 -#define DEFAULT_PARSE_DEPTH_LIMIT 4096 - -/* */ -/* escape other system UChar definition */ -#ifdef ONIG_ESCAPE_UCHAR_COLLISION -#undef ONIG_ESCAPE_UCHAR_COLLISION -#endif - -#define xmalloc malloc -#define xrealloc realloc -#define xcalloc calloc -#define xfree free - -#define st_init_table onig_st_init_table -#define st_init_table_with_size onig_st_init_table_with_size -#define st_init_numtable onig_st_init_numtable -#define st_init_numtable_with_size onig_st_init_numtable_with_size -#define st_init_strtable onig_st_init_strtable -#define st_init_strtable_with_size onig_st_init_strtable_with_size -#define st_delete onig_st_delete -#define st_delete_safe onig_st_delete_safe -#define st_insert onig_st_insert -#define st_lookup onig_st_lookup -#define st_foreach onig_st_foreach -#define st_add_direct onig_st_add_direct -#define st_free_table onig_st_free_table -#define st_cleanup_safe onig_st_cleanup_safe -#define st_copy onig_st_copy -#define st_nothing_key_clone onig_st_nothing_key_clone -#define st_nothing_key_free onig_st_nothing_key_free -/* */ -#define onig_st_is_member st_is_member - - -#ifndef ONIGURUMA_SYS_UEFI - -#define xmemset memset -#define xmemcpy memcpy -#define xmemmove memmove - -#if defined(_WIN32) && !defined(__GNUC__) -#define xalloca _alloca -#define xvsnprintf(buf,size,fmt,args) _vsnprintf_s(buf,size,_TRUNCATE,fmt,args) -#define xsnprintf sprintf_s -#define xstrcat(dest,src,size) strcat_s(dest,size,src) -#else -#define xalloca alloca -#define xvsnprintf vsnprintf -#define xsnprintf snprintf -#define xstrcat(dest,src,size) strcat(dest,src) -#endif - +#ifndef ONIG_NO_STANDARD_C_HEADERS #include +#include #include #include +#include +#include #ifdef HAVE_STDINT_H #include @@ -153,9 +108,6 @@ #include #endif -#include - -#include #ifdef HAVE_SYS_TYPES_H #ifndef __BORLANDC__ #include @@ -166,32 +118,50 @@ #include #endif -#ifdef __BORLANDC__ +#if defined(_WIN32) || defined(__BORLANDC__) #include #endif -#ifdef ONIG_DEBUG +#if defined(ONIG_DEBUG) || defined(NEED_TO_INCLUDE_STDIO) # include #endif -#ifdef _WIN32 -#if defined(_MSC_VER) && (_MSC_VER < 1300) -typedef int intptr_t; -typedef unsigned int uintptr_t; +#ifdef ONIG_DEBUG_STATISTICS +#ifdef USE_TIMEOFDAY + +#ifdef HAVE_SYS_TIME_H +#include #endif +#ifdef HAVE_UNISTD_H +#include #endif -/* strend hash */ -typedef void hash_table_type; +#else /* USE_TIMEOFDAY */ -#ifdef _WIN32 -# include -typedef ULONG_PTR hash_data_type; -#else -typedef unsigned long hash_data_type; +#ifdef HAVE_SYS_TIMES_H +#include #endif -#endif /* ONIGURUMA_SYS_UEFI */ +#endif /* USE_TIMEOFDAY */ +#endif /* ONIG_DEBUG_STATISTICS */ + +/* I don't think these x....'s need to be included in + ONIG_NO_STANDARD_C_HEADERS, but they are required by Issue #170 + and do so since there is no problem. + */ +#ifndef xmemset +#define xmemset memset +#endif + +#ifndef xmemcpy +#define xmemcpy memcpy +#endif + +#ifndef xmemmove +#define xmemmove memmove +#endif + +#endif /* ONIG_NO_STANDARD_C_HEADERS */ #ifdef MIN @@ -213,6 +183,86 @@ typedef unsigned long hash_data_type; #define CHAR_MAP_SIZE 256 #define INFINITE_LEN ONIG_INFINITE_DISTANCE +/* escape other system UChar definition */ +#ifdef ONIG_ESCAPE_UCHAR_COLLISION +#undef ONIG_ESCAPE_UCHAR_COLLISION +#endif + +#define xmalloc malloc +#define xrealloc realloc +#define xcalloc calloc +#define xfree free + +#define st_init_table onig_st_init_table +#define st_init_table_with_size onig_st_init_table_with_size +#define st_init_numtable onig_st_init_numtable +#define st_init_numtable_with_size onig_st_init_numtable_with_size +#define st_init_strtable onig_st_init_strtable +#define st_init_strtable_with_size onig_st_init_strtable_with_size +#define st_delete onig_st_delete +#define st_delete_safe onig_st_delete_safe +#define st_insert onig_st_insert +#define st_lookup onig_st_lookup +#define st_foreach onig_st_foreach +#define st_add_direct onig_st_add_direct +#define st_free_table onig_st_free_table +#define st_cleanup_safe onig_st_cleanup_safe +#define st_copy onig_st_copy +#define st_nothing_key_clone onig_st_nothing_key_clone +#define st_nothing_key_free onig_st_nothing_key_free +/* */ +#define onig_st_is_member st_is_member + + +#if defined(_WIN32) && !defined(__GNUC__) + +#ifndef xalloca +#define xalloca _alloca +#endif +#ifndef xvsnprintf +#define xvsnprintf(buf,size,fmt,args) _vsnprintf_s(buf,size,_TRUNCATE,fmt,args) +#endif +#ifndef xsnprintf +#define xsnprintf sprintf_s +#endif +#ifndef xstrcat +#define xstrcat(dest,src,size) strcat_s(dest,size,src) +#endif + +#else + +#ifndef xalloca +#define xalloca alloca +#endif +#ifndef xvsnprintf +#define xvsnprintf vsnprintf +#endif +#ifndef xsnprintf +#define xsnprintf snprintf +#endif +#ifndef xstrcat +#define xstrcat(dest,src,size) strcat(dest,src) +#endif + +#endif /* defined(_WIN32) && !defined(__GNUC__) */ + + +#ifdef _WIN32 +#if defined(_MSC_VER) && (_MSC_VER < 1300) +typedef int intptr_t; +typedef unsigned int uintptr_t; +#endif +#endif + +#if SIZEOF_VOIDP == SIZEOF_LONG +typedef unsigned long hash_data_type; +#elif SIZEOF_VOIDP == SIZEOF_LONG_LONG +typedef unsigned long long hash_data_type; +#endif + +/* strend hash */ +typedef void* hash_table_type; + #ifdef USE_CALLOUT @@ -338,7 +388,7 @@ typedef uint32_t Bits; typedef Bits BitSet[BITSET_REAL_SIZE]; typedef Bits* BitSetRef; -#define SIZE_BITSET sizeof(BitSet) +#define SIZE_BITSET sizeof(BitSet) #define BITSET_CLEAR(bs) do {\ int i;\ @@ -362,14 +412,6 @@ typedef struct _BBuf { #define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) -/* -#define BB_SIZE_INC(buf,inc) do{\ - (buf)->alloc += (inc);\ - (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ - if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ -} while (0) -*/ - #define BB_EXPAND(buf,low) do{\ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ @@ -466,20 +508,20 @@ typedef struct _BBuf { /* operation code */ enum OpCode { - OP_FINISH = 0, /* matching process terminator (no more alternative) */ - OP_END = 1, /* pattern code terminator (success end) */ - OP_STR_1 = 2, /* single byte, N = 1 */ - OP_STR_2, /* single byte, N = 2 */ - OP_STR_3, /* single byte, N = 3 */ - OP_STR_4, /* single byte, N = 4 */ - OP_STR_5, /* single byte, N = 5 */ - OP_STR_N, /* single byte */ - OP_STR_MB2N1, /* mb-length = 2 N = 1 */ - OP_STR_MB2N2, /* mb-length = 2 N = 2 */ - OP_STR_MB2N3, /* mb-length = 2 N = 3 */ - OP_STR_MB2N, /* mb-length = 2 */ - OP_STR_MB3N, /* mb-length = 3 */ - OP_STR_MBN, /* other length */ + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + OP_STR_1 = 2, /* single byte, N = 1 */ + OP_STR_2, /* single byte, N = 2 */ + OP_STR_3, /* single byte, N = 3 */ + OP_STR_4, /* single byte, N = 4 */ + OP_STR_5, /* single byte, N = 5 */ + OP_STR_N, /* single byte */ + OP_STR_MB2N1, /* mb-length = 2 N = 1 */ + OP_STR_MB2N2, /* mb-length = 2 N = 2 */ + OP_STR_MB2N3, /* mb-length = 2 N = 3 */ + OP_STR_MB2N, /* mb-length = 2 */ + OP_STR_MB3N, /* mb-length = 3 */ + OP_STR_MBN, /* other length */ OP_CCLASS, OP_CCLASS_MB, OP_CCLASS_MIX, @@ -506,7 +548,7 @@ enum OpCode { OP_BEGIN_LINE, OP_END_LINE, OP_SEMI_END_BUF, - OP_BEGIN_POSITION, + OP_CHECK_POSITION, OP_BACKREF1, OP_BACKREF2, OP_BACKREF_N, @@ -531,7 +573,8 @@ enum OpCode { OP_JUMP, OP_PUSH, OP_PUSH_SUPER, - OP_POP_OUT, + OP_POP, + OP_POP_TO_MARK, #ifdef USE_OP_PUSH_OR_JUMP_EXACT OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ #endif @@ -546,15 +589,11 @@ enum OpCode { #ifdef USE_CALL OP_EMPTY_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ #endif - OP_PREC_READ_START, /* (?=...) start */ - OP_PREC_READ_END, /* (?=...) end */ - OP_PREC_READ_NOT_START, /* (?!...) start */ - OP_PREC_READ_NOT_END, /* (?!...) end */ - OP_ATOMIC_START, /* (?>...) start */ - OP_ATOMIC_END, /* (?>...) end */ - OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ - OP_LOOK_BEHIND_NOT_START, /* (? -#endif - #define INIT_TAG_NAMES_ALLOC_NUM 5 #define WARN_BUFSIZE 256 @@ -95,6 +97,7 @@ OnigSyntaxType OnigSyntaxOniguruma = { , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | + ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | @@ -389,16 +392,6 @@ onig_strcpy(UChar* dest, const UChar* src, const UChar* end) } } -static int -save_entry(ScanEnv* env, enum SaveType type, int* id) -{ - int nid = env->save_num; - - env->save_num++; - *id = nid; - return 0; -} - /* scan pattern methods */ #define PEND_VALUE 0 @@ -499,7 +492,7 @@ str_end_hash(st_str_end_key* x) return (int) (val + (val >> 5)); } -extern hash_table_type* +extern hash_table_type onig_st_init_strend_table_with_size(int size) { static struct st_hash_type hashType = { @@ -507,12 +500,11 @@ onig_st_init_strend_table_with_size(int size) str_end_hash, }; - return (hash_table_type* ) - onig_st_init_table_with_size(&hashType, size); + return (hash_table_type )onig_st_init_table_with_size(&hashType, size); } extern int -onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, +onig_st_lookup_strend(hash_table_type table, const UChar* str_key, const UChar* end_key, hash_data_type *value) { st_str_end_key key; @@ -524,7 +516,7 @@ onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, } extern int -onig_st_insert_strend(hash_table_type* table, const UChar* str_key, +onig_st_insert_strend(hash_table_type table, const UChar* str_key, const UChar* end_key, hash_data_type value) { st_str_end_key* key; @@ -590,7 +582,7 @@ callout_name_table_hash(st_callout_name_key* x) return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } -extern hash_table_type* +extern hash_table_type onig_st_init_callout_name_table_with_size(int size) { static struct st_hash_type hashType = { @@ -598,12 +590,11 @@ onig_st_init_callout_name_table_with_size(int size) callout_name_table_hash, }; - return (hash_table_type* ) - onig_st_init_table_with_size(&hashType, size); + return (hash_table_type )onig_st_init_table_with_size(&hashType, size); } extern int -onig_st_lookup_callout_name_table(hash_table_type* table, +onig_st_lookup_callout_name_table(hash_table_type table, OnigEncoding enc, int type, const UChar* str_key, @@ -621,7 +612,7 @@ onig_st_lookup_callout_name_table(hash_table_type* table, } static int -st_insert_callout_name_table(hash_table_type* table, +st_insert_callout_name_table(hash_table_type table, OnigEncoding enc, int type, UChar* str_key, UChar* end_key, hash_data_type value) @@ -2003,15 +1994,15 @@ scan_env_clear(ScanEnv* env) xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); - env->parse_depth = 0; + env->parse_depth = 0; #ifdef ONIG_DEBUG_PARSE - env->max_parse_depth = 0; + env->max_parse_depth = 0; #endif - env->backref_num = 0; - env->keep_num = 0; - env->save_num = 0; - env->save_alloc_num = 0; - env->saves = 0; + env->backref_num = 0; + env->keep_num = 0; + env->id_num = 0; + env->save_alloc_num = 0; + env->saves = 0; } static int @@ -2062,16 +2053,11 @@ scan_env_set_mem_node(ScanEnv* env, int num, Node* node) return 0; } -extern void -onig_node_free(Node* node) +static void +node_free_body(Node* node) { - start: if (IS_NULL(node)) return ; -#ifdef DEBUG_NODE_FREE - fprintf(stderr, "onig_node_free: %p\n", node); -#endif - switch (NODE_TYPE(node)) { case NODE_STRING: if (STR_(node)->capacity != 0 && @@ -2083,12 +2069,12 @@ onig_node_free(Node* node) case NODE_LIST: case NODE_ALT: onig_node_free(NODE_CAR(node)); - { - Node* next_node = NODE_CDR(node); - + node = NODE_CDR(node); + while (IS_NOT_NULL(node)) { + Node* next = NODE_CDR(node); + onig_node_free(NODE_CAR(node)); xfree(node); - node = next_node; - goto start; + node = next; } break; @@ -2120,9 +2106,15 @@ onig_node_free(Node* node) break; case NODE_QUANT: + if (NODE_BODY(node)) + onig_node_free(NODE_BODY(node)); + break; + case NODE_ANCHOR: if (NODE_BODY(node)) onig_node_free(NODE_BODY(node)); + if (IS_NOT_NULL(ANCHOR_(node)->lead_node)) + onig_node_free(ANCHOR_(node)->lead_node); break; case NODE_CTYPE: @@ -2130,7 +2122,18 @@ onig_node_free(Node* node) case NODE_GIMMICK: break; } +} +extern void +onig_node_free(Node* node) +{ + if (IS_NULL(node)) return ; + +#ifdef DEBUG_NODE_FREE + fprintf(stderr, "onig_node_free: %p\n", node); +#endif + + node_free_body(node); xfree(node); } @@ -2157,16 +2160,64 @@ node_new(void) return node; } -extern Node* -onig_node_copy(Node* from) +extern int +onig_node_copy(Node** rcopy, Node* from) { + int r; Node* copy; + *rcopy = NULL_NODE; + + switch (NODE_TYPE(from)) { + case NODE_LIST: + case NODE_ALT: + case NODE_ANCHOR: + /* These node's link to other nodes are processed by caller. */ + break; + case NODE_STRING: + case NODE_CCLASS: + case NODE_CTYPE: + /* Fixed contents after copy. */ + break; + default: + /* Not supported yet. */ + return ONIGERR_TYPE_BUG; + break; + } + copy = node_new(); - CHECK_NULL_RETURN(copy); + CHECK_NULL_RETURN_MEMERR(copy); xmemcpy(copy, from, sizeof(*copy)); - return copy; + switch (NODE_TYPE(copy)) { + case NODE_STRING: + r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE); + if (r != 0) { + err: + onig_node_free(copy); + return r; + } + break; + + case NODE_CCLASS: + { + CClassNode *fcc, *tcc; + + fcc = CCLASS_(from); + tcc = CCLASS_(copy); + if (IS_NOT_NULL(fcc->mbuf)) { + r = bbuf_clone(&(tcc->mbuf), fcc->mbuf); + if (r != 0) goto err; + } + } + break; + + default: + break; + } + + *rcopy = copy; + return ONIG_NORMAL; } @@ -2323,8 +2374,10 @@ node_new_anchor(int type) NODE_SET_TYPE(node, NODE_ANCHOR); ANCHOR_(node)->type = type; - ANCHOR_(node)->char_len = INFINITE_LEN; + ANCHOR_(node)->char_min_len = 0; + ANCHOR_(node)->char_max_len = INFINITE_LEN; ANCHOR_(node)->ascii_mode = 0; + ANCHOR_(node)->lead_node = NULL_NODE; return node; } @@ -2560,25 +2613,36 @@ node_drop_group(Node* group) return content; } +static int +node_set_fail(Node* node) +{ + NODE_SET_TYPE(node, NODE_GIMMICK); + GIMMICK_(node)->type = GIMMICK_FAIL; + return ONIG_NORMAL; +} + static int node_new_fail(Node** node, ScanEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); - NODE_SET_TYPE(*node, NODE_GIMMICK); - GIMMICK_(*node)->type = GIMMICK_FAIL; - return ONIG_NORMAL; + return node_set_fail(*node); +} + +extern int +onig_node_reset_fail(Node* node) +{ + node_free_body(node); + return node_set_fail(node); } static int node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) { int id; - int r; - r = save_entry(env, save_type, &id); - if (r != ONIG_NORMAL) return r; + ID_ENTRY(env, id); *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2806,6 +2870,9 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, id, env); if (r != 0) goto err; + if (is_range_cutter != 0) + NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS); + r = node_new_fail(&ns[3], env); if (r != 0) goto err; @@ -2945,6 +3012,7 @@ make_range_clear(Node** node, ScanEnv* env) r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; + NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS); x = make_alt(2, ns); if (IS_NULL(x)) goto err0; @@ -3216,9 +3284,9 @@ onig_node_str_cat(Node* node, const UChar* s, const UChar* end) } extern int -onig_node_str_set(Node* node, const UChar* s, const UChar* end) +onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free) { - onig_node_str_clear(node); + onig_node_str_clear(node, need_free); return onig_node_str_cat(node, s, end); } @@ -3232,9 +3300,10 @@ node_str_cat_char(Node* node, UChar c) } extern void -onig_node_str_clear(Node* node) +onig_node_str_clear(Node* node, int need_free) { - if (STR_(node)->capacity != 0 && + if (need_free != 0 && + STR_(node)->capacity != 0 && IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) { xfree(STR_(node)->s); } @@ -3245,11 +3314,10 @@ onig_node_str_clear(Node* node) STR_(node)->capacity = 0; } -static Node* -node_new_str(const UChar* s, const UChar* end) +static int +node_set_str(Node* node, const UChar* s, const UChar* end) { - Node* node = node_new(); - CHECK_NULL_RETURN(node); + int r; NODE_SET_TYPE(node, NODE_STRING); STR_(node)->flag = 0; @@ -3257,13 +3325,39 @@ node_new_str(const UChar* s, const UChar* end) STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; - if (onig_node_str_cat(node, s, end)) { + r = onig_node_str_cat(node, s, end); + return r; +} + +static Node* +node_new_str(const UChar* s, const UChar* end) +{ + int r; + Node* node = node_new(); + CHECK_NULL_RETURN(node); + + r = node_set_str(node, s, end); + if (r != 0) { onig_node_free(node); return NULL; } + return node; } +static int +node_reset_str(Node* node, const UChar* s, const UChar* end) +{ + node_free_body(node); + return node_set_str(node, s, end); +} + +extern int +onig_node_reset_empty(Node* node) +{ + return node_reset_str(node, NULL, NULL); +} + extern Node* onig_node_new_str(const UChar* s, const UChar* end) { diff --git a/oniguruma/src/regparse.h b/oniguruma/src/regparse.h index 964401653..6097062b1 100644 --- a/oniguruma/src/regparse.h +++ b/oniguruma/src/regparse.h @@ -5,7 +5,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,23 +32,23 @@ #include "regint.h" -#define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ -#define NODE_BACKREFS_SIZE 6 +#define NODE_STRING_MARGIN 16 +#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_BACKREFS_SIZE 6 /* node type */ typedef enum { - NODE_STRING = 0, - NODE_CCLASS = 1, - NODE_CTYPE = 2, - NODE_BACKREF = 3, - NODE_QUANT = 4, - NODE_BAG = 5, - NODE_ANCHOR = 6, - NODE_LIST = 7, - NODE_ALT = 8, - NODE_CALL = 9, - NODE_GIMMICK = 10 + NODE_STRING = 0, + NODE_CCLASS = 1, + NODE_CTYPE = 2, + NODE_BACKREF = 3, + NODE_QUANT = 4, + NODE_BAG = 5, + NODE_ANCHOR = 6, + NODE_LIST = 7, + NODE_ALT = 8, + NODE_CALL = 9, + NODE_GIMMICK = 10 } NodeType; enum BagType { @@ -85,7 +85,7 @@ typedef struct { UChar* end; unsigned int flag; UChar buf[NODE_STRING_BUF_SIZE]; - int capacity; /* (allocated size - 1) or 0: use buf[] */ + int capacity; /* (allocated size - 1) or 0: use buf[] */ } StrNode; typedef struct { @@ -110,7 +110,7 @@ typedef struct { enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; - int include_referred; /* include called node. don't eliminate even if {0} */ + int include_referred; /* include called node. don't eliminate even if {0} */ } QuantNode; typedef struct { @@ -190,8 +190,10 @@ typedef struct { struct _Node* body; int type; - OnigLen char_len; + OnigLen char_min_len; + OnigLen char_max_len; int ascii_mode; + struct _Node* lead_node; } AnchorNode; typedef struct { @@ -248,6 +250,11 @@ typedef struct _Node { } u; } Node; +typedef struct { + int new_val; +} GroupNumMap; + + #define NULL_NODE ((Node* )0) @@ -280,8 +287,8 @@ typedef struct _Node { #define CALL_(node) (&((node)->u.call)) #define GIMMICK_(node) (&((node)->u.gimmick)) -#define NODE_CAR(node) (CONS_(node)->car) -#define NODE_CDR(node) (CONS_(node)->cdr) +#define NODE_CAR(node) (CONS_(node)->car) +#define NODE_CDR(node) (CONS_(node)->cdr) #define CTYPE_ANYCHAR -1 #define NODE_IS_ANYCHAR(node) \ @@ -291,8 +298,8 @@ typedef struct _Node { #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) -#define NODE_STRING_CRUDE (1<<0) -#define NODE_STRING_CASE_EXPANDED (1<<1) +#define NODE_STRING_CRUDE (1<<0) +#define NODE_STRING_CASE_EXPANDED (1<<1) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) #define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE @@ -307,30 +314,32 @@ typedef struct _Node { (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) /* node status bits */ -#define NODE_ST_FIXED_MIN (1<<0) -#define NODE_ST_FIXED_MAX (1<<1) -#define NODE_ST_FIXED_CLEN (1<<2) -#define NODE_ST_MARK1 (1<<3) -#define NODE_ST_MARK2 (1<<4) -#define NODE_ST_STRICT_REAL_REPEAT (1<<5) -#define NODE_ST_RECURSION (1<<6) -#define NODE_ST_CALLED (1<<7) -#define NODE_ST_FIXED_ADDR (1<<8) -#define NODE_ST_NAMED_GROUP (1<<9) -#define NODE_ST_IN_REAL_REPEAT (1<<10) /* STK_REPEAT is nested in stack. */ -#define NODE_ST_IN_ZERO_REPEAT (1<<11) /* (....){0} */ -#define NODE_ST_IN_MULTI_ENTRY (1<<12) -#define NODE_ST_NEST_LEVEL (1<<13) -#define NODE_ST_BY_NUMBER (1<<14) /* {n,m} */ -#define NODE_ST_BY_NAME (1<<15) /* backref by name */ -#define NODE_ST_BACKREF (1<<16) -#define NODE_ST_CHECKER (1<<17) -#define NODE_ST_PROHIBIT_RECURSION (1<<18) -#define NODE_ST_SUPER (1<<19) -#define NODE_ST_EMPTY_STATUS_CHECK (1<<20) -#define NODE_ST_IGNORECASE (1<<21) -#define NODE_ST_MULTILINE (1<<22) -#define NODE_ST_TEXT_SEGMENT_WORD (1<<23) +#define NODE_ST_FIXED_MIN (1<<0) +#define NODE_ST_FIXED_MAX (1<<1) +#define NODE_ST_FIXED_CLEN (1<<2) +#define NODE_ST_MARK1 (1<<3) +#define NODE_ST_MARK2 (1<<4) +#define NODE_ST_STRICT_REAL_REPEAT (1<<5) +#define NODE_ST_RECURSION (1<<6) +#define NODE_ST_CALLED (1<<7) +#define NODE_ST_FIXED_ADDR (1<<8) +#define NODE_ST_NAMED_GROUP (1<<9) +#define NODE_ST_IN_REAL_REPEAT (1<<10) /* STK_REPEAT is nested in stack. */ +#define NODE_ST_IN_ZERO_REPEAT (1<<11) /* (....){0} */ +#define NODE_ST_IN_MULTI_ENTRY (1<<12) +#define NODE_ST_NEST_LEVEL (1<<13) +#define NODE_ST_BY_NUMBER (1<<14) /* {n,m} */ +#define NODE_ST_BY_NAME (1<<15) /* backref by name */ +#define NODE_ST_BACKREF (1<<16) +#define NODE_ST_CHECKER (1<<17) +#define NODE_ST_PROHIBIT_RECURSION (1<<18) +#define NODE_ST_SUPER (1<<19) +#define NODE_ST_EMPTY_STATUS_CHECK (1<<20) +#define NODE_ST_IGNORECASE (1<<21) +#define NODE_ST_MULTILINE (1<<22) +#define NODE_ST_TEXT_SEGMENT_WORD (1<<23) +#define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */ +#define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -364,6 +373,8 @@ typedef struct _Node { #define NODE_IS_IGNORECASE(node) ((NODE_STATUS(node) & NODE_ST_IGNORECASE) != 0) #define NODE_IS_MULTILINE(node) ((NODE_STATUS(node) & NODE_ST_MULTILINE) != 0) #define NODE_IS_TEXT_SEGMENT_WORD(node) ((NODE_STATUS(node) & NODE_ST_TEXT_SEGMENT_WORD) != 0) +#define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0) +#define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0) #define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) @@ -372,11 +383,20 @@ typedef struct _Node { #define NODE_CALL_BODY(node) ((node)->body) #define NODE_ANCHOR_BODY(node) ((node)->body) -#define SCANENV_MEMENV_SIZE 8 +#define SCANENV_MEMENV_SIZE 8 #define SCANENV_MEMENV(senv) \ (IS_NOT_NULL((senv)->mem_env_dynamic) ? \ (senv)->mem_env_dynamic : (senv)->mem_env_static) +#define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) +#define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) +#define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) + +#define ID_ENTRY(env, id) do {\ + id = (env)->id_num++;\ +} while(0) + + typedef struct { Node* mem_node; Node* empty_repeat_node; @@ -400,50 +420,44 @@ typedef struct { UChar* error_end; regex_t* reg; /* for reg->names only */ int num_call; -#ifdef USE_CALL - UnsetAddrList* unset_addr_list; - int has_call_zero; -#endif int num_mem; int num_named; int mem_alloc; MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; + int backref_num; + int keep_num; + int id_num; + int save_alloc_num; + SaveItem* saves; +#ifdef USE_CALL + UnsetAddrList* unset_addr_list; + int has_call_zero; +#endif unsigned int parse_depth; #ifdef ONIG_DEBUG_PARSE unsigned int max_parse_depth; #endif - int backref_num; - int keep_num; - int save_num; - int save_alloc_num; - SaveItem* saves; } ScanEnv; -#define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) -#define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) -#define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) - -typedef struct { - int new_val; -} GroupNumMap; - extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_reduce_nested_quantifier P_((Node* pnode)); -extern Node* onig_node_copy(Node* from); +extern int onig_node_copy(Node** rcopy, Node* from); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); -extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); +extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end, int need_free)); +extern void onig_node_str_clear P_((Node* node, int need_free)); extern void onig_node_free P_((Node* node)); +extern int onig_node_reset_empty P_((Node* node)); +extern int onig_node_reset_fail P_((Node* node)); extern Node* onig_node_new_bag P_((enum BagType type)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); -extern void onig_node_str_clear P_((Node* node)); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); diff --git a/oniguruma/src/regsyntax.c b/oniguruma/src/regsyntax.c index 428e21bfd..23de0f218 100644 --- a/oniguruma/src/regsyntax.c +++ b/oniguruma/src/regsyntax.c @@ -3,7 +3,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -154,7 +154,8 @@ OnigSyntaxType OnigSyntaxJava = { ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | - ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | + ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND ) , ONIG_OPTION_SINGLELINE , { diff --git a/oniguruma/src/regversion.c b/oniguruma/src/regversion.c index f954b9b6e..8a87694ad 100644 --- a/oniguruma/src/regversion.c +++ b/oniguruma/src/regversion.c @@ -3,7 +3,7 @@ encoding: UTF-8 **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,8 +28,11 @@ * SUCH DAMAGE. */ +#ifndef NEED_TO_INCLUDE_STDIO +#define NEED_TO_INCLUDE_STDIO +#endif + #include "regint.h" -#include extern const char* onig_version(void) diff --git a/oniguruma/src/st.c b/oniguruma/src/st.c index da849a5c8..6d3dc6dc4 100644 --- a/oniguruma/src/st.c +++ b/oniguruma/src/st.c @@ -3,17 +3,14 @@ /* static char sccsid[] = "@(#) st.c 5.1 89/12/14 Crucible"; */ -#include -#include -#include - -#ifdef _WIN32 -#include +#ifndef NEED_TO_INCLUDE_STDIO +#define NEED_TO_INCLUDE_STDIO #endif #include "regint.h" #include "st.h" + typedef struct st_table_entry st_table_entry; struct st_table_entry { diff --git a/oniguruma/src/st.h b/oniguruma/src/st.h index 3b57bc134..0746cdb20 100644 --- a/oniguruma/src/st.h +++ b/oniguruma/src/st.h @@ -3,20 +3,14 @@ /* @(#) st.h 5.1 89/12/14 */ #ifndef ST_INCLUDED - #define ST_INCLUDED -#ifndef ONIGURUMA_SYS_UEFI - -#ifdef _WIN32 -# include -typedef ULONG_PTR st_data_t; -#else +#if SIZEOF_VOIDP == SIZEOF_LONG typedef unsigned long st_data_t; +#elif SIZEOF_VOIDP == SIZEOF_LONG_LONG +typedef unsigned long long st_data_t; #endif -#endif /* ONIGURUMA_SYS_UEFI */ - #define ST_DATA_T_DEFINED typedef struct st_table st_table; diff --git a/oniguruma/src/unicode_fold1_key.c b/oniguruma/src/unicode_fold1_key.c index 171a0fa4b..2089a883f 100644 --- a/oniguruma/src/unicode_fold1_key.c +++ b/oniguruma/src/unicode_fold1_key.c @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2019 K.Kosako + * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,8 +33,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include -#include "regenc.h" +#include "regint.h" #define TOTAL_KEYWORDS 1353 #define MIN_WORD_LENGTH 3 diff --git a/oniguruma/src/unicode_fold2_key.c b/oniguruma/src/unicode_fold2_key.c index c39b19da4..e06ba0be1 100644 --- a/oniguruma/src/unicode_fold2_key.c +++ b/oniguruma/src/unicode_fold2_key.c @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2019 K.Kosako + * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,8 +33,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include -#include "regenc.h" +#include "regint.h" #define TOTAL_KEYWORDS 59 #define MIN_WORD_LENGTH 6 diff --git a/oniguruma/src/unicode_fold3_key.c b/oniguruma/src/unicode_fold3_key.c index 295c44747..7ab24d1d4 100644 --- a/oniguruma/src/unicode_fold3_key.c +++ b/oniguruma/src/unicode_fold3_key.c @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2019 K.Kosako + * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,8 +33,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include -#include "regenc.h" +#include "regint.h" #define TOTAL_KEYWORDS 14 #define MIN_WORD_LENGTH 9 diff --git a/oniguruma/src/unicode_unfold_key.c b/oniguruma/src/unicode_unfold_key.c index 51a037b39..65bfa2603 100644 --- a/oniguruma/src/unicode_unfold_key.c +++ b/oniguruma/src/unicode_unfold_key.c @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2019 K.Kosako + * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,8 +33,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include -#include "regenc.h" +#include "regint.h" #define TOTAL_KEYWORDS 1487 #define MIN_WORD_LENGTH 3 diff --git a/res/Notepad3.exe.manifest.conf b/res/Notepad3.exe.manifest.conf index 1b45b8774..b992e3f1a 100644 --- a/res/Notepad3.exe.manifest.conf +++ b/res/Notepad3.exe.manifest.conf @@ -3,7 +3,7 @@ Notepad3 BETA diff --git a/src/VersionEx.h b/src/VersionEx.h index ea2a99a55..556d9d06c 100644 --- a/src/VersionEx.h +++ b/src/VersionEx.h @@ -8,8 +8,8 @@ #define SAPPNAME "Notepad3" #define VERSION_MAJOR 5 #define VERSION_MINOR 20 -#define VERSION_REV 118 -#define VERSION_BUILD 2711 +#define VERSION_REV 123 +#define VERSION_BUILD 2712 #define SCINTILLA_VER 430 #define ONIGURUMA_REGEX_VER 6.9.4 #define UCHARDET_VER 2018.09.27