From cf2ca6de9a968a045a27d1e301b98df00e31b343 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Wed, 12 Jun 2019 18:49:38 +0200 Subject: [PATCH] + enh: a little bit more advanced TOML Lexer --- oniguruma/doc/SYNTAX.md | 1069 +++++++++++++++++++++++++++ sciXlexers/CharSetX.h | 49 ++ sciXlexers/LexAHKL.cxx | 26 +- sciXlexers/LexTOML.cxx | 350 ++++++--- sciXlexers/SciXLexer.h | 20 +- scintilla/Scintilla.vcxproj | 1 + scintilla/Scintilla.vcxproj.filters | 3 + src/Config/SimpleIni.h | 60 +- src/Edit.c | 14 +- src/Notepad3.c | 8 +- src/StyleLexers/styleLexTOML.c | 12 +- 11 files changed, 1433 insertions(+), 179 deletions(-) create mode 100644 oniguruma/doc/SYNTAX.md create mode 100644 sciXlexers/CharSetX.h diff --git a/oniguruma/doc/SYNTAX.md b/oniguruma/doc/SYNTAX.md new file mode 100644 index 000000000..449f262ff --- /dev/null +++ b/oniguruma/doc/SYNTAX.md @@ -0,0 +1,1069 @@ + +# Oniguruma syntax (operator) configuration + +_Documented for Oniguruma 6.9.2 (2019/03/28)_ + + +---------- + + +## Overview + +This document details how to configure Oniguruma's syntax, by describing the desired +syntax operators and behaviors in an instance of the OnigSyntaxType struct, just like +the built-in Oniguruma syntaxes do. + +Configuration operators are bit flags, and are broken into multiple groups, somewhat arbitrarily, +because Oniguruma takes its configuration as a trio of 32-bit `unsigned int` values, assigned as +the first three fields in an `OnigSyntaxType` struct: + +```C +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + OnigOptionType options; /* default option */ + OnigMetaCharTableType meta_char_table; +} OnigSyntaxType; +``` + +The first group of configuration flags (`op`) roughly corresponds to the +configuration for "basic regex." The second group (`op2`) roughly corresponds +to the configuration for "advanced regex." And the third group (`behavior`) +describes more-or-less what to do for broken input, bad input, or other corner-case +regular expressions whose meaning is not well-defined. These three groups of +flags are described in full below, and tables of their usages for various syntaxes +follow. + +The `options` field describes the default compile options to use if the caller does +not specify any options when invoking `onig_new()`. + +The `meta_char_table` field is used exclusively by the ONIG_SYN_OP_VARIABLE_META_CHARACTERS +option, which allows the various regex metacharacters, like `*` and `?`, to be replaced +with alternates (for example, SQL typically uses `%` instead of `.*` and `_` instead of `?`). + + +---------- + + +## Group One Flags (op) + + +This group contains "basic regex" constructs, features common to most regex systems. + + +### 0. ONIG_SYN_OP_VARIABLE_META_CHARACTERS + +_Set in: none_ + +Enables support for `onig_set_meta_char()`, which allows you to provide alternate +characters that will be used instead of the six special characters that are normally +these characters below: + + - `ONIG_META_CHAR_ESCAPE`: `\` + - `ONIG_META_CHAR_ANYCHAR`: `.` + - `ONIG_META_CHAR_ANYTIME`: `*` + - `ONIG_META_CHAR_ZERO_OR_ONE_TIME`: `?` + - `ONIG_META_CHAR_ONE_OR_MORE_TIME`: `+` + - `ONIG_META_CHAR_ANYCHAR_ANYTIME`: Equivalent in normal regex to `.*`, but supported + explicitly so that Oniguruma can support matching SQL `%` wildcards or shell `*` wildcards. + +If this flag is set, then the values defined using `onig_set_meta_char()` will be used; +if this flag is clear, then the default regex characters will be used instead, and +data set by `onig_set_meta_char()` will be ignored. + + +### 1. ONIG_SYN_OP_DOT_ANYCHAR (enable `.`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `.` metacharacter, meaning "any one character." You +usually want this flag on unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `.` instead. + + +### 2. ONIG_SYN_OP_ASTERISK_ZERO_INF (enable `r*`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r*` metacharacter, meaning "zero or more r's." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `*` instead. + + +### 3. ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (enable `r\*`) + +_Set in: none_ + +Enables support for an escaped `r\*` metacharacter, meaning "zero or more r's." This is +useful if you have disabled support for the normal `r*` metacharacter because you want `*` +to simply match a literal `*` character, but you still want some way of activating "zero or more" +behavior. + + +### 4. ONIG_SYN_OP_PLUS_ONE_INF (enable `r+`) + +_Set in: PosixExtended, Emacs, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r+` metacharacter, meaning "one or more r's." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `+` instead. + + +### 5. ONIG_SYN_OP_ESC_PLUS_ONE_INF (enable `r\+`) + +_Set in: Grep_ + +Enables support for an escaped `r\+` metacharacter, meaning "one or more r's." This is +useful if you have disabled support for the normal `r+` metacharacter because you want `+` +to simply match a literal `+` character, but you still want some way of activating "one or more" +behavior. + + +### 6. ONIG_SYN_OP_QMARK_ZERO_ONE (enable `r?`) + +_Set in: PosixExtended, Emacs, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r?` metacharacter, meaning "zero or one r" or "an optional r." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `?` instead. + + +### 7. ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (enable `r\?`) + +_Set in: Grep_ + +Enables support for an escaped `r\?` metacharacter, meaning "zero or one r" or "an optional +r." This is useful if you have disabled support for the normal `r?` metacharacter because +you want `?` to simply match a literal `?` character, but you still want some way of activating +"optional" behavior. + + +### 8. ONIG_SYN_OP_BRACE_INTERVAL (enable `r{l,u}`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the `r{lower,upper}` range form, common to more advanced +regex engines, which lets you specify precisely a minimum and maximum range on how many r's +must match (and not simply "zero or more"). + +This form also allows `r{count}` to specify a precise count of r's that must match. + +This form also allows `r{lower,}` to be equivalent to `r{lower,infinity}`. + +If and only if the `ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV` behavior flag is set, +this form also allows `r{,upper}` to be equivalent to `r{0,upper}`; otherwise, +`r{,upper}` will be treated as an error. + + +### 9. ONIG_SYN_OP_ESC_BRACE_INTERVAL (enable `\{` and `\}`) + +_Set in: PosixBasic, Emacs, Grep_ + +Enables support for an escaped `r\{lower,upper\}` range form. This is useful if you +have disabled support for the normal `r{...}` range form and want curly braces to simply +match literal curly brace characters, but you still want some way of activating +"range" behavior. + + +### 10. ONIG_SYN_OP_VBAR_ALT (enable `r|s`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `r|s` alternation operator. You usually want this +flag set. + + +### 11. ONIG_SYN_OP_ESC_VBAR_ALT (enable `\|`) + +_Set in: Emacs, Grep_ + +Enables support for an escaped `r\|s` alternation form. This is useful if you +have disabled support for the normal `r|s` alternation form and want `|` to simply +match a literal `|` character, but you still want some way of activating "alternate" behavior. + + +### 12. ONIG_SYN_OP_LPAREN_SUBEXP (enable `(r)`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `(...)` grouping-and-capturing operators. You usually +want this flag set. + + +### 13. ONIG_SYN_OP_ESC_LPAREN_SUBEXP (enable `\(` and `\)`) + +_Set in: PosixBasic, Emacs, Grep_ + +Enables support for escaped `\(...\)` grouping-and-capturing operators. This is useful if you +have disabled support for the normal `(...)` grouping-and-capturing operators and want +parentheses to simply match literal parenthesis characters, but you still want some way of +activating "grouping" or "capturing" behavior. + + +### 14. ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (enable `\A` and `\Z` and `\z`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the anchors `\A` (start-of-string), `\Z` (end-of-string or +newline-at-end-of-string), and `\z` (end-of-string) escapes. + +(If the escape metacharacter has been changed from the default of `\`, this +option will recognize that metacharacter instead.) + + +### 15. ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (enable `\G`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the special anchor `\G` (start-of-previous-match). + +(If the escape metacharacter has been changed from the default of `\`, this +option will recognize that metacharacter instead.) + +Note that `OnigRegex`/`regex_t` are not stateful objects, and do _not_ record +the location of the previous match. The `\G` flag uses the `start` parameter +explicitly passed to `onig_search()` (or `onig_search_with_param()` to determine +the "start of the previous match," so if the caller always passes the start of +the entire buffer as the function's `start` parameter, then `\G` will behave +exactly the same as `\A`. + + +### 16. ONIG_SYN_OP_DECIMAL_BACKREF (enable `\num`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for subsequent matches to back references to prior capture groups `(...)` using +the common `\num` syntax (like `\3`). + +If this flag is clear, then a numeric escape like `\3` will either be treated as a literal `3`, +or, if `ONIG_SYN_OP_ESC_OCTAL3` is set, will be treated as an octal character code `\3`. + +You usually want this enabled, and it is enabled by default in every built-in syntax. + + +### 17. ONIG_SYN_OP_BRACKET_CC (enable `[...]`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for recognizing character classes, like `[a-z]`. If this flag is not set, `[` +and `]` will be treated as ordinary literal characters instead of as metacharacters. + +You usually want this enabled, and it is enabled by default in every built-in syntax. + + +### 18. ONIG_SYN_OP_ESC_W_WORD (enable `\w` and `\W`) + +_Set in: Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\w` and `\W` shorthand forms. These match "word characters," +whose meaning varies depending on the encoding being used. + +In ASCII encoding, `\w` is equivalent to `[A-Za-z0-9_]`. + +In most other encodings, `\w` matches many more characters, including accented letters, Greek letters, +Cyrillic letters, Braille letters and numbers, Runic letters, Hebrew letters, Arabic letters and numerals, +Chinese Han ideographs, Japanese Katakana and Hiragana, Korean Hangul, and generally any symbol that +could qualify as a phonetic "letter" or counting "number" in any language. (Note that emoji are _not_ +considered "word characters.") + +`\W` always matches the opposite of whatever `\w` matches. + + +### 19. ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (enable `\<` and `\>`) + +_Set in: Grep, GnuRegex_ + +Enables support for the GNU-specific `\<` and `\>` word-boundary metacharacters. These work like +the `\b` word-boundary metacharacter, but only match at one end of the word or the other: `\<` +only matches at a transition from a non-word character to a word character (i.e., at the start +of a word), and `\>` only matches at a transition from a word character to a non-word character +(i.e., at the end of a word). + +Most regex syntaxes do _not_ support these metacharacters. + + +### 20. ONIG_SYN_OP_ESC_B_WORD_BOUND (enable `\b` and `\B`) + +_Set in: Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\b` and `\B` word-boundary metacharacters. The `\b` metacharacter +matches a zero-width position at a transition from word-characters to non-word-characters, or vice +versa. The `\B` metacharacter matches at all positions _not_ matched by `\b`. + +See details in `ONIG_SYN_OP_ESC_W_WORD` above for an explanation as to which characters +are considered "word characters." + + +### 21. ONIG_SYN_OP_ESC_S_WHITE_SPACE (enable `\s` and `\S`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\s` and `\S` whitespace-matching metacharacters. + +The `\s` metacharacter in ASCII encoding is exactly equivalent to the character class +`[\t\n\v\f\r ]`, or characters codes 9 through 13 (inclusive), and 32. + +The `\s` metacharacter in Unicode is exactly equivalent to the character class +`[\t\n\v\f\r \x85\xA0\x1680\x2000-\x200A\x2028-\x2029\x202F\x205F\x3000]` — that is, it matches +the same as ASCII, plus U+0085 (next line), U+00A0 (nonbreaking space), U+1680 (Ogham space mark), +U+2000 (en quad) through U+200A (hair space) (this range includes several widths of Unicode spaces), +U+2028 (line separator) through U+2029 (paragraph separator), +U+202F (narrow no-break space), U+205F (medium mathematical space), and U+3000 (CJK ideographic space). + +All non-Unicode encodings are handled by converting their code points to the appropriate +Unicode-equivalent code points, and then matching according to Unicode rules. + +`\S` always matches any one character that is _not_ in the set matched by `\s`. + + +### 22. ONIG_SYN_OP_ESC_D_DIGIT (enable `\d` and `\D`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\d` and `\D` digit-matching metacharacters. + +The `\d` metacharacter in ASCII encoding is exactly equivalent to the character class +`[0-9]`, or characters codes 48 through 57 (inclusive). + +The `\d` metacharacter in Unicode matches `[0-9]`, as well as digits in Arabic, Devanagari, +Bengali, Laotian, Mongolian, CJK fullwidth numerals, and many more. + +All non-Unicode encodings are handled by converting their code points to the appropriate +Unicode-equivalent code points, and then matching according to Unicode rules. + +`\D` always matches any one character that is _not_ in the set matched by `\d`. + + +### 23. ONIG_SYN_OP_LINE_ANCHOR (enable `^r` and `r$`) + +_Set in: Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `^` and `$` line-anchor metacharacters. + +In single-line mode, `^` matches the start of the input buffer, and `$` matches +the end of the input buffer. In multi-line mode, `^` matches if the preceding +character is `\n`; and `$` matches if the following character is `\n`. + +(Note that Oniguruma does not recognize other newline types: It only matches +`^` and `$` against `\n`: not `\r`, not `\r\n`, not the U+2028 line separator, +and not any other form.) + + +### 24. ONIG_SYN_OP_POSIX_BRACKET (enable POSIX `[:xxxx:]`) + +_Set in: PosixBasic, PosixExtended, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the POSIX `[:xxxx:]` character classes, like `[:alpha:]` and `[:digit:]`. +The supported POSIX character classes are `alnum`, `alpha`, `blank`, `cntrl`, `digit`, +`graph`, `lower`, `print`, `punct`, `space`, `upper`, `xdigit`, `ascii`, `word`. + + +### 25. ONIG_SYN_OP_QMARK_NON_GREEDY (enable `r??`, `r*?`, `r+?`, and `r{n,m}?`) + +_Set in: Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for lazy (non-greedy) quantifiers: That is, if you append a `?` after +another quantifier such as `?`, `*`, `+`, or `{n,m}`, Oniguruma will try to match +as _little_ as possible instead of as _much_ as possible. + + +### 26. ONIG_SYN_OP_ESC_CONTROL_CHARS (enable `\n`, `\r`, `\t`, etc.) + +_Set in: PosixBasic, PosixExtended, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for C-style control-code escapes, like `\n` and `\r`. Specifically, +this recognizes `\a` (7), `\b` (8), `\t` (9), `\n` (10), `\f` (12), `\r` (13), and +`\e` (27). If ONIG_SYN_OP2_ESC_V_VTAB is enabled (see below), this also enables +support for recognizing `\v` as code point 11. + + +### 27. ONIG_SYN_OP_ESC_C_CONTROL (enable `\cx` control codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for named control-code escapes, like `\cm` or `\cM` for code-point +13. In this shorthand form, control codes may be specified by `\c` (for "Control") +followed by an alphabetic letter, a-z or A-Z, indicating which code point to represent +(1 through 26). So `\cA` is code point 1, and `\cZ` is code point 26. + + +### 28. ONIG_SYN_OP_ESC_OCTAL3 (enable `\OOO` octal codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for octal-style escapes of up to three digits, like `\1` for code +point 1, and `\177` for code point 127. Octal values greater than 255 will result +in an error message. + + +### 29. ONIG_SYN_OP_ESC_X_HEX2 (enable `\xHH` hex codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for hexadecimal-style escapes of up to two digits, like `\x1` for code +point 1, and `\x7F` for code point 127. + + +### 30. ONIG_SYN_OP_ESC_X_BRACE_HEX8 (enable `\x{7HHHHHHH}` hex codes) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for brace-wrapped hexadecimal-style escapes of up to eight digits, +like `\x{1}` for code point 1, and `\x{FFFE}` for code point 65534. + + +### 31. ONIG_SYN_OP_ESC_O_BRACE_OCTAL (enable `\o{1OOOOOOOOOO}` octal codes) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for brace-wrapped octal-style escapes of up to eleven digits, +like `\o{1}` for code point 1, and `\o{177776}` for code point 65534. + +(New feature as of Oniguruma 6.3.) + + +---------- + + +## Group Two Flags (op2) + + +This group contains support for lesser-known regex syntax constructs. + + +### 0. ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (enable `\Q...\E`) + +_Set in: Java, Perl, Perl_NG_ + +Enables support for "quoted" parts of a pattern: Between `\Q` and `\E`, all +syntax parsing is turned off, so that metacharacters like `*` and `+` will no +longer be treated as metacharacters, and instead will be matched as literal +`*` and `+`, as if they had been escaped with `\*` and `\+`. + + +### 1. ONIG_SYN_OP2_QMARK_GROUP_EFFECT (enable `(?...)`) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for the fairly-common `(?...)` grouping operator, which +controls precedence but which does _not_ capture its contents. + + +### 2. ONIG_SYN_OP2_OPTION_PERL (enable options `(?imsx)` and `(?-imsx)`) + +_Set in: Java, Perl, Perl_NG_ + +Enables support of regex options. (i,m,s,x) +The supported toggle-able options for this flag are: + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`^` and `$` match at `\n` as well as start/end of buffer) + - `s` - Single-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + + +### 3. ONIG_SYN_OP2_OPTION_RUBY (enable options `(?imx)` and `(?-imx)`) + +_Set in: Ruby, Oniguruma_ + +Enables support of regex options. (i,m,x) +The supported toggle-able options for this flag are: + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + + +### 4. ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (enable `r?+`, `r*+`, and `r++`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the _possessive_ quantifiers `?+`, `*+`, and `++`, which +work similarly to `?` and `*` and `+`, respectively, but which do not backtrack +after matching: Like the normal greedy quantifiers, they match as much as +possible, but they do not attempt to match _less_ than their maximum possible +extent if subsequent parts of the pattern fail to match. + + +### 5. ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (enable `r{n,m}+`) + +_Set in: Java_ + +Enables support for the _possessive_ quantifier `{n,m}+`, which +works similarly to `{n,m}`, but which does not backtrack +after matching: Like the normal greedy quantifier, it matches as much as +possible, but it do not attempt to match _less_ than its maximum possible +extent if subsequent parts of the pattern fail to match. + + +### 6. ONIG_SYN_OP2_CCLASS_SET_OP (enable `&&` within `[...]`) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for character-class _intersection_. For example, with this +feature enabled, you can write `[a-z&&[^aeiou]]` to produce a character class +of only consonants, or `[\0-\37&&[^\n\r]]` to produce a character class of +all control codes _except_ newlines. + + +### 7. ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (enable named captures `(?...)`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for _naming_ capture groups, so that instead of having to +refer to captures by position (like `\3` or `$3`), you can refer to them by names +(like `server` and `path`). This supports the Perl/Ruby naming syntaxes `(?...)` +and `(?'name'...)`, but not the Python `(?P...)` syntax. + + +### 8. ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (enable named backreferences `\k`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for substituted backreferences by name, not just by position. +This supports using `\k'name'` in addition to supporting `\k`. This also +supports an Oniguruma-specific extension that lets you specify the _distance_ of +the match, if the capture matched multiple times, by writing `\k` or +`\k`. + + +### 9. ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (enable backreferences `\g` and `\g`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for substituted backreferences by both name and position using +the same syntax. This supports using `\g'name'` and `\g'1'` in addition to +supporting `\g` and `\g<1>`. + + +### 10. ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (enable `(?@...)` and `(?@...)`) + +_Set in: none_ + +Enables support for _capture history_, which can answer via the `onig_*capture*()` +functions exactly which captures were matched, how many times, and where in the +input they were matched, by placing `?@` in front of the capture. Per Oniguruma's +regex syntax documentation (appendix A-5): + +`/(?@a)*/.match("aaa")` ==> `[<0-1>, <1-2>, <2-3>]` + +This can require substantial memory, is primarily useful for debugging, and is not +enabled by default in any syntax. + + +### 11. ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (enable `\C-x`) + +_Set in: Ruby, Oniguruma_ + +Enables support for Ruby legacy control-code escapes, like `\C-m` or `\C-M` for code-point +13. In this shorthand form, control codes may be specified by `\C-` (for "Control") +followed by a single character (or equivalent), indicating which code point to represent, +based on that character's lowest five bits. So, like `\c`, you can represent code-point +10 with `\C-j`, but you can also represent it with `\C-*` as well. + +See also ONIG_SYN_OP_ESC_C_CONTROL, which enables the more-common `\cx` syntax. + + +### 12. ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (enable `\M-x`) + +_Set in: Ruby, Oniguruma_ + +Enables support for Ruby legacy meta-code escapes. When you write `\M-x`, Oniguruma +will match an `x` whose 8th bit is set (i.e., the character code of `x` will be or'ed +with `0x80`). So, for example, you can match `\x81` using `\x81`, or you can write +`\M-\1`. This is mostly useful when working with legacy 8-bit character encodings. + + +### 13. ONIG_SYN_OP2_ESC_V_VTAB (enable `\v` as vertical tab) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for a C-style `\v` escape code, meaning "vertical tab." If enabled, +`\v` will be equivalent to ASCII code point 11. + + +### 14. ONIG_SYN_OP2_ESC_U_HEX4 (enable `\uHHHH` for Unicode) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for a Java-style `\uHHHH` escape code for representing Unicode +code-points by number, using up to four hexadecimal digits (up to `\uFFFF`). So, +for example, `\u221E` will match an infinity symbol, `∞`. + +For code points larger than four digits, like the emoji `🚡` (aerial tramway, or code +point U+1F6A1), you must either represent the character directly using an encoding like +UTF-8, or you must enable support for ONIG_SYN_OP_ESC_X_BRACE_HEX8 or +ONIG_SYN_OP_ESC_O_BRACE_OCTAL, which support more than four digits. + +(New feature as of Oniguruma 6.7.) + + +### 15. ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (enable ``\` `` and `\'` anchors) + +_Set in: Emacs_ + +This flag makes the ``\` `` and `\'` escapes function identically to +`\A` and `\z`, respectively (when ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR is enabled). + +These anchor forms are very obscure, and rarely supported by other regex libraries. + + +### 16. ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (enable `\p{...}` and `\P{...}`) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for an alternate syntax for POSIX character classes; instead of +writing `[:alpha:]` when this is enabled, you can instead write `\p{alpha}`. + +See also ONIG_SYN_OP_POSIX_BRACKET for the classic POSIX form. + + +### 17. ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (enable `\p{^...}` and `\P{^...}`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for an alternate syntax for POSIX character classes; instead of +writing `[:^alpha:]` when this is enabled, you can instead write `\p{^alpha}`. + +See also ONIG_SYN_OP_POSIX_BRACKET for the classic POSIX form. + + +### 18. ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS + +_(not presently used)_ + + +### 19. ONIG_SYN_OP2_ESC_H_XDIGIT (enable `\h` and `\H`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the Ruby-specific shorthand `\h` and `\H` metacharacters. +Somewhat like `\d` matches decimal digits, `\h` matches hexadecimal digits — that is, +characters in `[0-9a-fA-F]`. + +`\H` matches the opposite of whatever `\h` matches. + + +### 20. ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (disable `\`) + +_Set in: As-is_ + +If set, this disables all escape codes, shorthands, and metacharacters that start +with `\` (or whatever the configured escape character is), allowing `\` to be treated +as a literal `\`. + +You usually do not want this flag to be enabled. + + +### 21. ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE (enable `(?(...)then|else)`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for conditional inclusion of subsequent regex patterns based on whether +a prior named or numbered capture matched, or based on whether a pattern will +match. This supports many different forms, including: + + - `(?()then|else)` - condition based on a capture by name. + - `(?('foo')then|else)` - condition based on a capture by name. + - `(?(3)then|else)` - condition based on a capture by number. + - `(?(+3)then|else)` - forward conditional to a future match, by relative position. + - `(?(-3)then|else)` - backward conditional to a prior match, by relative position. + - `(?(foo)then|else)` - this matches a pattern `foo`. (foo is any sub-expression) + +(New feature as of Oniguruma 6.5.) + + +### 22. ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (enable `\K`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for `\K`, which excludes all content before it from the overall +regex match (i.e., capture #0). So, for example, pattern `foo\Kbar` would match +`foobar`, but capture #0 would only include `bar`. + +(New feature as of Oniguruma 6.5.) + + +### 23. ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE (enable `\R`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for `\R`, the "general newline" shorthand, which matches +`(\r\n|[\n\v\f\r\u0085\u2028\u2029])` (obviously, the Unicode values are cannot be +matched in ASCII encodings). + +(New feature as of Oniguruma 6.5.) + + +### 24. ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT (enable `\N` and `\O`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for `\N` and `\O`. `\N` is "not a line break," which is much +like the standard `.` metacharacter, except that while `.` can be affected by +the single-line setting, `\N` always matches exactly one character that is not +one of the various line-break characters (like `\n` and `\r`). + +`\O` matches exactly one character, regardless of whether single-line or +multi-line mode are enabled or disabled. + +(New feature as of Oniguruma 6.5.) + + +### 25. ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP (enable `(?~...)`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the `(?~r)` "absent operator" syntax, which matches +as much as possible as long as the result _doesn't_ match pattern `r`. This is +_not_ the same as negative lookahead or negative lookbehind. + +Among the most useful examples of this is `\/\*(?~\*\/)\*\/`, which matches +C-style comments by simply saying "starts with /*, ends with */, and _doesn't_ +contain a */ in between." + +A full explanation of this feature is complicated, but it is useful, and an +excellent article about it is [available on Medium](https://medium.com/rubyinside/the-new-absent-operator-in-ruby-s-regular-expressions-7c3ef6cd0b99). + +(New feature as of Oniguruma 6.5.) + + +### 26. ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT (enable `\X` and `\Y` and `\y`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +`\X` is another variation on `.`, designed to support Unicode, in that it matches +a full _grapheme cluster_. In Unicode, `à` can be encoded as one code point, +`U+00E0`, or as two, `U+0061 U+0300`. If those are further escaped using UTF-8, +the former becomes two bytes, and the latter becomes three. Unfortunately, `.` +would naively match only one or two bytes, depending on the encoding, and would +likely incorrectly match anything from just `a` to a broken half of a code point. +`\X` is designed to fix this: It matches the full `à`, no matter how `à` is +encoded or decomposed. + +`\y` matches a cluster boundary, i.e., a zero-width position between +graphemes, somewhat like `\b` matches boundaries between words. `\Y` matches +the _opposite_ of `\y`, that is, a zero-width position between code points in +the _middle_ of a grapheme. + +(New feature as of Oniguruma 6.6.) + + +### 27. ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL (enable `(?R)` and `(?&name)`) + +_Set in: Perl_NG_ + +Enables support for substituted backreferences by both name and position using +Perl-5-specific syntax. This supports using `(?R3)` and `(?&name)` to reference +previous (and future) matches, similar to the more-common `\g<3>` and `\g` +backreferences. + +(New feature as of Oniguruma 6.7.) + + +### 28. ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (enable `(?{...})`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for Perl-style "callouts" — pattern substitutions that result from +invoking a callback method. When `(?{foo})` is reached in a pattern, the callback +function set in `onig_set_progress_callout()` will be invoked, and be able to perform +custom computation during the pattern match (and during backtracking). + +Full documentation for this advanced feature can be found in the Oniguruma +`docs/CALLOUT.md` file, with an example in `samples/callout.c`. + +(New feature as of Oniguruma 6.8.) + + +### 29. ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (enable `(*name)`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for Perl-style "callouts" — pattern substitutions that result from +invoking a callback method. When `(*foo)` is reached in a pattern, the callback +function set in `onig_set_callout_of_name()` will be invoked, passing the given name +`foo` to it, and it can perform custom computation during the pattern match (and +during backtracking). + +Full documentation for this advanced feature can be found in the Oniguruma +`docs/CALLOUT.md` file, with an example in `samples/callout.c`. + +(New feature as of Oniguruma 6.8.) + + +### 30. ONIG_SYN_OP2_OPTION_ONIGURUMA (enable options `(?imxWSDPy)` and `(?-imxWDSP)`) + +_Set in: Oniguruma_ + +Enables support of regex options. (i,m,x,W,S,D,P,y) + +(New feature as of Oniguruma 6.9.2) + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + - `W` - ASCII only word. + - `D` - ASCII only digit. + - `S` - ASCII only space. + - `P` - ASCII only POSIX properties. (includes W,D,S) + +---------- + + +## Syntax Flags (syn) + + +This group contains rules to handle corner cases and constructs that are errors in +some syntaxes but not in others. + +### 0. ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (independent `?`, `*`, `+`, `{n,m}`) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +This flag specifies how to handle operators like `?` and `*` when they aren't +directly attached to an operand, as in `^*` or `(*)`: Are they an error, are +they discarded, or are they taken as literals? If this flag is clear, they +are taken as literals; otherwise, the ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS flag +determines if they are errors or if they are discarded. + +### 1. ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (error or ignore independent operators) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS is set, this flag controls what happens when +independent operators appear in a pattern: If this flag is set, then independent +operators produce an error message; if this flag is clear, then independent +operators are silently discarded. + +### 2. ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (allow `...)...`) + +_Set in: PosixExtended_ + +This flag, if set, causes a `)` character without a preceding `(` to be treated as +a literal `)`, equivalent to `\)`. If this flag is clear, then an unmatched `)` +character will produce an error message. + +### 3. ONIG_SYN_ALLOW_INVALID_INTERVAL (allow `{???`) + +_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +This flag, if set, causes an invalid range, like `foo{bar}` or `foo{}`, to be +silently discarded, as if `foo` had been written instead. If clear, an invalid +range will produce an error message. + +### 4. ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (allow `{,n}` to mean `{0,n}`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, then `r{,n}` will be treated as equivalent to writing +`{0,n}`. If this flag is clear, then `r{,n}` will produce an error message. + +Note that regardless of whether this flag is set or clear, if +ONIG_SYN_OP_BRACE_INTERVAL is enabled, then `r{n,}` will always be legal: This +flag *only* controls the behavior of the opposite form, `r{,n}`. + +### 5. ONIG_SYN_STRICT_CHECK_BACKREF (error on invalid backrefs) + +_Set in: none_ + +If this flag is set, an invalid backref, like `\1` in a pattern with no captures, +will produce an error. If this flag is clear, then an invalid backref will be +equivalent to the empty string. + +No built-in syntax has this flag enabled. + +### 6. ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (allow `(?<=a|bc)`) + +_Set in: Java, Ruby, Oniguruma_ + +If this flag is set, lookbehind patterns with alternate options may have differing +lengths among those options. If this flag is clear, lookbehind patterns with options +must have each option have identical length to the other options. + +Oniguruma can handle either form, but not all regex engines can, so for compatibility, +Oniguruma allows you to cause regexes for other regex engines to fail if they might +depend on this rule. + +### 7. ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (prefer `\k` over `\3`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +If this flag is set on the syntax *and* ONIG_OPTION_CAPTURE_GROUP is set when calling +Oniguruma, then if a name is used on any capture, all captures must also use names: A +single use of a named capture prohibits the use of numbered captures. + +### 8. ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (allow `(?)...(?)`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +If this flag is set, multiple capture groups may use the same name. If this flag is +clear, then reuse of a name will produce an error message. + +### 9. ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (`a{n}?` is equivalent to `(?:a{n})?`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, then intervals of a fixed size will ignore a lazy (non-greedy) +`?` quantifier and treat it as an optional match (an ordinary `r?`), since "match as +little as possible" is meaningless for a fixed-size interval. If this flag is clear, +then `r{n}?` will mean the same as `r{n}`, and the useless `?` will be discarded. + +### 20. ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (add `\n` to `[^...]`) + +_Set in: Grep_ + +If this flag is set, all newline characters (like `\n`) will be excluded from a negative +character class automatically, as if the pattern had been written as `[^...\n]`. If this +flag is clear, negative character classes do not automatically exclude newlines, and +only exclude those characters and ranges written in them. + +### 21. ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (allow `[...\w...]`) + +_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If this flag is set, shorthands like `\w` are allowed to describe characters in character +classes. If this flag is clear, shorthands like `\w` are treated as a redundantly-escaped +literal `w`. + +### 22. ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (silently discard `[z-a]`) + +_Set in: Emacs, Grep_ + +If this flag is set, then character ranges like `[z-a]` that are broken or contain no +characters will be silently ignored. If this flag is clear, then broken or empty +character ranges will produce an error message. + +### 23. ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (treat `[0-9-a]` as `[0-9\-a]`) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If this flag is set, then a trailing `-` after a character range will be taken as a +literal `-`, as if it had been escaped as `\-`. If this flag is clear, then a trailing +`-` after a character range will produce an error message. + +### 24. ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (warn on `[[...]` and `[-x]`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, Oniguruma will be stricter about warning for bad forms in +character classes: `[[...]` will produce a warning, but `[\[...]` will not; +`[-x]` will produce a warning, but `[\-x]` will not; `[x&&-y]` will produce a warning, +while `[x&&\-y]` will not; and so on. If this flag is clear, all of these warnings +will be silently discarded. + +### 25. ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (warn on `(?:a*)+`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, Oniguruma will warn about nested repeat operators those have no meaning, like `(?:a*)+`. +If this flag is clear, Oniguruma will allow the nested repeat operators without warning about them. + +### 31. ONIG_SYN_CONTEXT_INDEP_ANCHORS + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Not currently used, and does nothing. (But still set in several syntaxes for some +reason.) + +---------- + +## Usage tables + +These tables show which of the built-in syntaxes use which flags and options, for easy comparison between them. + +### Group One Flags (op) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` | - | - | - | - | - | - | - | - | - | - | +| 1 | `ONIG_SYN_OP_DOT_ANYCHAR` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_OP_ASTERISK_ZERO_INF` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 3 | `ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF` | - | - | - | - | - | - | - | - | - | - | +| 4 | `ONIG_SYN_OP_PLUS_ONE_INF` | - | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 5 | `ONIG_SYN_OP_ESC_PLUS_ONE_INF` | - | - | - | Yes | - | - | - | - | - | - | +| 6 | `ONIG_SYN_OP_QMARK_ZERO_ONE` | - | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 7 | `ONIG_SYN_OP_ESC_QMARK_ZERO_ONE` | - | - | - | Yes | - | - | - | - | - | - | +| 8 | `ONIG_SYN_OP_BRACE_INTERVAL` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 9 | `ONIG_SYN_OP_ESC_BRACE_INTERVAL` | Yes | - | Yes | Yes | - | - | - | - | - | - | +| 10 | `ONIG_SYN_OP_VBAR_ALT` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 11 | `ONIG_SYN_OP_ESC_VBAR_ALT` | - | - | Yes | Yes | - | - | - | - | - | - | +| 12 | `ONIG_SYN_OP_LPAREN_SUBEXP` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 13 | `ONIG_SYN_OP_ESC_LPAREN_SUBEXP` | Yes | - | Yes | Yes | - | - | - | - | - | - | +| 14 | `ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 15 | `ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 16 | `ONIG_SYN_OP_DECIMAL_BACKREF` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 17 | `ONIG_SYN_OP_BRACKET_CC` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 18 | `ONIG_SYN_OP_ESC_W_WORD` | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 19 | `ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END` | - | - | - | Yes | Yes | - | - | - | - | - | +| 20 | `ONIG_SYN_OP_ESC_B_WORD_BOUND` | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 21 | `ONIG_SYN_OP_ESC_S_WHITE_SPACE` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_OP_ESC_D_DIGIT` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 23 | `ONIG_SYN_OP_LINE_ANCHOR` | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_OP_POSIX_BRACKET` | Yes | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 25 | `ONIG_SYN_OP_QMARK_NON_GREEDY` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 26 | `ONIG_SYN_OP_ESC_CONTROL_CHARS` | Yes | Yes | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 27 | `ONIG_SYN_OP_ESC_C_CONTROL` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 28 | `ONIG_SYN_OP_ESC_OCTAL3` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 29 | `ONIG_SYN_OP_ESC_X_HEX2` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 30 | `ONIG_SYN_OP_ESC_X_BRACE_HEX8` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 31 | `ONIG_SYN_OP_ESC_O_BRACE_OCTAL` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | + +### Group Two Flags (op2) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE` | - | - | - | - | - | Yes | Yes | Yes | - | - | +| 1 | `ONIG_SYN_OP2_QMARK_GROUP_EFFECT` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_OP2_OPTION_PERL` | - | - | - | - | - | Yes | Yes | Yes | - | - | +| 3 | `ONIG_SYN_OP2_OPTION_RUBY` | - | - | - | - | - | - | - | - | Yes | - | +| 4 | `ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 5 | `ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL` | - | - | - | - | - | Yes | - | - | - | - | +| 6 | `ONIG_SYN_OP2_CCLASS_SET_OP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 7 | `ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 8 | `ONIG_SYN_OP2_ESC_K_NAMED_BACKREF` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 9 | `ONIG_SYN_OP2_ESC_G_SUBEXP_CALL` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 10 | `ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY` | - | - | - | - | - | - | - | - | - | - | +| 11 | `ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL` | - | - | - | - | - | - | - | - | Yes | Yes | +| 12 | `ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META` | - | - | - | - | - | - | - | - | Yes | Yes | +| 13 | `ONIG_SYN_OP2_ESC_V_VTAB` | - | - | - | - | - | Yes | - | - | Yes | Yes | +| 14 | `ONIG_SYN_OP2_ESC_U_HEX4` | - | - | - | - | - | Yes | - | - | Yes | Yes | +| 15 | `ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR` | - | - | Yes | - | - | - | - | - | - | - | +| 16 | `ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 17 | `ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 18 | `ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS` | - | - | - | - | - | - | - | - | - | - | +| 19 | `ONIG_SYN_OP2_ESC_H_XDIGIT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 20 | `ONIG_SYN_OP2_INEFFECTIVE_ESCAPE` | - | - | - | - | - | - | - | - | - | - | +| 21 | `ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 23 | `ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT` | - | - | - | - | - | - | Yes | Yes | - | Yes | +| 25 | `ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP` | - | - | - | - | - | - | - | - | Yes | Yes | +| 26 | `ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 27 | `ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL` | - | - | - | - | - | - | - | Yes | - | - | +| 28 | `ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS` | - | - | - | - | - | - | Yes | Yes | Yes | - | +| 29 | `ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME` | - | - | - | - | - | - | Yes | Yes | Yes | - | +| 30 | `ONIG_SYN_OP2_OPTION_ONIGURUMA` | - | - | - | - | - | - | - | - | - | Yes | + +### Syntax Flags (syn) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 1 | `ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP` | - | Yes | - | - | - | - | - | - | - | - | +| 3 | `ONIG_SYN_ALLOW_INVALID_INTERVAL` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 4 | `ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV` | - | - | - | - | - | - | - | - | Yes | Yes | +| 5 | `ONIG_SYN_STRICT_CHECK_BACKREF` | - | - | - | - | - | - | - | - | - | - | +| 6 | `ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 7 | `ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 8 | `ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 9 | `ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY` | - | - | - | - | - | - | - | - | Yes | Yes | +| 20 | `ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC` | - | - | - | Yes | - | - | - | - | - | - | +| 21 | `ONIG_SYN_BACKSLASH_ESCAPE_IN_CC` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC` | - | - | Yes | Yes | - | - | - | - | - | - | +| 23 | `ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_WARN_CC_OP_NOT_ESCAPED` | - | - | - | - | - | - | - | - | Yes | Yes | +| 25 | `ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 31 | `ONIG_SYN_CONTEXT_INDEP_ANCHORS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | diff --git a/sciXlexers/CharSetX.h b/sciXlexers/CharSetX.h new file mode 100644 index 000000000..e0e776d5a --- /dev/null +++ b/sciXlexers/CharSetX.h @@ -0,0 +1,49 @@ +#pragma once +#ifndef _CHARSETX_H_ +#define _CHARSETX_H_ + +#include "StyleContext.h" +#include "CharacterSet.h" + +// Functions for classifying characters + +// *** Methods from "scintilla\lexlib\CharacterSet.h" *** +//- IsASpace(int ch); +//- IsASpaceOrTab(int ch); +//- IsADigit(int ch); +//- IsADigit(int ch, int base); +//- IsASCII(int ch); +//- IsLowerCase(int ch); +//- IsUpperCase(int ch); +//- IsUpperOrLowerCase(int ch); +//- IsAlphaNumeric(int ch); + +constexpr bool IsALetter(const int ch) noexcept { + // 97 to 122 || 65 to 90 + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); +} + +constexpr bool IsLineBreak(const int ch) noexcept { + return ((ch == '\n') || (ch == '\r')); +} + +constexpr int IsNumHex(const Scintilla::StyleContext& sc) noexcept { + return (sc.chNext == 'x') || (sc.chNext == 'X'); +} + +constexpr int IsNumBinary(const Scintilla::StyleContext& sc) noexcept { + return (sc.chNext == 'b') || (sc.chNext == 'B'); +} + + +inline int IsNumOctal(const Scintilla::StyleContext& sc) { + return Scintilla::IsADigit(sc.chNext) || (sc.chNext == 'o'); +} + +inline bool IsAIdentifierChar(const int ch) { + return (Scintilla::IsAlphaNumeric(ch) || ch == '_' || ch == '.'); +} + + + +#endif //_CHARSETX_H_ diff --git a/sciXlexers/LexAHKL.cxx b/sciXlexers/LexAHKL.cxx index afa6932e8..291352803 100644 --- a/sciXlexers/LexAHKL.cxx +++ b/sciXlexers/LexAHKL.cxx @@ -26,7 +26,7 @@ #include "LexAccessor.h" #include "Accessor.h" #include "StyleContext.h" -#include "CharacterSet.h" +#include "CharSetX.h" #include "LexerModule.h" #include "OptionSet.h" #include "DefaultLexer.h" @@ -508,7 +508,7 @@ void SCI_METHOD LexerAHKL::Lex(Sci_PositionU startPos, Sci_Position lengthDoc, i inCommand = true; } - // if ((OnlySpaces || isspace(sc.chPrev)) && sc.Match(';')) { + // if ((OnlySpaces || IsASpace(sc.chPrev)) && sc.Match(';')) { // sc.SetState(SCE_AHKL_STRINGCOMMENT); @@ -613,16 +613,17 @@ void SCI_METHOD LexerAHKL::Lex(Sci_PositionU startPos, Sci_Position lengthDoc, i expLevel += 1; inExpression = true; - if (sc.Match(" % ")) - inCommand = false; + if (sc.Match(" % ")) { + inCommand = false; + } } else if (sc.ch == ']' || sc.ch == ')') { expLevel -= 1, inCommand = false; - if (expLevel == 0) - inExpression = false; - + if (expLevel == 0) { + inExpression = false; + } } // Handle Command continuation section @@ -635,7 +636,7 @@ void SCI_METHOD LexerAHKL::Lex(Sci_PositionU startPos, Sci_Position lengthDoc, i if (valIdentifier.Contains(sc.ch)) validFunction = true; - if (isdigit(sc.ch & 0xFF)) + if (IsADigit(sc.ch)) sc.SetState(SCE_AHKL_DECNUMBER); else if (inCommand && sc.ch == '+') @@ -699,14 +700,11 @@ void SCI_METHOD LexerAHKL::Lex(Sci_PositionU startPos, Sci_Position lengthDoc, i inHotstring = true; sc.SetState(SCE_AHKL_HOTSTRINGOPT); - } - } - - if (!isspace(sc.ch)) - OnlySpaces = false; - + if (!IsASpace((sc.ch))) { + OnlySpaces = false; + } } sc.Complete(); diff --git a/sciXlexers/LexTOML.cxx b/sciXlexers/LexTOML.cxx index e85b7e4f7..568051fcd 100644 --- a/sciXlexers/LexTOML.cxx +++ b/sciXlexers/LexTOML.cxx @@ -25,11 +25,12 @@ #include "LexAccessor.h" #include "Accessor.h" #include "StyleContext.h" -#include "CharacterSet.h" +#include "CharSetX.h" #include "LexerModule.h" #include "OptionSet.h" #include "DefaultLexer.h" + using namespace Scintilla; namespace { @@ -48,34 +49,9 @@ namespace { int GetDataTypeStyle(const int numType) { if (numType == DataType::Unknown) { - return SCE_TOML_TYPEERROR; + return SCE_TOML_PARSINGERROR; } - return SCE_TOML_DATATYPE; - } - - inline bool IsLetter(const int ch) { - // 97 to 122 || 65 to 90 - return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); - } - - inline bool IsAWordChar(const int ch) { - return (ch < 0x80) && (isalnum(ch) || ch == '_' || ch == '.'); - } - - inline int IsNumHex(const StyleContext& sc) { - return (sc.chNext == 'x') || (sc.chNext == 'X'); - } - - inline int IsNumBinary(const StyleContext& sc) { - return (sc.chNext == 'b') || (sc.chNext == 'B'); - } - - inline int IsNumOctal(const StyleContext& sc) { - return IsADigit(sc.chNext) || sc.chNext == 'o'; - } - - constexpr bool IsNewline(const int ch) noexcept { - return (ch == '\n' || ch == '\r'); + return SCE_TOML_VALUE; } inline bool IsFuncName(const char* str) { @@ -88,34 +64,14 @@ namespace { "iterator", "converter" }; - for (const char* id : identifiers) { if (strcmp(str, id) == 0) { return true; } } - return false; } - //constexpr bool IsTripleLiteral(const int style) noexcept { - // return style == SCE_TOML_TRIPLE || style == SCE_TOML_TRIPLEDOUBLE; - //} - // - //constexpr bool IsLineComment(const int style) noexcept { - // return style == SCE_TOML_COMMENTLINE || style == SCE_TOML_COMMENTLINEDOC; - //} - // - //constexpr bool IsStreamComment(const int style) noexcept { - // return style == SCE_TOML_COMMENT || style == SCE_TOML_COMMENTDOC; - //} - - - constexpr bool IsAssignChar(unsigned char ch) { - return (ch == '=') || (ch == ':'); - } - - struct OptionsTOML { bool fold; bool foldCompact; @@ -127,7 +83,7 @@ namespace { }; static const char* const tomlWordListDesc[] = { - "Keywords", + "TOML", nullptr }; @@ -141,17 +97,16 @@ namespace { } }; - LexicalClass lexicalClasses[] = { // Lexer TOML SCLEX_TOML SCE_TOML_: - 0, "SCE_TOML_DEFAULT", "default", "White space", - 1, "SCE_TOML_COMMENT", "comment block", "Block comment", - 2, "SCE_TOML_KEY", "keyc", "Keyword", + 0, "SCE_TOML_DEFAULT", "default", "Default", + 1, "SCE_TOML_COMMENT", "comment", "Comment", + 2, "SCE_TOML_KEY", "key", "Key", 3, "SCE_TOML_SECTION", "section", "Section", 4, "SCE_TOML_ASSIGNMENT", "assignment", "Assignment", 5, "SCE_TOML_DEFVAL", "default value", "Default Value", - 6, "SCE_TOML_DATATYPE", "datatype", "Datatype", - 7, "SCE_TOML_TYPEERROR", "type error", "Type Error", + 6, "SCE_TOML_VALUETYPE", "value type", "Value Type", + 7, "SCE_TOML_PARSINGERROR", "type error", "Type Error", }; } // end of namespace @@ -163,9 +118,10 @@ class LexerTOML : public DefaultLexer { OptionSetTOML osTOML; public: - LexerTOML() : - DefaultLexer(lexicalClasses, ELEMENTS(lexicalClasses)), - setWord(CharacterSet::setAlphaNum, "_", 0x80, true) { } + LexerTOML() + : DefaultLexer(lexicalClasses, ELEMENTS(lexicalClasses)) + , setWord(CharacterSet::setAlphaNum, "_", 0x80, true) + { } virtual ~LexerTOML() { } @@ -189,19 +145,13 @@ public: return osTOML.DescribeProperty(name); } - Sci_Position SCI_METHOD PropertySet(const char* key, const char* val) override; const char* SCI_METHOD DescribeWordListSets() override { return osTOML.DescribeWordListSets(); } - Sci_Position SCI_METHOD WordListSet(int n, const char* wl) override; - - void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument* pAccess) override; - void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument* pAccess) override; - void* SCI_METHOD PrivateCall(int, void*) override { - return 0; + return nullptr; } int SCI_METHOD LineEndTypesSupported() override { @@ -215,8 +165,20 @@ public: static ILexer4* LexerFactoryTOML() { return new LexerTOML(); } + + // -------------------------------------------------------------------------- + + Sci_Position SCI_METHOD PropertySet(const char* key, const char* val) override; + + Sci_Position SCI_METHOD WordListSet(int n, const char* wl) override; + + void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument* pAccess) override; + + void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument* pAccess) override; + }; + Sci_Position SCI_METHOD LexerTOML::PropertySet(const char* key, const char* val) { if (osTOML.PropertySet(&options, key, val)) { return 0; @@ -225,8 +187,9 @@ Sci_Position SCI_METHOD LexerTOML::PropertySet(const char* key, const char* val) } -Sci_Position SCI_METHOD LexerTOML::WordListSet(int n, const char* wl) { - WordList* wordListN = 0; +Sci_Position SCI_METHOD LexerTOML::WordListSet(int n, const char* wl) +{ + WordList* wordListN = nullptr; switch (n) { case 0: @@ -245,100 +208,259 @@ Sci_Position SCI_METHOD LexerTOML::WordListSet(int n, const char* wl) { firstModification = 0; } } - return firstModification; } +// ---------------------------------------------------------------------------- - - -static inline bool AtEOL(Accessor& styler, Sci_PositionU i) -{ - return (styler[i] == '\n') || ((styler[i] == '\r') && (styler.SafeGetCharAt(i + 1) != '\n')); +constexpr bool IsCommentChar(const int ch) noexcept { + //return (ch == '#') || (ch == ':'); + return (ch == '#'); } +// ---------------------------------------------------------------------------- + +constexpr bool IsAssignChar(const int ch) noexcept { + //return (ch == '=') || (ch == ':'); + return (ch == '='); +} +// ---------------------------------------------------------------------------- + +inline bool IsAKeyChar(const int ch) { + return (IsAlphaNumeric(ch) || ch == '_'); +} +// ---------------------------------------------------------------------------- + + +static int GetBracketLevel(StyleContext& sc) +{ + Sci_Position const posCurrent = static_cast(sc.currentPos); + + bool ignore = false; + int iBracketLevel = -1; + + Sci_Position i = 0; + while ((--i + posCurrent) >= 0) + { + if (sc.GetRelative(i) == '"') { + ignore = !ignore; // toggle string + } + else if (!ignore) { + if (IsAssignChar(sc.GetRelative(i))) { + break; // must be within assignment + } + else if (sc.GetRelative(i) == ']') { + --iBracketLevel; + } + else if (sc.GetRelative(i) == '[') { + ++iBracketLevel; + } + } + } + return iBracketLevel; +} +// ---------------------------------------------------------------------------- + + void SCI_METHOD LexerTOML::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument* pAccess) { - Accessor styler(pAccess, nullptr); StyleContext sc(startPos, length, initStyle, styler); + bool inSectionDef = false; + bool inMultiLnString = (sc.state == SCE_TOML_STRING); + bool inMultiLnArrayDef = (sc.state == SCE_TOML_ARRAY); + for (; sc.More(); sc.Forward()) { + + // -------------------------------------------------- + // check if in the middle of a line continuation ... + // -------------------------------------------------- if (sc.atLineStart) { - sc.SetState(SCE_TOML_DEFAULT); + switch (sc.state) + { + case SCE_TOML_STRING: + if (!inMultiLnString) { + sc.SetState(SCE_TOML_PARSINGERROR); + } + break; + case SCE_TOML_ARRAY: + if (!inMultiLnArrayDef) { + sc.SetState(SCE_TOML_PARSINGERROR); + } + break; + case SCE_TOML_PARSINGERROR: + // preserve error + break; + default: + sc.SetState(SCE_TOML_DEFAULT); // reset + break; + } } - - switch (sc.state) + + // ------------------------- + // current state independent + // ------------------------- + if (IsLineBreak(sc.ch)) { + continue; // eat line breaks + } + + if (sc.ch != SCE_TOML_PARSINGERROR) + { + if (IsCommentChar(sc.ch)) { + if (inSectionDef) { + sc.SetState(SCE_TOML_PARSINGERROR); + } + else if (inMultiLnString || inMultiLnArrayDef) { + sc.ForwardSetState(sc.state); // ignore + } + else { + sc.SetState(SCE_TOML_COMMENT); + } + } + + } // SCE_TOML_PARSINGERROR + + + // ------------------------- + // state dependent + // ------------------------- + switch (sc.state) { case SCE_TOML_DEFAULT: - { - if (sc.ch == '#' || sc.ch == '!' || sc.ch == ';') { + if (IsASpaceOrTab(sc.ch)) { + // eat + } + else if (IsCommentChar(sc.ch)) { sc.SetState(SCE_TOML_COMMENT); } else if (sc.ch == '[') { sc.SetState(SCE_TOML_SECTION); + inSectionDef = true; } - else if (sc.ch == '@') { - sc.SetState(SCE_TOML_DEFVAL); - } - else if (IsAssignChar(sc.ch)) { - sc.SetState(SCE_TOML_ASSIGNMENT); - } - else if (IsLetter(sc.ch)) { + else if (IsAKeyChar(sc.ch)) { sc.SetState(SCE_TOML_KEY); } - } - break; - - case SCE_TOML_COMMENT: + else { + sc.SetState(SCE_TOML_PARSINGERROR); + } break; - case SCE_TOML_KEY: - if (!IsLetter(sc.ch)) { - if (IsAssignChar(sc.ch)) { - sc.SetState(SCE_TOML_ASSIGNMENT); - } - else { - sc.SetState(SCE_TOML_DEFAULT); - } - } + case SCE_TOML_COMMENT: + // eat - rest of line is comment break; case SCE_TOML_SECTION: if (sc.ch == ']') { - sc.Forward(); - sc.SetState(SCE_TOML_DEFAULT); + inSectionDef = false; } - break; + else if (IsCommentChar(sc.ch)) { + if (!inSectionDef) { + sc.SetState(SCE_TOML_COMMENT); + } + else { + sc.SetState(SCE_TOML_PARSINGERROR); + } + } + break; + + case SCE_TOML_KEY: + if (IsASpaceOrTab(sc.ch)) { + sc.SetState(SCE_TOML_ASSIGNMENT); // end of key + } + else if (IsAssignChar(sc.ch)) { + sc.SetState(SCE_TOML_ASSIGNMENT); + } + else if (!IsAKeyChar(sc.ch)) { + sc.SetState(SCE_TOML_PARSINGERROR); + } + break; case SCE_TOML_ASSIGNMENT: - if (!IsAssignChar(sc.ch)) { - sc.SetState(SCE_TOML_DEFVAL); + if (IsAssignChar(sc.ch)) { + sc.ForwardSetState(SCE_TOML_VALUE); + // fall through case SCE_TOML_VALUE: + } + else if (IsASpaceOrTab(sc.ch)) { + break; // OK + } + else { + sc.SetState(SCE_TOML_PARSINGERROR); + break; + } + // fall through + + case SCE_TOML_VALUE: + if (sc.ch == '[') { + sc.SetState(SCE_TOML_ARRAY); + inMultiLnArrayDef = true; + } + else if (sc.ch == ']') { + sc.SetState(SCE_TOML_PARSINGERROR); + } + else if (sc.ch == '"') { + sc.SetState(SCE_TOML_STRING); + if (sc.Match(R"(""")")) { + inMultiLnString = true; + sc.Forward(2); + } } break; - case SCE_TOML_DEFVAL: + case SCE_TOML_STRING: + if (sc.ch == '\\') { + sc.ForwardSetState(SCE_TOML_STRING); + } + else if (sc.ch == '"') { + if (!inMultiLnString) { + sc.ForwardSetState(SCE_TOML_VALUE); + } + else { + // inMultiLnString + if (sc.Match(R"(""")")) { + sc.Forward(2); + sc.ForwardSetState(SCE_TOML_VALUE); + inMultiLnString = false; + } + else { + sc.SetState(SCE_TOML_PARSINGERROR); + } + } + } break; - case SCE_TOML_DATATYPE: + case SCE_TOML_ARRAY: + if (sc.ch == ']') { + int const level = GetBracketLevel(sc); + if (level == 0) { + sc.ForwardSetState(SCE_TOML_VALUE); + inMultiLnArrayDef = false; + } + else if (level < 0) { + sc.SetState(SCE_TOML_PARSINGERROR); + inMultiLnArrayDef = false; + } + } break; - case SCE_TOML_TYPEERROR: + case SCE_TOML_PARSINGERROR: + // still parsing error until new line break; default: + sc.SetState(SCE_TOML_PARSINGERROR); // unknown break; } - if (sc.atLineEnd) { - sc.SetState(SCE_TOML_DEFAULT); - } + //if (sc.atLineEnd) { + // // --- + //} } sc.Complete(); } - +// ---------------------------------------------------------------------------- @@ -348,7 +470,7 @@ void SCI_METHOD LexerTOML::Fold(Sci_PositionU startPos, Sci_Position length, int return; } - Accessor styler(pAccess, NULL); + Accessor styler(pAccess, nullptr); //const Sci_Position docLines = styler.GetLine(styler.Length()); //const Sci_Position maxPos = startPos + length; @@ -426,5 +548,9 @@ void SCI_METHOD LexerTOML::Fold(Sci_PositionU startPos, Sci_Position length, int styler.SetLevel(lineCurrent, lev | (flagsNext & ~SC_FOLDLEVELNUMBERMASK)); } +// ---------------------------------------------------------------------------- LexerModule lmTOML(SCLEX_TOML, LexerTOML::LexerFactoryTOML, "toml", tomlWordListDesc); + +// ---------------------------------------------------------------------------- + diff --git a/sciXlexers/SciXLexer.h b/sciXlexers/SciXLexer.h index b3e6610e6..d8138df0a 100644 --- a/sciXlexers/SciXLexer.h +++ b/sciXlexers/SciXLexer.h @@ -1,11 +1,12 @@ -#ifndef SCIXLEXER_H -#define SCIXLEXER_H +#pragma once +#ifndef _SCIXLEXER_H_ +#define _SCIXLEXER_H_ #define SCLEX_AHKL 200 #define SCLEX_TOML 201 // ----------------------------------------------------------------------------- -// !!!!! ADD Lexer Linkage in: Notepad3\scintilla\src\Catalogue.cxx !!!!! +// !!!!! ADD Lexer Linkage in: scintilla\src\Catalogue.cxx !!!!! // ----------------------------------------------------------------------------- #define SCE_AHKL_NEUTRAL 0 @@ -43,12 +44,13 @@ #define SCE_TOML_DEFAULT 0 #define SCE_TOML_COMMENT 1 -#define SCE_TOML_KEY 2 -#define SCE_TOML_SECTION 3 +#define SCE_TOML_SECTION 2 +#define SCE_TOML_KEY 3 #define SCE_TOML_ASSIGNMENT 4 -#define SCE_TOML_DEFVAL 5 -#define SCE_TOML_DATATYPE 6 -#define SCE_TOML_TYPEERROR 7 +#define SCE_TOML_VALUE 5 +#define SCE_TOML_STRING 6 +#define SCE_TOML_ARRAY 7 +#define SCE_TOML_PARSINGERROR 8 -#endif +#endif //_SCIXLEXER_H_ diff --git a/scintilla/Scintilla.vcxproj b/scintilla/Scintilla.vcxproj index 08572c606..2d6a5e446 100644 --- a/scintilla/Scintilla.vcxproj +++ b/scintilla/Scintilla.vcxproj @@ -326,6 +326,7 @@ + diff --git a/scintilla/Scintilla.vcxproj.filters b/scintilla/Scintilla.vcxproj.filters index eb012fa93..87aa77cff 100644 --- a/scintilla/Scintilla.vcxproj.filters +++ b/scintilla/Scintilla.vcxproj.filters @@ -554,5 +554,8 @@ oniguruma + + sciXlexers + \ No newline at end of file diff --git a/src/Config/SimpleIni.h b/src/Config/SimpleIni.h index de830f129..9a14b0cde 100644 --- a/src/Config/SimpleIni.h +++ b/src/Config/SimpleIni.h @@ -325,7 +325,7 @@ template class CSimpleIniTempl { public: - typedef SI_CHAR SI_CHAR_T; + using SI_CHAR_T = SI_CHAR; /** key entry */ struct Entry { @@ -377,15 +377,15 @@ public: }; /** map keys to values */ - typedef std::multimap TKeyVal; + using TKeyVal = std::multimap; /** map sections to key/value map */ - typedef std::map TSection; + using TSection = std::map; /** set of dependent string pointers. Note that these pointers are dependent on memory owned by CSimpleIni. */ - typedef std::list TNamesDepend; + using TNamesDepend = std::list; /** interface definition for the OutputWriter object to pass to Save() in order to output the INI file data. @@ -895,8 +895,8 @@ public: const SI_CHAR * GetValue( const SI_CHAR * a_pSection, const SI_CHAR * a_pKey, - const SI_CHAR * a_pDefault = NULL, - bool * a_pHasMultiple = NULL + const SI_CHAR * a_pDefault = nullptr, + bool * a_pHasMultiple = nullptr ) const; /** Retrieve a numeric value for a specific key. If multiple keys are enabled @@ -916,7 +916,7 @@ public: const SI_CHAR * a_pSection, const SI_CHAR * a_pKey, long a_nDefault = 0, - bool * a_pHasMultiple = NULL + bool * a_pHasMultiple = nullptr ) const; /** Retrieve a numeric value for a specific key. If multiple keys are enabled @@ -936,7 +936,7 @@ public: const SI_CHAR * a_pSection, const SI_CHAR * a_pKey, double a_nDefault = 0, - bool * a_pHasMultiple = NULL + bool * a_pHasMultiple = nullptr ) const; /** Retrieve a boolean value for a specific key. If multiple keys are enabled @@ -961,7 +961,7 @@ public: const SI_CHAR * a_pSection, const SI_CHAR * a_pKey, bool a_bDefault = false, - bool * a_pHasMultiple = NULL + bool * a_pHasMultiple = nullptr ) const; /** Add or update a section or value. This will always insert @@ -1358,7 +1358,7 @@ CSimpleIniTempl::LoadFile( ) { strcpy_s(m_FilePathA, _countof(m_FilePathA), a_pszFile); - FILE * fp = NULL; + FILE * fp = nullptr; #if __STDC_WANT_SECURE_LIB__ && !_WIN32_WCE fopen_s(&fp, a_pszFile, "rb"); #else // !__STDC_WANT_SECURE_LIB__ @@ -1380,7 +1380,7 @@ CSimpleIniTempl::LoadFile( ) { #ifdef _WIN32 - FILE * fp = NULL; + FILE * fp = nullptr; #if __STDC_WANT_SECURE_LIB__ && !_WIN32_WCE _wfopen_s(&fp, a_pwszFile, L"rb"); #else // !__STDC_WANT_SECURE_LIB__ @@ -1418,7 +1418,7 @@ CSimpleIniTempl::LoadFile( } // allocate and ensure NULL terminated - char * pData = new(std::nothrow) char[lSize+1]; + auto * pData = new(std::nothrow) char[lSize+1]; if (!pData) { return SI_NOMEM; } @@ -1474,8 +1474,8 @@ CSimpleIniTempl::LoadData( return SI_NOMEM; } - size_t const convCnt = (size_t)WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR)a_pData + 1, (int)(a_uDataLen / sizeof(WCHAR) - 1), - (LPSTR)pDataUTF16toUTF8, (int)(a_uDataLen * 3 + 1), NULL, NULL); + auto const convCnt = (size_t)WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR)a_pData + 1, (int)(a_uDataLen / sizeof(WCHAR) - 1), + (LPSTR)pDataUTF16toUTF8, (int)(a_uDataLen * 3 + 1), nullptr, nullptr); if (convCnt == 0) { delete[] pDataUTF16toUTF8; return SI_FAIL; @@ -1504,7 +1504,7 @@ CSimpleIniTempl::LoadData( // allocate memory for the data, ensure that there is a NULL // terminator wherever the converted data ends - SI_CHAR * pData = new(std::nothrow) SI_CHAR[uLen+1]; + auto * pData = new(std::nothrow) SI_CHAR[uLen+1]; if (!pData) { delete[] pDataUTF16toUTF8; return SI_NOMEM; @@ -1861,7 +1861,7 @@ CSimpleIniTempl::LoadMultiLineText( // move this line down to the location that it should be if necessary if (pDataLine < pCurrLine) { - size_t nLen = (size_t) (a_pData - pCurrLine); + auto nLen = (size_t) (a_pData - pCurrLine); memmove(pDataLine, pCurrLine, nLen * sizeof(SI_CHAR)); pDataLine[nLen] = '\0'; } @@ -1933,7 +1933,7 @@ CSimpleIniTempl::CopyString( for ( ; a_pString[uLen]; ++uLen) /*loop*/ ; } ++uLen; // NULL character - SI_CHAR * pCopy = new(std::nothrow) SI_CHAR[uLen]; + auto * pCopy = new(std::nothrow) SI_CHAR[uLen]; if (!pCopy) { return SI_NOMEM; } @@ -1983,7 +1983,7 @@ CSimpleIniTempl::AddEntry( } typename TSection::value_type oEntry(oSection, TKeyVal()); - typedef typename TSection::iterator SectionIterator; + using SectionIterator = typename TSection::iterator; std::pair i = m_data.insert(oEntry); iSection = i.first; bInserted = true; @@ -2176,7 +2176,7 @@ CSimpleIniTempl::GetDoubleValue( return a_nDefault; } - char * pszSuffix = NULL; + char * pszSuffix = nullptr; double nValue = strtod(szValue, &pszSuffix); // any invalid strings will return the default value @@ -2235,6 +2235,8 @@ CSimpleIniTempl::GetBoolValue( switch (pszValue[0]) { case 't': case 'T': // true case 'y': case 'Y': // yes + case '9': case '8': case '7': case '6': // != 0 + case '5': case '4': case '3': case '2': // != 0 case '1': // 1 (one) return true; @@ -2415,7 +2417,7 @@ CSimpleIniTempl::SaveFile( bool a_bAddSignature ) const { - FILE * fp = NULL; + FILE * fp = nullptr; #if __STDC_WANT_SECURE_LIB__ && !_WIN32_WCE fopen_s(&fp, a_pszFile, "wb"); #else // !__STDC_WANT_SECURE_LIB__ @@ -2436,7 +2438,7 @@ CSimpleIniTempl::SaveFile( ) const { #ifdef _WIN32 - FILE * fp = NULL; + FILE * fp = nullptr; #if __STDC_WANT_SECURE_LIB__ && !_WIN32_WCE _wfopen_s(&fp, a_pwszFile, L"wb"); #else // !__STDC_WANT_SECURE_LIB__ @@ -3455,10 +3457,10 @@ public: // TYPE DEFINITIONS // --------------------------------------------------------------------------- -typedef CSimpleIniTempl,SI_ConvertA > CSimpleIniA; -typedef CSimpleIniTempl,SI_ConvertA > CSimpleIniCaseA; +using CSimpleIniA = CSimpleIniTempl,SI_ConvertA >; +using CSimpleIniCaseA = CSimpleIniTempl,SI_ConvertA >; #if defined(SI_CONVERT_ICU) typedef CSimpleIniTempl,SI_ConvertW > CSimpleIniCaseW; #else -typedef CSimpleIniTempl,SI_ConvertW > CSimpleIniW; -typedef CSimpleIniTempl,SI_ConvertW > CSimpleIniCaseW; +using CSimpleIniW = CSimpleIniTempl,SI_ConvertW >; +using CSimpleIniCaseW = CSimpleIniTempl,SI_ConvertW >; #endif #ifdef _UNICODE diff --git a/src/Edit.c b/src/Edit.c index 4c4d93ab4..922fd5f18 100644 --- a/src/Edit.c +++ b/src/Edit.c @@ -7023,7 +7023,7 @@ void EditHideNotMarkedLineRange(HWND hwnd, bool bHideLines) // // _HighlightIfBrace() // -static bool _HighlightIfBrace(HWND hwnd, DocPos iPos) +static bool _HighlightIfBrace(const HWND hwnd, const DocPos iPos) { UNUSED(hwnd); if (iPos < 0) { @@ -7033,15 +7033,15 @@ static bool _HighlightIfBrace(HWND hwnd, DocPos iPos) return true; } - char c = SciCall_GetCharAt(iPos); + char const c = SciCall_GetCharAt(iPos); if (StrChrA(NP3_BRACES_TO_MATCH, c)) { - DocPos iBrace2 = SciCall_BraceMatch(iPos); - if (iBrace2 != -1) { - DocPos col1 = SciCall_GetColumn(iPos); - DocPos col2 = SciCall_GetColumn(iBrace2); + DocPos const iBrace2 = SciCall_BraceMatch(iPos); + if (iBrace2 != (DocPos)-1) { + int const col1 = (int)SciCall_GetColumn(iPos); + int const col2 = (int)SciCall_GetColumn(iBrace2); SciCall_BraceHighLight(iPos, iBrace2); - SciCall_SetHighLightGuide(min_i((int)col1, (int)col2)); + SciCall_SetHighLightGuide(min_i(col1, col2)); } else { SciCall_BraceBadLight(iPos); diff --git a/src/Notepad3.c b/src/Notepad3.c index 750f0fd02..256e6b329 100644 --- a/src/Notepad3.c +++ b/src/Notepad3.c @@ -688,21 +688,25 @@ static bool _InsertLanguageMenu(HMENU hMenuBar) if (s_hmenuLanguage) { DestroyMenu(s_hmenuLanguage); } s_hmenuLanguage = CreatePopupMenu(); - WCHAR wchMenuItemFmt[128] = L"%s"; + WCHAR wchMenuItemFmt[128] = { L'\0' }; WCHAR wchMenuItemStrg[196] = { L'\0' }; for (int lng = 0; lng < MuiLanguages_CountOf(); ++lng) { if (MUI_LanguageDLLs[lng].bHasDLL) { // GetLngString(MUI_LanguageDLLs[lng].rid, wchMenuItemFmt, COUNTOF(wchMenuItemFmt)); + bool found = false; for (int i = 0; i < COUNTOF(s_LanguageMenu); ++i) { if (MUI_LanguageDLLs[lng].LangId == s_LanguageMenu[i].LangID) { StringCchCopy(wchMenuItemFmt, COUNTOF(wchMenuItemFmt), s_LanguageMenu[i].MenuItem); + found = true; break; } } - + if (!found) { + StringCchCopy(wchMenuItemFmt, COUNTOF(wchMenuItemFmt), L"Lang-(Sub)\t\t\t[%s]"); + } StringCchPrintfW(wchMenuItemStrg, COUNTOF(wchMenuItemStrg), wchMenuItemFmt, MUI_LanguageDLLs[lng].szLocaleName); AppendMenu(s_hmenuLanguage, MF_ENABLED | MF_STRING, MUI_LanguageDLLs[lng].rid, wchMenuItemStrg); } diff --git a/src/StyleLexers/styleLexTOML.c b/src/StyleLexers/styleLexTOML.c index 9fbbdb1f5..4e643c053 100644 --- a/src/StyleLexers/styleLexTOML.c +++ b/src/StyleLexers/styleLexTOML.c @@ -14,10 +14,10 @@ SCLEX_TOML, IDS_LEX_TOML_CFG, L"TOML Config", L"toml", L"", { {STYLE_DEFAULT}, IDS_LEX_STR_63126, L"Default", L"", L"" }, //{ {SCE_TOML_DEFAULT}, IDS_LEX_STR_63126, L"Default", L"", L"" }, { {SCE_TOML_COMMENT}, IDS_LEX_STR_63127, L"Comment", L"fore:#008000", L"" }, - { {SCE_TOML_KEY}, IDS_LEX_STR_63128, L"Key", L"bold; fore:#5E8F60", L"" }, - { {SCE_TOML_SECTION}, IDS_LEX_STR_63232, L"Section", L"bold; fore:#000000; back:#FF8040; eolfilled", L"" }, - { {SCE_TOML_ASSIGNMENT}, IDS_LEX_STR_63233, L"Assignment", L"fore:#FFA500", L"" }, - { {SCE_TOML_DEFVAL}, IDS_LEX_STR_63234, L"Default Value", L"fore:#00FF00", L"" }, - { {SCE_TOML_DATATYPE}, IDS_LEX_STR_63234, L"Datatype", L"fore:#0000FF", L"" }, - { {SCE_TOML_TYPEERROR}, IDS_LEX_STR_63234, L"Type Error", L"fore:#FF0000", L"" }, + { {SCE_TOML_SECTION}, IDS_LEX_STR_63232, L"Section", L"bold; fore:#000000; back:#FFF1A8; eolfilled", L"" }, + { {SCE_TOML_KEY}, IDS_LEX_STR_63348, L"Key", L"bold; fore:#5E608F", L"" }, + { {SCE_TOML_ASSIGNMENT}, IDS_LEX_STR_63233, L"Assignment", L"bold; fore:#FF2020", L"" }, + { {SCE_TOML_VALUE}, IDS_LEX_STR_63201, L"Value", L"fore:#202020", L"" }, + { {SCE_TOML_STRING}, IDS_LEX_STR_63131, L"String", L"italic; fore:#800000", L"" }, + { {SCE_TOML_PARSINGERROR}, IDS_LEX_STR_63252, L"Parsing Error", L"fore:#FFFF00; back:#A00000", L"" }, EDITLEXER_SENTINEL } };