mirror of
https://github.com/rizonesoft/Notepad3.git
synced 2026-06-11 21:03:05 +08:00
fix: hardening of JSON5 lexer
This commit is contained in:
parent
e24dc017cb
commit
45d8b4b15c
@ -31,6 +31,7 @@
|
||||
#include "DefaultLexer.h"
|
||||
|
||||
#include "CharSetX.h"
|
||||
#include "CharacterCategory.h"
|
||||
#include "SciXLexer.h"
|
||||
|
||||
using namespace Scintilla;
|
||||
@ -110,6 +111,70 @@ struct EscapeSequence {
|
||||
}
|
||||
};
|
||||
|
||||
// Decode a single UTF-8 code point at byte position `pos`.
|
||||
// Returns the code point and sets `outWidth` to its byte width (1-4).
|
||||
// On invalid UTF-8 (lone trail byte, truncated sequence), returns the raw
|
||||
// lead byte with width 1 so callers degrade gracefully.
|
||||
static int DecodeUTF8At(LexAccessor &styler, Sci_PositionU pos, int &outWidth) noexcept {
|
||||
const unsigned char b0 = static_cast<unsigned char>(styler.SafeGetCharAt(pos, '\0'));
|
||||
outWidth = 1;
|
||||
if (b0 < 0x80) {
|
||||
return b0;
|
||||
}
|
||||
if (b0 < 0xC2 || b0 > 0xF4) {
|
||||
return b0;
|
||||
}
|
||||
const unsigned char b1 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 1, '\0'));
|
||||
if ((b1 & 0xC0) != 0x80) {
|
||||
return b0;
|
||||
}
|
||||
if (b0 < 0xE0) {
|
||||
outWidth = 2;
|
||||
return ((b0 & 0x1F) << 6) | (b1 & 0x3F);
|
||||
}
|
||||
const unsigned char b2 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 2, '\0'));
|
||||
if ((b2 & 0xC0) != 0x80) {
|
||||
return b0;
|
||||
}
|
||||
if (b0 < 0xF0) {
|
||||
outWidth = 3;
|
||||
return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
|
||||
}
|
||||
const unsigned char b3 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 3, '\0'));
|
||||
if ((b3 & 0xC0) != 0x80) {
|
||||
return b0;
|
||||
}
|
||||
outWidth = 4;
|
||||
return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||
}
|
||||
|
||||
// JSON5 IdentifierStart per ECMA-262 / JSON5 §5.4:
|
||||
// $ | _ | UnicodeLetter (UAX #31 ID_Start).
|
||||
static bool IsJSON5IdStart(int cp) noexcept {
|
||||
if (cp < 0x80) {
|
||||
return (cp == '$') || (cp == '_')
|
||||
|| (cp >= 'A' && cp <= 'Z')
|
||||
|| (cp >= 'a' && cp <= 'z');
|
||||
}
|
||||
return IsIdStart(cp);
|
||||
}
|
||||
|
||||
// JSON5 IdentifierPart per ECMA-262 / JSON5 §5.4:
|
||||
// IdentifierStart | UnicodeCombiningMark | UnicodeDigit
|
||||
// | UnicodeConnectorPunctuation | ZWNJ (U+200C) | ZWJ (U+200D).
|
||||
static bool IsJSON5IdContinue(int cp) noexcept {
|
||||
if (cp < 0x80) {
|
||||
return (cp == '$') || (cp == '_')
|
||||
|| (cp >= '0' && cp <= '9')
|
||||
|| (cp >= 'A' && cp <= 'Z')
|
||||
|| (cp >= 'a' && cp <= 'z');
|
||||
}
|
||||
if (cp == 0x200C || cp == 0x200D) {
|
||||
return true;
|
||||
}
|
||||
return IsIdContinue(cp);
|
||||
}
|
||||
|
||||
struct OptionsJSON5 {
|
||||
bool foldCompact;
|
||||
bool fold;
|
||||
@ -118,8 +183,9 @@ struct OptionsJSON5 {
|
||||
OptionsJSON5() {
|
||||
foldCompact = false;
|
||||
fold = false;
|
||||
allowComments = false;
|
||||
escapeSequence = false;
|
||||
// JSON5 spec mandates comments and ECMAScript-style escape sequences.
|
||||
allowComments = true;
|
||||
escapeSequence = true;
|
||||
}
|
||||
};
|
||||
|
||||
@ -167,34 +233,83 @@ class LexerJSON5 : public DefaultLexer {
|
||||
/**
|
||||
* Looks for the colon following the end quote
|
||||
*
|
||||
* Assumes property names of lengths no longer than a 120 characters.
|
||||
* Assumes property names of lengths no longer than 120 code points.
|
||||
* The colon is also expected to be less than 50 spaces after the end
|
||||
* quote for the string to be considered a property name
|
||||
* quote for the string to be considered a property name.
|
||||
*/
|
||||
static constexpr bool IsPropChar(int ch) noexcept {
|
||||
// JSON5 / ECMAScript IdentifierStart + IdentifierPart (ASCII subset)
|
||||
return IsAlphaNumeric(ch) || ch == '$' || ch == '_';
|
||||
|
||||
// Match a JSON5 reserved literal at `start`. Returns the literal length
|
||||
// (including any leading sign for Infinity/NaN) or 0 if no match.
|
||||
// On match, sets `style` to SCE_JSON5_NUMBER (Infinity/NaN) or
|
||||
// SCE_JSON5_KEYWORD (true/false/null).
|
||||
static int MatchJSON5Literal(LexAccessor &styler, Sci_PositionU start, int &style) noexcept {
|
||||
struct Lit { const char *name; int style; bool allowSign; };
|
||||
static constexpr Lit literals[] = {
|
||||
{ "Infinity", SCE_JSON5_NUMBER, true },
|
||||
{ "NaN", SCE_JSON5_NUMBER, true },
|
||||
{ "true", SCE_JSON5_KEYWORD, false },
|
||||
{ "false", SCE_JSON5_KEYWORD, false },
|
||||
{ "null", SCE_JSON5_KEYWORD, false },
|
||||
};
|
||||
const char first = styler.SafeGetCharAt(start, '\0');
|
||||
const int signLen = (first == '+' || first == '-') ? 1 : 0;
|
||||
for (const auto &lit : literals) {
|
||||
if (signLen && !lit.allowSign) {
|
||||
continue;
|
||||
}
|
||||
int n = 0;
|
||||
while (lit.name[n] != '\0') {
|
||||
if (styler.SafeGetCharAt(start + signLen + n, '\0') != lit.name[n]) {
|
||||
n = -1;
|
||||
break;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
if (n <= 0) {
|
||||
continue;
|
||||
}
|
||||
int afterWidth = 0;
|
||||
const int after = DecodeUTF8At(styler, start + signLen + n, afterWidth);
|
||||
if (IsJSON5IdContinue(after)) {
|
||||
continue;
|
||||
}
|
||||
style = lit.style;
|
||||
return signLen + n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool AtPropertyName(LexAccessor &styler, const Sci_PositionU start, bool bQuoted) {
|
||||
Sci_PositionU i = 0;
|
||||
// Walk forward one code point per iteration (rather than one byte) so
|
||||
// strict IsJSON5IdContinue is applied to actual code points, not raw
|
||||
// UTF-8 lead/trail bytes.
|
||||
Sci_PositionU i = 1; // start+0 is the identifier-start char (caller validated)
|
||||
bool escaped = false;
|
||||
while (++i < 120) {
|
||||
char curr = styler.SafeGetCharAt(start+i, '\0');
|
||||
if (escaped) {
|
||||
escaped = false;
|
||||
continue;
|
||||
}
|
||||
escaped = (curr == '\\');
|
||||
if (curr == ':' && !bQuoted) {
|
||||
return true;
|
||||
} else if ((curr == '"' || curr == '\'') && bQuoted) {
|
||||
return IsNextNonWhitespace(styler, start + i, ':');
|
||||
} else if (isspacechar(curr) && !bQuoted) {
|
||||
return IsNextNonWhitespace(styler, start + i, ':');
|
||||
} if (!curr || (!bQuoted && !IsPropChar(curr))) {
|
||||
for (int iter = 0; iter < 120; ++iter) {
|
||||
int width = 0;
|
||||
const int cp = DecodeUTF8At(styler, start + i, width);
|
||||
if (cp == 0) {
|
||||
return false;
|
||||
}
|
||||
if (escaped) {
|
||||
escaped = false;
|
||||
i += width;
|
||||
continue;
|
||||
}
|
||||
escaped = (cp == '\\');
|
||||
if (cp == ':' && !bQuoted) {
|
||||
return true;
|
||||
}
|
||||
if ((cp == '"' || cp == '\'') && bQuoted) {
|
||||
return IsNextNonWhitespace(styler, start + i, ':');
|
||||
}
|
||||
if (cp < 0x80 && isspacechar(cp) && !bQuoted) {
|
||||
return IsNextNonWhitespace(styler, start + i, ':');
|
||||
}
|
||||
if (!bQuoted && !IsJSON5IdContinue(cp)) {
|
||||
return false;
|
||||
}
|
||||
i += width;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -266,7 +381,8 @@ class LexerJSON5 : public DefaultLexer {
|
||||
LexerJSON5() :
|
||||
DefaultLexer("json5", SCLEX_JSON5),
|
||||
setOperators(CharacterSet::setNone, "[{}]:,"),
|
||||
setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
|
||||
// RFC 3986 unreserved + gen-delims + sub-delims (URI characters)
|
||||
setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,;="),
|
||||
setKeywordJSON5_LD(CharacterSet::setAlpha, ":@"),
|
||||
setKeywordJSON5(CharacterSet::setAlpha, "$_+-") {
|
||||
}
|
||||
@ -466,7 +582,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
|
||||
context.SetState(SCE_JSON5_LDKEYWORD);
|
||||
}
|
||||
}
|
||||
else if (IsPropChar(context.ch)) {
|
||||
else if (IsJSON5IdContinue(context.ch)) {
|
||||
if (!AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) {
|
||||
if (context.state == SCE_JSON5_PROPERTYNAME) {
|
||||
context.SetState(SCE_JSON5_ERROR);
|
||||
@ -531,7 +647,15 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
|
||||
context.SetState(SCE_JSON5_PROPERTYNAME);
|
||||
}
|
||||
} else if (setKeywordJSON5.Contains(context.ch)) {
|
||||
if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) {
|
||||
// Hardcoded JSON5 reserved literals (true, false, null, Infinity, NaN, +/-Infinity, +/-NaN)
|
||||
int litStyle = 0;
|
||||
const int litLen = MatchJSON5Literal(styler, context.currentPos, litStyle);
|
||||
if (litLen > 0) {
|
||||
context.SetState(litStyle);
|
||||
for (int k = 1; k < litLen; ++k) {
|
||||
context.Forward();
|
||||
}
|
||||
} else if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) {
|
||||
context.SetState(SCE_JSON5_KEYWORD);
|
||||
}
|
||||
} else if (setOperators.Contains(context.ch)) {
|
||||
@ -547,7 +671,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
|
||||
context.SetState(SCE_JSON5_NUMBER);
|
||||
}
|
||||
else if (context.state == SCE_JSON5_DEFAULT) {
|
||||
if (IsPropChar(context.ch)) {
|
||||
if (IsJSON5IdStart(context.ch)) {
|
||||
if (AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) {
|
||||
context.SetState(SCE_JSON5_PROPERTYNAME);
|
||||
}
|
||||
@ -577,17 +701,26 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos,
|
||||
currLevel = styler.LevelAt(currLine - 1) >> 16;
|
||||
int nextLevel = currLevel;
|
||||
int visibleChars = 0;
|
||||
int prevStyle = (startPos > 0) ? styler.StyleAt(startPos - 1) : SCE_JSON5_DEFAULT;
|
||||
for (Sci_PositionU i = startPos; i < endPos; i++) {
|
||||
char curr = styler.SafeGetCharAt(i);
|
||||
char next = styler.SafeGetCharAt(i+1);
|
||||
bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
|
||||
if (styler.StyleAt(i) == SCE_JSON5_OPERATOR) {
|
||||
const int style = styler.StyleAt(i);
|
||||
if (style == SCE_JSON5_OPERATOR) {
|
||||
if (curr == '{' || curr == '[') {
|
||||
nextLevel++;
|
||||
} else if (curr == '}' || curr == ']') {
|
||||
nextLevel--;
|
||||
}
|
||||
}
|
||||
// Fold /* ... */ block comments. Single-line block comments self-cancel
|
||||
// (enter + exit on the same line), so they never produce a fold.
|
||||
if (style == SCE_JSON5_BLOCKCOMMENT && prevStyle != SCE_JSON5_BLOCKCOMMENT) {
|
||||
nextLevel++;
|
||||
} else if (prevStyle == SCE_JSON5_BLOCKCOMMENT && style != SCE_JSON5_BLOCKCOMMENT) {
|
||||
nextLevel--;
|
||||
}
|
||||
if (atEOL || i == (endPos-1)) {
|
||||
int level = currLevel | nextLevel << 16;
|
||||
if (!visibleChars && options.foldCompact) {
|
||||
@ -605,6 +738,7 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos,
|
||||
if (!isspacechar(curr)) {
|
||||
visibleChars++;
|
||||
}
|
||||
prevStyle = style;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -167,6 +167,83 @@ single quotes too',
|
||||
l4LS: "before\
continued",
|
||||
l4PS: "before\
continued",
|
||||
|
||||
// =====================================================================
|
||||
// BELOW: extensions covering the strict-ID / hardcoded-literal /
|
||||
// signed-NaN / setURL-typo / block-comment-folding lexer changes.
|
||||
// =====================================================================
|
||||
|
||||
// ---------- Hardcoded JSON5 reserved literals: signed NaN ----------
|
||||
// Pre-fix the word-list path handled +Infinity / -Infinity but missed
|
||||
// +NaN / -NaN (those tokens were never in the keyword list). After the
|
||||
// hardcoded-literal change both style as NUMBER.
|
||||
posNaN: +NaN,
|
||||
negNaN: -NaN,
|
||||
|
||||
// ---------- Literal-boundary check ----------
|
||||
// true / false / null / Infinity / NaN are reserved ONLY when the next
|
||||
// character is NOT an identifier-continue char. These keys start with
|
||||
// literal text but continue with more identifier characters, so they
|
||||
// MUST style as PROPERTYNAME (not KEYWORD / NUMBER).
|
||||
nullable: 'starts with literal "null"',
|
||||
truer: 'starts with literal "true"',
|
||||
falsehood: 'starts with literal "false"',
|
||||
Infinityx: 'starts with literal "Infinity"',
|
||||
NaNny: 'starts with literal "NaN"',
|
||||
|
||||
// ---------- Strict Unicode IdentifierName: accepted (UAX #31 ID_Start) ----------
|
||||
// Pre-strict accepted any byte >= 0x80 indiscriminately; post-strict
|
||||
// requires the actual code point to be a Unicode Letter / $ / _.
|
||||
café: 'Latin Extended (precomposed)',
|
||||
naïve: 'Latin Extended with diaeresis',
|
||||
Σmega: 'Greek capital sigma + ASCII',
|
||||
λambda: 'Greek small lambda + ASCII',
|
||||
имя: 'Cyrillic letters (Russian "name")',
|
||||
名前: 'CJK Han ideographs (Japanese "name")',
|
||||
中文: 'CJK Han ideographs (Chinese)',
|
||||
|
||||
// ---------- IdentifierPart: ZWNJ / ZWJ inside identifier (invisible chars) ----------
|
||||
// ZWNJ (U+200C) and ZWJ (U+200D) are valid IdentifierPart per ECMA-262.
|
||||
// The bytes are present in the source even though they don't render --
|
||||
// inspect with a hex viewer; the unquoted keys below must lex as
|
||||
// PROPERTYNAME, not break at the joiner.
|
||||
xy_zwnj: 'unquoted key with ZWNJ between x and y',
|
||||
xy_zwj: 'unquoted key with ZWJ between x and y',
|
||||
|
||||
// ---------- Strict Unicode rejection: should style as PARSING ERROR ----------
|
||||
// Each leading character is NOT a Unicode Letter, so not ID_Start.
|
||||
// Pre-strict accepted these (over-accepting all bytes >= 0x80);
|
||||
// post-strict flags them as ERROR.
|
||||
|
||||
// Emoji 😀 (U+1F600, category So) -- ERROR:
|
||||
😀: 'leading emoji must be ERROR',
|
||||
|
||||
// Math symbol ∞ (U+221E, category Sm) -- ERROR:
|
||||
∞: 'leading math symbol must be ERROR',
|
||||
|
||||
// Currency € (U+20AC, category Sc) -- JSON5 allows ONLY ASCII '$':
|
||||
€: 'leading currency symbol must be ERROR',
|
||||
|
||||
// ---------- URI charset typo fix: ';' inside URL ----------
|
||||
// Pre-fix setURL was "...,)," (duplicated ')' with ';' missing), so the
|
||||
// URI style terminated at the first ';'. Post-fix ';' is part of the
|
||||
// RFC-3986 sub-delim set and the whole URL highlights as URI through
|
||||
// the matrix parameters.
|
||||
urlSemicolon: "http://example.com/path;jsessionid=abc;v=2",
|
||||
|
||||
// ---------- Folding: multi-line block comment ----------
|
||||
// Fold() now treats a multi-line /* ... */ as a fold region (HEADERFLAG
|
||||
// on the opening line). The block below should be foldable; the
|
||||
// single-line /* ... */ further down should NOT produce a fold (its
|
||||
// enter+exit cancel within the same line).
|
||||
|
||||
/* This block comment
|
||||
spans three lines
|
||||
and should produce a fold marker on the opening line. */
|
||||
foldDemo: 'after the foldable block comment',
|
||||
|
||||
/* single-line comment -- should NOT produce a fold marker */
|
||||
noFold: 'after the non-foldable comment',
|
||||
|
||||
// ---------- Trailing item to verify the closing brace styling ----------
|
||||
last: 'end-of-test',
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user