From 45d8b4b15c85581ed87de0d287c2e0a3d99bd6a2 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Mon, 11 May 2026 14:23:32 +0200 Subject: [PATCH] fix: hardening of JSON5 lexer --- lexilla/lexers_x/LexJSON5.cxx | 188 +++++++++++++++--- .../styleLexJSON/Sample_JSON5_extended.json | 77 +++++++ 2 files changed, 238 insertions(+), 27 deletions(-) diff --git a/lexilla/lexers_x/LexJSON5.cxx b/lexilla/lexers_x/LexJSON5.cxx index 6ad059f38..160c75715 100644 --- a/lexilla/lexers_x/LexJSON5.cxx +++ b/lexilla/lexers_x/LexJSON5.cxx @@ -31,6 +31,7 @@ #include "DefaultLexer.h" #include "CharSetX.h" +#include "CharacterCategory.h" #include "SciXLexer.h" using namespace Scintilla; @@ -110,6 +111,70 @@ struct EscapeSequence { } }; +// Decode a single UTF-8 code point at byte position `pos`. +// Returns the code point and sets `outWidth` to its byte width (1-4). +// On invalid UTF-8 (lone trail byte, truncated sequence), returns the raw +// lead byte with width 1 so callers degrade gracefully. +static int DecodeUTF8At(LexAccessor &styler, Sci_PositionU pos, int &outWidth) noexcept { + const unsigned char b0 = static_cast(styler.SafeGetCharAt(pos, '\0')); + outWidth = 1; + if (b0 < 0x80) { + return b0; + } + if (b0 < 0xC2 || b0 > 0xF4) { + return b0; + } + const unsigned char b1 = static_cast(styler.SafeGetCharAt(pos + 1, '\0')); + if ((b1 & 0xC0) != 0x80) { + return b0; + } + if (b0 < 0xE0) { + outWidth = 2; + return ((b0 & 0x1F) << 6) | (b1 & 0x3F); + } + const unsigned char b2 = static_cast(styler.SafeGetCharAt(pos + 2, '\0')); + if ((b2 & 0xC0) != 0x80) { + return b0; + } + if (b0 < 0xF0) { + outWidth = 3; + return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + } + const unsigned char b3 = static_cast(styler.SafeGetCharAt(pos + 3, '\0')); + if ((b3 & 0xC0) != 0x80) { + return b0; + } + outWidth = 4; + return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); +} + +// JSON5 IdentifierStart per ECMA-262 / JSON5 §5.4: +// $ | _ | UnicodeLetter (UAX #31 ID_Start). +static bool IsJSON5IdStart(int cp) noexcept { + if (cp < 0x80) { + return (cp == '$') || (cp == '_') + || (cp >= 'A' && cp <= 'Z') + || (cp >= 'a' && cp <= 'z'); + } + return IsIdStart(cp); +} + +// JSON5 IdentifierPart per ECMA-262 / JSON5 §5.4: +// IdentifierStart | UnicodeCombiningMark | UnicodeDigit +// | UnicodeConnectorPunctuation | ZWNJ (U+200C) | ZWJ (U+200D). +static bool IsJSON5IdContinue(int cp) noexcept { + if (cp < 0x80) { + return (cp == '$') || (cp == '_') + || (cp >= '0' && cp <= '9') + || (cp >= 'A' && cp <= 'Z') + || (cp >= 'a' && cp <= 'z'); + } + if (cp == 0x200C || cp == 0x200D) { + return true; + } + return IsIdContinue(cp); +} + struct OptionsJSON5 { bool foldCompact; bool fold; @@ -118,8 +183,9 @@ struct OptionsJSON5 { OptionsJSON5() { foldCompact = false; fold = false; - allowComments = false; - escapeSequence = false; + // JSON5 spec mandates comments and ECMAScript-style escape sequences. + allowComments = true; + escapeSequence = true; } }; @@ -167,34 +233,83 @@ class LexerJSON5 : public DefaultLexer { /** * Looks for the colon following the end quote * - * Assumes property names of lengths no longer than a 120 characters. + * Assumes property names of lengths no longer than 120 code points. * The colon is also expected to be less than 50 spaces after the end - * quote for the string to be considered a property name + * quote for the string to be considered a property name. */ - static constexpr bool IsPropChar(int ch) noexcept { - // JSON5 / ECMAScript IdentifierStart + IdentifierPart (ASCII subset) - return IsAlphaNumeric(ch) || ch == '$' || ch == '_'; + + // Match a JSON5 reserved literal at `start`. Returns the literal length + // (including any leading sign for Infinity/NaN) or 0 if no match. + // On match, sets `style` to SCE_JSON5_NUMBER (Infinity/NaN) or + // SCE_JSON5_KEYWORD (true/false/null). + static int MatchJSON5Literal(LexAccessor &styler, Sci_PositionU start, int &style) noexcept { + struct Lit { const char *name; int style; bool allowSign; }; + static constexpr Lit literals[] = { + { "Infinity", SCE_JSON5_NUMBER, true }, + { "NaN", SCE_JSON5_NUMBER, true }, + { "true", SCE_JSON5_KEYWORD, false }, + { "false", SCE_JSON5_KEYWORD, false }, + { "null", SCE_JSON5_KEYWORD, false }, + }; + const char first = styler.SafeGetCharAt(start, '\0'); + const int signLen = (first == '+' || first == '-') ? 1 : 0; + for (const auto &lit : literals) { + if (signLen && !lit.allowSign) { + continue; + } + int n = 0; + while (lit.name[n] != '\0') { + if (styler.SafeGetCharAt(start + signLen + n, '\0') != lit.name[n]) { + n = -1; + break; + } + ++n; + } + if (n <= 0) { + continue; + } + int afterWidth = 0; + const int after = DecodeUTF8At(styler, start + signLen + n, afterWidth); + if (IsJSON5IdContinue(after)) { + continue; + } + style = lit.style; + return signLen + n; + } + return 0; } static bool AtPropertyName(LexAccessor &styler, const Sci_PositionU start, bool bQuoted) { - Sci_PositionU i = 0; + // Walk forward one code point per iteration (rather than one byte) so + // strict IsJSON5IdContinue is applied to actual code points, not raw + // UTF-8 lead/trail bytes. + Sci_PositionU i = 1; // start+0 is the identifier-start char (caller validated) bool escaped = false; - while (++i < 120) { - char curr = styler.SafeGetCharAt(start+i, '\0'); - if (escaped) { - escaped = false; - continue; - } - escaped = (curr == '\\'); - if (curr == ':' && !bQuoted) { - return true; - } else if ((curr == '"' || curr == '\'') && bQuoted) { - return IsNextNonWhitespace(styler, start + i, ':'); - } else if (isspacechar(curr) && !bQuoted) { - return IsNextNonWhitespace(styler, start + i, ':'); - } if (!curr || (!bQuoted && !IsPropChar(curr))) { + for (int iter = 0; iter < 120; ++iter) { + int width = 0; + const int cp = DecodeUTF8At(styler, start + i, width); + if (cp == 0) { return false; } + if (escaped) { + escaped = false; + i += width; + continue; + } + escaped = (cp == '\\'); + if (cp == ':' && !bQuoted) { + return true; + } + if ((cp == '"' || cp == '\'') && bQuoted) { + return IsNextNonWhitespace(styler, start + i, ':'); + } + if (cp < 0x80 && isspacechar(cp) && !bQuoted) { + return IsNextNonWhitespace(styler, start + i, ':'); + } + if (!bQuoted && !IsJSON5IdContinue(cp)) { + return false; + } + i += width; } return false; } @@ -266,7 +381,8 @@ class LexerJSON5 : public DefaultLexer { LexerJSON5() : DefaultLexer("json5", SCLEX_JSON5), setOperators(CharacterSet::setNone, "[{}]:,"), - setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="), + // RFC 3986 unreserved + gen-delims + sub-delims (URI characters) + setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,;="), setKeywordJSON5_LD(CharacterSet::setAlpha, ":@"), setKeywordJSON5(CharacterSet::setAlpha, "$_+-") { } @@ -466,7 +582,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos, context.SetState(SCE_JSON5_LDKEYWORD); } } - else if (IsPropChar(context.ch)) { + else if (IsJSON5IdContinue(context.ch)) { if (!AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) { if (context.state == SCE_JSON5_PROPERTYNAME) { context.SetState(SCE_JSON5_ERROR); @@ -531,7 +647,15 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos, context.SetState(SCE_JSON5_PROPERTYNAME); } } else if (setKeywordJSON5.Contains(context.ch)) { - if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) { + // Hardcoded JSON5 reserved literals (true, false, null, Infinity, NaN, +/-Infinity, +/-NaN) + int litStyle = 0; + const int litLen = MatchJSON5Literal(styler, context.currentPos, litStyle); + if (litLen > 0) { + context.SetState(litStyle); + for (int k = 1; k < litLen; ++k) { + context.Forward(); + } + } else if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) { context.SetState(SCE_JSON5_KEYWORD); } } else if (setOperators.Contains(context.ch)) { @@ -547,7 +671,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos, context.SetState(SCE_JSON5_NUMBER); } else if (context.state == SCE_JSON5_DEFAULT) { - if (IsPropChar(context.ch)) { + if (IsJSON5IdStart(context.ch)) { if (AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) { context.SetState(SCE_JSON5_PROPERTYNAME); } @@ -577,17 +701,26 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos, currLevel = styler.LevelAt(currLine - 1) >> 16; int nextLevel = currLevel; int visibleChars = 0; + int prevStyle = (startPos > 0) ? styler.StyleAt(startPos - 1) : SCE_JSON5_DEFAULT; for (Sci_PositionU i = startPos; i < endPos; i++) { char curr = styler.SafeGetCharAt(i); char next = styler.SafeGetCharAt(i+1); bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n'); - if (styler.StyleAt(i) == SCE_JSON5_OPERATOR) { + const int style = styler.StyleAt(i); + if (style == SCE_JSON5_OPERATOR) { if (curr == '{' || curr == '[') { nextLevel++; } else if (curr == '}' || curr == ']') { nextLevel--; } } + // Fold /* ... */ block comments. Single-line block comments self-cancel + // (enter + exit on the same line), so they never produce a fold. + if (style == SCE_JSON5_BLOCKCOMMENT && prevStyle != SCE_JSON5_BLOCKCOMMENT) { + nextLevel++; + } else if (prevStyle == SCE_JSON5_BLOCKCOMMENT && style != SCE_JSON5_BLOCKCOMMENT) { + nextLevel--; + } if (atEOL || i == (endPos-1)) { int level = currLevel | nextLevel << 16; if (!visibleChars && options.foldCompact) { @@ -605,6 +738,7 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos, if (!isspacechar(curr)) { visibleChars++; } + prevStyle = style; } } diff --git a/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json b/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json index 4fa3babfa..a61772bde 100644 --- a/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json +++ b/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json @@ -167,6 +167,83 @@ single quotes too', l4LS: "before\
continued", l4PS: "before\
continued", + // ===================================================================== + // BELOW: extensions covering the strict-ID / hardcoded-literal / + // signed-NaN / setURL-typo / block-comment-folding lexer changes. + // ===================================================================== + + // ---------- Hardcoded JSON5 reserved literals: signed NaN ---------- + // Pre-fix the word-list path handled +Infinity / -Infinity but missed + // +NaN / -NaN (those tokens were never in the keyword list). After the + // hardcoded-literal change both style as NUMBER. + posNaN: +NaN, + negNaN: -NaN, + + // ---------- Literal-boundary check ---------- + // true / false / null / Infinity / NaN are reserved ONLY when the next + // character is NOT an identifier-continue char. These keys start with + // literal text but continue with more identifier characters, so they + // MUST style as PROPERTYNAME (not KEYWORD / NUMBER). + nullable: 'starts with literal "null"', + truer: 'starts with literal "true"', + falsehood: 'starts with literal "false"', + Infinityx: 'starts with literal "Infinity"', + NaNny: 'starts with literal "NaN"', + + // ---------- Strict Unicode IdentifierName: accepted (UAX #31 ID_Start) ---------- + // Pre-strict accepted any byte >= 0x80 indiscriminately; post-strict + // requires the actual code point to be a Unicode Letter / $ / _. + café: 'Latin Extended (precomposed)', + naïve: 'Latin Extended with diaeresis', + Σmega: 'Greek capital sigma + ASCII', + λambda: 'Greek small lambda + ASCII', + имя: 'Cyrillic letters (Russian "name")', + 名前: 'CJK Han ideographs (Japanese "name")', + 中文: 'CJK Han ideographs (Chinese)', + + // ---------- IdentifierPart: ZWNJ / ZWJ inside identifier (invisible chars) ---------- + // ZWNJ (U+200C) and ZWJ (U+200D) are valid IdentifierPart per ECMA-262. + // The bytes are present in the source even though they don't render -- + // inspect with a hex viewer; the unquoted keys below must lex as + // PROPERTYNAME, not break at the joiner. + x‌y_zwnj: 'unquoted key with ZWNJ between x and y', + x‍y_zwj: 'unquoted key with ZWJ between x and y', + + // ---------- Strict Unicode rejection: should style as PARSING ERROR ---------- + // Each leading character is NOT a Unicode Letter, so not ID_Start. + // Pre-strict accepted these (over-accepting all bytes >= 0x80); + // post-strict flags them as ERROR. + + // Emoji 😀 (U+1F600, category So) -- ERROR: + 😀: 'leading emoji must be ERROR', + + // Math symbol ∞ (U+221E, category Sm) -- ERROR: + ∞: 'leading math symbol must be ERROR', + + // Currency € (U+20AC, category Sc) -- JSON5 allows ONLY ASCII '$': + €: 'leading currency symbol must be ERROR', + + // ---------- URI charset typo fix: ';' inside URL ---------- + // Pre-fix setURL was "...,)," (duplicated ')' with ';' missing), so the + // URI style terminated at the first ';'. Post-fix ';' is part of the + // RFC-3986 sub-delim set and the whole URL highlights as URI through + // the matrix parameters. + urlSemicolon: "http://example.com/path;jsessionid=abc;v=2", + + // ---------- Folding: multi-line block comment ---------- + // Fold() now treats a multi-line /* ... */ as a fold region (HEADERFLAG + // on the opening line). The block below should be foldable; the + // single-line /* ... */ further down should NOT produce a fold (its + // enter+exit cancel within the same line). + + /* This block comment + spans three lines + and should produce a fold marker on the opening line. */ + foldDemo: 'after the foldable block comment', + + /* single-line comment -- should NOT produce a fold marker */ + noFold: 'after the non-foldable comment', + // ---------- Trailing item to verify the closing brace styling ---------- last: 'end-of-test', }