fix: hardening of JSON5 lexer

2026-06-11 21:03:05 +08:00 · 2026-05-11 14:23:32 +02:00 · 2026-05-11 14:23:32 +02:00 · 45d8b4b15c
commit 45d8b4b15c
parent e24dc017cb
2 changed files with 238 additions and 27 deletions
--- a/lexilla/lexers_x/LexJSON5.cxx
+++ b/lexilla/lexers_x/LexJSON5.cxx
@ -31,6 +31,7 @@
 #include "DefaultLexer.h"

 #include "CharSetX.h"
+#include "CharacterCategory.h"
 #include "SciXLexer.h"

 using namespace Scintilla;
@ -110,6 +111,70 @@ struct EscapeSequence {
 	}
 };

+// Decode a single UTF-8 code point at byte position `pos`.
+// Returns the code point and sets `outWidth` to its byte width (1-4).
+// On invalid UTF-8 (lone trail byte, truncated sequence), returns the raw
+// lead byte with width 1 so callers degrade gracefully.
+static int DecodeUTF8At(LexAccessor &styler, Sci_PositionU pos, int &outWidth) noexcept {
+	const unsigned char b0 = static_cast<unsigned char>(styler.SafeGetCharAt(pos, '\0'));
+	outWidth = 1;
+	if (b0 < 0x80) {
+		return b0;
+	}
+	if (b0 < 0xC2 || b0 > 0xF4) {
+		return b0;
+	}
+	const unsigned char b1 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 1, '\0'));
+	if ((b1 & 0xC0) != 0x80) {
+		return b0;
+	}
+	if (b0 < 0xE0) {
+		outWidth = 2;
+		return ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+	}
+	const unsigned char b2 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 2, '\0'));
+	if ((b2 & 0xC0) != 0x80) {
+		return b0;
+	}
+	if (b0 < 0xF0) {
+		outWidth = 3;
+		return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+	}
+	const unsigned char b3 = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 3, '\0'));
+	if ((b3 & 0xC0) != 0x80) {
+		return b0;
+	}
+	outWidth = 4;
+	return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+}
+
+// JSON5 IdentifierStart per ECMA-262 / JSON5 §5.4:
+//   $ | _ | UnicodeLetter (UAX #31 ID_Start).
+static bool IsJSON5IdStart(int cp) noexcept {
+	if (cp < 0x80) {
+		return (cp == '$') || (cp == '_')
+			|| (cp >= 'A' && cp <= 'Z')
+			|| (cp >= 'a' && cp <= 'z');
+	}
+	return IsIdStart(cp);
+}
+
+// JSON5 IdentifierPart per ECMA-262 / JSON5 §5.4:
+//   IdentifierStart | UnicodeCombiningMark | UnicodeDigit
+//   | UnicodeConnectorPunctuation | ZWNJ (U+200C) | ZWJ (U+200D).
+static bool IsJSON5IdContinue(int cp) noexcept {
+	if (cp < 0x80) {
+		return (cp == '$') || (cp == '_')
+			|| (cp >= '0' && cp <= '9')
+			|| (cp >= 'A' && cp <= 'Z')
+			|| (cp >= 'a' && cp <= 'z');
+	}
+	if (cp == 0x200C || cp == 0x200D) {
+		return true;
+	}
+	return IsIdContinue(cp);
+}
+
 struct OptionsJSON5 {
 	bool foldCompact;
 	bool fold;
@ -118,8 +183,9 @@ struct OptionsJSON5 {
 	OptionsJSON5() {
 		foldCompact = false;
 		fold = false;
-		allowComments = false;
-		escapeSequence = false;
+		// JSON5 spec mandates comments and ECMAScript-style escape sequences.
+		allowComments = true;
+		escapeSequence = true;
 	}
 };

@ -167,34 +233,83 @@ class LexerJSON5 : public DefaultLexer {
 	/**
 	 * Looks for the colon following the end quote
 	 *
-	 * Assumes property names of lengths no longer than a 120 characters.
+	 * Assumes property names of lengths no longer than 120 code points.
 	 * The colon is also expected to be less than 50 spaces after the end
-	 * quote for the string to be considered a property name
+	 * quote for the string to be considered a property name.
 	 */
-	static constexpr bool IsPropChar(int ch) noexcept {
-		// JSON5 / ECMAScript IdentifierStart + IdentifierPart (ASCII subset)
-		return IsAlphaNumeric(ch) || ch == '$' || ch == '_';
+
+	// Match a JSON5 reserved literal at `start`. Returns the literal length
+	// (including any leading sign for Infinity/NaN) or 0 if no match.
+	// On match, sets `style` to SCE_JSON5_NUMBER (Infinity/NaN) or
+	// SCE_JSON5_KEYWORD (true/false/null).
+	static int MatchJSON5Literal(LexAccessor &styler, Sci_PositionU start, int &style) noexcept {
+		struct Lit { const char *name; int style; bool allowSign; };
+		static constexpr Lit literals[] = {
+			{ "Infinity", SCE_JSON5_NUMBER,  true  },
+			{ "NaN",      SCE_JSON5_NUMBER,  true  },
+			{ "true",     SCE_JSON5_KEYWORD, false },
+			{ "false",    SCE_JSON5_KEYWORD, false },
+			{ "null",     SCE_JSON5_KEYWORD, false },
+		};
+		const char first = styler.SafeGetCharAt(start, '\0');
+		const int signLen = (first == '+' || first == '-') ? 1 : 0;
+		for (const auto &lit : literals) {
+			if (signLen && !lit.allowSign) {
+				continue;
+			}
+			int n = 0;
+			while (lit.name[n] != '\0') {
+				if (styler.SafeGetCharAt(start + signLen + n, '\0') != lit.name[n]) {
+					n = -1;
+					break;
+				}
+				++n;
+			}
+			if (n <= 0) {
+				continue;
+			}
+			int afterWidth = 0;
+			const int after = DecodeUTF8At(styler, start + signLen + n, afterWidth);
+			if (IsJSON5IdContinue(after)) {
+				continue;
+			}
+			style = lit.style;
+			return signLen + n;
+		}
+		return 0;
 	}

 	static bool AtPropertyName(LexAccessor &styler, const Sci_PositionU start, bool bQuoted) {
-		Sci_PositionU i = 0;
+		// Walk forward one code point per iteration (rather than one byte) so
+		// strict IsJSON5IdContinue is applied to actual code points, not raw
+		// UTF-8 lead/trail bytes.
+		Sci_PositionU i = 1;       // start+0 is the identifier-start char (caller validated)
 		bool escaped = false;
-		while (++i < 120) {
-			char curr = styler.SafeGetCharAt(start+i, '\0');
-			if (escaped) {
-				escaped = false;
-				continue;
-			}
-			escaped = (curr == '\\');
-			if (curr == ':' && !bQuoted) {
-				return true;
-			} else if ((curr == '"' || curr == '\'') && bQuoted) {
-				return IsNextNonWhitespace(styler, start + i, ':');
-			} else if (isspacechar(curr) && !bQuoted) {
-				return IsNextNonWhitespace(styler, start + i, ':');
-			} if (!curr || (!bQuoted && !IsPropChar(curr))) {
+		for (int iter = 0; iter < 120; ++iter) {
+			int width = 0;
+			const int cp = DecodeUTF8At(styler, start + i, width);
+			if (cp == 0) {
 				return false;
 			}
+			if (escaped) {
+				escaped = false;
+				i += width;
+				continue;
+			}
+			escaped = (cp == '\\');
+			if (cp == ':' && !bQuoted) {
+				return true;
+			}
+			if ((cp == '"' || cp == '\'') && bQuoted) {
+				return IsNextNonWhitespace(styler, start + i, ':');
+			}
+			if (cp < 0x80 && isspacechar(cp) && !bQuoted) {
+				return IsNextNonWhitespace(styler, start + i, ':');
+			}
+			if (!bQuoted && !IsJSON5IdContinue(cp)) {
+				return false;
+			}
+			i += width;
 		}
 		return false;
 	}
@ -266,7 +381,8 @@ class LexerJSON5 : public DefaultLexer {
 	LexerJSON5() :
 		DefaultLexer("json5", SCLEX_JSON5),
 		setOperators(CharacterSet::setNone, "[{}]:,"),
-		setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
+		// RFC 3986 unreserved + gen-delims + sub-delims (URI characters)
+		setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,;="),
 		setKeywordJSON5_LD(CharacterSet::setAlpha, ":@"),
 		setKeywordJSON5(CharacterSet::setAlpha, "$_+-") {
 	}
@ -466,7 +582,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
 						context.SetState(SCE_JSON5_LDKEYWORD);
 					}
 				}
-				else if (IsPropChar(context.ch)) {
+				else if (IsJSON5IdContinue(context.ch)) {
 					if (!AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) {
 						if (context.state == SCE_JSON5_PROPERTYNAME) {
 							context.SetState(SCE_JSON5_ERROR);
@ -531,7 +647,15 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
 					context.SetState(SCE_JSON5_PROPERTYNAME);
 				}
 			} else if (setKeywordJSON5.Contains(context.ch)) {
-				if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) {
+				// Hardcoded JSON5 reserved literals (true, false, null, Infinity, NaN, +/-Infinity, +/-NaN)
+				int litStyle = 0;
+				const int litLen = MatchJSON5Literal(styler, context.currentPos, litStyle);
+				if (litLen > 0) {
+					context.SetState(litStyle);
+					for (int k = 1; k < litLen; ++k) {
+						context.Forward();
+					}
+				} else if (IsNextWordInList(keywordsJSON5, setKeywordJSON5, context, styler)) {
 					context.SetState(SCE_JSON5_KEYWORD);
 				}
 			} else if (setOperators.Contains(context.ch)) {
@ -547,7 +671,7 @@ void SCI_METHOD LexerJSON5::Lex(Sci_PositionU startPos,
 				context.SetState(SCE_JSON5_NUMBER);
 			}
 			else if (context.state == SCE_JSON5_DEFAULT) {
-				if (IsPropChar(context.ch)) {
+				if (IsJSON5IdStart(context.ch)) {
 					if (AtPropertyName(styler, context.currentPos, (doubleQuotCntx || singleQuotCntx))) {
 						context.SetState(SCE_JSON5_PROPERTYNAME);
 					}
@ -577,17 +701,26 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos,
 		currLevel = styler.LevelAt(currLine - 1) >> 16;
 	int nextLevel = currLevel;
 	int visibleChars = 0;
+	int prevStyle = (startPos > 0) ? styler.StyleAt(startPos - 1) : SCE_JSON5_DEFAULT;
 	for (Sci_PositionU i = startPos; i < endPos; i++) {
 		char curr = styler.SafeGetCharAt(i);
 		char next = styler.SafeGetCharAt(i+1);
 		bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
-		if (styler.StyleAt(i) == SCE_JSON5_OPERATOR) {
+		const int style = styler.StyleAt(i);
+		if (style == SCE_JSON5_OPERATOR) {
 			if (curr == '{' || curr == '[') {
 				nextLevel++;
 			} else if (curr == '}' || curr == ']') {
 				nextLevel--;
 			}
 		}
+		// Fold /* ... */ block comments. Single-line block comments self-cancel
+		// (enter + exit on the same line), so they never produce a fold.
+		if (style == SCE_JSON5_BLOCKCOMMENT && prevStyle != SCE_JSON5_BLOCKCOMMENT) {
+			nextLevel++;
+		} else if (prevStyle == SCE_JSON5_BLOCKCOMMENT && style != SCE_JSON5_BLOCKCOMMENT) {
+			nextLevel--;
+		}
 		if (atEOL || i == (endPos-1)) {
 			int level = currLevel | nextLevel << 16;
 			if (!visibleChars && options.foldCompact) {
@ -605,6 +738,7 @@ void SCI_METHOD LexerJSON5::Fold(Sci_PositionU startPos,
 		if (!isspacechar(curr)) {
 			visibleChars++;
 		}
+		prevStyle = style;
 	}
 }

--- a/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json
+++ b/test/test_files/StyleLexers/styleLexJSON/Sample_JSON5_extended.json
@ -167,6 +167,83 @@ single quotes too',
  l4LS:         "before\ continued",
  l4PS:         "before\ continued",

+  // =====================================================================
+  // BELOW: extensions covering the strict-ID / hardcoded-literal /
+  // signed-NaN / setURL-typo / block-comment-folding lexer changes.
+  // =====================================================================
+
+  // ---------- Hardcoded JSON5 reserved literals: signed NaN ----------
+  // Pre-fix the word-list path handled +Infinity / -Infinity but missed
+  // +NaN / -NaN (those tokens were never in the keyword list).  After the
+  // hardcoded-literal change both style as NUMBER.
+  posNaN:         +NaN,
+  negNaN:         -NaN,
+
+  // ---------- Literal-boundary check ----------
+  // true / false / null / Infinity / NaN are reserved ONLY when the next
+  // character is NOT an identifier-continue char.  These keys start with
+  // literal text but continue with more identifier characters, so they
+  // MUST style as PROPERTYNAME (not KEYWORD / NUMBER).
+  nullable:       'starts with literal "null"',
+  truer:          'starts with literal "true"',
+  falsehood:      'starts with literal "false"',
+  Infinityx:      'starts with literal "Infinity"',
+  NaNny:          'starts with literal "NaN"',
+
+  // ---------- Strict Unicode IdentifierName: accepted (UAX #31 ID_Start) ----------
+  // Pre-strict accepted any byte >= 0x80 indiscriminately; post-strict
+  // requires the actual code point to be a Unicode Letter / $ / _.
+  café:           'Latin Extended (precomposed)',
+  naïve:          'Latin Extended with diaeresis',
+  Σmega:          'Greek capital sigma + ASCII',
+  λambda:         'Greek small lambda + ASCII',
+  имя:            'Cyrillic letters (Russian "name")',
+  名前:           'CJK Han ideographs (Japanese "name")',
+  中文:           'CJK Han ideographs (Chinese)',
+
+  // ---------- IdentifierPart: ZWNJ / ZWJ inside identifier (invisible chars) ----------
+  // ZWNJ (U+200C) and ZWJ (U+200D) are valid IdentifierPart per ECMA-262.
+  // The bytes are present in the source even though they don't render --
+  // inspect with a hex viewer; the unquoted keys below must lex as
+  // PROPERTYNAME, not break at the joiner.
+  x‌y_zwnj:        'unquoted key with ZWNJ between x and y',
+  x‍y_zwj:         'unquoted key with ZWJ between x and y',
+
+  // ---------- Strict Unicode rejection: should style as PARSING ERROR ----------
+  // Each leading character is NOT a Unicode Letter, so not ID_Start.
+  // Pre-strict accepted these (over-accepting all bytes >= 0x80);
+  // post-strict flags them as ERROR.
+
+  // Emoji 😀 (U+1F600, category So) -- ERROR:
+  😀: 'leading emoji must be ERROR',
+
+  // Math symbol ∞ (U+221E, category Sm) -- ERROR:
+  ∞: 'leading math symbol must be ERROR',
+
+  // Currency € (U+20AC, category Sc) -- JSON5 allows ONLY ASCII '$':
+  €: 'leading currency symbol must be ERROR',
+
+  // ---------- URI charset typo fix: ';' inside URL ----------
+  // Pre-fix setURL was "...,)," (duplicated ')' with ';' missing), so the
+  // URI style terminated at the first ';'.  Post-fix ';' is part of the
+  // RFC-3986 sub-delim set and the whole URL highlights as URI through
+  // the matrix parameters.
+  urlSemicolon:   "http://example.com/path;jsessionid=abc;v=2",
+
+  // ---------- Folding: multi-line block comment ----------
+  // Fold() now treats a multi-line /* ... */ as a fold region (HEADERFLAG
+  // on the opening line).  The block below should be foldable; the
+  // single-line /* ... */ further down should NOT produce a fold (its
+  // enter+exit cancel within the same line).
+
+  /* This block comment
+     spans three lines
+     and should produce a fold marker on the opening line. */
+  foldDemo:       'after the foldable block comment',
+
+  /* single-line comment -- should NOT produce a fold marker */
+  noFold:         'after the non-foldable comment',
+
  // ---------- Trailing item to verify the closing brace styling ----------
  last: 'end-of-test',
 }