diff --git a/scintilla/lexers/LexPython.cxx b/scintilla/lexers/LexPython.cxx index 19dd0ca3b..6a2edde11 100644 --- a/scintilla/lexers/LexPython.cxx +++ b/scintilla/lexers/LexPython.cxx @@ -25,6 +25,7 @@ #include "Accessor.h" #include "StyleContext.h" #include "CharacterSet.h" +#include "CharacterCategory.h" #include "LexerModule.h" #include "OptionSet.h" #include "SubStyles.h" @@ -39,7 +40,7 @@ namespace { /* kwCDef, kwCTypeName only used for Cython */ enum kwType { kwOther, kwClass, kwDef, kwImport, kwCDef, kwCTypeName, kwCPDef }; -enum literalsAllowed { litNone = 0, litU = 1, litB = 2 }; +enum literalsAllowed { litNone = 0, litU = 1, litB = 2, litF = 4 }; const int indicatorWhitespace = 1; @@ -50,7 +51,8 @@ bool IsPyComment(Accessor &styler, Sci_Position pos, Sci_Position len) { bool IsPyStringTypeChar(int ch, literalsAllowed allowed) { return ((allowed & litB) && (ch == 'b' || ch == 'B')) || - ((allowed & litU) && (ch == 'u' || ch == 'U')); + ((allowed & litU) && (ch == 'u' || ch == 'U')) || + ((allowed & litF) && (ch == 'f' || ch == 'F')); } bool IsPyStringStart(int ch, int chNext, int chNext2, literalsAllowed allowed) { @@ -73,7 +75,7 @@ int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex, char ch = styler.SafeGetCharAt(i); char chNext = styler.SafeGetCharAt(i + 1); - // Advance beyond r, u, or ur prefix (or r, b, or br in Python 3.0), but bail if there are any unexpected chars + // Advance beyond r, u, or ur prefix (or r, b, or br in Python 2.7+ and r, f, or fr in Python 3.6+), but bail if there are any unexpected chars if (ch == 'r' || ch == 'R') { i++; ch = styler.SafeGetCharAt(i); @@ -109,12 +111,32 @@ int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex, } } -inline bool IsAWordChar(int ch) { - return (ch < 0x80) && (isalnum(ch) || ch == '.' || ch == '_'); +inline bool IsAWordChar(int ch, bool unicodeIdentifiers) { + if (ch < 0x80) + return (isalnum(ch) || ch == '.' || ch == '_'); + + if (!unicodeIdentifiers) + return false; + + // Approximation, Python uses the XID_Continue set from unicode data + // see http://unicode.org/reports/tr31/ + CharacterCategory c = CategoriseCharacter(ch); + return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo + || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc); } -inline bool IsAWordStart(int ch) { - return (ch < 0x80) && (isalnum(ch) || ch == '_'); +inline bool IsAWordStart(int ch, bool unicodeIdentifiers) { + if (ch < 0x80) + return (isalpha(ch) || ch == '_'); + + if (!unicodeIdentifiers) + return false; + + // Approximation, Python uses the XID_Start set from unicode data + // see http://unicode.org/reports/tr31/ + CharacterCategory c = CategoriseCharacter(ch); + return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo + || c == ccNl); } static bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) { @@ -134,28 +156,34 @@ struct OptionsPython { bool base2or8Literals; bool stringsU; bool stringsB; + bool stringsF; bool stringsOverNewline; bool keywords2NoSubIdentifiers; bool fold; bool foldQuotes; bool foldCompact; + bool unicodeIdentifiers; OptionsPython() { whingeLevel = 0; base2or8Literals = true; stringsU = true; stringsB = true; + stringsF = true; stringsOverNewline = false; keywords2NoSubIdentifiers = false; fold = false; foldQuotes = false; foldCompact = false; + unicodeIdentifiers = true; } literalsAllowed AllowedLiterals() const { literalsAllowed allowedLiterals = stringsU ? litU : litNone; if (stringsB) allowedLiterals = static_cast(allowedLiterals | litB); + if (stringsF) + allowedLiterals = static_cast(allowedLiterals | litF); return allowedLiterals; } }; @@ -186,6 +214,9 @@ struct OptionSetPython : public OptionSet { DefineProperty("lexer.python.strings.b", &OptionsPython::stringsB, "Set to 0 to not recognise Python 3 bytes literals b\"x\"."); + DefineProperty("lexer.python.strings.f", &OptionsPython::stringsF, + "Set to 0 to not recognise Python 3.6 f-string literals f\"var={var}\"."); + DefineProperty("lexer.python.strings.over.newline", &OptionsPython::stringsOverNewline, "Set to 1 to allow strings to span newline characters."); @@ -200,6 +231,9 @@ struct OptionSetPython : public OptionSet { DefineProperty("fold.compact", &OptionsPython::foldCompact); + DefineProperty("lexer.python.unicode.identifiers", &OptionsPython::unicodeIdentifiers, + "Set to 0 to not recognise Python 3 unicode identifiers."); + DefineWordListSets(pythonWordListDesc); } }; @@ -410,12 +444,12 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in kwLast = kwOther; sc.SetState(SCE_P_DEFAULT); } else if (sc.state == SCE_P_NUMBER) { - if (!IsAWordChar(sc.ch) && + if (!IsAWordChar(sc.ch, false) && !(!base_n_number && ((sc.ch == '+' || sc.ch == '-') && (sc.chPrev == 'e' || sc.chPrev == 'E')))) { sc.SetState(SCE_P_DEFAULT); } } else if (sc.state == SCE_P_IDENTIFIER) { - if ((sc.ch == '.') || (!IsAWordChar(sc.ch))) { + if ((sc.ch == '.') || (!IsAWordChar(sc.ch, options.unicodeIdentifiers))) { char s[100]; sc.GetCurrent(s, sizeof(s)); int style = SCE_P_IDENTIFIER; @@ -487,7 +521,7 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in sc.SetState(SCE_P_DEFAULT); } } else if (sc.state == SCE_P_DECORATOR) { - if (!IsAWordChar(sc.ch)) { + if (!IsAWordStart(sc.ch, options.unicodeIdentifiers)) { sc.SetState(SCE_P_DEFAULT); } } else if ((sc.state == SCE_P_STRING) || (sc.state == SCE_P_CHARACTER)) { @@ -581,7 +615,7 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in while (nextIndex > (sc.currentPos + 1) && sc.More()) { sc.Forward(); } - } else if (IsAWordStart(sc.ch)) { + } else if (IsAWordStart(sc.ch, options.unicodeIdentifiers)) { sc.SetState(SCE_P_IDENTIFIER); } }