mirror of
https://github.com/zulip/zulip.git
synced 2026-06-21 21:32:29 +08:00
Now we only tokenize the file once, and we pass
**validated** tokens to the pretty printer.
There are a few reasons for this:
* It obviously saves a lot of extra computation
just in terms of tokenization.
* It allows our validator to add fields
to the Token objects that help the pretty
printer.
I also removed/tweaked a lot of legacy tests for
pretty_print.py that were exercising bizarrely
formatted HTML that we now simply ban during the
validation phase.
695 lines
20 KiB
Python
695 lines
20 KiB
Python
from typing import Callable, List, Optional
|
|
|
|
|
|
class FormattedException(Exception):
|
|
pass
|
|
|
|
|
|
class TemplateParserException(Exception):
|
|
def __init__(self, message: str) -> None:
|
|
self.message = message
|
|
|
|
def __str__(self) -> str:
|
|
return self.message
|
|
|
|
|
|
class TokenizationException(Exception):
|
|
def __init__(self, message: str, line_content: Optional[str] = None) -> None:
|
|
self.message = message
|
|
self.line_content = line_content
|
|
|
|
|
|
class TokenizerState:
|
|
def __init__(self) -> None:
|
|
self.i = 0
|
|
self.line = 1
|
|
self.col = 1
|
|
|
|
|
|
class Token:
|
|
def __init__(self, kind: str, s: str, tag: str, line: int, col: int, line_span: int) -> None:
|
|
self.kind = kind
|
|
self.s = s
|
|
self.tag = tag
|
|
self.line = line
|
|
self.col = col
|
|
self.line_span = line_span
|
|
|
|
# These get set during the validation pass.
|
|
self.start_token: Optional[Token] = None
|
|
self.end_token: Optional[Token] = None
|
|
|
|
|
|
def tokenize(text: str) -> List[Token]:
|
|
def advance(n: int) -> None:
|
|
for _ in range(n):
|
|
state.i += 1
|
|
if state.i >= 0 and text[state.i - 1] == "\n":
|
|
state.line += 1
|
|
state.col = 1
|
|
else:
|
|
state.col += 1
|
|
|
|
def looking_at(s: str) -> bool:
|
|
return text[state.i : state.i + len(s)] == s
|
|
|
|
def looking_at_htmlcomment() -> bool:
|
|
return looking_at("<!--")
|
|
|
|
def looking_at_handlebarcomment() -> bool:
|
|
return looking_at("{{!")
|
|
|
|
def looking_at_djangocomment() -> bool:
|
|
return looking_at("{#")
|
|
|
|
def looking_at_handlebarpartial() -> bool:
|
|
return looking_at("{{>")
|
|
|
|
def looking_at_html_start() -> bool:
|
|
return looking_at("<") and not looking_at("</")
|
|
|
|
def looking_at_html_end() -> bool:
|
|
return looking_at("</")
|
|
|
|
def looking_at_handlebars_start() -> bool:
|
|
return looking_at("{{#") or looking_at("{{^") or looking_at("{{~#")
|
|
|
|
def looking_at_handlebars_else() -> bool:
|
|
return looking_at("{{else")
|
|
|
|
def looking_at_template_var() -> bool:
|
|
return looking_at("{")
|
|
|
|
def looking_at_handlebars_end() -> bool:
|
|
return looking_at("{{/") or looking_at("{{~/")
|
|
|
|
def looking_at_django_start() -> bool:
|
|
return looking_at("{% ")
|
|
|
|
def looking_at_django_else() -> bool:
|
|
return looking_at("{% else") or looking_at("{% elif")
|
|
|
|
def looking_at_django_end() -> bool:
|
|
return looking_at("{% end")
|
|
|
|
def looking_at_jinja2_end_whitespace_stripped() -> bool:
|
|
return looking_at("{%- end")
|
|
|
|
def looking_at_jinja2_start_whitespace_stripped_type2() -> bool:
|
|
# This function detects tag like {%- if foo -%}...{% endif %}
|
|
return looking_at("{%-") and not looking_at("{%- end")
|
|
|
|
def looking_at_whitespace() -> bool:
|
|
return looking_at("\n") or looking_at(" ")
|
|
|
|
state = TokenizerState()
|
|
tokens: List[Token] = []
|
|
|
|
while state.i < len(text):
|
|
try:
|
|
if looking_at_htmlcomment():
|
|
s = get_html_comment(text, state.i)
|
|
tag = s[4:-3]
|
|
kind = "html_comment"
|
|
elif looking_at_handlebarcomment():
|
|
s = get_handlebar_comment(text, state.i)
|
|
tag = s[3:-2]
|
|
kind = "handlebar_comment"
|
|
elif looking_at_djangocomment():
|
|
s = get_django_comment(text, state.i)
|
|
tag = s[2:-2]
|
|
kind = "django_comment"
|
|
elif looking_at_handlebarpartial():
|
|
s = get_handlebar_partial(text, state.i)
|
|
tag = s[9:-2]
|
|
kind = "handlebars_singleton"
|
|
elif looking_at_html_start():
|
|
s = get_html_tag(text, state.i)
|
|
if s.endswith("/>"):
|
|
end_offset = -2
|
|
else:
|
|
end_offset = -1
|
|
tag_parts = s[1:end_offset].split()
|
|
|
|
if not tag_parts:
|
|
raise TemplateParserException("Tag name missing")
|
|
|
|
tag = tag_parts[0]
|
|
|
|
if tag == "!DOCTYPE":
|
|
kind = "html_doctype"
|
|
elif s.endswith("/>"):
|
|
kind = "html_singleton"
|
|
else:
|
|
kind = "html_start"
|
|
elif looking_at_html_end():
|
|
s = get_html_tag(text, state.i)
|
|
tag = s[2:-1]
|
|
kind = "html_end"
|
|
elif looking_at_handlebars_else():
|
|
s = get_handlebars_tag(text, state.i)
|
|
tag = "else"
|
|
kind = "handlebars_else"
|
|
elif looking_at_handlebars_start():
|
|
s = get_handlebars_tag(text, state.i)
|
|
tag = s[3:-2].split()[0].strip("#")
|
|
if tag.startswith("*"):
|
|
tag = tag[1:]
|
|
kind = "handlebars_start"
|
|
elif looking_at_handlebars_end():
|
|
s = get_handlebars_tag(text, state.i)
|
|
tag = s[3:-2].strip("/#~")
|
|
kind = "handlebars_end"
|
|
elif looking_at_django_else():
|
|
s = get_django_tag(text, state.i)
|
|
tag = "else"
|
|
kind = "django_else"
|
|
elif looking_at_django_end():
|
|
s = get_django_tag(text, state.i)
|
|
tag = s[6:-3]
|
|
kind = "django_end"
|
|
elif looking_at_django_start():
|
|
# must check this after end/else
|
|
s = get_django_tag(text, state.i)
|
|
tag = s[3:-2].split()[0]
|
|
kind = "django_start"
|
|
|
|
if s[-3] == "-":
|
|
kind = "jinja2_whitespace_stripped_start"
|
|
elif looking_at_jinja2_end_whitespace_stripped():
|
|
s = get_django_tag(text, state.i)
|
|
tag = s[7:-3]
|
|
kind = "jinja2_whitespace_stripped_end"
|
|
elif looking_at_jinja2_start_whitespace_stripped_type2():
|
|
s = get_django_tag(text, state.i, stripped=True)
|
|
tag = s[3:-3].split()[0]
|
|
kind = "jinja2_whitespace_stripped_type2_start"
|
|
elif looking_at_template_var():
|
|
# order is important here
|
|
s = get_template_var(text, state.i)
|
|
tag = "var"
|
|
kind = "template_var"
|
|
elif looking_at("\n"):
|
|
s = "\n"
|
|
tag = "newline"
|
|
kind = "newline"
|
|
elif looking_at(" "):
|
|
s = get_spaces(text, state.i)
|
|
tag = ""
|
|
if not tokens or tokens[-1].kind == "newline":
|
|
kind = "indent"
|
|
else:
|
|
kind = "whitespace"
|
|
elif text[state.i] in "{<":
|
|
snippet = text[state.i :][:15]
|
|
raise AssertionError(f"tool cannot parse {snippet}")
|
|
else:
|
|
s = get_text(text, state.i)
|
|
if s == "":
|
|
continue
|
|
tag = ""
|
|
kind = "text"
|
|
except TokenizationException as e:
|
|
raise FormattedException(
|
|
f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
|
|
)
|
|
|
|
line_span = len(s.strip("\n").split("\n"))
|
|
token = Token(
|
|
kind=kind,
|
|
s=s,
|
|
tag=tag.strip(),
|
|
line=state.line,
|
|
col=state.col,
|
|
line_span=line_span,
|
|
)
|
|
tokens.append(token)
|
|
advance(len(s))
|
|
|
|
return tokens
|
|
|
|
|
|
HTML_VOID_TAGS = {
|
|
"area",
|
|
"base",
|
|
"br",
|
|
"col",
|
|
"command",
|
|
"embed",
|
|
"hr",
|
|
"img",
|
|
"input",
|
|
"keygen",
|
|
"link",
|
|
"meta",
|
|
"param",
|
|
"source",
|
|
"track",
|
|
"wbr",
|
|
}
|
|
|
|
# The following excludes some obscure tags that are never used
|
|
# in Zulip code.
|
|
HTML_INLINE_TAGS = {
|
|
"a",
|
|
"b",
|
|
"br",
|
|
"button",
|
|
"cite",
|
|
"code",
|
|
"em",
|
|
"i",
|
|
"img",
|
|
"input",
|
|
"kbd",
|
|
"label",
|
|
"object",
|
|
"script",
|
|
"select",
|
|
"small",
|
|
"span",
|
|
"strong",
|
|
"textarea",
|
|
}
|
|
|
|
|
|
def tag_flavor(token: Token) -> Optional[str]:
|
|
kind = token.kind
|
|
tag = token.tag
|
|
if kind in (
|
|
"django_comment",
|
|
"handlebar_comment",
|
|
"handlebars_singleton",
|
|
"html_comment",
|
|
"html_doctype",
|
|
"html_singleton",
|
|
"indent",
|
|
"newline",
|
|
"template_var",
|
|
"text",
|
|
"whitespace",
|
|
):
|
|
return None
|
|
|
|
if kind in ("handlebars_start", "html_start"):
|
|
return "start"
|
|
elif kind in (
|
|
"django_else",
|
|
"django_end",
|
|
"handlebars_else",
|
|
"handlebars_end",
|
|
"html_end",
|
|
"jinja2_whitespace_stripped_end",
|
|
):
|
|
return "end"
|
|
elif kind in {
|
|
"django_start",
|
|
"django_else",
|
|
"jinja2_whitespace_stripped_start",
|
|
"jinja2_whitespace_stripped_type2_start",
|
|
}:
|
|
if is_django_block_tag(tag):
|
|
return "start"
|
|
else:
|
|
return None
|
|
else:
|
|
raise AssertionError(f"tools programmer neglected to handle {kind} tokens")
|
|
|
|
|
|
def validate(fn: Optional[str] = None, text: Optional[str] = None) -> List[Token]:
|
|
assert fn or text
|
|
|
|
if fn is None:
|
|
fn = "<in memory file>"
|
|
|
|
if text is None:
|
|
with open(fn) as f:
|
|
text = f.read()
|
|
|
|
lines = text.split("\n")
|
|
|
|
try:
|
|
tokens = tokenize(text)
|
|
except FormattedException as e:
|
|
raise TemplateParserException(
|
|
f"""
|
|
fn: {fn}
|
|
{e}"""
|
|
)
|
|
|
|
prevent_whitespace_violations(fn, tokens)
|
|
|
|
class State:
|
|
def __init__(self, func: Callable[[Token], None]) -> None:
|
|
self.depth = 0
|
|
self.foreign = False
|
|
self.matcher = func
|
|
|
|
def no_start_tag(token: Token) -> None:
|
|
raise TemplateParserException(
|
|
f"""
|
|
No start tag
|
|
fn: {fn}
|
|
end tag:
|
|
{token.tag}
|
|
line {token.line}, col {token.col}
|
|
"""
|
|
)
|
|
|
|
state = State(no_start_tag)
|
|
|
|
def start_tag_matcher(start_token: Token) -> None:
|
|
state.depth += 1
|
|
start_tag = start_token.tag.strip("~")
|
|
start_line = start_token.line
|
|
start_col = start_token.col
|
|
|
|
old_matcher = state.matcher
|
|
old_foreign = state.foreign
|
|
|
|
if start_tag in ["math", "svg"]:
|
|
state.foreign = True
|
|
|
|
def f(end_token: Token) -> None:
|
|
is_else_tag = end_token.tag == "else"
|
|
|
|
end_tag = end_token.tag.strip("~")
|
|
end_line = end_token.line
|
|
end_col = end_token.col
|
|
|
|
def report_problem() -> Optional[str]:
|
|
if (start_tag == "code") and (end_line == start_line + 1):
|
|
return "Code tag is split across two lines."
|
|
|
|
if is_else_tag:
|
|
# We are not completely rigorous about having a sensible
|
|
# order of if/elif/elif/else, but we catch obviously
|
|
# mismatching else tags.
|
|
if start_tag not in ("if", "else", "unless"):
|
|
return f"Unexpected else/elif tag encountered after {start_tag} tag."
|
|
elif start_tag != end_tag:
|
|
return f"Mismatched tags: ({start_tag} != {end_tag})"
|
|
|
|
return None
|
|
|
|
problem = report_problem()
|
|
if problem:
|
|
raise TemplateParserException(
|
|
f"""
|
|
fn: {fn}
|
|
{problem}
|
|
start:
|
|
{start_token.s}
|
|
line {start_line}, col {start_col}
|
|
end tag:
|
|
{end_tag}
|
|
line {end_line}, col {end_col}
|
|
"""
|
|
)
|
|
|
|
if not is_else_tag:
|
|
state.matcher = old_matcher
|
|
state.foreign = old_foreign
|
|
state.depth -= 1
|
|
|
|
# TODO: refine this for the else/elif use cases
|
|
end_token.start_token = start_token
|
|
start_token.end_token = end_token
|
|
|
|
state.matcher = f
|
|
|
|
for token in tokens:
|
|
kind = token.kind
|
|
tag = token.tag
|
|
|
|
if not state.foreign:
|
|
if kind == "html_start":
|
|
if tag in HTML_VOID_TAGS:
|
|
raise TemplateParserException(
|
|
f"Tag must be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
|
|
)
|
|
elif kind == "html_singleton":
|
|
if not state.foreign and tag not in HTML_VOID_TAGS:
|
|
raise TemplateParserException(
|
|
f"Tag must not be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
|
|
)
|
|
|
|
flavor = tag_flavor(token)
|
|
if flavor == "start":
|
|
start_tag_matcher(token)
|
|
elif flavor == "end":
|
|
state.matcher(token)
|
|
|
|
if state.depth != 0:
|
|
raise TemplateParserException("Missing end tag")
|
|
|
|
ensure_matching_indentation(fn, tokens, lines)
|
|
|
|
return tokens
|
|
|
|
|
|
def ensure_matching_indentation(fn: str, tokens: List[Token], lines: List[str]) -> None:
|
|
for token in tokens:
|
|
if token.start_token is None:
|
|
continue
|
|
|
|
end_token = token
|
|
|
|
start_token = token.start_token
|
|
start_line = start_token.line
|
|
start_col = start_token.col
|
|
start_tag = start_token.tag
|
|
end_tag = end_token.tag.strip("~")
|
|
end_line = end_token.line
|
|
end_col = end_token.col
|
|
|
|
def has_bad_indentation() -> bool:
|
|
is_inline_tag = start_tag in HTML_INLINE_TAGS and start_token.kind == "html_start"
|
|
|
|
if end_line > start_line + 1:
|
|
if is_inline_tag:
|
|
end_row_text = lines[end_line - 1]
|
|
if end_row_text.lstrip().startswith(end_token.s):
|
|
if end_col != start_col:
|
|
return True
|
|
else:
|
|
if end_col != start_col:
|
|
return True
|
|
|
|
return False
|
|
|
|
if has_bad_indentation():
|
|
raise TemplateParserException(
|
|
f"""
|
|
fn: {fn}
|
|
Indentation for start/end tags does not match.
|
|
start tag: {start_token.s}
|
|
|
|
start:
|
|
line {start_line}, col {start_col}
|
|
end:
|
|
{end_tag}
|
|
line {end_line}, col {end_col}
|
|
"""
|
|
)
|
|
|
|
|
|
def prevent_extra_newlines(fn: str, tokens: List[Token]) -> None:
|
|
count = 0
|
|
|
|
for token in tokens:
|
|
if token.kind != "newline":
|
|
count = 0
|
|
continue
|
|
|
|
count += 1
|
|
if count >= 4:
|
|
raise TemplateParserException(
|
|
f"""Please avoid so many blank lines near row {token.line} in {fn}."""
|
|
)
|
|
|
|
|
|
def prevent_whitespace_violations(fn: str, tokens: List[Token]) -> None:
|
|
if tokens[0].kind in ("indent", "whitespace"):
|
|
raise TemplateParserException(f" Please remove the whitespace at the beginning of {fn}.")
|
|
|
|
prevent_extra_newlines(fn, tokens)
|
|
|
|
for i in range(1, len(tokens) - 1):
|
|
token = tokens[i]
|
|
next_token = tokens[i + 1]
|
|
|
|
if token.kind == "indent":
|
|
if next_token.kind in ("indent", "whitespace"):
|
|
raise AssertionError("programming error parsing indents")
|
|
|
|
if next_token.kind == "newline":
|
|
raise TemplateParserException(
|
|
f"""Please just make row {token.line} in {fn} a truly blank line (no spaces)."""
|
|
)
|
|
|
|
if len(token.s) % 4 != 0:
|
|
raise TemplateParserException(
|
|
f"""
|
|
Please use 4-space indents for template files. Most of our
|
|
codebase (including Python and JavaScript) uses 4-space indents,
|
|
so it's worth investing in configuring your editor to use
|
|
4-space indents for files like
|
|
{fn}
|
|
|
|
The line at row {token.line} is indented with {len(token.s)} spaces.
|
|
"""
|
|
)
|
|
|
|
if token.kind == "whitespace":
|
|
if len(token.s) > 1:
|
|
raise TemplateParserException(
|
|
f"""
|
|
We did not expect this much whitespace at row {token.line} column {token.col} in {fn}.
|
|
"""
|
|
)
|
|
if next_token.kind == "newline":
|
|
raise TemplateParserException(
|
|
f"""
|
|
Unexpected trailing whitespace at row {token.line} column {token.col} in {fn}.
|
|
"""
|
|
)
|
|
|
|
|
|
def is_django_block_tag(tag: str) -> bool:
|
|
return tag in [
|
|
"autoescape",
|
|
"block",
|
|
"comment",
|
|
"for",
|
|
"if",
|
|
"ifequal",
|
|
"macro",
|
|
"verbatim",
|
|
"blocktrans",
|
|
"trans",
|
|
"raw",
|
|
"with",
|
|
]
|
|
|
|
|
|
def get_handlebars_tag(text: str, i: int) -> str:
|
|
end = i + 2
|
|
while end < len(text) - 1 and text[end] != "}":
|
|
end += 1
|
|
if text[end] != "}" or text[end + 1] != "}":
|
|
raise TokenizationException('Tag missing "}}"', text[i : end + 2])
|
|
s = text[i : end + 2]
|
|
return s
|
|
|
|
|
|
def get_spaces(text: str, i: int) -> str:
|
|
s = ""
|
|
while i < len(text) and text[i] in " ":
|
|
s += text[i]
|
|
i += 1
|
|
return s
|
|
|
|
|
|
def get_text(text: str, i: int) -> str:
|
|
s = ""
|
|
while i < len(text) and text[i] not in "{<":
|
|
s += text[i]
|
|
i += 1
|
|
return s.strip()
|
|
|
|
|
|
def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
|
|
end = i + 2
|
|
if stripped:
|
|
end += 1
|
|
while end < len(text) - 1 and text[end] != "%":
|
|
end += 1
|
|
if text[end] != "%" or text[end + 1] != "}":
|
|
raise TokenizationException('Tag missing "%}"', text[i : end + 2])
|
|
s = text[i : end + 2]
|
|
return s
|
|
|
|
|
|
def get_html_tag(text: str, i: int) -> str:
|
|
quote_count = 0
|
|
end = i + 1
|
|
unclosed_end = 0
|
|
while end < len(text) and (text[end] != ">" or quote_count % 2 != 0 and text[end] != "<"):
|
|
if text[end] == '"':
|
|
quote_count += 1
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
if quote_count % 2 != 0:
|
|
if unclosed_end:
|
|
raise TokenizationException("Unbalanced quotes", text[i:unclosed_end])
|
|
else:
|
|
raise TokenizationException("Unbalanced quotes", text[i : end + 1])
|
|
if end == len(text) or text[end] != ">":
|
|
raise TokenizationException('Tag missing ">"', text[i : end + 1])
|
|
s = text[i : end + 1]
|
|
return s
|
|
|
|
|
|
def get_html_comment(text: str, i: int) -> str:
|
|
end = i + 7
|
|
unclosed_end = 0
|
|
while end <= len(text):
|
|
if text[end - 3 : end] == "-->":
|
|
return text[i:end]
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
|
|
|
|
|
|
def get_handlebar_comment(text: str, i: int) -> str:
|
|
end = i + 5
|
|
unclosed_end = 0
|
|
while end <= len(text):
|
|
if text[end - 2 : end] == "}}":
|
|
return text[i:end]
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
|
|
|
|
|
|
def get_template_var(text: str, i: int) -> str:
|
|
end = i + 3
|
|
unclosed_end = 0
|
|
while end <= len(text):
|
|
if text[end - 1] == "}":
|
|
if end < len(text) and text[end] == "}":
|
|
end += 1
|
|
return text[i:end]
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
raise TokenizationException("Unclosed var", text[i:unclosed_end])
|
|
|
|
|
|
def get_django_comment(text: str, i: int) -> str:
|
|
end = i + 4
|
|
unclosed_end = 0
|
|
while end <= len(text):
|
|
if text[end - 2 : end] == "#}":
|
|
return text[i:end]
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
|
|
|
|
|
|
def get_handlebar_partial(text: str, i: int) -> str:
|
|
end = i + 10
|
|
unclosed_end = 0
|
|
while end <= len(text):
|
|
if text[end - 2 : end] == "}}":
|
|
return text[i:end]
|
|
if not unclosed_end and text[end] == "<":
|
|
unclosed_end = end
|
|
end += 1
|
|
raise TokenizationException("Unclosed partial", text[i:unclosed_end])
|