Update dashboard, memory, root +2 more (+3 ~5)
This commit is contained in:
706
venv/lib/python3.12/site-packages/pycparser/c_lexer.py
Normal file
706
venv/lib/python3.12/site-packages/pycparser/c_lexer.py
Normal file
@@ -0,0 +1,706 @@
|
||||
# ------------------------------------------------------------------------------
|
||||
# pycparser: c_lexer.py
|
||||
#
|
||||
# CLexer class: lexer for the C language
|
||||
#
|
||||
# Eli Bendersky [https://eli.thegreenplace.net/]
|
||||
# License: BSD
|
||||
# ------------------------------------------------------------------------------
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Callable, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class _Token:
|
||||
type: str
|
||||
value: str
|
||||
lineno: int
|
||||
column: int
|
||||
|
||||
|
||||
class CLexer:
|
||||
"""A standalone lexer for C.
|
||||
|
||||
Parameters for construction:
|
||||
error_func:
|
||||
Called with (msg, line, column) on lexing errors.
|
||||
on_lbrace_func:
|
||||
Called when an LBRACE token is produced (used for scope tracking).
|
||||
on_rbrace_func:
|
||||
Called when an RBRACE token is produced (used for scope tracking).
|
||||
type_lookup_func:
|
||||
Called with an identifier name; expected to return True if it is
|
||||
a typedef name and should be tokenized as TYPEID.
|
||||
|
||||
Call input(text) to initialize lexing, and then keep calling token() to
|
||||
get the next token, until it returns None (at end of input).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
error_func: Callable[[str, int, int], None],
|
||||
on_lbrace_func: Callable[[], None],
|
||||
on_rbrace_func: Callable[[], None],
|
||||
type_lookup_func: Callable[[str], bool],
|
||||
) -> None:
|
||||
self.error_func = error_func
|
||||
self.on_lbrace_func = on_lbrace_func
|
||||
self.on_rbrace_func = on_rbrace_func
|
||||
self.type_lookup_func = type_lookup_func
|
||||
self._init_state()
|
||||
|
||||
def input(self, text: str, filename: str = "") -> None:
|
||||
"""Initialize the lexer to the given input text.
|
||||
|
||||
filename is an optional name identifying the file from which the input
|
||||
comes. The lexer can modify it if #line directives are encountered.
|
||||
"""
|
||||
self._init_state()
|
||||
self._lexdata = text
|
||||
self._filename = filename
|
||||
|
||||
def _init_state(self) -> None:
|
||||
self._lexdata = ""
|
||||
self._filename = ""
|
||||
self._pos = 0
|
||||
self._line_start = 0
|
||||
self._pending_tok: Optional[_Token] = None
|
||||
self._lineno = 1
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
return self._filename
|
||||
|
||||
def token(self) -> Optional[_Token]:
|
||||
# Lexing strategy overview:
|
||||
#
|
||||
# - We maintain a current position (self._pos), line number, and the
|
||||
# byte offset of the current line start. The lexer is a simple loop
|
||||
# that skips whitespace/newlines and emits one token per call.
|
||||
# - A small amount of logic is handled manually before regex matching:
|
||||
#
|
||||
# * Preprocessor-style directives: if we see '#', we check whether
|
||||
# it's a #line or #pragma directive and consume it inline. #line
|
||||
# updates lineno/filename and produces no tokens. #pragma can yield
|
||||
# both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
|
||||
# so we stash the PPPRAGMASTR as _pending_tok to return on the next
|
||||
# token() call. Otherwise we return PPHASH.
|
||||
# * Newlines update lineno/line-start tracking so tokens can record
|
||||
# accurate columns.
|
||||
#
|
||||
# - The bulk of tokens are recognized in _match_token:
|
||||
#
|
||||
# * _regex_rules: regex patterns for identifiers, literals, and other
|
||||
# complex tokens (including error-producing patterns). The lexer
|
||||
# uses a combined _regex_master to scan options at the same time.
|
||||
# * _fixed_tokens: exact string matches for operators and punctuation,
|
||||
# resolved by longest match.
|
||||
#
|
||||
# - Error patterns call the error callback and advance minimally, which
|
||||
# keeps lexing resilient while reporting useful diagnostics.
|
||||
text = self._lexdata
|
||||
n = len(text)
|
||||
|
||||
if self._pending_tok is not None:
|
||||
tok = self._pending_tok
|
||||
self._pending_tok = None
|
||||
return tok
|
||||
|
||||
while self._pos < n:
|
||||
match text[self._pos]:
|
||||
case " " | "\t":
|
||||
self._pos += 1
|
||||
case "\n":
|
||||
self._lineno += 1
|
||||
self._pos += 1
|
||||
self._line_start = self._pos
|
||||
case "#":
|
||||
if _line_pattern.match(text, self._pos + 1):
|
||||
self._pos += 1
|
||||
self._handle_ppline()
|
||||
continue
|
||||
if _pragma_pattern.match(text, self._pos + 1):
|
||||
self._pos += 1
|
||||
toks = self._handle_pppragma()
|
||||
if len(toks) > 1:
|
||||
self._pending_tok = toks[1]
|
||||
if len(toks) > 0:
|
||||
return toks[0]
|
||||
continue
|
||||
tok = self._make_token("PPHASH", "#", self._pos)
|
||||
self._pos += 1
|
||||
return tok
|
||||
case _:
|
||||
if tok := self._match_token():
|
||||
return tok
|
||||
else:
|
||||
continue
|
||||
|
||||
def _match_token(self) -> Optional[_Token]:
|
||||
"""Match one token at the current position.
|
||||
|
||||
Returns a Token on success, or None if no token could be matched and
|
||||
an error was reported. This method always advances _pos by the matched
|
||||
length, or by 1 on error/no-match.
|
||||
"""
|
||||
text = self._lexdata
|
||||
pos = self._pos
|
||||
# We pick the longest match between:
|
||||
# - the master regex (identifiers, literals, error patterns, etc.)
|
||||
# - fixed operator/punctuator literals from the bucket for text[pos]
|
||||
#
|
||||
# The longest match is required to ensure we properly lex something
|
||||
# like ".123" (a floating-point constant) as a single entity (with
|
||||
# FLOAT_CONST), rather than a PERIOD followed by a number.
|
||||
#
|
||||
# The fixed-literal buckets are already length-sorted, so within that
|
||||
# bucket we can take the first match. However, we still compare its
|
||||
# length to the regex match because the regex may have matched a longer
|
||||
# token that should take precedence.
|
||||
best = None
|
||||
|
||||
if m := _regex_master.match(text, pos):
|
||||
tok_type = m.lastgroup
|
||||
# All master-regex alternatives are named; lastgroup shouldn't be None.
|
||||
assert tok_type is not None
|
||||
value = m.group(tok_type)
|
||||
length = len(value)
|
||||
action, msg = _regex_actions[tok_type]
|
||||
best = (length, tok_type, value, action, msg)
|
||||
|
||||
if bucket := _fixed_tokens_by_first.get(text[pos]):
|
||||
for entry in bucket:
|
||||
if text.startswith(entry.literal, pos):
|
||||
length = len(entry.literal)
|
||||
if best is None or length > best[0]:
|
||||
best = (
|
||||
length,
|
||||
entry.tok_type,
|
||||
entry.literal,
|
||||
_RegexAction.TOKEN,
|
||||
None,
|
||||
)
|
||||
break
|
||||
|
||||
if best is None:
|
||||
msg = f"Illegal character {repr(text[pos])}"
|
||||
self._error(msg, pos)
|
||||
self._pos += 1
|
||||
return None
|
||||
|
||||
length, tok_type, value, action, msg = best
|
||||
if action == _RegexAction.ERROR:
|
||||
if tok_type == "BAD_CHAR_CONST":
|
||||
msg = f"Invalid char constant {value}"
|
||||
# All other ERROR rules provide a message.
|
||||
assert msg is not None
|
||||
self._error(msg, pos)
|
||||
self._pos += max(1, length)
|
||||
return None
|
||||
|
||||
if action == _RegexAction.ID:
|
||||
tok_type = _keyword_map.get(value, "ID")
|
||||
if tok_type == "ID" and self.type_lookup_func(value):
|
||||
tok_type = "TYPEID"
|
||||
|
||||
tok = self._make_token(tok_type, value, pos)
|
||||
self._pos += length
|
||||
|
||||
if tok.type == "LBRACE":
|
||||
self.on_lbrace_func()
|
||||
elif tok.type == "RBRACE":
|
||||
self.on_rbrace_func()
|
||||
|
||||
return tok
|
||||
|
||||
def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
|
||||
"""Create a Token at an absolute input position.
|
||||
|
||||
Expects tok_type/value and the absolute byte offset pos in the current
|
||||
input. Does not advance lexer state; callers manage _pos themselves.
|
||||
Returns a Token with lineno/column computed from current line tracking.
|
||||
"""
|
||||
column = pos - self._line_start + 1
|
||||
tok = _Token(tok_type, value, self._lineno, column)
|
||||
return tok
|
||||
|
||||
def _error(self, msg: str, pos: int) -> None:
|
||||
column = pos - self._line_start + 1
|
||||
self.error_func(msg, self._lineno, column)
|
||||
|
||||
def _handle_ppline(self) -> None:
|
||||
# Since #line directives aren't supposed to return tokens but should
|
||||
# only affect the lexer's state (update line/filename for coords), this
|
||||
# method does a bit of parsing on its own. It doesn't return anything,
|
||||
# but its side effect is to update self._pos past the directive, and
|
||||
# potentially update self._lineno and self._filename, based on the
|
||||
# directive's contents.
|
||||
#
|
||||
# Accepted #line forms from preprocessors:
|
||||
# - "#line 66 \"kwas\\df.h\""
|
||||
# - "# 9"
|
||||
# - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
|
||||
# - "# 1 \"file.h\" 3"
|
||||
# Errors we must report:
|
||||
# - "#line \"file.h\"" (filename before line number)
|
||||
# - "#line df" (garbage instead of number/string)
|
||||
#
|
||||
# We scan the directive line once (after an optional 'line' keyword),
|
||||
# validating the order: NUMBER, optional STRING, then any NUMBERs.
|
||||
# The NUMBERs tail is only accepted if a filename STRING was present.
|
||||
text = self._lexdata
|
||||
n = len(text)
|
||||
line_end = text.find("\n", self._pos)
|
||||
if line_end == -1:
|
||||
line_end = n
|
||||
line = text[self._pos : line_end]
|
||||
pos = 0
|
||||
line_len = len(line)
|
||||
|
||||
def skip_ws() -> None:
|
||||
nonlocal pos
|
||||
while pos < line_len and line[pos] in " \t":
|
||||
pos += 1
|
||||
|
||||
skip_ws()
|
||||
if line.startswith("line", pos):
|
||||
pos += 4
|
||||
|
||||
def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
|
||||
if pp_line is None:
|
||||
self._error("line number missing in #line", self._pos + line_len)
|
||||
else:
|
||||
self._lineno = int(pp_line)
|
||||
if pp_filename is not None:
|
||||
self._filename = pp_filename
|
||||
self._pos = line_end + 1
|
||||
self._line_start = self._pos
|
||||
|
||||
def fail(msg: str, offset: int) -> None:
|
||||
self._error(msg, self._pos + offset)
|
||||
self._pos = line_end + 1
|
||||
self._line_start = self._pos
|
||||
|
||||
skip_ws()
|
||||
if pos >= line_len:
|
||||
success(None, None)
|
||||
return
|
||||
if line[pos] == '"':
|
||||
fail("filename before line number in #line", pos)
|
||||
return
|
||||
|
||||
m = re.match(_decimal_constant, line[pos:])
|
||||
if not m:
|
||||
fail("invalid #line directive", pos)
|
||||
return
|
||||
|
||||
pp_line = m.group(0)
|
||||
pos += len(pp_line)
|
||||
skip_ws()
|
||||
if pos >= line_len:
|
||||
success(pp_line, None)
|
||||
return
|
||||
|
||||
if line[pos] != '"':
|
||||
fail("invalid #line directive", pos)
|
||||
return
|
||||
|
||||
m = re.match(_string_literal, line[pos:])
|
||||
if not m:
|
||||
fail("invalid #line directive", pos)
|
||||
return
|
||||
|
||||
pp_filename = m.group(0).lstrip('"').rstrip('"')
|
||||
pos += len(m.group(0))
|
||||
|
||||
# Consume arbitrary sequence of numeric flags after the directive
|
||||
while True:
|
||||
skip_ws()
|
||||
if pos >= line_len:
|
||||
break
|
||||
m = re.match(_decimal_constant, line[pos:])
|
||||
if not m:
|
||||
fail("invalid #line directive", pos)
|
||||
return
|
||||
pos += len(m.group(0))
|
||||
|
||||
success(pp_line, pp_filename)
|
||||
|
||||
def _handle_pppragma(self) -> List[_Token]:
|
||||
# Parse a full #pragma line; returns a list of tokens with 1 or 2
|
||||
# tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
|
||||
# returned, it means an error occurred, or we're at the end of input.
|
||||
#
|
||||
# Examples:
|
||||
# - "#pragma" -> PPPRAGMA only
|
||||
# - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
|
||||
# - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
|
||||
# - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
|
||||
text = self._lexdata
|
||||
n = len(text)
|
||||
pos = self._pos
|
||||
|
||||
while pos < n and text[pos] in " \t":
|
||||
pos += 1
|
||||
if pos >= n:
|
||||
self._pos = pos
|
||||
return []
|
||||
|
||||
if not text.startswith("pragma", pos):
|
||||
self._error("invalid #pragma directive", pos)
|
||||
self._pos = pos + 1
|
||||
return []
|
||||
|
||||
pragma_pos = pos
|
||||
pos += len("pragma")
|
||||
toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
|
||||
|
||||
while pos < n and text[pos] in " \t":
|
||||
pos += 1
|
||||
|
||||
start = pos
|
||||
while pos < n and text[pos] != "\n":
|
||||
pos += 1
|
||||
if pos > start:
|
||||
toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
|
||||
if pos < n and text[pos] == "\n":
|
||||
self._lineno += 1
|
||||
pos += 1
|
||||
self._line_start = pos
|
||||
self._pos = pos
|
||||
return toks
|
||||
|
||||
|
||||
##
|
||||
## Reserved keywords
|
||||
##
|
||||
_keywords: Tuple[str, ...] = (
|
||||
"AUTO",
|
||||
"BREAK",
|
||||
"CASE",
|
||||
"CHAR",
|
||||
"CONST",
|
||||
"CONTINUE",
|
||||
"DEFAULT",
|
||||
"DO",
|
||||
"DOUBLE",
|
||||
"ELSE",
|
||||
"ENUM",
|
||||
"EXTERN",
|
||||
"FLOAT",
|
||||
"FOR",
|
||||
"GOTO",
|
||||
"IF",
|
||||
"INLINE",
|
||||
"INT",
|
||||
"LONG",
|
||||
"REGISTER",
|
||||
"OFFSETOF",
|
||||
"RESTRICT",
|
||||
"RETURN",
|
||||
"SHORT",
|
||||
"SIGNED",
|
||||
"SIZEOF",
|
||||
"STATIC",
|
||||
"STRUCT",
|
||||
"SWITCH",
|
||||
"TYPEDEF",
|
||||
"UNION",
|
||||
"UNSIGNED",
|
||||
"VOID",
|
||||
"VOLATILE",
|
||||
"WHILE",
|
||||
"__INT128",
|
||||
"_BOOL",
|
||||
"_COMPLEX",
|
||||
"_NORETURN",
|
||||
"_THREAD_LOCAL",
|
||||
"_STATIC_ASSERT",
|
||||
"_ATOMIC",
|
||||
"_ALIGNOF",
|
||||
"_ALIGNAS",
|
||||
"_PRAGMA",
|
||||
)
|
||||
|
||||
_keyword_map: Dict[str, str] = {}
|
||||
|
||||
for keyword in _keywords:
|
||||
# Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
|
||||
if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
|
||||
_keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
|
||||
else:
|
||||
_keyword_map[keyword.lower()] = keyword
|
||||
|
||||
##
|
||||
## Regexes for use in tokens
|
||||
##
|
||||
|
||||
# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
|
||||
_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
|
||||
|
||||
_hex_prefix = "0[xX]"
|
||||
_hex_digits = "[0-9a-fA-F]+"
|
||||
_bin_prefix = "0[bB]"
|
||||
_bin_digits = "[01]+"
|
||||
|
||||
# integer constants (K&R2: A.2.5.1)
|
||||
_integer_suffix_opt = (
|
||||
r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
|
||||
)
|
||||
_decimal_constant = (
|
||||
"(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
|
||||
)
|
||||
_octal_constant = "0[0-7]*" + _integer_suffix_opt
|
||||
_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
|
||||
_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
|
||||
|
||||
_bad_octal_constant = "0[0-7]*[89]"
|
||||
|
||||
# comments are not supported
|
||||
_unsupported_c_style_comment = r"\/\*"
|
||||
_unsupported_cxx_style_comment = r"\/\/"
|
||||
|
||||
# character constants (K&R2: A.2.5.2)
|
||||
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
|
||||
# directives with Windows paths as filenames (..\..\dir\file)
|
||||
# For the same reason, decimal_escape allows all digit sequences. We want to
|
||||
# parse all correct code, even if it means to sometimes parse incorrect
|
||||
# code.
|
||||
#
|
||||
# The original regexes were taken verbatim from the C syntax definition,
|
||||
# and were later modified to avoid worst-case exponential running time.
|
||||
#
|
||||
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
|
||||
# decimal_escape = r"""(\d+)"""
|
||||
# hex_escape = r"""(x[0-9a-fA-F]+)"""
|
||||
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
|
||||
#
|
||||
# The following modifications were made to avoid the ambiguity that allowed
|
||||
# backtracking: (https://github.com/eliben/pycparser/issues/61)
|
||||
#
|
||||
# - \x was removed from simple_escape, unless it was not followed by a hex
|
||||
# digit, to avoid ambiguity with hex_escape.
|
||||
# - hex_escape allows one or more hex characters, but requires that the next
|
||||
# character(if any) is not hex
|
||||
# - decimal_escape allows one or more decimal characters, but requires that the
|
||||
# next character(if any) is not a decimal
|
||||
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
|
||||
# permissive decimal_escape.
|
||||
#
|
||||
# Without this change, python's `re` module would recursively try parsing each
|
||||
# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
|
||||
# `\1`+`23`, `\12`+`3`, and `\123`.
|
||||
|
||||
_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
|
||||
_decimal_escape = r"""(\d+)(?!\d)"""
|
||||
_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
|
||||
_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
|
||||
|
||||
_escape_sequence = (
|
||||
r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
|
||||
)
|
||||
|
||||
# This complicated regex with lookahead might be slow for strings, so because
|
||||
# all of the valid escapes (including \x) allowed
|
||||
# 0 or more non-escaped characters after the first character,
|
||||
# simple_escape+decimal_escape+hex_escape got simplified to
|
||||
|
||||
_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
|
||||
|
||||
_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
|
||||
_char_const = "'" + _cconst_char + "'"
|
||||
_wchar_const = "L" + _char_const
|
||||
_u8char_const = "u8" + _char_const
|
||||
_u16char_const = "u" + _char_const
|
||||
_u32char_const = "U" + _char_const
|
||||
_multicharacter_constant = "'" + _cconst_char + "{2,4}'"
|
||||
_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
|
||||
_bad_char_const = (
|
||||
r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
|
||||
)
|
||||
|
||||
# string literals (K&R2: A.2.6)
|
||||
_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
|
||||
_string_literal = '"' + _string_char + '*"'
|
||||
_wstring_literal = "L" + _string_literal
|
||||
_u8string_literal = "u8" + _string_literal
|
||||
_u16string_literal = "u" + _string_literal
|
||||
_u32string_literal = "U" + _string_literal
|
||||
_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
|
||||
|
||||
# floating constants (K&R2: A.2.5.3)
|
||||
_exponent_part = r"""([eE][-+]?[0-9]+)"""
|
||||
_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
|
||||
_floating_constant = (
|
||||
"(((("
|
||||
+ _fractional_constant
|
||||
+ ")"
|
||||
+ _exponent_part
|
||||
+ "?)|([0-9]+"
|
||||
+ _exponent_part
|
||||
+ "))[FfLl]?)"
|
||||
)
|
||||
_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
|
||||
_hex_fractional_constant = (
|
||||
"(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
|
||||
)
|
||||
_hex_floating_constant = (
|
||||
"("
|
||||
+ _hex_prefix
|
||||
+ "("
|
||||
+ _hex_digits
|
||||
+ "|"
|
||||
+ _hex_fractional_constant
|
||||
+ ")"
|
||||
+ _binary_exponent_part
|
||||
+ "[FfLl]?)"
|
||||
)
|
||||
|
||||
|
||||
class _RegexAction(Enum):
|
||||
TOKEN = 0
|
||||
ID = 1
|
||||
ERROR = 2
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _RegexRule:
|
||||
# tok_type: name of the token emitted for a match
|
||||
# regex_pattern: the raw regex (no anchors) to match at the current position
|
||||
# action: TOKEN for normal tokens, ID for identifiers, ERROR to report
|
||||
# error_message: message used for ERROR entries
|
||||
tok_type: str
|
||||
regex_pattern: str
|
||||
action: _RegexAction
|
||||
error_message: Optional[str]
|
||||
|
||||
|
||||
_regex_rules: List[_RegexRule] = [
|
||||
_RegexRule(
|
||||
"UNSUPPORTED_C_STYLE_COMMENT",
|
||||
_unsupported_c_style_comment,
|
||||
_RegexAction.ERROR,
|
||||
"Comments are not supported, see https://github.com/eliben/pycparser#3using.",
|
||||
),
|
||||
_RegexRule(
|
||||
"UNSUPPORTED_CXX_STYLE_COMMENT",
|
||||
_unsupported_cxx_style_comment,
|
||||
_RegexAction.ERROR,
|
||||
"Comments are not supported, see https://github.com/eliben/pycparser#3using.",
|
||||
),
|
||||
_RegexRule(
|
||||
"BAD_STRING_LITERAL",
|
||||
_bad_string_literal,
|
||||
_RegexAction.ERROR,
|
||||
"String contains invalid escape code",
|
||||
),
|
||||
_RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
|
||||
_RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
|
||||
_RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule(
|
||||
"BAD_CONST_OCT",
|
||||
_bad_octal_constant,
|
||||
_RegexAction.ERROR,
|
||||
"Invalid octal constant",
|
||||
),
|
||||
_RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
|
||||
_RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
|
||||
_RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
|
||||
_RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
|
||||
_RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
|
||||
_RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
|
||||
_RegexRule("ID", _identifier, _RegexAction.ID, None),
|
||||
]
|
||||
|
||||
_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
|
||||
_regex_pattern_parts: List[str] = []
|
||||
for _rule in _regex_rules:
|
||||
_regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
|
||||
_regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
|
||||
# The master regex is a single alternation of all token patterns, each wrapped
|
||||
# in a named group. We match once at the current position and then use
|
||||
# `lastgroup` to recover which token kind fired; this avoids iterating over all
|
||||
# regexes on every character while keeping the same token-level semantics.
|
||||
_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _FixedToken:
|
||||
tok_type: str
|
||||
literal: str
|
||||
|
||||
|
||||
_fixed_tokens: List[_FixedToken] = [
|
||||
_FixedToken("ELLIPSIS", "..."),
|
||||
_FixedToken("LSHIFTEQUAL", "<<="),
|
||||
_FixedToken("RSHIFTEQUAL", ">>="),
|
||||
_FixedToken("PLUSPLUS", "++"),
|
||||
_FixedToken("MINUSMINUS", "--"),
|
||||
_FixedToken("ARROW", "->"),
|
||||
_FixedToken("LAND", "&&"),
|
||||
_FixedToken("LOR", "||"),
|
||||
_FixedToken("LSHIFT", "<<"),
|
||||
_FixedToken("RSHIFT", ">>"),
|
||||
_FixedToken("LE", "<="),
|
||||
_FixedToken("GE", ">="),
|
||||
_FixedToken("EQ", "=="),
|
||||
_FixedToken("NE", "!="),
|
||||
_FixedToken("TIMESEQUAL", "*="),
|
||||
_FixedToken("DIVEQUAL", "/="),
|
||||
_FixedToken("MODEQUAL", "%="),
|
||||
_FixedToken("PLUSEQUAL", "+="),
|
||||
_FixedToken("MINUSEQUAL", "-="),
|
||||
_FixedToken("ANDEQUAL", "&="),
|
||||
_FixedToken("OREQUAL", "|="),
|
||||
_FixedToken("XOREQUAL", "^="),
|
||||
_FixedToken("EQUALS", "="),
|
||||
_FixedToken("PLUS", "+"),
|
||||
_FixedToken("MINUS", "-"),
|
||||
_FixedToken("TIMES", "*"),
|
||||
_FixedToken("DIVIDE", "/"),
|
||||
_FixedToken("MOD", "%"),
|
||||
_FixedToken("OR", "|"),
|
||||
_FixedToken("AND", "&"),
|
||||
_FixedToken("NOT", "~"),
|
||||
_FixedToken("XOR", "^"),
|
||||
_FixedToken("LNOT", "!"),
|
||||
_FixedToken("LT", "<"),
|
||||
_FixedToken("GT", ">"),
|
||||
_FixedToken("CONDOP", "?"),
|
||||
_FixedToken("LPAREN", "("),
|
||||
_FixedToken("RPAREN", ")"),
|
||||
_FixedToken("LBRACKET", "["),
|
||||
_FixedToken("RBRACKET", "]"),
|
||||
_FixedToken("LBRACE", "{"),
|
||||
_FixedToken("RBRACE", "}"),
|
||||
_FixedToken("COMMA", ","),
|
||||
_FixedToken("PERIOD", "."),
|
||||
_FixedToken("SEMI", ";"),
|
||||
_FixedToken("COLON", ":"),
|
||||
]
|
||||
|
||||
# To avoid scanning all fixed tokens on every character, we bucket them by the
|
||||
# first character. When matching at position i, we only look at the bucket for
|
||||
# text[i], and we pre-sort that bucket by token length so the first match is
|
||||
# also the longest. This preserves longest-match semantics (e.g. '>>=' before
|
||||
# '>>' before '>') while reducing the number of comparisons.
|
||||
_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
|
||||
for _entry in _fixed_tokens:
|
||||
_fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
|
||||
for _bucket in _fixed_tokens_by_first.values():
|
||||
_bucket.sort(key=lambda item: len(item.literal), reverse=True)
|
||||
|
||||
_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
|
||||
_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")
|
||||
Reference in New Issue
Block a user