clawd/venv/lib/python3.12/site-packages/pycparser/c_lexer.py

# ------------------------------------------------------------------------------
# pycparser: c_lexer.py
#
# CLexer class: lexer for the C language
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# ------------------------------------------------------------------------------
import re
from dataclasses import dataclass
from enum import Enum
from typing import Callable, Dict, List, Optional, Tuple


@dataclass(slots=True)
class _Token:
    type: str
    value: str
    lineno: int
    column: int


class CLexer:
    """A standalone lexer for C.

    Parameters for construction:
        error_func:
            Called with (msg, line, column) on lexing errors.
        on_lbrace_func:
            Called when an LBRACE token is produced (used for scope tracking).
        on_rbrace_func:
            Called when an RBRACE token is produced (used for scope tracking).
        type_lookup_func:
            Called with an identifier name; expected to return True if it is
            a typedef name and should be tokenized as TYPEID.

    Call input(text) to initialize lexing, and then keep calling token() to
    get the next token, until it returns None (at end of input).
    """

    def __init__(
        self,
        error_func: Callable[[str, int, int], None],
        on_lbrace_func: Callable[[], None],
        on_rbrace_func: Callable[[], None],
        type_lookup_func: Callable[[str], bool],
    ) -> None:
        self.error_func = error_func
        self.on_lbrace_func = on_lbrace_func
        self.on_rbrace_func = on_rbrace_func
        self.type_lookup_func = type_lookup_func
        self._init_state()

    def input(self, text: str, filename: str = "") -> None:
        """Initialize the lexer to the given input text.

        filename is an optional name identifying the file from which the input
        comes. The lexer can modify it if #line directives are encountered.
        """
        self._init_state()
        self._lexdata = text
        self._filename = filename

    def _init_state(self) -> None:
        self._lexdata = ""
        self._filename = ""
        self._pos = 0
        self._line_start = 0
        self._pending_tok: Optional[_Token] = None
        self._lineno = 1

    @property
    def filename(self) -> str:
        return self._filename

    def token(self) -> Optional[_Token]:
        # Lexing strategy overview:
        #
        # - We maintain a current position (self._pos), line number, and the
        #   byte offset of the current line start. The lexer is a simple loop
        #   that skips whitespace/newlines and emits one token per call.
        # - A small amount of logic is handled manually before regex matching:
        #
        #   * Preprocessor-style directives: if we see '#', we check whether
        #     it's a #line or #pragma directive and consume it inline. #line
        #     updates lineno/filename and produces no tokens. #pragma can yield
        #     both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
        #     so we stash the PPPRAGMASTR as _pending_tok to return on the next
        #     token() call. Otherwise we return PPHASH.
        #   * Newlines update lineno/line-start tracking so tokens can record
        #     accurate columns.
        #
        # - The bulk of tokens are recognized in _match_token:
        #
        #   * _regex_rules: regex patterns for identifiers, literals, and other
        #     complex tokens (including error-producing patterns). The lexer
        #     uses a combined _regex_master to scan options at the same time.
        #   * _fixed_tokens: exact string matches for operators and punctuation,
        #     resolved by longest match.
        #
        # - Error patterns call the error callback and advance minimally, which
        #   keeps lexing resilient while reporting useful diagnostics.
        text = self._lexdata
        n = len(text)

        if self._pending_tok is not None:
            tok = self._pending_tok
            self._pending_tok = None
            return tok

        while self._pos < n:
            match text[self._pos]:
                case " " | "\t":
                    self._pos += 1
                case "\n":
                    self._lineno += 1
                    self._pos += 1
                    self._line_start = self._pos
                case "#":
                    if _line_pattern.match(text, self._pos + 1):
                        self._pos += 1
                        self._handle_ppline()
                        continue
                    if _pragma_pattern.match(text, self._pos + 1):
                        self._pos += 1
                        toks = self._handle_pppragma()
                        if len(toks) > 1:
                            self._pending_tok = toks[1]
                        if len(toks) > 0:
                            return toks[0]
                        continue
                    tok = self._make_token("PPHASH", "#", self._pos)
                    self._pos += 1
                    return tok
                case _:
                    if tok := self._match_token():
                        return tok
                    else:
                        continue

    def _match_token(self) -> Optional[_Token]:
        """Match one token at the current position.

        Returns a Token on success, or None if no token could be matched and
        an error was reported. This method always advances _pos by the matched
        length, or by 1 on error/no-match.
        """
        text = self._lexdata
        pos = self._pos
        # We pick the longest match between:
        # - the master regex (identifiers, literals, error patterns, etc.)
        # - fixed operator/punctuator literals from the bucket for text[pos]
        #
        # The longest match is required to ensure we properly lex something
        # like ".123" (a floating-point constant) as a single entity (with
        # FLOAT_CONST), rather than a PERIOD followed by a number.
        #
        # The fixed-literal buckets are already length-sorted, so within that
        # bucket we can take the first match. However, we still compare its
        # length to the regex match because the regex may have matched a longer
        # token that should take precedence.
        best = None

        if m := _regex_master.match(text, pos):
            tok_type = m.lastgroup
            # All master-regex alternatives are named; lastgroup shouldn't be None.
            assert tok_type is not None
            value = m.group(tok_type)
            length = len(value)
            action, msg = _regex_actions[tok_type]
            best = (length, tok_type, value, action, msg)

        if bucket := _fixed_tokens_by_first.get(text[pos]):
            for entry in bucket:
                if text.startswith(entry.literal, pos):
                    length = len(entry.literal)
                    if best is None or length > best[0]:
                        best = (
                            length,
                            entry.tok_type,
                            entry.literal,
                            _RegexAction.TOKEN,
                            None,
                        )
                    break

        if best is None:
            msg = f"Illegal character {repr(text[pos])}"
            self._error(msg, pos)
            self._pos += 1
            return None

        length, tok_type, value, action, msg = best
        if action == _RegexAction.ERROR:
            if tok_type == "BAD_CHAR_CONST":
                msg = f"Invalid char constant {value}"
            # All other ERROR rules provide a message.
            assert msg is not None
            self._error(msg, pos)
            self._pos += max(1, length)
            return None

        if action == _RegexAction.ID:
            tok_type = _keyword_map.get(value, "ID")
            if tok_type == "ID" and self.type_lookup_func(value):
                tok_type = "TYPEID"

        tok = self._make_token(tok_type, value, pos)
        self._pos += length

        if tok.type == "LBRACE":
            self.on_lbrace_func()
        elif tok.type == "RBRACE":
            self.on_rbrace_func()

        return tok

    def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
        """Create a Token at an absolute input position.

        Expects tok_type/value and the absolute byte offset pos in the current
        input. Does not advance lexer state; callers manage _pos themselves.
        Returns a Token with lineno/column computed from current line tracking.
        """
        column = pos - self._line_start + 1
        tok = _Token(tok_type, value, self._lineno, column)
        return tok

    def _error(self, msg: str, pos: int) -> None:
        column = pos - self._line_start + 1
        self.error_func(msg, self._lineno, column)

    def _handle_ppline(self) -> None:
        # Since #line directives aren't supposed to return tokens but should
        # only affect the lexer's state (update line/filename for coords), this
        # method does a bit of parsing on its own. It doesn't return anything,
        # but its side effect is to update self._pos past the directive, and
        # potentially update self._lineno and self._filename, based on the
        # directive's contents.
        #
        # Accepted #line forms from preprocessors:
        # - "#line 66 \"kwas\\df.h\""
        # - "# 9"
        # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
        # - "# 1 \"file.h\" 3"
        # Errors we must report:
        # - "#line \"file.h\"" (filename before line number)
        # - "#line df" (garbage instead of number/string)
        #
        # We scan the directive line once (after an optional 'line' keyword),
        # validating the order: NUMBER, optional STRING, then any NUMBERs.
        # The NUMBERs tail is only accepted if a filename STRING was present.
        text = self._lexdata
        n = len(text)
        line_end = text.find("\n", self._pos)
        if line_end == -1:
            line_end = n
        line = text[self._pos : line_end]
        pos = 0
        line_len = len(line)

        def skip_ws() -> None:
            nonlocal pos
            while pos < line_len and line[pos] in " \t":
                pos += 1

        skip_ws()
        if line.startswith("line", pos):
            pos += 4

        def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
            if pp_line is None:
                self._error("line number missing in #line", self._pos + line_len)
            else:
                self._lineno = int(pp_line)
                if pp_filename is not None:
                    self._filename = pp_filename
            self._pos = line_end + 1
            self._line_start = self._pos

        def fail(msg: str, offset: int) -> None:
            self._error(msg, self._pos + offset)
            self._pos = line_end + 1
            self._line_start = self._pos

        skip_ws()
        if pos >= line_len:
            success(None, None)
            return
        if line[pos] == '"':
            fail("filename before line number in #line", pos)
            return

        m = re.match(_decimal_constant, line[pos:])
        if not m:
            fail("invalid #line directive", pos)
            return

        pp_line = m.group(0)
        pos += len(pp_line)
        skip_ws()
        if pos >= line_len:
            success(pp_line, None)
            return

        if line[pos] != '"':
            fail("invalid #line directive", pos)
            return

        m = re.match(_string_literal, line[pos:])
        if not m:
            fail("invalid #line directive", pos)
            return

        pp_filename = m.group(0).lstrip('"').rstrip('"')
        pos += len(m.group(0))

        # Consume arbitrary sequence of numeric flags after the directive
        while True:
            skip_ws()
            if pos >= line_len:
                break
            m = re.match(_decimal_constant, line[pos:])
            if not m:
                fail("invalid #line directive", pos)
                return
            pos += len(m.group(0))

        success(pp_line, pp_filename)

    def _handle_pppragma(self) -> List[_Token]:
        # Parse a full #pragma line; returns a list of tokens with 1 or 2
        # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
        # returned, it means an error occurred, or we're at the end of input.
        #
        # Examples:
        # - "#pragma" -> PPPRAGMA only
        # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
        # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
        # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
        text = self._lexdata
        n = len(text)
        pos = self._pos

        while pos < n and text[pos] in " \t":
            pos += 1
        if pos >= n:
            self._pos = pos
            return []

        if not text.startswith("pragma", pos):
            self._error("invalid #pragma directive", pos)
            self._pos = pos + 1
            return []

        pragma_pos = pos
        pos += len("pragma")
        toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]

        while pos < n and text[pos] in " \t":
            pos += 1

        start = pos
        while pos < n and text[pos] != "\n":
            pos += 1
        if pos > start:
            toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
        if pos < n and text[pos] == "\n":
            self._lineno += 1
            pos += 1
            self._line_start = pos
        self._pos = pos
        return toks


##
## Reserved keywords
##
_keywords: Tuple[str, ...] = (
    "AUTO",
    "BREAK",
    "CASE",
    "CHAR",
    "CONST",
    "CONTINUE",
    "DEFAULT",
    "DO",
    "DOUBLE",
    "ELSE",
    "ENUM",
    "EXTERN",
    "FLOAT",
    "FOR",
    "GOTO",
    "IF",
    "INLINE",
    "INT",
    "LONG",
    "REGISTER",
    "OFFSETOF",
    "RESTRICT",
    "RETURN",
    "SHORT",
    "SIGNED",
    "SIZEOF",
    "STATIC",
    "STRUCT",
    "SWITCH",
    "TYPEDEF",
    "UNION",
    "UNSIGNED",
    "VOID",
    "VOLATILE",
    "WHILE",
    "__INT128",
    "_BOOL",
    "_COMPLEX",
    "_NORETURN",
    "_THREAD_LOCAL",
    "_STATIC_ASSERT",
    "_ATOMIC",
    "_ALIGNOF",
    "_ALIGNAS",
    "_PRAGMA",
)

_keyword_map: Dict[str, str] = {}

for keyword in _keywords:
    # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
    if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
        _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
    else:
        _keyword_map[keyword.lower()] = keyword

##
## Regexes for use in tokens
##

# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"

_hex_prefix = "0[xX]"
_hex_digits = "[0-9a-fA-F]+"
_bin_prefix = "0[bB]"
_bin_digits = "[01]+"

# integer constants (K&R2: A.2.5.1)
_integer_suffix_opt = (
    r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
)
_decimal_constant = (
    "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
)
_octal_constant = "0[0-7]*" + _integer_suffix_opt
_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt

_bad_octal_constant = "0[0-7]*[89]"

# comments are not supported
_unsupported_c_style_comment = r"\/\*"
_unsupported_cxx_style_comment = r"\/\/"

# character constants (K&R2: A.2.5.2)
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
# directives with Windows paths as filenames (..\..\dir\file)
# For the same reason, decimal_escape allows all digit sequences. We want to
# parse all correct code, even if it means to sometimes parse incorrect
# code.
#
# The original regexes were taken verbatim from the C syntax definition,
# and were later modified to avoid worst-case exponential running time.
#
#   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
#   decimal_escape = r"""(\d+)"""
#   hex_escape = r"""(x[0-9a-fA-F]+)"""
#   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
#
# The following modifications were made to avoid the ambiguity that allowed
# backtracking: (https://github.com/eliben/pycparser/issues/61)
#
# - \x was removed from simple_escape, unless it was not followed by a hex
#   digit, to avoid ambiguity with hex_escape.
# - hex_escape allows one or more hex characters, but requires that the next
#   character(if any) is not hex
# - decimal_escape allows one or more decimal characters, but requires that the
#   next character(if any) is not a decimal
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
#   permissive decimal_escape.
#
# Without this change, python's `re` module would recursively try parsing each
# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
# `\1`+`23`, `\12`+`3`, and `\123`.

_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
_decimal_escape = r"""(\d+)(?!\d)"""
_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""

_escape_sequence = (
    r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
)

# This complicated regex with lookahead might be slow for strings, so because
# all of the valid escapes (including \x) allowed
# 0 or more non-escaped characters after the first character,
# simple_escape+decimal_escape+hex_escape got simplified to

_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""

_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
_char_const = "'" + _cconst_char + "'"
_wchar_const = "L" + _char_const
_u8char_const = "u8" + _char_const
_u16char_const = "u" + _char_const
_u32char_const = "U" + _char_const
_multicharacter_constant = "'" + _cconst_char + "{2,4}'"
_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
_bad_char_const = (
    r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
)

# string literals (K&R2: A.2.6)
_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
_string_literal = '"' + _string_char + '*"'
_wstring_literal = "L" + _string_literal
_u8string_literal = "u8" + _string_literal
_u16string_literal = "u" + _string_literal
_u32string_literal = "U" + _string_literal
_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'

# floating constants (K&R2: A.2.5.3)
_exponent_part = r"""([eE][-+]?[0-9]+)"""
_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
_floating_constant = (
    "(((("
    + _fractional_constant
    + ")"
    + _exponent_part
    + "?)|([0-9]+"
    + _exponent_part
    + "))[FfLl]?)"
)
_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
_hex_fractional_constant = (
    "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
)
_hex_floating_constant = (
    "("
    + _hex_prefix
    + "("
    + _hex_digits
    + "|"
    + _hex_fractional_constant
    + ")"
    + _binary_exponent_part
    + "[FfLl]?)"
)


class _RegexAction(Enum):
    TOKEN = 0
    ID = 1
    ERROR = 2


@dataclass(frozen=True)
class _RegexRule:
    # tok_type: name of the token emitted for a match
    # regex_pattern: the raw regex (no anchors) to match at the current position
    # action: TOKEN for normal tokens, ID for identifiers, ERROR to report
    # error_message: message used for ERROR entries
    tok_type: str
    regex_pattern: str
    action: _RegexAction
    error_message: Optional[str]


_regex_rules: List[_RegexRule] = [
    _RegexRule(
        "UNSUPPORTED_C_STYLE_COMMENT",
        _unsupported_c_style_comment,
        _RegexAction.ERROR,
        "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
    ),
    _RegexRule(
        "UNSUPPORTED_CXX_STYLE_COMMENT",
        _unsupported_cxx_style_comment,
        _RegexAction.ERROR,
        "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
    ),
    _RegexRule(
        "BAD_STRING_LITERAL",
        _bad_string_literal,
        _RegexAction.ERROR,
        "String contains invalid escape code",
    ),
    _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
    _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
    _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
    _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
    _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
    _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
    _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
    _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
    _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
    _RegexRule(
        "BAD_CONST_OCT",
        _bad_octal_constant,
        _RegexAction.ERROR,
        "Invalid octal constant",
    ),
    _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
    _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
    _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
    _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
    _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
    _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
    _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
    _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
    _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
    _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
    _RegexRule("ID", _identifier, _RegexAction.ID, None),
]

_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
_regex_pattern_parts: List[str] = []
for _rule in _regex_rules:
    _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
    _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
# The master regex is a single alternation of all token patterns, each wrapped
# in a named group. We match once at the current position and then use
# `lastgroup` to recover which token kind fired; this avoids iterating over all
# regexes on every character while keeping the same token-level semantics.
_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))


@dataclass(frozen=True)
class _FixedToken:
    tok_type: str
    literal: str


_fixed_tokens: List[_FixedToken] = [
    _FixedToken("ELLIPSIS", "..."),
    _FixedToken("LSHIFTEQUAL", "<<="),
    _FixedToken("RSHIFTEQUAL", ">>="),
    _FixedToken("PLUSPLUS", "++"),
    _FixedToken("MINUSMINUS", "--"),
    _FixedToken("ARROW", "->"),
    _FixedToken("LAND", "&&"),
    _FixedToken("LOR", "||"),
    _FixedToken("LSHIFT", "<<"),
    _FixedToken("RSHIFT", ">>"),
    _FixedToken("LE", "<="),
    _FixedToken("GE", ">="),
    _FixedToken("EQ", "=="),
    _FixedToken("NE", "!="),
    _FixedToken("TIMESEQUAL", "*="),
    _FixedToken("DIVEQUAL", "/="),
    _FixedToken("MODEQUAL", "%="),
    _FixedToken("PLUSEQUAL", "+="),
    _FixedToken("MINUSEQUAL", "-="),
    _FixedToken("ANDEQUAL", "&="),
    _FixedToken("OREQUAL", "|="),
    _FixedToken("XOREQUAL", "^="),
    _FixedToken("EQUALS", "="),
    _FixedToken("PLUS", "+"),
    _FixedToken("MINUS", "-"),
    _FixedToken("TIMES", "*"),
    _FixedToken("DIVIDE", "/"),
    _FixedToken("MOD", "%"),
    _FixedToken("OR", "|"),
    _FixedToken("AND", "&"),
    _FixedToken("NOT", "~"),
    _FixedToken("XOR", "^"),
    _FixedToken("LNOT", "!"),
    _FixedToken("LT", "<"),
    _FixedToken("GT", ">"),
    _FixedToken("CONDOP", "?"),
    _FixedToken("LPAREN", "("),
    _FixedToken("RPAREN", ")"),
    _FixedToken("LBRACKET", "["),
    _FixedToken("RBRACKET", "]"),
    _FixedToken("LBRACE", "{"),
    _FixedToken("RBRACE", "}"),
    _FixedToken("COMMA", ","),
    _FixedToken("PERIOD", "."),
    _FixedToken("SEMI", ";"),
    _FixedToken("COLON", ":"),
]

# To avoid scanning all fixed tokens on every character, we bucket them by the
# first character. When matching at position i, we only look at the bucket for
# text[i], and we pre-sort that bucket by token length so the first match is
# also the longest. This preserves longest-match semantics (e.g. '>>=' before
# '>>' before '>') while reducing the number of comparisons.
_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
for _entry in _fixed_tokens:
    _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
for _bucket in _fixed_tokens_by_first.values():
    _bucket.sort(key=lambda item: len(item.literal), reverse=True)

_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")