Update dashboard, memory, root +2 more (+3 ~5)

This commit is contained in:
Echo
2026-02-02 16:21:41 +00:00
parent 2e8d47353b
commit 84701a062e
2212 changed files with 2938184 additions and 37 deletions

View File

@@ -0,0 +1,99 @@
# -----------------------------------------------------------------
# pycparser: __init__.py
#
# This package file exports some convenience functions for
# interacting with pycparser
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# -----------------------------------------------------------------
__all__ = ["c_lexer", "c_parser", "c_ast"]
__version__ = "3.00"
import io
from subprocess import check_output
from . import c_parser
CParser = c_parser.CParser
def preprocess_file(filename, cpp_path="cpp", cpp_args=""):
"""Preprocess a file using cpp.
filename:
Name of the file you want to preprocess.
cpp_path:
cpp_args:
Refer to the documentation of parse_file for the meaning of these
arguments.
When successful, returns the preprocessed file's contents.
Errors from cpp will be printed out.
"""
path_list = [cpp_path]
if isinstance(cpp_args, list):
path_list += cpp_args
elif cpp_args != "":
path_list += [cpp_args]
path_list += [filename]
try:
# Note the use of universal_newlines to treat all newlines
# as \n for Python's purpose
text = check_output(path_list, universal_newlines=True)
except OSError as e:
raise RuntimeError(
"Unable to invoke 'cpp'. "
+ "Make sure its path was passed correctly\n"
+ f"Original error: {e}"
)
return text
def parse_file(
filename, use_cpp=False, cpp_path="cpp", cpp_args="", parser=None, encoding=None
):
"""Parse a C file using pycparser.
filename:
Name of the file you want to parse.
use_cpp:
Set to True if you want to execute the C pre-processor
on the file prior to parsing it.
cpp_path:
If use_cpp is True, this is the path to 'cpp' on your
system. If no path is provided, it attempts to just
execute 'cpp', so it must be in your PATH.
cpp_args:
If use_cpp is True, set this to the command line arguments strings
to cpp. Be careful with quotes - it's best to pass a raw string
(r'') here. For example:
r'-I../utils/fake_libc_include'
If several arguments are required, pass a list of strings.
encoding:
Encoding to use for the file to parse
parser:
Optional parser object to be used instead of the default CParser
When successful, an AST is returned. ParseError can be
thrown if the file doesn't parse successfully.
Errors from cpp will be printed out.
"""
if use_cpp:
text = preprocess_file(filename, cpp_path, cpp_args)
else:
with io.open(filename, encoding=encoding) as f:
text = f.read()
if parser is None:
parser = CParser()
return parser.parse(text, filename)

View File

@@ -0,0 +1,355 @@
# -----------------------------------------------------------------
# _ast_gen.py
#
# Generates the AST Node classes from a specification given in
# a configuration file. This module can also be run as a script to
# regenerate c_ast.py from _c_ast.cfg (from the repo root or the
# pycparser/ directory). Use 'make check' to reformat the generated
# file after running this script.
#
# The design of this module was inspired by astgen.py from the
# Python 2.5 code-base.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# -----------------------------------------------------------------
from string import Template
import os
from typing import IO
class ASTCodeGenerator:
def __init__(self, cfg_filename="_c_ast.cfg"):
"""Initialize the code generator from a configuration
file.
"""
self.cfg_filename = cfg_filename
self.node_cfg = [
NodeCfg(name, contents)
for (name, contents) in self.parse_cfgfile(cfg_filename)
]
def generate(self, file: IO[str]) -> None:
"""Generates the code into file, an open file buffer."""
src = Template(_PROLOGUE_COMMENT).substitute(cfg_filename=self.cfg_filename)
src += _PROLOGUE_CODE
for node_cfg in self.node_cfg:
src += node_cfg.generate_source() + "\n\n"
file.write(src)
def parse_cfgfile(self, filename):
"""Parse the configuration file and yield pairs of
(name, contents) for each node.
"""
with open(filename, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
colon_i = line.find(":")
lbracket_i = line.find("[")
rbracket_i = line.find("]")
if colon_i < 1 or lbracket_i <= colon_i or rbracket_i <= lbracket_i:
raise RuntimeError(f"Invalid line in {filename}:\n{line}\n")
name = line[:colon_i]
val = line[lbracket_i + 1 : rbracket_i]
vallist = [v.strip() for v in val.split(",")] if val else []
yield name, vallist
class NodeCfg:
"""Node configuration.
name: node name
contents: a list of contents - attributes and child nodes
See comment at the top of the configuration file for details.
"""
def __init__(self, name, contents):
self.name = name
self.all_entries = []
self.attr = []
self.child = []
self.seq_child = []
for entry in contents:
clean_entry = entry.rstrip("*")
self.all_entries.append(clean_entry)
if entry.endswith("**"):
self.seq_child.append(clean_entry)
elif entry.endswith("*"):
self.child.append(clean_entry)
else:
self.attr.append(entry)
def generate_source(self):
src = self._gen_init()
src += "\n" + self._gen_children()
src += "\n" + self._gen_iter()
src += "\n" + self._gen_attr_names()
return src
def _gen_init(self):
src = f"class {self.name}(Node):\n"
if self.all_entries:
args = ", ".join(self.all_entries)
slots = ", ".join(f"'{e}'" for e in self.all_entries)
slots += ", 'coord', '__weakref__'"
arglist = f"(self, {args}, coord=None)"
else:
slots = "'coord', '__weakref__'"
arglist = "(self, coord=None)"
src += f" __slots__ = ({slots})\n"
src += f" def __init__{arglist}:\n"
for name in self.all_entries + ["coord"]:
src += f" self.{name} = {name}\n"
return src
def _gen_children(self):
src = " def children(self):\n"
if self.all_entries:
src += " nodelist = []\n"
for child in self.child:
src += f" if self.{child} is not None:\n"
src += f' nodelist.append(("{child}", self.{child}))\n'
for seq_child in self.seq_child:
src += f" for i, child in enumerate(self.{seq_child} or []):\n"
src += f' nodelist.append((f"{seq_child}[{{i}}]", child))\n'
src += " return tuple(nodelist)\n"
else:
src += " return ()\n"
return src
def _gen_iter(self):
src = " def __iter__(self):\n"
if self.all_entries:
for child in self.child:
src += f" if self.{child} is not None:\n"
src += f" yield self.{child}\n"
for seq_child in self.seq_child:
src += f" for child in (self.{seq_child} or []):\n"
src += " yield child\n"
if not (self.child or self.seq_child):
# Empty generator
src += " return\n" + " yield\n"
else:
# Empty generator
src += " return\n" + " yield\n"
return src
def _gen_attr_names(self):
src = " attr_names = (" + "".join(f"{nm!r}, " for nm in self.attr) + ")"
return src
_PROLOGUE_COMMENT = r"""#-----------------------------------------------------------------
# ** ATTENTION **
# This code was automatically generated from _c_ast.cfg
#
# Do not modify it directly. Modify the configuration file and
# run the generator again.
# ** ** *** ** **
#
# pycparser: c_ast.py
#
# AST Node classes.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
"""
_PROLOGUE_CODE = r'''
import sys
from typing import Any, ClassVar, IO, Optional
def _repr(obj):
"""
Get the representation of an object, with dedicated pprint-like format for lists.
"""
if isinstance(obj, list):
return '[' + (',\n '.join((_repr(e).replace('\n', '\n ') for e in obj))) + '\n]'
else:
return repr(obj)
class Node:
__slots__ = ()
""" Abstract base class for AST nodes.
"""
attr_names: ClassVar[tuple[str, ...]] = ()
coord: Optional[Any]
def __repr__(self):
""" Generates a python representation of the current node
"""
result = self.__class__.__name__ + '('
indent = ''
separator = ''
for name in self.__slots__[:-2]:
result += separator
result += indent
result += name + '=' + (_repr(getattr(self, name)).replace('\n', '\n ' + (' ' * (len(name) + len(self.__class__.__name__)))))
separator = ','
indent = '\n ' + (' ' * len(self.__class__.__name__))
result += indent + ')'
return result
def children(self):
""" A sequence of all children that are Nodes
"""
pass
def show(
self,
buf: IO[str] = sys.stdout,
offset: int = 0,
attrnames: bool = False,
showemptyattrs: bool = True,
nodenames: bool = False,
showcoord: bool = False,
_my_node_name: Optional[str] = None,
):
""" Pretty print the Node and all its attributes and
children (recursively) to a buffer.
buf:
Open IO buffer into which the Node is printed.
offset:
Initial offset (amount of leading spaces)
attrnames:
True if you want to see the attribute names in
name=value pairs. False to only see the values.
showemptyattrs:
False if you want to suppress printing empty attributes.
nodenames:
True if you want to see the actual node names
within their parents.
showcoord:
Do you want the coordinates of each Node to be
displayed.
"""
lead = ' ' * offset
if nodenames and _my_node_name is not None:
buf.write(lead + self.__class__.__name__+ ' <' + _my_node_name + '>: ')
else:
buf.write(lead + self.__class__.__name__+ ': ')
if self.attr_names:
def is_empty(v):
v is None or (hasattr(v, '__len__') and len(v) == 0)
nvlist = [(n, getattr(self,n)) for n in self.attr_names \
if showemptyattrs or not is_empty(getattr(self,n))]
if attrnames:
attrstr = ', '.join(f'{name}={value}' for name, value in nvlist)
else:
attrstr = ', '.join(f'{value}' for _, value in nvlist)
buf.write(attrstr)
if showcoord:
buf.write(f' (at {self.coord})')
buf.write('\n')
for (child_name, child) in self.children():
child.show(
buf,
offset=offset + 2,
attrnames=attrnames,
showemptyattrs=showemptyattrs,
nodenames=nodenames,
showcoord=showcoord,
_my_node_name=child_name)
class NodeVisitor:
""" A base NodeVisitor class for visiting c_ast nodes.
Subclass it and define your own visit_XXX methods, where
XXX is the class name you want to visit with these
methods.
For example:
class ConstantVisitor(NodeVisitor):
def __init__(self):
self.values = []
def visit_Constant(self, node):
self.values.append(node.value)
Creates a list of values of all the constant nodes
encountered below the given node. To use it:
cv = ConstantVisitor()
cv.visit(node)
Notes:
* generic_visit() will be called for AST nodes for which
no visit_XXX method was defined.
* The children of nodes for which a visit_XXX was
defined will not be visited - if you need this, call
generic_visit() on the node.
You can use:
NodeVisitor.generic_visit(self, node)
* Modeled after Python's own AST visiting facilities
(the ast module of Python 3.0)
"""
_method_cache = None
def visit(self, node: Node):
""" Visit a node.
"""
if self._method_cache is None:
self._method_cache = {}
visitor = self._method_cache.get(node.__class__.__name__, None)
if visitor is None:
method = 'visit_' + node.__class__.__name__
visitor = getattr(self, method, self.generic_visit)
self._method_cache[node.__class__.__name__] = visitor
return visitor(node)
def generic_visit(self, node: Node):
""" Called if no explicit visitor function exists for a
node. Implements preorder visiting of the node.
"""
for _, c in node.children():
self.visit(c)
'''
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.abspath(__file__))
cfg_path = os.path.join(base_dir, "_c_ast.cfg")
out_path = os.path.join(base_dir, "c_ast.py")
ast_gen = ASTCodeGenerator(cfg_path)
with open(out_path, "w") as out:
ast_gen.generate(out)

View File

@@ -0,0 +1,195 @@
#-----------------------------------------------------------------
# pycparser: _c_ast.cfg
#
# Defines the AST Node classes used in pycparser.
#
# Each entry is a Node sub-class name, listing the attributes
# and child nodes of the class:
# <name>* - a child node
# <name>** - a sequence of child nodes
# <name> - an attribute
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
# ArrayDecl is a nested declaration of an array with the given type.
# dim: the dimension (for example, constant 42)
# dim_quals: list of dimension qualifiers, to support C99's allowing 'const'
# and 'static' within the array dimension in function declarations.
ArrayDecl: [type*, dim*, dim_quals]
ArrayRef: [name*, subscript*]
# op: =, +=, /= etc.
#
Assignment: [op, lvalue*, rvalue*]
Alignas: [alignment*]
BinaryOp: [op, left*, right*]
Break: []
Case: [expr*, stmts**]
Cast: [to_type*, expr*]
# Compound statement in C99 is a list of block items (declarations or
# statements).
#
Compound: [block_items**]
# Compound literal (anonymous aggregate) for C99.
# (type-name) {initializer_list}
# type: the typename
# init: InitList for the initializer list
#
CompoundLiteral: [type*, init*]
# type: int, char, float, string, etc.
#
Constant: [type, value]
Continue: []
# name: the variable being declared
# quals: list of qualifiers (const, volatile)
# funcspec: list function specifiers (i.e. inline in C99)
# storage: list of storage specifiers (extern, register, etc.)
# type: declaration type (probably nested with all the modifiers)
# init: initialization value, or None
# bitsize: bit field size, or None
#
Decl: [name, quals, align, storage, funcspec, type*, init*, bitsize*]
DeclList: [decls**]
Default: [stmts**]
DoWhile: [cond*, stmt*]
# Represents the ellipsis (...) parameter in a function
# declaration
#
EllipsisParam: []
# An empty statement (a semicolon ';' on its own)
#
EmptyStatement: []
# Enumeration type specifier
# name: an optional ID
# values: an EnumeratorList
#
Enum: [name, values*]
# A name/value pair for enumeration values
#
Enumerator: [name, value*]
# A list of enumerators
#
EnumeratorList: [enumerators**]
# A list of expressions separated by the comma operator.
#
ExprList: [exprs**]
# This is the top of the AST, representing a single C file (a
# translation unit in K&R jargon). It contains a list of
# "external-declaration"s, which is either declarations (Decl),
# Typedef or function definitions (FuncDef).
#
FileAST: [ext**]
# for (init; cond; next) stmt
#
For: [init*, cond*, next*, stmt*]
# name: Id
# args: ExprList
#
FuncCall: [name*, args*]
# type <decl>(args)
#
FuncDecl: [args*, type*]
# Function definition: a declarator for the function name and
# a body, which is a compound statement.
# There's an optional list of parameter declarations for old
# K&R-style definitions
#
FuncDef: [decl*, param_decls**, body*]
Goto: [name]
ID: [name]
# Holder for types that are a simple identifier (e.g. the built
# ins void, char etc. and typedef-defined types)
#
IdentifierType: [names]
If: [cond*, iftrue*, iffalse*]
# An initialization list used for compound literals.
#
InitList: [exprs**]
Label: [name, stmt*]
# A named initializer for C99.
# The name of a NamedInitializer is a sequence of Nodes, because
# names can be hierarchical and contain constant expressions.
#
NamedInitializer: [name**, expr*]
# a list of comma separated function parameter declarations
#
ParamList: [params**]
PtrDecl: [quals, type*]
Return: [expr*]
StaticAssert: [cond*, message*]
# name: struct tag name
# decls: declaration of members
#
Struct: [name, decls**]
# type: . or ->
# name.field or name->field
#
StructRef: [name*, type, field*]
Switch: [cond*, stmt*]
# cond ? iftrue : iffalse
#
TernaryOp: [cond*, iftrue*, iffalse*]
# A base type declaration
#
TypeDecl: [declname, quals, align, type*]
# A typedef declaration.
# Very similar to Decl, but without some attributes
#
Typedef: [name, quals, storage, type*]
Typename: [name, quals, align, type*]
UnaryOp: [op, expr*]
# name: union tag name
# decls: declaration of members
#
Union: [name, decls**]
While: [cond*, stmt*]
Pragma: [string]

View File

@@ -0,0 +1,174 @@
# ------------------------------------------------------------------------------
# pycparser: ast_transforms.py
#
# Some utilities used by the parser to create a friendlier AST.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# ------------------------------------------------------------------------------
from typing import Any, List, Tuple, cast
from . import c_ast
def fix_switch_cases(switch_node: c_ast.Switch) -> c_ast.Switch:
"""The 'case' statements in a 'switch' come out of parsing with one
child node, so subsequent statements are just tucked to the parent
Compound. Additionally, consecutive (fall-through) case statements
come out messy. This is a peculiarity of the C grammar. The following:
switch (myvar) {
case 10:
k = 10;
p = k + 1;
return 10;
case 20:
case 30:
return 20;
default:
break;
}
Creates this tree (pseudo-dump):
Switch
ID: myvar
Compound:
Case 10:
k = 10
p = k + 1
return 10
Case 20:
Case 30:
return 20
Default:
break
The goal of this transform is to fix this mess, turning it into the
following:
Switch
ID: myvar
Compound:
Case 10:
k = 10
p = k + 1
return 10
Case 20:
Case 30:
return 20
Default:
break
A fixed AST node is returned. The argument may be modified.
"""
assert isinstance(switch_node, c_ast.Switch)
if not isinstance(switch_node.stmt, c_ast.Compound):
return switch_node
# The new Compound child for the Switch, which will collect children in the
# correct order
new_compound = c_ast.Compound([], switch_node.stmt.coord)
# The last Case/Default node
last_case: c_ast.Case | c_ast.Default | None = None
# Goes over the children of the Compound below the Switch, adding them
# either directly below new_compound or below the last Case as appropriate
# (for `switch(cond) {}`, block_items would have been None)
for child in switch_node.stmt.block_items or []:
if isinstance(child, (c_ast.Case, c_ast.Default)):
# If it's a Case/Default:
# 1. Add it to the Compound and mark as "last case"
# 2. If its immediate child is also a Case or Default, promote it
# to a sibling.
new_compound.block_items.append(child)
_extract_nested_case(child, new_compound.block_items)
last_case = new_compound.block_items[-1]
else:
# Other statements are added as children to the last case, if it
# exists.
if last_case is None:
new_compound.block_items.append(child)
else:
last_case.stmts.append(child)
switch_node.stmt = new_compound
return switch_node
def _extract_nested_case(
case_node: c_ast.Case | c_ast.Default, stmts_list: List[c_ast.Node]
) -> None:
"""Recursively extract consecutive Case statements that are made nested
by the parser and add them to the stmts_list.
"""
if isinstance(case_node.stmts[0], (c_ast.Case, c_ast.Default)):
nested = case_node.stmts.pop()
stmts_list.append(nested)
_extract_nested_case(cast(Any, nested), stmts_list)
def fix_atomic_specifiers(
decl: c_ast.Decl | c_ast.Typedef,
) -> c_ast.Decl | c_ast.Typedef:
"""Atomic specifiers like _Atomic(type) are unusually structured,
conferring a qualifier upon the contained type.
This function fixes a decl with atomic specifiers to have a sane AST
structure, by removing spurious Typename->TypeDecl pairs and attaching
the _Atomic qualifier in the right place.
"""
# There can be multiple levels of _Atomic in a decl; fix them until a
# fixed point is reached.
while True:
decl, found = _fix_atomic_specifiers_once(decl)
if not found:
break
# Make sure to add an _Atomic qual on the topmost decl if needed. Also
# restore the declname on the innermost TypeDecl (it gets placed in the
# wrong place during construction).
typ: Any = decl
while not isinstance(typ, c_ast.TypeDecl):
try:
typ = typ.type
except AttributeError:
return decl
if "_Atomic" in typ.quals and "_Atomic" not in decl.quals:
decl.quals.append("_Atomic")
if typ.declname is None:
typ.declname = decl.name
return decl
def _fix_atomic_specifiers_once(
decl: c_ast.Decl | c_ast.Typedef,
) -> Tuple[c_ast.Decl | c_ast.Typedef, bool]:
"""Performs one 'fix' round of atomic specifiers.
Returns (modified_decl, found) where found is True iff a fix was made.
"""
parent: Any = decl
grandparent: Any = None
node: Any = decl.type
while node is not None:
if isinstance(node, c_ast.Typename) and "_Atomic" in node.quals:
break
try:
grandparent = parent
parent = node
node = node.type
except AttributeError:
# If we've reached a node without a `type` field, it means we won't
# find what we're looking for at this point; give up the search
# and return the original decl unmodified.
return decl, False
assert isinstance(parent, c_ast.TypeDecl)
assert grandparent is not None
cast(Any, grandparent).type = node.type
if "_Atomic" not in node.type.quals:
node.type.quals.append("_Atomic")
return decl, True

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,573 @@
# ------------------------------------------------------------------------------
# pycparser: c_generator.py
#
# C code generator from pycparser AST nodes.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# ------------------------------------------------------------------------------
from typing import Callable, List, Optional
from . import c_ast
class CGenerator:
"""Uses the same visitor pattern as c_ast.NodeVisitor, but modified to
return a value from each visit method, using string accumulation in
generic_visit.
"""
indent_level: int
reduce_parentheses: bool
def __init__(self, reduce_parentheses: bool = False) -> None:
"""Constructs C-code generator
reduce_parentheses:
if True, eliminates needless parentheses on binary operators
"""
# Statements start with indentation of self.indent_level spaces, using
# the _make_indent method.
self.indent_level = 0
self.reduce_parentheses = reduce_parentheses
def _make_indent(self) -> str:
return " " * self.indent_level
def visit(self, node: c_ast.Node) -> str:
method = "visit_" + node.__class__.__name__
return getattr(self, method, self.generic_visit)(node)
def generic_visit(self, node: Optional[c_ast.Node]) -> str:
if node is None:
return ""
else:
return "".join(self.visit(c) for c_name, c in node.children())
def visit_Constant(self, n: c_ast.Constant) -> str:
return n.value
def visit_ID(self, n: c_ast.ID) -> str:
return n.name
def visit_Pragma(self, n: c_ast.Pragma) -> str:
ret = "#pragma"
if n.string:
ret += " " + n.string
return ret
def visit_ArrayRef(self, n: c_ast.ArrayRef) -> str:
arrref = self._parenthesize_unless_simple(n.name)
return arrref + "[" + self.visit(n.subscript) + "]"
def visit_StructRef(self, n: c_ast.StructRef) -> str:
sref = self._parenthesize_unless_simple(n.name)
return sref + n.type + self.visit(n.field)
def visit_FuncCall(self, n: c_ast.FuncCall) -> str:
fref = self._parenthesize_unless_simple(n.name)
args = self.visit(n.args) if n.args is not None else ""
return fref + "(" + args + ")"
def visit_UnaryOp(self, n: c_ast.UnaryOp) -> str:
match n.op:
case "sizeof":
# Always parenthesize the argument of sizeof since it can be
# a name.
return f"sizeof({self.visit(n.expr)})"
case "p++":
operand = self._parenthesize_unless_simple(n.expr)
return f"{operand}++"
case "p--":
operand = self._parenthesize_unless_simple(n.expr)
return f"{operand}--"
case _:
operand = self._parenthesize_unless_simple(n.expr)
return f"{n.op}{operand}"
# Precedence map of binary operators:
precedence_map = {
# Should be in sync with c_parser.CParser.precedence
# Higher numbers are stronger binding
"||": 0, # weakest binding
"&&": 1,
"|": 2,
"^": 3,
"&": 4,
"==": 5,
"!=": 5,
">": 6,
">=": 6,
"<": 6,
"<=": 6,
">>": 7,
"<<": 7,
"+": 8,
"-": 8,
"*": 9,
"/": 9,
"%": 9, # strongest binding
}
def visit_BinaryOp(self, n: c_ast.BinaryOp) -> str:
# Note: all binary operators are left-to-right associative
#
# If `n.left.op` has a stronger or equally binding precedence in
# comparison to `n.op`, no parenthesis are needed for the left:
# e.g., `(a*b) + c` is equivalent to `a*b + c`, as well as
# `(a+b) - c` is equivalent to `a+b - c` (same precedence).
# If the left operator is weaker binding than the current, then
# parentheses are necessary:
# e.g., `(a+b) * c` is NOT equivalent to `a+b * c`.
lval_str = self._parenthesize_if(
n.left,
lambda d: not (
self._is_simple_node(d)
or self.reduce_parentheses
and isinstance(d, c_ast.BinaryOp)
and self.precedence_map[d.op] >= self.precedence_map[n.op]
),
)
# If `n.right.op` has a stronger -but not equal- binding precedence,
# parenthesis can be omitted on the right:
# e.g., `a + (b*c)` is equivalent to `a + b*c`.
# If the right operator is weaker or equally binding, then parentheses
# are necessary:
# e.g., `a * (b+c)` is NOT equivalent to `a * b+c` and
# `a - (b+c)` is NOT equivalent to `a - b+c` (same precedence).
rval_str = self._parenthesize_if(
n.right,
lambda d: not (
self._is_simple_node(d)
or self.reduce_parentheses
and isinstance(d, c_ast.BinaryOp)
and self.precedence_map[d.op] > self.precedence_map[n.op]
),
)
return f"{lval_str} {n.op} {rval_str}"
def visit_Assignment(self, n: c_ast.Assignment) -> str:
rval_str = self._parenthesize_if(
n.rvalue, lambda n: isinstance(n, c_ast.Assignment)
)
return f"{self.visit(n.lvalue)} {n.op} {rval_str}"
def visit_IdentifierType(self, n: c_ast.IdentifierType) -> str:
return " ".join(n.names)
def _visit_expr(self, n: c_ast.Node) -> str:
match n:
case c_ast.InitList():
return "{" + self.visit(n) + "}"
case c_ast.ExprList() | c_ast.Compound():
return "(" + self.visit(n) + ")"
case _:
return self.visit(n)
def visit_Decl(self, n: c_ast.Decl, no_type: bool = False) -> str:
# no_type is used when a Decl is part of a DeclList, where the type is
# explicitly only for the first declaration in a list.
#
s = n.name if no_type else self._generate_decl(n)
if n.bitsize:
s += " : " + self.visit(n.bitsize)
if n.init:
s += " = " + self._visit_expr(n.init)
return s
def visit_DeclList(self, n: c_ast.DeclList) -> str:
s = self.visit(n.decls[0])
if len(n.decls) > 1:
s += ", " + ", ".join(
self.visit_Decl(decl, no_type=True) for decl in n.decls[1:]
)
return s
def visit_Typedef(self, n: c_ast.Typedef) -> str:
s = ""
if n.storage:
s += " ".join(n.storage) + " "
s += self._generate_type(n.type)
return s
def visit_Cast(self, n: c_ast.Cast) -> str:
s = "(" + self._generate_type(n.to_type, emit_declname=False) + ")"
return s + " " + self._parenthesize_unless_simple(n.expr)
def visit_ExprList(self, n: c_ast.ExprList) -> str:
visited_subexprs = []
for expr in n.exprs:
visited_subexprs.append(self._visit_expr(expr))
return ", ".join(visited_subexprs)
def visit_InitList(self, n: c_ast.InitList) -> str:
visited_subexprs = []
for expr in n.exprs:
visited_subexprs.append(self._visit_expr(expr))
return ", ".join(visited_subexprs)
def visit_Enum(self, n: c_ast.Enum) -> str:
return self._generate_struct_union_enum(n, name="enum")
def visit_Alignas(self, n: c_ast.Alignas) -> str:
return "_Alignas({})".format(self.visit(n.alignment))
def visit_Enumerator(self, n: c_ast.Enumerator) -> str:
if not n.value:
return "{indent}{name},\n".format(
indent=self._make_indent(),
name=n.name,
)
else:
return "{indent}{name} = {value},\n".format(
indent=self._make_indent(),
name=n.name,
value=self.visit(n.value),
)
def visit_FuncDef(self, n: c_ast.FuncDef) -> str:
decl = self.visit(n.decl)
self.indent_level = 0
body = self.visit(n.body)
if n.param_decls:
knrdecls = ";\n".join(self.visit(p) for p in n.param_decls)
return decl + "\n" + knrdecls + ";\n" + body + "\n"
else:
return decl + "\n" + body + "\n"
def visit_FileAST(self, n: c_ast.FileAST) -> str:
s = ""
for ext in n.ext:
match ext:
case c_ast.FuncDef():
s += self.visit(ext)
case c_ast.Pragma():
s += self.visit(ext) + "\n"
case _:
s += self.visit(ext) + ";\n"
return s
def visit_Compound(self, n: c_ast.Compound) -> str:
s = self._make_indent() + "{\n"
self.indent_level += 2
if n.block_items:
s += "".join(self._generate_stmt(stmt) for stmt in n.block_items)
self.indent_level -= 2
s += self._make_indent() + "}\n"
return s
def visit_CompoundLiteral(self, n: c_ast.CompoundLiteral) -> str:
return "(" + self.visit(n.type) + "){" + self.visit(n.init) + "}"
def visit_EmptyStatement(self, n: c_ast.EmptyStatement) -> str:
return ";"
def visit_ParamList(self, n: c_ast.ParamList) -> str:
return ", ".join(self.visit(param) for param in n.params)
def visit_Return(self, n: c_ast.Return) -> str:
s = "return"
if n.expr:
s += " " + self.visit(n.expr)
return s + ";"
def visit_Break(self, n: c_ast.Break) -> str:
return "break;"
def visit_Continue(self, n: c_ast.Continue) -> str:
return "continue;"
def visit_TernaryOp(self, n: c_ast.TernaryOp) -> str:
s = "(" + self._visit_expr(n.cond) + ") ? "
s += "(" + self._visit_expr(n.iftrue) + ") : "
s += "(" + self._visit_expr(n.iffalse) + ")"
return s
def visit_If(self, n: c_ast.If) -> str:
s = "if ("
if n.cond:
s += self.visit(n.cond)
s += ")\n"
s += self._generate_stmt(n.iftrue, add_indent=True)
if n.iffalse:
s += self._make_indent() + "else\n"
s += self._generate_stmt(n.iffalse, add_indent=True)
return s
def visit_For(self, n: c_ast.For) -> str:
s = "for ("
if n.init:
s += self.visit(n.init)
s += ";"
if n.cond:
s += " " + self.visit(n.cond)
s += ";"
if n.next:
s += " " + self.visit(n.next)
s += ")\n"
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_While(self, n: c_ast.While) -> str:
s = "while ("
if n.cond:
s += self.visit(n.cond)
s += ")\n"
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_DoWhile(self, n: c_ast.DoWhile) -> str:
s = "do\n"
s += self._generate_stmt(n.stmt, add_indent=True)
s += self._make_indent() + "while ("
if n.cond:
s += self.visit(n.cond)
s += ");"
return s
def visit_StaticAssert(self, n: c_ast.StaticAssert) -> str:
s = "_Static_assert("
s += self.visit(n.cond)
if n.message:
s += ","
s += self.visit(n.message)
s += ")"
return s
def visit_Switch(self, n: c_ast.Switch) -> str:
s = "switch (" + self.visit(n.cond) + ")\n"
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_Case(self, n: c_ast.Case) -> str:
s = "case " + self.visit(n.expr) + ":\n"
for stmt in n.stmts:
s += self._generate_stmt(stmt, add_indent=True)
return s
def visit_Default(self, n: c_ast.Default) -> str:
s = "default:\n"
for stmt in n.stmts:
s += self._generate_stmt(stmt, add_indent=True)
return s
def visit_Label(self, n: c_ast.Label) -> str:
return n.name + ":\n" + self._generate_stmt(n.stmt)
def visit_Goto(self, n: c_ast.Goto) -> str:
return "goto " + n.name + ";"
def visit_EllipsisParam(self, n: c_ast.EllipsisParam) -> str:
return "..."
def visit_Struct(self, n: c_ast.Struct) -> str:
return self._generate_struct_union_enum(n, "struct")
def visit_Typename(self, n: c_ast.Typename) -> str:
return self._generate_type(n.type)
def visit_Union(self, n: c_ast.Union) -> str:
return self._generate_struct_union_enum(n, "union")
def visit_NamedInitializer(self, n: c_ast.NamedInitializer) -> str:
s = ""
for name in n.name:
if isinstance(name, c_ast.ID):
s += "." + name.name
else:
s += "[" + self.visit(name) + "]"
s += " = " + self._visit_expr(n.expr)
return s
def visit_FuncDecl(self, n: c_ast.FuncDecl) -> str:
return self._generate_type(n)
def visit_ArrayDecl(self, n: c_ast.ArrayDecl) -> str:
return self._generate_type(n, emit_declname=False)
def visit_TypeDecl(self, n: c_ast.TypeDecl) -> str:
return self._generate_type(n, emit_declname=False)
def visit_PtrDecl(self, n: c_ast.PtrDecl) -> str:
return self._generate_type(n, emit_declname=False)
def _generate_struct_union_enum(
self, n: c_ast.Struct | c_ast.Union | c_ast.Enum, name: str
) -> str:
"""Generates code for structs, unions, and enums. name should be
'struct', 'union', or 'enum'.
"""
if name in ("struct", "union"):
assert isinstance(n, (c_ast.Struct, c_ast.Union))
members = n.decls
body_function = self._generate_struct_union_body
else:
assert name == "enum"
assert isinstance(n, c_ast.Enum)
members = None if n.values is None else n.values.enumerators
body_function = self._generate_enum_body
s = name + " " + (n.name or "")
if members is not None:
# None means no members
# Empty sequence means an empty list of members
s += "\n"
s += self._make_indent()
self.indent_level += 2
s += "{\n"
s += body_function(members)
self.indent_level -= 2
s += self._make_indent() + "}"
return s
def _generate_struct_union_body(self, members: List[c_ast.Node]) -> str:
return "".join(self._generate_stmt(decl) for decl in members)
def _generate_enum_body(self, members: List[c_ast.Enumerator]) -> str:
# `[:-2] + '\n'` removes the final `,` from the enumerator list
return "".join(self.visit(value) for value in members)[:-2] + "\n"
def _generate_stmt(self, n: c_ast.Node, add_indent: bool = False) -> str:
"""Generation from a statement node. This method exists as a wrapper
for individual visit_* methods to handle different treatment of
some statements in this context.
"""
if add_indent:
self.indent_level += 2
indent = self._make_indent()
if add_indent:
self.indent_level -= 2
match n:
case (
c_ast.Decl()
| c_ast.Assignment()
| c_ast.Cast()
| c_ast.UnaryOp()
| c_ast.BinaryOp()
| c_ast.TernaryOp()
| c_ast.FuncCall()
| c_ast.ArrayRef()
| c_ast.StructRef()
| c_ast.Constant()
| c_ast.ID()
| c_ast.Typedef()
| c_ast.ExprList()
):
# These can also appear in an expression context so no semicolon
# is added to them automatically
#
return indent + self.visit(n) + ";\n"
case c_ast.Compound():
# No extra indentation required before the opening brace of a
# compound - because it consists of multiple lines it has to
# compute its own indentation.
#
return self.visit(n)
case c_ast.If():
return indent + self.visit(n)
case _:
return indent + self.visit(n) + "\n"
def _generate_decl(self, n: c_ast.Decl) -> str:
"""Generation from a Decl node."""
s = ""
if n.funcspec:
s = " ".join(n.funcspec) + " "
if n.storage:
s += " ".join(n.storage) + " "
if n.align:
s += self.visit(n.align[0]) + " "
s += self._generate_type(n.type)
return s
def _generate_type(
self,
n: c_ast.Node,
modifiers: List[c_ast.Node] = [],
emit_declname: bool = True,
) -> str:
"""Recursive generation from a type node. n is the type node.
modifiers collects the PtrDecl, ArrayDecl and FuncDecl modifiers
encountered on the way down to a TypeDecl, to allow proper
generation from it.
"""
# ~ print(n, modifiers)
match n:
case c_ast.TypeDecl():
s = ""
if n.quals:
s += " ".join(n.quals) + " "
s += self.visit(n.type)
nstr = n.declname if n.declname and emit_declname else ""
# Resolve modifiers.
# Wrap in parens to distinguish pointer to array and pointer to
# function syntax.
#
for i, modifier in enumerate(modifiers):
match modifier:
case c_ast.ArrayDecl():
if i != 0 and isinstance(modifiers[i - 1], c_ast.PtrDecl):
nstr = "(" + nstr + ")"
nstr += "["
if modifier.dim_quals:
nstr += " ".join(modifier.dim_quals) + " "
if modifier.dim is not None:
nstr += self.visit(modifier.dim)
nstr += "]"
case c_ast.FuncDecl():
if i != 0 and isinstance(modifiers[i - 1], c_ast.PtrDecl):
nstr = "(" + nstr + ")"
args = (
self.visit(modifier.args)
if modifier.args is not None
else ""
)
nstr += "(" + args + ")"
case c_ast.PtrDecl():
if modifier.quals:
quals = " ".join(modifier.quals)
suffix = f" {nstr}" if nstr else ""
nstr = f"* {quals}{suffix}"
else:
nstr = "*" + nstr
if nstr:
s += " " + nstr
return s
case c_ast.Decl():
return self._generate_decl(n.type)
case c_ast.Typename():
return self._generate_type(n.type, emit_declname=emit_declname)
case c_ast.IdentifierType():
return " ".join(n.names) + " "
case c_ast.ArrayDecl() | c_ast.PtrDecl() | c_ast.FuncDecl():
return self._generate_type(
n.type, modifiers + [n], emit_declname=emit_declname
)
case _:
return self.visit(n)
def _parenthesize_if(
self, n: c_ast.Node, condition: Callable[[c_ast.Node], bool]
) -> str:
"""Visits 'n' and returns its string representation, parenthesized
if the condition function applied to the node returns True.
"""
s = self._visit_expr(n)
if condition(n):
return "(" + s + ")"
else:
return s
def _parenthesize_unless_simple(self, n: c_ast.Node) -> str:
"""Common use case for _parenthesize_if"""
return self._parenthesize_if(n, lambda d: not self._is_simple_node(d))
def _is_simple_node(self, n: c_ast.Node) -> bool:
"""Returns True for nodes that are "simple" - i.e. nodes that always
have higher precedence than operators.
"""
return isinstance(
n,
(c_ast.Constant, c_ast.ID, c_ast.ArrayRef, c_ast.StructRef, c_ast.FuncCall),
)

View File

@@ -0,0 +1,706 @@
# ------------------------------------------------------------------------------
# pycparser: c_lexer.py
#
# CLexer class: lexer for the C language
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
# ------------------------------------------------------------------------------
import re
from dataclasses import dataclass
from enum import Enum
from typing import Callable, Dict, List, Optional, Tuple
@dataclass(slots=True)
class _Token:
type: str
value: str
lineno: int
column: int
class CLexer:
"""A standalone lexer for C.
Parameters for construction:
error_func:
Called with (msg, line, column) on lexing errors.
on_lbrace_func:
Called when an LBRACE token is produced (used for scope tracking).
on_rbrace_func:
Called when an RBRACE token is produced (used for scope tracking).
type_lookup_func:
Called with an identifier name; expected to return True if it is
a typedef name and should be tokenized as TYPEID.
Call input(text) to initialize lexing, and then keep calling token() to
get the next token, until it returns None (at end of input).
"""
def __init__(
self,
error_func: Callable[[str, int, int], None],
on_lbrace_func: Callable[[], None],
on_rbrace_func: Callable[[], None],
type_lookup_func: Callable[[str], bool],
) -> None:
self.error_func = error_func
self.on_lbrace_func = on_lbrace_func
self.on_rbrace_func = on_rbrace_func
self.type_lookup_func = type_lookup_func
self._init_state()
def input(self, text: str, filename: str = "") -> None:
"""Initialize the lexer to the given input text.
filename is an optional name identifying the file from which the input
comes. The lexer can modify it if #line directives are encountered.
"""
self._init_state()
self._lexdata = text
self._filename = filename
def _init_state(self) -> None:
self._lexdata = ""
self._filename = ""
self._pos = 0
self._line_start = 0
self._pending_tok: Optional[_Token] = None
self._lineno = 1
@property
def filename(self) -> str:
return self._filename
def token(self) -> Optional[_Token]:
# Lexing strategy overview:
#
# - We maintain a current position (self._pos), line number, and the
# byte offset of the current line start. The lexer is a simple loop
# that skips whitespace/newlines and emits one token per call.
# - A small amount of logic is handled manually before regex matching:
#
# * Preprocessor-style directives: if we see '#', we check whether
# it's a #line or #pragma directive and consume it inline. #line
# updates lineno/filename and produces no tokens. #pragma can yield
# both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
# so we stash the PPPRAGMASTR as _pending_tok to return on the next
# token() call. Otherwise we return PPHASH.
# * Newlines update lineno/line-start tracking so tokens can record
# accurate columns.
#
# - The bulk of tokens are recognized in _match_token:
#
# * _regex_rules: regex patterns for identifiers, literals, and other
# complex tokens (including error-producing patterns). The lexer
# uses a combined _regex_master to scan options at the same time.
# * _fixed_tokens: exact string matches for operators and punctuation,
# resolved by longest match.
#
# - Error patterns call the error callback and advance minimally, which
# keeps lexing resilient while reporting useful diagnostics.
text = self._lexdata
n = len(text)
if self._pending_tok is not None:
tok = self._pending_tok
self._pending_tok = None
return tok
while self._pos < n:
match text[self._pos]:
case " " | "\t":
self._pos += 1
case "\n":
self._lineno += 1
self._pos += 1
self._line_start = self._pos
case "#":
if _line_pattern.match(text, self._pos + 1):
self._pos += 1
self._handle_ppline()
continue
if _pragma_pattern.match(text, self._pos + 1):
self._pos += 1
toks = self._handle_pppragma()
if len(toks) > 1:
self._pending_tok = toks[1]
if len(toks) > 0:
return toks[0]
continue
tok = self._make_token("PPHASH", "#", self._pos)
self._pos += 1
return tok
case _:
if tok := self._match_token():
return tok
else:
continue
def _match_token(self) -> Optional[_Token]:
"""Match one token at the current position.
Returns a Token on success, or None if no token could be matched and
an error was reported. This method always advances _pos by the matched
length, or by 1 on error/no-match.
"""
text = self._lexdata
pos = self._pos
# We pick the longest match between:
# - the master regex (identifiers, literals, error patterns, etc.)
# - fixed operator/punctuator literals from the bucket for text[pos]
#
# The longest match is required to ensure we properly lex something
# like ".123" (a floating-point constant) as a single entity (with
# FLOAT_CONST), rather than a PERIOD followed by a number.
#
# The fixed-literal buckets are already length-sorted, so within that
# bucket we can take the first match. However, we still compare its
# length to the regex match because the regex may have matched a longer
# token that should take precedence.
best = None
if m := _regex_master.match(text, pos):
tok_type = m.lastgroup
# All master-regex alternatives are named; lastgroup shouldn't be None.
assert tok_type is not None
value = m.group(tok_type)
length = len(value)
action, msg = _regex_actions[tok_type]
best = (length, tok_type, value, action, msg)
if bucket := _fixed_tokens_by_first.get(text[pos]):
for entry in bucket:
if text.startswith(entry.literal, pos):
length = len(entry.literal)
if best is None or length > best[0]:
best = (
length,
entry.tok_type,
entry.literal,
_RegexAction.TOKEN,
None,
)
break
if best is None:
msg = f"Illegal character {repr(text[pos])}"
self._error(msg, pos)
self._pos += 1
return None
length, tok_type, value, action, msg = best
if action == _RegexAction.ERROR:
if tok_type == "BAD_CHAR_CONST":
msg = f"Invalid char constant {value}"
# All other ERROR rules provide a message.
assert msg is not None
self._error(msg, pos)
self._pos += max(1, length)
return None
if action == _RegexAction.ID:
tok_type = _keyword_map.get(value, "ID")
if tok_type == "ID" and self.type_lookup_func(value):
tok_type = "TYPEID"
tok = self._make_token(tok_type, value, pos)
self._pos += length
if tok.type == "LBRACE":
self.on_lbrace_func()
elif tok.type == "RBRACE":
self.on_rbrace_func()
return tok
def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
"""Create a Token at an absolute input position.
Expects tok_type/value and the absolute byte offset pos in the current
input. Does not advance lexer state; callers manage _pos themselves.
Returns a Token with lineno/column computed from current line tracking.
"""
column = pos - self._line_start + 1
tok = _Token(tok_type, value, self._lineno, column)
return tok
def _error(self, msg: str, pos: int) -> None:
column = pos - self._line_start + 1
self.error_func(msg, self._lineno, column)
def _handle_ppline(self) -> None:
# Since #line directives aren't supposed to return tokens but should
# only affect the lexer's state (update line/filename for coords), this
# method does a bit of parsing on its own. It doesn't return anything,
# but its side effect is to update self._pos past the directive, and
# potentially update self._lineno and self._filename, based on the
# directive's contents.
#
# Accepted #line forms from preprocessors:
# - "#line 66 \"kwas\\df.h\""
# - "# 9"
# - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
# - "# 1 \"file.h\" 3"
# Errors we must report:
# - "#line \"file.h\"" (filename before line number)
# - "#line df" (garbage instead of number/string)
#
# We scan the directive line once (after an optional 'line' keyword),
# validating the order: NUMBER, optional STRING, then any NUMBERs.
# The NUMBERs tail is only accepted if a filename STRING was present.
text = self._lexdata
n = len(text)
line_end = text.find("\n", self._pos)
if line_end == -1:
line_end = n
line = text[self._pos : line_end]
pos = 0
line_len = len(line)
def skip_ws() -> None:
nonlocal pos
while pos < line_len and line[pos] in " \t":
pos += 1
skip_ws()
if line.startswith("line", pos):
pos += 4
def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
if pp_line is None:
self._error("line number missing in #line", self._pos + line_len)
else:
self._lineno = int(pp_line)
if pp_filename is not None:
self._filename = pp_filename
self._pos = line_end + 1
self._line_start = self._pos
def fail(msg: str, offset: int) -> None:
self._error(msg, self._pos + offset)
self._pos = line_end + 1
self._line_start = self._pos
skip_ws()
if pos >= line_len:
success(None, None)
return
if line[pos] == '"':
fail("filename before line number in #line", pos)
return
m = re.match(_decimal_constant, line[pos:])
if not m:
fail("invalid #line directive", pos)
return
pp_line = m.group(0)
pos += len(pp_line)
skip_ws()
if pos >= line_len:
success(pp_line, None)
return
if line[pos] != '"':
fail("invalid #line directive", pos)
return
m = re.match(_string_literal, line[pos:])
if not m:
fail("invalid #line directive", pos)
return
pp_filename = m.group(0).lstrip('"').rstrip('"')
pos += len(m.group(0))
# Consume arbitrary sequence of numeric flags after the directive
while True:
skip_ws()
if pos >= line_len:
break
m = re.match(_decimal_constant, line[pos:])
if not m:
fail("invalid #line directive", pos)
return
pos += len(m.group(0))
success(pp_line, pp_filename)
def _handle_pppragma(self) -> List[_Token]:
# Parse a full #pragma line; returns a list of tokens with 1 or 2
# tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
# returned, it means an error occurred, or we're at the end of input.
#
# Examples:
# - "#pragma" -> PPPRAGMA only
# - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
# - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
# - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
text = self._lexdata
n = len(text)
pos = self._pos
while pos < n and text[pos] in " \t":
pos += 1
if pos >= n:
self._pos = pos
return []
if not text.startswith("pragma", pos):
self._error("invalid #pragma directive", pos)
self._pos = pos + 1
return []
pragma_pos = pos
pos += len("pragma")
toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
while pos < n and text[pos] in " \t":
pos += 1
start = pos
while pos < n and text[pos] != "\n":
pos += 1
if pos > start:
toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
if pos < n and text[pos] == "\n":
self._lineno += 1
pos += 1
self._line_start = pos
self._pos = pos
return toks
##
## Reserved keywords
##
_keywords: Tuple[str, ...] = (
"AUTO",
"BREAK",
"CASE",
"CHAR",
"CONST",
"CONTINUE",
"DEFAULT",
"DO",
"DOUBLE",
"ELSE",
"ENUM",
"EXTERN",
"FLOAT",
"FOR",
"GOTO",
"IF",
"INLINE",
"INT",
"LONG",
"REGISTER",
"OFFSETOF",
"RESTRICT",
"RETURN",
"SHORT",
"SIGNED",
"SIZEOF",
"STATIC",
"STRUCT",
"SWITCH",
"TYPEDEF",
"UNION",
"UNSIGNED",
"VOID",
"VOLATILE",
"WHILE",
"__INT128",
"_BOOL",
"_COMPLEX",
"_NORETURN",
"_THREAD_LOCAL",
"_STATIC_ASSERT",
"_ATOMIC",
"_ALIGNOF",
"_ALIGNAS",
"_PRAGMA",
)
_keyword_map: Dict[str, str] = {}
for keyword in _keywords:
# Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
_keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
else:
_keyword_map[keyword.lower()] = keyword
##
## Regexes for use in tokens
##
# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
_hex_prefix = "0[xX]"
_hex_digits = "[0-9a-fA-F]+"
_bin_prefix = "0[bB]"
_bin_digits = "[01]+"
# integer constants (K&R2: A.2.5.1)
_integer_suffix_opt = (
r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
)
_decimal_constant = (
"(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
)
_octal_constant = "0[0-7]*" + _integer_suffix_opt
_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
_bad_octal_constant = "0[0-7]*[89]"
# comments are not supported
_unsupported_c_style_comment = r"\/\*"
_unsupported_cxx_style_comment = r"\/\/"
# character constants (K&R2: A.2.5.2)
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
# directives with Windows paths as filenames (..\..\dir\file)
# For the same reason, decimal_escape allows all digit sequences. We want to
# parse all correct code, even if it means to sometimes parse incorrect
# code.
#
# The original regexes were taken verbatim from the C syntax definition,
# and were later modified to avoid worst-case exponential running time.
#
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
# decimal_escape = r"""(\d+)"""
# hex_escape = r"""(x[0-9a-fA-F]+)"""
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
#
# The following modifications were made to avoid the ambiguity that allowed
# backtracking: (https://github.com/eliben/pycparser/issues/61)
#
# - \x was removed from simple_escape, unless it was not followed by a hex
# digit, to avoid ambiguity with hex_escape.
# - hex_escape allows one or more hex characters, but requires that the next
# character(if any) is not hex
# - decimal_escape allows one or more decimal characters, but requires that the
# next character(if any) is not a decimal
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
# permissive decimal_escape.
#
# Without this change, python's `re` module would recursively try parsing each
# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
# `\1`+`23`, `\12`+`3`, and `\123`.
_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
_decimal_escape = r"""(\d+)(?!\d)"""
_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
_escape_sequence = (
r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
)
# This complicated regex with lookahead might be slow for strings, so because
# all of the valid escapes (including \x) allowed
# 0 or more non-escaped characters after the first character,
# simple_escape+decimal_escape+hex_escape got simplified to
_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
_char_const = "'" + _cconst_char + "'"
_wchar_const = "L" + _char_const
_u8char_const = "u8" + _char_const
_u16char_const = "u" + _char_const
_u32char_const = "U" + _char_const
_multicharacter_constant = "'" + _cconst_char + "{2,4}'"
_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
_bad_char_const = (
r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
)
# string literals (K&R2: A.2.6)
_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
_string_literal = '"' + _string_char + '*"'
_wstring_literal = "L" + _string_literal
_u8string_literal = "u8" + _string_literal
_u16string_literal = "u" + _string_literal
_u32string_literal = "U" + _string_literal
_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
# floating constants (K&R2: A.2.5.3)
_exponent_part = r"""([eE][-+]?[0-9]+)"""
_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
_floating_constant = (
"(((("
+ _fractional_constant
+ ")"
+ _exponent_part
+ "?)|([0-9]+"
+ _exponent_part
+ "))[FfLl]?)"
)
_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
_hex_fractional_constant = (
"(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
)
_hex_floating_constant = (
"("
+ _hex_prefix
+ "("
+ _hex_digits
+ "|"
+ _hex_fractional_constant
+ ")"
+ _binary_exponent_part
+ "[FfLl]?)"
)
class _RegexAction(Enum):
TOKEN = 0
ID = 1
ERROR = 2
@dataclass(frozen=True)
class _RegexRule:
# tok_type: name of the token emitted for a match
# regex_pattern: the raw regex (no anchors) to match at the current position
# action: TOKEN for normal tokens, ID for identifiers, ERROR to report
# error_message: message used for ERROR entries
tok_type: str
regex_pattern: str
action: _RegexAction
error_message: Optional[str]
_regex_rules: List[_RegexRule] = [
_RegexRule(
"UNSUPPORTED_C_STYLE_COMMENT",
_unsupported_c_style_comment,
_RegexAction.ERROR,
"Comments are not supported, see https://github.com/eliben/pycparser#3using.",
),
_RegexRule(
"UNSUPPORTED_CXX_STYLE_COMMENT",
_unsupported_cxx_style_comment,
_RegexAction.ERROR,
"Comments are not supported, see https://github.com/eliben/pycparser#3using.",
),
_RegexRule(
"BAD_STRING_LITERAL",
_bad_string_literal,
_RegexAction.ERROR,
"String contains invalid escape code",
),
_RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
_RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
_RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
_RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
_RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
_RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
_RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
_RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
_RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
_RegexRule(
"BAD_CONST_OCT",
_bad_octal_constant,
_RegexAction.ERROR,
"Invalid octal constant",
),
_RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
_RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
_RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
_RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
_RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
_RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
_RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
_RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
_RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
_RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
_RegexRule("ID", _identifier, _RegexAction.ID, None),
]
_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
_regex_pattern_parts: List[str] = []
for _rule in _regex_rules:
_regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
_regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
# The master regex is a single alternation of all token patterns, each wrapped
# in a named group. We match once at the current position and then use
# `lastgroup` to recover which token kind fired; this avoids iterating over all
# regexes on every character while keeping the same token-level semantics.
_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
@dataclass(frozen=True)
class _FixedToken:
tok_type: str
literal: str
_fixed_tokens: List[_FixedToken] = [
_FixedToken("ELLIPSIS", "..."),
_FixedToken("LSHIFTEQUAL", "<<="),
_FixedToken("RSHIFTEQUAL", ">>="),
_FixedToken("PLUSPLUS", "++"),
_FixedToken("MINUSMINUS", "--"),
_FixedToken("ARROW", "->"),
_FixedToken("LAND", "&&"),
_FixedToken("LOR", "||"),
_FixedToken("LSHIFT", "<<"),
_FixedToken("RSHIFT", ">>"),
_FixedToken("LE", "<="),
_FixedToken("GE", ">="),
_FixedToken("EQ", "=="),
_FixedToken("NE", "!="),
_FixedToken("TIMESEQUAL", "*="),
_FixedToken("DIVEQUAL", "/="),
_FixedToken("MODEQUAL", "%="),
_FixedToken("PLUSEQUAL", "+="),
_FixedToken("MINUSEQUAL", "-="),
_FixedToken("ANDEQUAL", "&="),
_FixedToken("OREQUAL", "|="),
_FixedToken("XOREQUAL", "^="),
_FixedToken("EQUALS", "="),
_FixedToken("PLUS", "+"),
_FixedToken("MINUS", "-"),
_FixedToken("TIMES", "*"),
_FixedToken("DIVIDE", "/"),
_FixedToken("MOD", "%"),
_FixedToken("OR", "|"),
_FixedToken("AND", "&"),
_FixedToken("NOT", "~"),
_FixedToken("XOR", "^"),
_FixedToken("LNOT", "!"),
_FixedToken("LT", "<"),
_FixedToken("GT", ">"),
_FixedToken("CONDOP", "?"),
_FixedToken("LPAREN", "("),
_FixedToken("RPAREN", ")"),
_FixedToken("LBRACKET", "["),
_FixedToken("RBRACKET", "]"),
_FixedToken("LBRACE", "{"),
_FixedToken("RBRACE", "}"),
_FixedToken("COMMA", ","),
_FixedToken("PERIOD", "."),
_FixedToken("SEMI", ";"),
_FixedToken("COLON", ":"),
]
# To avoid scanning all fixed tokens on every character, we bucket them by the
# first character. When matching at position i, we only look at the bucket for
# text[i], and we pre-sort that bucket by token length so the first match is
# also the longest. This preserves longest-match semantics (e.g. '>>=' before
# '>>' before '>') while reducing the number of comparisons.
_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
for _entry in _fixed_tokens:
_fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
for _bucket in _fixed_tokens_by_first.values():
_bucket.sort(key=lambda item: len(item.literal), reverse=True)
_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")

File diff suppressed because it is too large Load Diff