Update ashboard, dashboard, memory +1 more (+2 ~3)

2026-02-02 22:27:24 +00:00
parent 4f00131184
commit b0c9b254f1
65 changed files with 42112 additions and 53 deletions
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/init.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/init.py
@@ -0,0 +1,245 @@
+"""
+Code related to text extraction.
+
+Some parts are still in _page.py. In doubt, they will stay there.
+"""
+
+import math
+from typing import Any, Callable, Optional, Union
+
+from .._font import Font
+from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
+
+CUSTOM_RTL_MIN: int = -1
+CUSTOM_RTL_MAX: int = -1
+CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
+LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
+
+
+class OrientationNotFoundError(Exception):
+    pass
+
+
+def set_custom_rtl(
+    _min: Union[str, int, None] = None,
+    _max: Union[str, int, None] = None,
+    specials: Union[str, list[int], None] = None,
+) -> tuple[int, int, list[int]]:
+    """
+    Change the Right-To-Left and special characters custom parameters.
+
+    Args:
+        _min: The new minimum value for the range of custom characters that
+            will be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        _max: The new maximum value for the range of custom characters that will
+            be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        specials: The new list of special characters to be inserted in the
+            current insertion order.
+            If set to ``None``, the current value will not be changed.
+            If set to a string, it will be converted to a list of ASCII codes.
+            The default value is an empty list.
+
+    Returns:
+        A tuple containing the new values for ``CUSTOM_RTL_MIN``,
+        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
+
+    """
+    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+    if isinstance(_min, int):
+        CUSTOM_RTL_MIN = _min
+    elif isinstance(_min, str):
+        CUSTOM_RTL_MIN = ord(_min)
+    if isinstance(_max, int):
+        CUSTOM_RTL_MAX = _max
+    elif isinstance(_max, str):
+        CUSTOM_RTL_MAX = ord(_max)
+    if isinstance(specials, str):
+        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
+    elif isinstance(specials, list):
+        CUSTOM_RTL_SPECIAL_CHARS = specials
+    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+
+
+def mult(m: list[float], n: list[float]) -> list[float]:
+    return [
+        m[0] * n[0] + m[1] * n[2],
+        m[0] * n[1] + m[1] * n[3],
+        m[2] * n[0] + m[3] * n[2],
+        m[2] * n[1] + m[3] * n[3],
+        m[4] * n[0] + m[5] * n[2] + n[4],
+        m[4] * n[1] + m[5] * n[3] + n[5],
+    ]
+
+
+def orient(m: list[float]) -> int:
+    if m[3] > 1e-6:
+        return 0
+    if m[3] < -1e-6:
+        return 180
+    if m[1] > 0:
+        return 90
+    return 270
+
+
+def crlf_space_check(
+    text: str,
+    cmtm_prev: tuple[list[float], list[float]],
+    cmtm_matrix: tuple[list[float], list[float]],
+    memo_cmtm: tuple[list[float], list[float]],
+    font_resource: Optional[DictionaryObject],
+    orientations: tuple[int, ...],
+    output: str,
+    font_size: float,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+    str_widths: float,
+    spacewidth: float,
+    str_height: float,
+) -> tuple[str, str, list[float], list[float]]:
+    cm_prev = cmtm_prev[0]
+    tm_prev = cmtm_prev[1]
+    cm_matrix = cmtm_matrix[0]
+    tm_matrix = cmtm_matrix[1]
+    memo_cm = memo_cmtm[0]
+    memo_tm = memo_cmtm[1]
+
+    m_prev = mult(tm_prev, cm_prev)
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    delta_x = m[4] - m_prev[4]
+    delta_y = m[5] - m_prev[5]
+    # Table 108 of the 1.7 reference ("Text positioning operators")
+    scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
+    scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
+    scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
+    cm_prev = m
+
+    if orientation not in orientations:
+        raise OrientationNotFoundError
+    if orientation in (0, 180):
+        moved_height: float = delta_y
+        moved_width: float = delta_x
+    elif orientation in (90, 270):
+        moved_height = delta_x
+        moved_width = delta_y
+    try:
+        if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
+            if (output + text)[-1] != "\n":
+                output += text + "\n"
+                if visitor_text is not None:
+                    visitor_text(
+                        text + "\n",
+                        memo_cm,
+                        memo_tm,
+                        font_resource,
+                        font_size,
+                    )
+                text = ""
+        elif (
+            (moved_width >= (spacewidth + str_widths) * scale_prev_x)
+            and (output + text)[-1] != " "
+        ):
+            text += " "
+    except Exception:
+        pass
+    tm_prev = tm_matrix.copy()
+    cm_prev = cm_matrix.copy()
+    return text, output, cm_prev, tm_prev
+
+
+def get_text_operands(
+    operands: list[Union[str, TextStringObject]],
+    cm_matrix: list[float],
+    tm_matrix: list[float],
+    font: Font,
+    orientations: tuple[int, ...]
+) -> tuple[str, bool]:
+    t: str = ""
+    is_str_operands = False
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    if orientation in orientations and len(operands) > 0:
+        if isinstance(operands[0], str):
+            t = operands[0]
+            is_str_operands = True
+        else:
+            t = ""
+            tt: bytes = (
+                encode_pdfdocencoding(operands[0])
+                if isinstance(operands[0], str)
+                else operands[0]
+            )
+            if isinstance(font.encoding, str):
+                try:
+                    t = tt.decode(font.encoding, "surrogatepass")  # apply str encoding
+                except Exception:
+                    # the data does not match the expectation,
+                    # we use the alternative ;
+                    # text extraction may not be good
+                    t = tt.decode(
+                        "utf-16-be" if font.encoding == "charmap" else "charmap",
+                        "surrogatepass",
+                    )  # apply str encoding
+            else:  # apply dict encoding
+                t = "".join(
+                    [font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
+                )
+    return (t, is_str_operands)
+
+
+def get_display_str(
+    text: str,
+    cm_matrix: list[float],
+    tm_matrix: list[float],
+    font_resource: Optional[DictionaryObject],
+    font: Font,
+    text_operands: str,
+    font_size: float,
+    rtl_dir: bool,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
+) -> tuple[str, bool, float]:
+    # "\u0590 - \u08FF \uFB50 - \uFDFF"
+    widths: float = 0.0
+    for x in [font.character_map.get(x, x) for x in text_operands]:
+        # x can be a sequence of bytes ; ex: habibi.pdf
+        if len(x) == 1:
+            xx = ord(x)
+        else:
+            xx = 1
+        # fmt: off
+        if (
+            # cases where the current inserting order is kept
+            (xx <= 0x2F)                        # punctuations but...
+            or 0x3A <= xx <= 0x40               # numbers (x30-39)
+            or 0x2000 <= xx <= 0x206F           # upper punctuations..
+            or 0x20A0 <= xx <= 0x21FF           # but (numbers) indices/exponents
+            or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
+        ):
+            text = x + text if rtl_dir else text + x
+        elif (  # right-to-left characters set
+            0x0590 <= xx <= 0x08FF
+            or 0xFB1D <= xx <= 0xFDFF
+            or 0xFE70 <= xx <= 0xFEFF
+            or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
+        ):
+            if not rtl_dir:
+                rtl_dir = True
+                if visitor_text is not None:
+                    visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
+                text = ""
+            text = x + text
+        else:  # left-to-right
+            if rtl_dir:
+                rtl_dir = False
+                if visitor_text is not None:
+                    visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
+                text = ""
+            text = text + x
+        widths += font.space_width if x == " " else font.text_width(x)
+        # fmt: on
+    return text, rtl_dir, widths
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/init.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/init.py
@@ -0,0 +1,16 @@
+"""Layout mode text extraction extension for pypdf"""
+from ..._font import Font
+from ._fixed_width_page import (
+    fixed_char_width,
+    fixed_width_page,
+    text_show_operations,
+    y_coordinate_groups,
+)
+
+__all__ = [
+    "Font",
+    "fixed_char_width",
+    "fixed_width_page",
+    "text_show_operations",
+    "y_coordinate_groups",
+]
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -0,0 +1,400 @@
+"""Extract PDF text preserving the layout of the source PDF"""
+
+from collections.abc import Iterator
+from itertools import groupby
+from math import ceil
+from pathlib import Path
+from typing import Any, Literal, Optional, TypedDict
+
+from ..._font import Font
+from ..._utils import logger_warning
+from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+from ._text_state_manager import TextStateManager
+from ._text_state_params import TextStateParams
+
+
+class BTGroup(TypedDict):
+    """
+    Dict describing a line of text rendered within a BT/ET operator pair.
+    If multiple text show operations render text on the same line, the text
+    will be combined into a single BTGroup dict.
+
+    Keys:
+        tx: x coordinate of first character in BTGroup
+        ty: y coordinate of first character in BTGroup
+        font_size: nominal font size
+        font_height: effective font height
+        text: rendered text
+        displaced_tx: x coordinate of last character in BTGroup
+        flip_sort: -1 if page is upside down, else 1
+    """
+
+    tx: float
+    ty: float
+    font_size: float
+    font_height: float
+    text: str
+    displaced_tx: float
+    flip_sort: Literal[-1, 1]
+
+
+def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
+    """
+    BTGroup constructed from a TextStateParams instance, rendered text, and
+    displaced tx value.
+
+    Args:
+        tj_op (TextStateParams): TextStateParams instance
+        rendered_text (str): rendered text
+        dispaced_tx (float): x coordinate of last character in BTGroup
+
+    """
+    return BTGroup(
+        tx=tj_op.tx,
+        ty=tj_op.ty,
+        font_size=tj_op.font_size,
+        font_height=tj_op.font_height,
+        text=rendered_text,
+        displaced_tx=dispaced_tx,
+        flip_sort=-1 if tj_op.flip_vertical else 1,
+    )
+
+
+def recurs_to_target_op(
+    ops: Iterator[tuple[list[Any], bytes]],
+    text_state_mgr: TextStateManager,
+    end_target: Literal[b"Q", b"ET"],
+    fonts: dict[str, Font],
+    strip_rotated: bool = True,
+) -> tuple[list[BTGroup], list[TextStateParams]]:
+    """
+    Recurse operators between BT/ET and/or q/Q operators managing the transform
+    stack and capturing text positioning and rendering data.
+
+    Args:
+        ops: iterator of operators in content stream
+        text_state_mgr: a TextStateManager instance
+        end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
+        fonts: font dictionary as returned by PageObject._layout_mode_fonts()
+
+    Returns:
+        tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
+
+    """
+    # 1 entry per line of text rendered within each BT/ET operation.
+    bt_groups: list[BTGroup] = []
+
+    # 1 entry per text show operator (Tj/TJ/'/")
+    tj_ops: list[TextStateParams] = []
+
+    if end_target == b"Q":
+        # add new q level. cm's added at this level will be popped at next b'Q'
+        text_state_mgr.add_q()
+
+    for operands, op in ops:
+        # The loop is broken by the end target, or exits normally when there are no more ops.
+        if op == end_target:
+            if op == b"Q":
+                text_state_mgr.remove_q()
+            if op == b"ET":
+                if not tj_ops:
+                    return bt_groups, tj_ops
+                _text = ""
+                bt_idx = 0  # idx of first tj in this bt group
+                last_displaced_tx = tj_ops[bt_idx].displaced_tx
+                last_ty = tj_ops[bt_idx].ty
+                for _idx, _tj in enumerate(
+                    tj_ops
+                ):  # ... build text from new Tj operators
+                    if strip_rotated and _tj.rotated:
+                        continue
+                    if not _tj.font.interpretable:  # generates warning
+                        continue
+                    # if the y position of the text is greater than the font height, assume
+                    # the text is on a new line and start a new group
+                    if abs(_tj.ty - last_ty) > _tj.font_height:
+                        if _text.strip():
+                            bt_groups.append(
+                                bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+                            )
+                        bt_idx = _idx
+                        _text = ""
+
+                    # if the x position of the text is less than the last x position by
+                    # more than 5 spaces widths, assume the text order should be flipped
+                    # and start a new group
+                    if (
+                        last_displaced_tx - _tj.tx
+                        > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+                    ):
+                        if _text.strip():
+                            bt_groups.append(
+                                bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+                            )
+                        bt_idx = _idx
+                        last_displaced_tx = _tj.displaced_tx
+                        _text = ""
+
+                    # calculate excess x translation based on ending tx of previous Tj.
+                    # multiply by bool (_idx != bt_idx) to ensure spaces aren't double
+                    # applied to the first tj of a BTGroup in fixed_width_page().
+                    excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
+                    # space_tx could be 0 if either Tz or font_size was 0 for this _tj.
+                    spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
+                    new_text = f'{" " * spaces}{_tj.txt}'
+
+                    last_ty = _tj.ty
+                    _text = f"{_text}{new_text}"
+                    last_displaced_tx = _tj.displaced_tx
+                if _text:
+                    bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
+                text_state_mgr.reset_tm()
+            break
+        if op == b"q":
+            bts, tjs = recurs_to_target_op(
+                ops, text_state_mgr, b"Q", fonts, strip_rotated
+            )
+            bt_groups.extend(bts)
+            tj_ops.extend(tjs)
+        elif op == b"cm":
+            text_state_mgr.add_cm(*operands)
+        elif op == b"BT":
+            bts, tjs = recurs_to_target_op(
+                ops, text_state_mgr, b"ET", fonts, strip_rotated
+            )
+            bt_groups.extend(bts)
+            tj_ops.extend(tjs)
+        elif op == b"Tj":
+            tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+        elif op == b"TJ":
+            _tj = text_state_mgr.text_state_params()
+            for tj_op in operands[0]:
+                if isinstance(tj_op, bytes):
+                    _tj = text_state_mgr.text_state_params(tj_op)
+                    tj_ops.append(_tj)
+                else:
+                    text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op))
+        elif op == b"'":
+            text_state_mgr.reset_trm()
+            text_state_mgr.add_tm([0, -text_state_mgr.TL])
+            tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+        elif op == b'"':
+            text_state_mgr.reset_trm()
+            text_state_mgr.set_state_param(b"Tw", operands[0])
+            text_state_mgr.set_state_param(b"Tc", operands[1])
+            text_state_mgr.add_tm([0, -text_state_mgr.TL])
+            tj_ops.append(text_state_mgr.text_state_params(operands[2]))
+        elif op in (b"Td", b"Tm", b"TD", b"T*"):
+            text_state_mgr.reset_trm()
+            if op == b"Tm":
+                text_state_mgr.reset_tm()
+            elif op == b"TD":
+                text_state_mgr.set_state_param(b"TL", -operands[1])
+            elif op == b"T*":
+                operands = [0, -text_state_mgr.TL]
+            text_state_mgr.add_tm(operands)
+        elif op == b"Tf":
+            text_state_mgr.set_font(fonts[operands[0]], operands[1])
+        else:  # handle Tc, Tw, Tz, TL, and Ts operators
+            text_state_mgr.set_state_param(op, operands)
+    else:
+        logger_warning(
+            f"Unbalanced target operations, expected {end_target!r}.",
+            __name__,
+        )
+    return bt_groups, tj_ops
+
+
+def y_coordinate_groups(
+    bt_groups: list[BTGroup], debug_path: Optional[Path] = None
+) -> dict[int, list[BTGroup]]:
+    """
+    Group text operations by rendered y coordinate, i.e. the line number.
+
+    Args:
+        bt_groups: list of dicts as returned by text_show_operations()
+        debug_path (Path, optional): Path to a directory for saving debug output.
+
+    Returns:
+        Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
+            keyed by y coordinate
+
+    """
+    ty_groups = {
+        ty: sorted(grp, key=lambda x: x["tx"])
+        for ty, grp in groupby(
+            bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
+        )
+    }
+    # combine groups whose y coordinates differ by less than the effective font height
+    # (accounts for mixed fonts and other minor oddities)
+    last_ty = next(iter(ty_groups))
+    last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
+    for ty in list(ty_groups)[1:]:
+        fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
+        txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
+        # prevent merge if both groups are rendering in the same x position.
+        no_text_overlap = not (txs & last_txs)
+        offset_less_than_font_height = abs(ty - last_ty) < fsz
+        if no_text_overlap and offset_less_than_font_height:
+            ty_groups[last_ty] = sorted(
+                ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
+            )
+            last_txs |= txs
+        else:
+            last_ty = ty
+            last_txs = txs
+    if debug_path:  # pragma: no cover
+        import json  # noqa: PLC0415
+
+        debug_path.joinpath("bt_groups.json").write_text(
+            json.dumps(ty_groups, indent=2, default=str), "utf-8"
+        )
+    return ty_groups
+
+
+def text_show_operations(
+    ops: Iterator[tuple[list[Any], bytes]],
+    fonts: dict[str, Font],
+    strip_rotated: bool = True,
+    debug_path: Optional[Path] = None,
+) -> list[BTGroup]:
+    """
+    Extract text from BT/ET operator pairs.
+
+    Args:
+        ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
+        fonts (Dict[str, Font]): font dictionary
+        strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
+        debug_path (Path, optional): Path to a directory for saving debug output.
+
+    Returns:
+        List[BTGroup]: list of dicts of text rendered by each BT operator
+
+    """
+    state_mgr = TextStateManager()  # transformation stack manager
+    bt_groups: list[BTGroup] = []  # BT operator dict
+    tj_ops: list[TextStateParams] = []  # Tj/TJ operator data
+    for operands, op in ops:
+        if op in (b"BT", b"q"):
+            bts, tjs = recurs_to_target_op(
+                ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
+            )
+            bt_groups.extend(bts)
+            tj_ops.extend(tjs)
+        elif op == b"Tf":
+            state_mgr.set_font(fonts[operands[0]], operands[1])
+        else:  # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
+            state_mgr.set_state_param(op, operands)
+
+    if any(tj.rotated for tj in tj_ops):
+        if strip_rotated:
+            logger_warning(
+                "Rotated text discovered. Output will be incomplete.", __name__
+            )
+        else:
+            logger_warning(
+                "Rotated text discovered. Layout will be degraded.", __name__
+            )
+    if not all(tj.font.interpretable for tj in tj_ops):
+        logger_warning(
+            "PDF contains an uninterpretable font. Output will be incomplete.", __name__
+        )
+
+    # left align the data, i.e. decrement all tx values by min(tx)
+    min_x = min((x["tx"] for x in bt_groups), default=0.0)
+    bt_groups = [
+        dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x)  # type: ignore[misc]
+        for ogrp in sorted(
+            bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
+        )
+    ]
+
+    if debug_path:  # pragma: no cover
+        import json  # noqa: PLC0415
+
+        debug_path.joinpath("bts.json").write_text(
+            json.dumps(bt_groups, indent=2, default=str), "utf-8"
+        )
+        debug_path.joinpath("tjs.json").write_text(
+            json.dumps(
+                tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
+            ),
+            "utf-8",
+        )
+    return bt_groups
+
+
+def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float:
+    """
+    Calculate average character width weighted by the length of the rendered
+    text in each sample for conversion to fixed-width layout.
+
+    Args:
+        bt_groups (List[BTGroup]): List of dicts of text rendered by each
+            BT operator
+
+    Returns:
+        float: fixed character width
+
+    """
+    char_widths = []
+    for _bt in bt_groups:
+        _len = len(_bt["text"]) * scale_weight
+        char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
+    return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
+
+
+def fixed_width_page(
+    ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
+) -> str:
+    """
+    Generate page text from text operations grouped by rendered y coordinate.
+
+    Args:
+        ty_groups: dict of text show ops as returned by y_coordinate_groups()
+        char_width: fixed character width
+        space_vertically: include blank lines inferred from y distance + font height.
+        font_height_weight: multiplier for font height when calculating blank lines.
+
+    Returns:
+        str: page text in a fixed width format that closely adheres to the rendered
+            layout in the source pdf.
+
+    """
+    lines: list[str] = []
+    last_y_coord = 0
+    table = str.maketrans(dict.fromkeys(range(14, 32), " "))
+    for y_coord, line_data in ty_groups.items():
+        if space_vertically and lines:
+            fh = line_data[0]["font_height"]
+            blank_lines = 0 if fh == 0 else (
+                int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
+            )
+            lines.extend([""] * blank_lines)
+
+        line_parts = []  # It uses a list to construct the line, avoiding string concatenation.
+        current_len = 0  # Track the size with int instead of len(str) overhead.
+        last_disp = 0.0
+        for bt_op in line_data:
+            tx = bt_op["tx"]
+            offset = int(tx // char_width)
+            needed_spaces = offset - current_len
+            if needed_spaces > 0 and ceil(last_disp) < int(tx):
+                padding = " " * needed_spaces
+                line_parts.append(padding)
+                current_len += needed_spaces
+
+            raw_text = bt_op["text"]
+            text = raw_text.translate(table)
+            line_parts.append(text)
+            current_len += len(text)
+            last_disp = bt_op["displaced_tx"]
+
+        full_line = "".join(line_parts).rstrip()
+        if full_line.strip() or (space_vertically and lines):
+            lines.append(full_line)
+
+        last_y_coord = y_coord
+
+    return "\n".join(lines)
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
@@ -0,0 +1,221 @@
+"""manage the PDF transform stack during "layout" mode text extraction"""
+
+from collections import ChainMap, Counter
+from collections import ChainMap as ChainMapType
+from collections import Counter as CounterType
+from collections.abc import MutableMapping
+from typing import Any, Union
+
+from ..._font import Font
+from ...errors import PdfReadError
+from .. import mult
+from ._text_state_params import TextStateParams
+
+TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
+TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
+
+
+class TextStateManager:
+    """
+    Tracks the current text state including cm/tm/trm transformation matrices.
+
+    Attributes:
+        transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
+        q_queue (Counter[int]): Counter of q operators
+        q_depth (List[int]): list of q operator nesting levels
+        Tc (float): character spacing
+        Tw (float): word spacing
+        Tz (int): horizontal scaling
+        TL (float): leading
+        Ts (float): text rise
+        font (Font): font object
+        font_size (int | float): font size
+
+    """
+
+    def __init__(self) -> None:
+        self.transform_stack: TextStateManagerChainMapType = ChainMap(
+            self.new_transform()
+        )
+        self.q_queue: CounterType[int] = Counter()
+        self.q_depth = [0]
+        self.Tc: float = 0.0
+        self.Tw: float = 0.0
+        self.Tz: float = 100.0
+        self.TL: float = 0.0
+        self.Ts: float = 0.0
+        self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
+        self.font: Union[Font, None] = None
+        self.font_size: Union[int, float] = 0
+
+    def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
+        """
+        Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
+
+        Args:
+            op: operator read from PDF stream as bytes. No action is taken
+                for unsupported operators (see supported operators above).
+            value (float | List[Any]): new parameter value. If a list,
+                value[0] is used.
+
+        """
+        if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
+            return
+        self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
+
+    def set_font(self, font: Font, size: float) -> None:
+        """
+        Set the current font and font_size.
+
+        Args:
+            font (Font): a layout mode Font
+            size (float): font size
+
+        """
+        self.font = font
+        self.font_size = size
+
+    def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
+        """
+        Create a TextStateParams instance to display a text string. Type[bytes] values
+        will be decoded implicitly.
+
+        Args:
+            value (str | bytes): text to associate with the captured state.
+
+        Raises:
+            PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
+
+        Returns:
+            TextStateParams: current text state parameters
+
+        """
+        if not isinstance(self.font, Font):
+            raise PdfReadError(
+                "font not set: is PDF missing a Tf operator?"
+            )  # pragma: no cover
+        if isinstance(value, bytes):
+            try:
+                if isinstance(self.font.encoding, str):
+                    txt = value.decode(self.font.encoding, "surrogatepass")
+                else:
+                    txt = "".join(
+                        self.font.encoding[x]
+                        if x in self.font.encoding
+                        else bytes((x,)).decode()
+                        for x in value
+                    )
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                txt = value.decode("utf-8", "replace")
+            txt = "".join(
+                self.font.character_map.get(x, x) for x in txt
+            )
+        else:
+            txt = value
+        return TextStateParams(
+            txt,
+            self.font,
+            self.font_size,
+            self.Tc,
+            self.Tw,
+            self.Tz,
+            self.TL,
+            self.Ts,
+            self.effective_transform,
+        )
+
+    @staticmethod
+    def raw_transform(
+        _a: float = 1.0,
+        _b: float = 0.0,
+        _c: float = 0.0,
+        _d: float = 1.0,
+        _e: float = 0.0,
+        _f: float = 0.0,
+    ) -> dict[int, float]:
+        """Only a/b/c/d/e/f matrix params"""
+        return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
+
+    @staticmethod
+    def new_transform(
+        _a: float = 1.0,
+        _b: float = 0.0,
+        _c: float = 0.0,
+        _d: float = 1.0,
+        _e: float = 0.0,
+        _f: float = 0.0,
+        is_text: bool = False,
+        is_render: bool = False,
+    ) -> TextStateManagerDictType:
+        """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
+        result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
+        result.update({"is_text": is_text, "is_render": is_render})
+        return result
+
+    def reset_tm(self) -> TextStateManagerChainMapType:
+        """Clear all transforms from chainmap having is_text==True or is_render==True"""
+        while (
+            self.transform_stack.maps[0]["is_text"]
+            or self.transform_stack.maps[0]["is_render"]
+        ):
+            self.transform_stack = self.transform_stack.parents
+        return self.transform_stack
+
+    def reset_trm(self) -> TextStateManagerChainMapType:
+        """Clear all transforms from chainmap having is_render==True"""
+        while self.transform_stack.maps[0]["is_render"]:
+            self.transform_stack = self.transform_stack.parents
+        return self.transform_stack
+
+    def remove_q(self) -> TextStateManagerChainMapType:
+        """Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
+        self.font, self.font_size = self.font_stack.pop(-1)
+        self.transform_stack = self.reset_tm()
+        self.transform_stack.maps = self.transform_stack.maps[
+            self.q_queue.pop(self.q_depth.pop(), 0) :
+        ]
+        return self.transform_stack
+
+    def add_q(self) -> None:
+        """Add another level to q_queue"""
+        self.font_stack.append((self.font, self.font_size))
+        self.q_depth.append(len(self.q_depth))
+
+    def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
+        """Concatenate an additional transform matrix"""
+        self.transform_stack = self.reset_tm()
+        self.q_queue.update(self.q_depth[-1:])
+        self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
+        return self.transform_stack
+
+    def _complete_matrix(self, operands: list[float]) -> list[float]:
+        """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
+        if len(operands) == 2:  # this is a Td operator or equivalent
+            operands = [1.0, 0.0, 0.0, 1.0, *operands]
+        return operands
+
+    def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
+        """Append a text transform matrix"""
+        self.transform_stack = self.transform_stack.new_child(
+            self.new_transform(  # type: ignore[misc]
+                *self._complete_matrix(operands), is_text=True  # type: ignore[arg-type]
+            )
+        )
+        return self.transform_stack
+
+    def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
+        """Append a text rendering transform matrix"""
+        self.transform_stack = self.transform_stack.new_child(
+            self.new_transform(  # type: ignore[misc]
+                *self._complete_matrix(operands), is_text=True, is_render=True  # type: ignore[arg-type]
+            )
+        )
+        return self.transform_stack
+
+    @property
+    def effective_transform(self) -> list[float]:
+        """Current effective transform accounting for cm, tm, and trm transforms"""
+        eff_transform = [*self.transform_stack.maps[0].values()]
+        for transform in self.transform_stack.maps[1:]:
+            eff_transform = mult(eff_transform, transform)  # type: ignore[arg-type]  # dict has int keys 0-5
+        return eff_transform
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
@@ -0,0 +1,135 @@
+"""A dataclass that captures the CTM and Text State for a tj operation"""
+
+import math
+from dataclasses import dataclass, field
+from typing import Any, Union
+
+from ..._font import Font
+from .. import mult, orient
+
+
+@dataclass
+class TextStateParams:
+    """
+    Text state parameters and operator values for a single text value in a
+    TJ or Tj PDF operation.
+
+    Attributes:
+        txt (str): the text to be rendered.
+        font (Font): font object
+        font_size (int | float): font size
+        Tc (float): character spacing. Defaults to 0.0.
+        Tw (float): word spacing. Defaults to 0.0.
+        Tz (float): horizontal scaling. Defaults to 100.0.
+        TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
+        Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
+        transform (List[float]): effective transformation matrix.
+        tx (float): x cood of rendered text, i.e. self.transform[4]
+        ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
+        displaced_tx (float): x coord immediately following rendered text
+        space_tx (float): tx for a space character
+        font_height (float): effective font height accounting for CTM
+        flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
+        rotated (bool): True if the text orientation is rotated with respect to the page.
+
+    """
+
+    txt: str
+    font: Font
+    font_size: Union[int, float]
+    Tc: float = 0.0
+    Tw: float = 0.0
+    Tz: float = 100.0
+    TL: float = 0.0
+    Ts: float = 0.0
+    transform: list[float] = field(
+        default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+    )
+    tx: float = field(default=0.0, init=False)
+    ty: float = field(default=0.0, init=False)
+    displaced_tx: float = field(default=0.0, init=False)
+    space_tx: float = field(default=0.0, init=False)
+    font_height: float = field(default=0.0, init=False)
+    flip_vertical: bool = field(default=False, init=False)
+    rotated: bool = field(default=False, init=False)
+
+    def __post_init__(self) -> None:
+        if orient(self.transform) in (90, 270):
+            self.transform = mult(
+                [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
+                self.transform,
+            )
+            self.rotated = True
+        # self.transform[0] AND self.transform[3] < 0 indicates true rotation.
+        # If only self.transform[3] < 0, the y coords are simply inverted.
+        if orient(self.transform) == 180 and self.transform[0] < -1e-6:
+            self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
+            self.rotated = True
+        self.displaced_tx = self.displaced_transform()[4]
+        self.tx = self.transform[4]
+        self.ty = self.render_transform()[5]
+        self.space_tx = round(self.word_tx(" "), 3)
+        if self.space_tx < 1e-6:
+            # if the " " char is assigned 0 width (e.g. for fine tuned spacing
+            # with TJ int operators a la crazyones.pdf), calculate space_tx as
+            # a td_offset of -1 * font.space_width where font.space_width is
+            # the space_width calculated in _font.py.
+            self.space_tx = round(self.word_tx("", -self.font.space_width), 3)
+        self.font_height = self.font_size * math.sqrt(
+            self.transform[1] ** 2 + self.transform[3] ** 2
+        )
+        # flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
+        self.flip_vertical = self.transform[3] < -1e-6  # inverts y axis
+
+    def font_size_matrix(self) -> list[float]:
+        """Font size matrix"""
+        return [
+            self.font_size * (self.Tz / 100.0),
+            0.0,
+            0.0,
+            self.font_size,
+            0.0,
+            self.Ts,
+        ]
+
+    def displaced_transform(self) -> list[float]:
+        """Effective transform matrix after text has been rendered."""
+        return mult(self.displacement_matrix(), self.transform)
+
+    def render_transform(self) -> list[float]:
+        """Effective transform matrix accounting for font size, Tz, and Ts."""
+        return mult(self.font_size_matrix(), self.transform)
+
+    def displacement_matrix(
+        self, word: Union[str, None] = None, td_offset: float = 0.0
+    ) -> list[float]:
+        """
+        Text displacement matrix
+
+        Args:
+            word (str, optional): Defaults to None in which case self.txt displacement is
+                returned.
+            td_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
+
+        """
+        word = word if word is not None else self.txt
+        return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0]
+
+    def word_tx(self, word: str, td_offset: float = 0.0) -> float:
+        """Horizontal text displacement for any word according this text state"""
+        width: float = 0.0
+        for char in word:
+            if char == " ":
+                width += self.font.space_width
+            else:
+                width += self.font.text_width(char)
+        return (
+            (self.font_size * ((width - td_offset) / 1000.0))
+            + self.Tc
+            + word.count(" ") * self.Tw
+        ) * (self.Tz / 100.0)
+
+    @staticmethod
+    def to_dict(inst: "TextStateParams") -> dict[str, Any]:
+        """Dataclass to dict for json.dumps serialization"""
+        return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
--- a/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_text_extractor.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_text_extraction/_text_extractor.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import math
+from typing import Any, Callable, Optional, Union
+
+from .._font import Font, FontDescriptor
+from ..generic import DictionaryObject, TextStringObject
+from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
+
+
+class TextExtraction:
+    """
+    A class to handle PDF text extraction operations.
+
+    This class encapsulates all the state and operations needed for extracting
+    text from PDF content streams, replacing the nested functions and nonlocal
+    variables in the original implementation.
+    """
+
+    def __init__(self) -> None:
+        self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
+
+        # Text extraction state variables
+        self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.cm_stack: list[
+            tuple[
+                list[float],
+                Optional[DictionaryObject],
+                Font,
+                float,
+                float,
+                float,
+                float,
+            ]
+        ] = []
+
+        # Store the last modified matrices; can be an intermediate position
+        self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # Store the position at the beginning of building the text
+        self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        self.char_scale = 1.0
+        self.space_scale = 1.0
+        self._space_width: float = 500.0  # will be set correctly at first Tf
+        self._actual_str_size: dict[str, float] = {
+            "str_widths": 0.0,
+            "str_height": 0.0,
+        }  # will be set to string length calculation result
+        self.TL = 0.0
+        self.font_size = 12.0  # init just in case of
+
+        # Text extraction variables
+        self.text: str = ""
+        self.output: str = ""
+        self.rtl_dir: bool = False  # right-to-left
+        self.font_resource: Optional[DictionaryObject] = None
+        self.font = Font(
+            name = "NotInitialized",
+            sub_type="Unknown",
+            encoding="charmap",
+            font_descriptor=FontDescriptor(),
+            )
+        self.orientations: tuple[int, ...] = (0, 90, 180, 270)
+        self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
+        self.font_resources: dict[str, DictionaryObject] = {}
+        self.fonts: dict[str, Font] = {}
+
+        self.operation_handlers = {
+            b"BT": self._handle_bt,
+            b"ET": self._handle_et,
+            b"q": self._handle_save_graphics_state,
+            b"Q": self._handle_restore_graphics_state,
+            b"cm": self._handle_cm,
+            b"Tz": self._handle_tz,
+            b"Tw": self._handle_tw,
+            b"TL": self._handle_tl,
+            b"Tf": self._handle_tf,
+            b"Td": self._handle_td,
+            b"Tm": self._handle_tm,
+            b"T*": self._handle_t_star,
+            b"Tj": self._handle_tj_operation,
+        }
+
+    def initialize_extraction(
+        self,
+        orientations: tuple[int, ...] = (0, 90, 180, 270),
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        font_resources: Optional[dict[str, DictionaryObject]] = None,
+        fonts: Optional[dict[str, Font]] = None
+    ) -> None:
+        """Initialize the extractor with extraction parameters."""
+        self.orientations = orientations
+        self.visitor_text = visitor_text
+        self.font_resources = font_resources or {}
+        self.fonts = fonts or {}
+
+        # Reset state
+        self.text = ""
+        self.output = ""
+        self.rtl_dir = False
+
+    def compute_str_widths(self, str_widths: float) -> float:
+        return str_widths / 1000
+
+    def process_operation(self, operator: bytes, operands: list[Any]) -> None:
+        if operator in self.operation_handlers:
+            handler = self.operation_handlers[operator]
+            str_widths = handler(operands)
+
+            # Post-process operations that affect text positioning
+            if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
+                self._post_process_text_operation(str_widths or 0.0)
+
+    def _post_process_text_operation(self, str_widths: float) -> None:
+        """Handle common post-processing for text positioning operations."""
+        try:
+            self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
+                self.text,
+                (self.cm_prev, self.tm_prev),
+                (self.cm_matrix, self.tm_matrix),
+                (self.memo_cm, self.memo_tm),
+                self.font_resource,
+                self.orientations,
+                self.output,
+                self.font_size,
+                self.visitor_text,
+                str_widths,
+                self.compute_str_widths(self.font_size * self._space_width),
+                self._actual_str_size["str_height"],
+            )
+            if self.text == "":
+                self.memo_cm = self.cm_matrix.copy()
+                self.memo_tm = self.tm_matrix.copy()
+        except OrientationNotFoundError:
+            pass
+
+    def _handle_tj(
+        self,
+        text: str,
+        operands: list[Union[str, TextStringObject]],
+        cm_matrix: list[float],
+        tm_matrix: list[float],
+        font_resource: Optional[DictionaryObject],
+        font: Font,
+        orientations: tuple[int, ...],
+        font_size: float,
+        rtl_dir: bool,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+        actual_str_size: dict[str, float],
+    ) -> tuple[str, bool, dict[str, float]]:
+        text_operands, is_str_operands = get_text_operands(
+            operands, cm_matrix, tm_matrix, font, orientations
+        )
+        if is_str_operands:
+            text += text_operands
+            font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
+        else:
+            text, rtl_dir, font_widths = get_display_str(
+                text,
+                cm_matrix,
+                tm_matrix,  # text matrix
+                font_resource,
+                font,
+                text_operands,
+                font_size,
+                rtl_dir,
+                visitor_text,
+            )
+        actual_str_size["str_widths"] += font_widths * font_size
+        actual_str_size["str_height"] = font_size
+        return text, rtl_dir, actual_str_size
+
+    def _flush_text(self) -> None:
+        """Flush accumulated text to output and call visitor if present."""
+        self.output += self.text
+        if self.visitor_text is not None:
+            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
+        self.text = ""
+        self.memo_cm = self.cm_matrix.copy()
+        self.memo_tm = self.tm_matrix.copy()
+
+    # Operation handlers
+
+    def _handle_bt(self, operands: list[Any]) -> None:
+        """Handle BT (Begin Text) operation - Table 5.4 page 405."""
+        self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self._flush_text()
+
+    def _handle_et(self, operands: list[Any]) -> None:
+        """Handle ET (End Text) operation - Table 5.4 page 405."""
+        self._flush_text()
+
+    def _handle_save_graphics_state(self, operands: list[Any]) -> None:
+        """Handle q (Save graphics state) operation - Table 4.7 page 219."""
+        self.cm_stack.append(
+            (
+                self.cm_matrix,
+                self.font_resource,
+                self.font,
+                self.font_size,
+                self.char_scale,
+                self.space_scale,
+                self.TL,
+            )
+        )
+
+    def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
+        """Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
+        try:
+            (
+                self.cm_matrix,
+                self.font_resource,
+                self.font,
+                self.font_size,
+                self.char_scale,
+                self.space_scale,
+                self.TL,
+            ) = self.cm_stack.pop()
+        except Exception:
+            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+    def _handle_cm(self, operands: list[Any]) -> None:
+        """Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
+        self.output += self.text
+        if self.visitor_text is not None:
+            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
+        self.text = ""
+        try:
+            self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
+        except Exception:
+            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.memo_cm = self.cm_matrix.copy()
+        self.memo_tm = self.tm_matrix.copy()
+
+    def _handle_tz(self, operands: list[Any]) -> None:
+        """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
+        self.char_scale = float(operands[0]) / 100 if operands else 1.0
+
+    def _handle_tw(self, operands: list[Any]) -> None:
+        """Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
+        self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
+
+    def _handle_tl(self, operands: list[Any]) -> None:
+        """Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
+        scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
+        self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
+
+    def _handle_tf(self, operands: list[Any]) -> None:
+        """Handle Tf (Set font size) operation - Table 5.2 page 398."""
+        if self.text != "":
+            self.output += self.text  # .translate(cmap)
+            if self.visitor_text is not None:
+                self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
+        self.text = ""
+        self.memo_cm = self.cm_matrix.copy()
+        self.memo_tm = self.tm_matrix.copy()
+        try:
+            self.font_resource = self.font_resources[operands[0]]
+            self.font = self.fonts[operands[0]]
+        except KeyError:  # font not found
+            self.font_resource = None
+            font_descriptor = FontDescriptor()
+            self.font = Font(
+                "Unknown",
+                space_width=250,
+                encoding=dict.fromkeys(range(256), "<EFBFBD>"),
+                font_descriptor=font_descriptor,
+                character_map={},
+                character_widths=font_descriptor.character_widths
+            )
+
+        self._space_width = self.font.space_width / 2  # Actually the width of _half_ a space...
+        try:
+            self.font_size = float(operands[1])
+        except Exception:
+            pass  # keep previous size
+
+    def _handle_td(self, operands: list[Any]) -> float:
+        """Handle Td (Move text position) operation - Table 5.5 page 406."""
+        # A special case is a translating only tm:
+        # tm = [1, 0, 0, 1, e, f]
+        # i.e. tm[4] += tx, tm[5] += ty.
+        tx, ty = float(operands[0]), float(operands[1])
+        self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
+        self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        return str_widths
+
+    def _handle_tm(self, operands: list[Any]) -> float:
+        """Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
+        self.tm_matrix = [float(operand) for operand in operands[:6]]
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        return str_widths
+
+    def _handle_t_star(self, operands: list[Any]) -> float:
+        """Handle T* (Move to next line) operation - Table 5.5 page 406."""
+        self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
+        self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        return str_widths
+
+    def _handle_tj_operation(self, operands: list[Any]) -> float:
+        """Handle Tj (Show text) operation - Table 5.5 page 406."""
+        self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
+            self.text,
+            operands,
+            self.cm_matrix,
+            self.tm_matrix,
+            self.font_resource,
+            self.font,
+            self.orientations,
+            self.font_size,
+            self.rtl_dir,
+            self.visitor_text,
+            self._actual_str_size,
+        )
+        return 0.0  # str_widths will be handled in post-processing