Update ashboard, dashboard, memory +1 more (+2 ~3)

This commit is contained in:
Echo
2026-02-02 22:27:24 +00:00
parent 4f00131184
commit b0c9b254f1
65 changed files with 42112 additions and 53 deletions

View File

@@ -0,0 +1,245 @@
"""
Code related to text extraction.
Some parts are still in _page.py. In doubt, they will stay there.
"""
import math
from typing import Any, Callable, Optional, Union
from .._font import Font
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
class OrientationNotFoundError(Exception):
pass
def set_custom_rtl(
_min: Union[str, int, None] = None,
_max: Union[str, int, None] = None,
specials: Union[str, list[int], None] = None,
) -> tuple[int, int, list[int]]:
"""
Change the Right-To-Left and special characters custom parameters.
Args:
_min: The new minimum value for the range of custom characters that
will be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
_max: The new maximum value for the range of custom characters that will
be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
specials: The new list of special characters to be inserted in the
current insertion order.
If set to ``None``, the current value will not be changed.
If set to a string, it will be converted to a list of ASCII codes.
The default value is an empty list.
Returns:
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
"""
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
if isinstance(_min, int):
CUSTOM_RTL_MIN = _min
elif isinstance(_min, str):
CUSTOM_RTL_MIN = ord(_min)
if isinstance(_max, int):
CUSTOM_RTL_MAX = _max
elif isinstance(_max, str):
CUSTOM_RTL_MAX = ord(_max)
if isinstance(specials, str):
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
elif isinstance(specials, list):
CUSTOM_RTL_SPECIAL_CHARS = specials
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
def mult(m: list[float], n: list[float]) -> list[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]
def orient(m: list[float]) -> int:
if m[3] > 1e-6:
return 0
if m[3] < -1e-6:
return 180
if m[1] > 0:
return 90
return 270
def crlf_space_check(
text: str,
cmtm_prev: tuple[list[float], list[float]],
cmtm_matrix: tuple[list[float], list[float]],
memo_cmtm: tuple[list[float], list[float]],
font_resource: Optional[DictionaryObject],
orientations: tuple[int, ...],
output: str,
font_size: float,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
str_widths: float,
spacewidth: float,
str_height: float,
) -> tuple[str, str, list[float], list[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]
m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
# Table 108 of the 1.7 reference ("Text positioning operators")
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
cm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
if orientation in (0, 180):
moved_height: float = delta_y
moved_width: float = delta_x
elif orientation in (90, 270):
moved_height = delta_x
moved_width = delta_y
try:
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
font_resource,
font_size,
)
text = ""
elif (
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
and (output + text)[-1] != " "
):
text += " "
except Exception:
pass
tm_prev = tm_matrix.copy()
cm_prev = cm_matrix.copy()
return text, output, cm_prev, tm_prev
def get_text_operands(
operands: list[Union[str, TextStringObject]],
cm_matrix: list[float],
tm_matrix: list[float],
font: Font,
orientations: tuple[int, ...]
) -> tuple[str, bool]:
t: str = ""
is_str_operands = False
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations and len(operands) > 0:
if isinstance(operands[0], str):
t = operands[0]
is_str_operands = True
else:
t = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(font.encoding, str):
try:
t = tt.decode(font.encoding, "surrogatepass") # apply str encoding
except Exception:
# the data does not match the expectation,
# we use the alternative ;
# text extraction may not be good
t = tt.decode(
"utf-16-be" if font.encoding == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
)
return (t, is_str_operands)
def get_display_str(
text: str,
cm_matrix: list[float],
tm_matrix: list[float],
font_resource: Optional[DictionaryObject],
font: Font,
text_operands: str,
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
) -> tuple[str, bool, float]:
# "\u0590 - \u08FF \uFB50 - \uFDFF"
widths: float = 0.0
for x in [font.character_map.get(x, x) for x in text_operands]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
(xx <= 0x2F) # punctuations but...
or 0x3A <= xx <= 0x40 # numbers (x30-39)
or 0x2000 <= xx <= 0x206F # upper punctuations..
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
elif ( # right-to-left characters set
0x0590 <= xx <= 0x08FF
or 0xFB1D <= xx <= 0xFDFF
or 0xFE70 <= xx <= 0xFEFF
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
):
if not rtl_dir:
rtl_dir = True
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
text = ""
text = x + text
else: # left-to-right
if rtl_dir:
rtl_dir = False
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
text = ""
text = text + x
widths += font.space_width if x == " " else font.text_width(x)
# fmt: on
return text, rtl_dir, widths

View File

@@ -0,0 +1,16 @@
"""Layout mode text extraction extension for pypdf"""
from ..._font import Font
from ._fixed_width_page import (
fixed_char_width,
fixed_width_page,
text_show_operations,
y_coordinate_groups,
)
__all__ = [
"Font",
"fixed_char_width",
"fixed_width_page",
"text_show_operations",
"y_coordinate_groups",
]

View File

@@ -0,0 +1,400 @@
"""Extract PDF text preserving the layout of the source PDF"""
from collections.abc import Iterator
from itertools import groupby
from math import ceil
from pathlib import Path
from typing import Any, Literal, Optional, TypedDict
from ..._font import Font
from ..._utils import logger_warning
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
from ._text_state_manager import TextStateManager
from ._text_state_params import TextStateParams
class BTGroup(TypedDict):
"""
Dict describing a line of text rendered within a BT/ET operator pair.
If multiple text show operations render text on the same line, the text
will be combined into a single BTGroup dict.
Keys:
tx: x coordinate of first character in BTGroup
ty: y coordinate of first character in BTGroup
font_size: nominal font size
font_height: effective font height
text: rendered text
displaced_tx: x coordinate of last character in BTGroup
flip_sort: -1 if page is upside down, else 1
"""
tx: float
ty: float
font_size: float
font_height: float
text: str
displaced_tx: float
flip_sort: Literal[-1, 1]
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
"""
BTGroup constructed from a TextStateParams instance, rendered text, and
displaced tx value.
Args:
tj_op (TextStateParams): TextStateParams instance
rendered_text (str): rendered text
dispaced_tx (float): x coordinate of last character in BTGroup
"""
return BTGroup(
tx=tj_op.tx,
ty=tj_op.ty,
font_size=tj_op.font_size,
font_height=tj_op.font_height,
text=rendered_text,
displaced_tx=dispaced_tx,
flip_sort=-1 if tj_op.flip_vertical else 1,
)
def recurs_to_target_op(
ops: Iterator[tuple[list[Any], bytes]],
text_state_mgr: TextStateManager,
end_target: Literal[b"Q", b"ET"],
fonts: dict[str, Font],
strip_rotated: bool = True,
) -> tuple[list[BTGroup], list[TextStateParams]]:
"""
Recurse operators between BT/ET and/or q/Q operators managing the transform
stack and capturing text positioning and rendering data.
Args:
ops: iterator of operators in content stream
text_state_mgr: a TextStateManager instance
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
Returns:
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
"""
# 1 entry per line of text rendered within each BT/ET operation.
bt_groups: list[BTGroup] = []
# 1 entry per text show operator (Tj/TJ/'/")
tj_ops: list[TextStateParams] = []
if end_target == b"Q":
# add new q level. cm's added at this level will be popped at next b'Q'
text_state_mgr.add_q()
for operands, op in ops:
# The loop is broken by the end target, or exits normally when there are no more ops.
if op == end_target:
if op == b"Q":
text_state_mgr.remove_q()
if op == b"ET":
if not tj_ops:
return bt_groups, tj_ops
_text = ""
bt_idx = 0 # idx of first tj in this bt group
last_displaced_tx = tj_ops[bt_idx].displaced_tx
last_ty = tj_ops[bt_idx].ty
for _idx, _tj in enumerate(
tj_ops
): # ... build text from new Tj operators
if strip_rotated and _tj.rotated:
continue
if not _tj.font.interpretable: # generates warning
continue
# if the y position of the text is greater than the font height, assume
# the text is on a new line and start a new group
if abs(_tj.ty - last_ty) > _tj.font_height:
if _text.strip():
bt_groups.append(
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
)
bt_idx = _idx
_text = ""
# if the x position of the text is less than the last x position by
# more than 5 spaces widths, assume the text order should be flipped
# and start a new group
if (
last_displaced_tx - _tj.tx
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
):
if _text.strip():
bt_groups.append(
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
)
bt_idx = _idx
last_displaced_tx = _tj.displaced_tx
_text = ""
# calculate excess x translation based on ending tx of previous Tj.
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
# applied to the first tj of a BTGroup in fixed_width_page().
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
new_text = f'{" " * spaces}{_tj.txt}'
last_ty = _tj.ty
_text = f"{_text}{new_text}"
last_displaced_tx = _tj.displaced_tx
if _text:
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
text_state_mgr.reset_tm()
break
if op == b"q":
bts, tjs = recurs_to_target_op(
ops, text_state_mgr, b"Q", fonts, strip_rotated
)
bt_groups.extend(bts)
tj_ops.extend(tjs)
elif op == b"cm":
text_state_mgr.add_cm(*operands)
elif op == b"BT":
bts, tjs = recurs_to_target_op(
ops, text_state_mgr, b"ET", fonts, strip_rotated
)
bt_groups.extend(bts)
tj_ops.extend(tjs)
elif op == b"Tj":
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
elif op == b"TJ":
_tj = text_state_mgr.text_state_params()
for tj_op in operands[0]:
if isinstance(tj_op, bytes):
_tj = text_state_mgr.text_state_params(tj_op)
tj_ops.append(_tj)
else:
text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op))
elif op == b"'":
text_state_mgr.reset_trm()
text_state_mgr.add_tm([0, -text_state_mgr.TL])
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
elif op == b'"':
text_state_mgr.reset_trm()
text_state_mgr.set_state_param(b"Tw", operands[0])
text_state_mgr.set_state_param(b"Tc", operands[1])
text_state_mgr.add_tm([0, -text_state_mgr.TL])
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
elif op in (b"Td", b"Tm", b"TD", b"T*"):
text_state_mgr.reset_trm()
if op == b"Tm":
text_state_mgr.reset_tm()
elif op == b"TD":
text_state_mgr.set_state_param(b"TL", -operands[1])
elif op == b"T*":
operands = [0, -text_state_mgr.TL]
text_state_mgr.add_tm(operands)
elif op == b"Tf":
text_state_mgr.set_font(fonts[operands[0]], operands[1])
else: # handle Tc, Tw, Tz, TL, and Ts operators
text_state_mgr.set_state_param(op, operands)
else:
logger_warning(
f"Unbalanced target operations, expected {end_target!r}.",
__name__,
)
return bt_groups, tj_ops
def y_coordinate_groups(
bt_groups: list[BTGroup], debug_path: Optional[Path] = None
) -> dict[int, list[BTGroup]]:
"""
Group text operations by rendered y coordinate, i.e. the line number.
Args:
bt_groups: list of dicts as returned by text_show_operations()
debug_path (Path, optional): Path to a directory for saving debug output.
Returns:
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
keyed by y coordinate
"""
ty_groups = {
ty: sorted(grp, key=lambda x: x["tx"])
for ty, grp in groupby(
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
)
}
# combine groups whose y coordinates differ by less than the effective font height
# (accounts for mixed fonts and other minor oddities)
last_ty = next(iter(ty_groups))
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
for ty in list(ty_groups)[1:]:
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
# prevent merge if both groups are rendering in the same x position.
no_text_overlap = not (txs & last_txs)
offset_less_than_font_height = abs(ty - last_ty) < fsz
if no_text_overlap and offset_less_than_font_height:
ty_groups[last_ty] = sorted(
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
)
last_txs |= txs
else:
last_ty = ty
last_txs = txs
if debug_path: # pragma: no cover
import json # noqa: PLC0415
debug_path.joinpath("bt_groups.json").write_text(
json.dumps(ty_groups, indent=2, default=str), "utf-8"
)
return ty_groups
def text_show_operations(
ops: Iterator[tuple[list[Any], bytes]],
fonts: dict[str, Font],
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
) -> list[BTGroup]:
"""
Extract text from BT/ET operator pairs.
Args:
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
fonts (Dict[str, Font]): font dictionary
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
debug_path (Path, optional): Path to a directory for saving debug output.
Returns:
List[BTGroup]: list of dicts of text rendered by each BT operator
"""
state_mgr = TextStateManager() # transformation stack manager
bt_groups: list[BTGroup] = [] # BT operator dict
tj_ops: list[TextStateParams] = [] # Tj/TJ operator data
for operands, op in ops:
if op in (b"BT", b"q"):
bts, tjs = recurs_to_target_op(
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
)
bt_groups.extend(bts)
tj_ops.extend(tjs)
elif op == b"Tf":
state_mgr.set_font(fonts[operands[0]], operands[1])
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
state_mgr.set_state_param(op, operands)
if any(tj.rotated for tj in tj_ops):
if strip_rotated:
logger_warning(
"Rotated text discovered. Output will be incomplete.", __name__
)
else:
logger_warning(
"Rotated text discovered. Layout will be degraded.", __name__
)
if not all(tj.font.interpretable for tj in tj_ops):
logger_warning(
"PDF contains an uninterpretable font. Output will be incomplete.", __name__
)
# left align the data, i.e. decrement all tx values by min(tx)
min_x = min((x["tx"] for x in bt_groups), default=0.0)
bt_groups = [
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
for ogrp in sorted(
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
)
]
if debug_path: # pragma: no cover
import json # noqa: PLC0415
debug_path.joinpath("bts.json").write_text(
json.dumps(bt_groups, indent=2, default=str), "utf-8"
)
debug_path.joinpath("tjs.json").write_text(
json.dumps(
tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
),
"utf-8",
)
return bt_groups
def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float:
"""
Calculate average character width weighted by the length of the rendered
text in each sample for conversion to fixed-width layout.
Args:
bt_groups (List[BTGroup]): List of dicts of text rendered by each
BT operator
Returns:
float: fixed character width
"""
char_widths = []
for _bt in bt_groups:
_len = len(_bt["text"]) * scale_weight
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
def fixed_width_page(
ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
) -> str:
"""
Generate page text from text operations grouped by rendered y coordinate.
Args:
ty_groups: dict of text show ops as returned by y_coordinate_groups()
char_width: fixed character width
space_vertically: include blank lines inferred from y distance + font height.
font_height_weight: multiplier for font height when calculating blank lines.
Returns:
str: page text in a fixed width format that closely adheres to the rendered
layout in the source pdf.
"""
lines: list[str] = []
last_y_coord = 0
table = str.maketrans(dict.fromkeys(range(14, 32), " "))
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
fh = line_data[0]["font_height"]
blank_lines = 0 if fh == 0 else (
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line_parts = [] # It uses a list to construct the line, avoiding string concatenation.
current_len = 0 # Track the size with int instead of len(str) overhead.
last_disp = 0.0
for bt_op in line_data:
tx = bt_op["tx"]
offset = int(tx // char_width)
needed_spaces = offset - current_len
if needed_spaces > 0 and ceil(last_disp) < int(tx):
padding = " " * needed_spaces
line_parts.append(padding)
current_len += needed_spaces
raw_text = bt_op["text"]
text = raw_text.translate(table)
line_parts.append(text)
current_len += len(text)
last_disp = bt_op["displaced_tx"]
full_line = "".join(line_parts).rstrip()
if full_line.strip() or (space_vertically and lines):
lines.append(full_line)
last_y_coord = y_coord
return "\n".join(lines)

View File

@@ -0,0 +1,221 @@
"""manage the PDF transform stack during "layout" mode text extraction"""
from collections import ChainMap, Counter
from collections import ChainMap as ChainMapType
from collections import Counter as CounterType
from collections.abc import MutableMapping
from typing import Any, Union
from ..._font import Font
from ...errors import PdfReadError
from .. import mult
from ._text_state_params import TextStateParams
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
class TextStateManager:
"""
Tracks the current text state including cm/tm/trm transformation matrices.
Attributes:
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
q_queue (Counter[int]): Counter of q operators
q_depth (List[int]): list of q operator nesting levels
Tc (float): character spacing
Tw (float): word spacing
Tz (int): horizontal scaling
TL (float): leading
Ts (float): text rise
font (Font): font object
font_size (int | float): font size
"""
def __init__(self) -> None:
self.transform_stack: TextStateManagerChainMapType = ChainMap(
self.new_transform()
)
self.q_queue: CounterType[int] = Counter()
self.q_depth = [0]
self.Tc: float = 0.0
self.Tw: float = 0.0
self.Tz: float = 100.0
self.TL: float = 0.0
self.Ts: float = 0.0
self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
self.font: Union[Font, None] = None
self.font_size: Union[int, float] = 0
def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
"""
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
Args:
op: operator read from PDF stream as bytes. No action is taken
for unsupported operators (see supported operators above).
value (float | List[Any]): new parameter value. If a list,
value[0] is used.
"""
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
return
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
def set_font(self, font: Font, size: float) -> None:
"""
Set the current font and font_size.
Args:
font (Font): a layout mode Font
size (float): font size
"""
self.font = font
self.font_size = size
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
"""
Create a TextStateParams instance to display a text string. Type[bytes] values
will be decoded implicitly.
Args:
value (str | bytes): text to associate with the captured state.
Raises:
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
Returns:
TextStateParams: current text state parameters
"""
if not isinstance(self.font, Font):
raise PdfReadError(
"font not set: is PDF missing a Tf operator?"
) # pragma: no cover
if isinstance(value, bytes):
try:
if isinstance(self.font.encoding, str):
txt = value.decode(self.font.encoding, "surrogatepass")
else:
txt = "".join(
self.font.encoding[x]
if x in self.font.encoding
else bytes((x,)).decode()
for x in value
)
except (UnicodeEncodeError, UnicodeDecodeError):
txt = value.decode("utf-8", "replace")
txt = "".join(
self.font.character_map.get(x, x) for x in txt
)
else:
txt = value
return TextStateParams(
txt,
self.font,
self.font_size,
self.Tc,
self.Tw,
self.Tz,
self.TL,
self.Ts,
self.effective_transform,
)
@staticmethod
def raw_transform(
_a: float = 1.0,
_b: float = 0.0,
_c: float = 0.0,
_d: float = 1.0,
_e: float = 0.0,
_f: float = 0.0,
) -> dict[int, float]:
"""Only a/b/c/d/e/f matrix params"""
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
@staticmethod
def new_transform(
_a: float = 1.0,
_b: float = 0.0,
_c: float = 0.0,
_d: float = 1.0,
_e: float = 0.0,
_f: float = 0.0,
is_text: bool = False,
is_render: bool = False,
) -> TextStateManagerDictType:
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
result.update({"is_text": is_text, "is_render": is_render})
return result
def reset_tm(self) -> TextStateManagerChainMapType:
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
while (
self.transform_stack.maps[0]["is_text"]
or self.transform_stack.maps[0]["is_render"]
):
self.transform_stack = self.transform_stack.parents
return self.transform_stack
def reset_trm(self) -> TextStateManagerChainMapType:
"""Clear all transforms from chainmap having is_render==True"""
while self.transform_stack.maps[0]["is_render"]:
self.transform_stack = self.transform_stack.parents
return self.transform_stack
def remove_q(self) -> TextStateManagerChainMapType:
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
self.font, self.font_size = self.font_stack.pop(-1)
self.transform_stack = self.reset_tm()
self.transform_stack.maps = self.transform_stack.maps[
self.q_queue.pop(self.q_depth.pop(), 0) :
]
return self.transform_stack
def add_q(self) -> None:
"""Add another level to q_queue"""
self.font_stack.append((self.font, self.font_size))
self.q_depth.append(len(self.q_depth))
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
"""Concatenate an additional transform matrix"""
self.transform_stack = self.reset_tm()
self.q_queue.update(self.q_depth[-1:])
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
return self.transform_stack
def _complete_matrix(self, operands: list[float]) -> list[float]:
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
if len(operands) == 2: # this is a Td operator or equivalent
operands = [1.0, 0.0, 0.0, 1.0, *operands]
return operands
def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
"""Append a text transform matrix"""
self.transform_stack = self.transform_stack.new_child(
self.new_transform( # type: ignore[misc]
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
)
)
return self.transform_stack
def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
"""Append a text rendering transform matrix"""
self.transform_stack = self.transform_stack.new_child(
self.new_transform( # type: ignore[misc]
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
)
)
return self.transform_stack
@property
def effective_transform(self) -> list[float]:
"""Current effective transform accounting for cm, tm, and trm transforms"""
eff_transform = [*self.transform_stack.maps[0].values()]
for transform in self.transform_stack.maps[1:]:
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
return eff_transform

View File

@@ -0,0 +1,135 @@
"""A dataclass that captures the CTM and Text State for a tj operation"""
import math
from dataclasses import dataclass, field
from typing import Any, Union
from ..._font import Font
from .. import mult, orient
@dataclass
class TextStateParams:
"""
Text state parameters and operator values for a single text value in a
TJ or Tj PDF operation.
Attributes:
txt (str): the text to be rendered.
font (Font): font object
font_size (int | float): font size
Tc (float): character spacing. Defaults to 0.0.
Tw (float): word spacing. Defaults to 0.0.
Tz (float): horizontal scaling. Defaults to 100.0.
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
transform (List[float]): effective transformation matrix.
tx (float): x cood of rendered text, i.e. self.transform[4]
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
displaced_tx (float): x coord immediately following rendered text
space_tx (float): tx for a space character
font_height (float): effective font height accounting for CTM
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
rotated (bool): True if the text orientation is rotated with respect to the page.
"""
txt: str
font: Font
font_size: Union[int, float]
Tc: float = 0.0
Tw: float = 0.0
Tz: float = 100.0
TL: float = 0.0
Ts: float = 0.0
transform: list[float] = field(
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
)
tx: float = field(default=0.0, init=False)
ty: float = field(default=0.0, init=False)
displaced_tx: float = field(default=0.0, init=False)
space_tx: float = field(default=0.0, init=False)
font_height: float = field(default=0.0, init=False)
flip_vertical: bool = field(default=False, init=False)
rotated: bool = field(default=False, init=False)
def __post_init__(self) -> None:
if orient(self.transform) in (90, 270):
self.transform = mult(
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
self.transform,
)
self.rotated = True
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
# If only self.transform[3] < 0, the y coords are simply inverted.
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
self.rotated = True
self.displaced_tx = self.displaced_transform()[4]
self.tx = self.transform[4]
self.ty = self.render_transform()[5]
self.space_tx = round(self.word_tx(" "), 3)
if self.space_tx < 1e-6:
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
# with TJ int operators a la crazyones.pdf), calculate space_tx as
# a td_offset of -1 * font.space_width where font.space_width is
# the space_width calculated in _font.py.
self.space_tx = round(self.word_tx("", -self.font.space_width), 3)
self.font_height = self.font_size * math.sqrt(
self.transform[1] ** 2 + self.transform[3] ** 2
)
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
def font_size_matrix(self) -> list[float]:
"""Font size matrix"""
return [
self.font_size * (self.Tz / 100.0),
0.0,
0.0,
self.font_size,
0.0,
self.Ts,
]
def displaced_transform(self) -> list[float]:
"""Effective transform matrix after text has been rendered."""
return mult(self.displacement_matrix(), self.transform)
def render_transform(self) -> list[float]:
"""Effective transform matrix accounting for font size, Tz, and Ts."""
return mult(self.font_size_matrix(), self.transform)
def displacement_matrix(
self, word: Union[str, None] = None, td_offset: float = 0.0
) -> list[float]:
"""
Text displacement matrix
Args:
word (str, optional): Defaults to None in which case self.txt displacement is
returned.
td_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
"""
word = word if word is not None else self.txt
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0]
def word_tx(self, word: str, td_offset: float = 0.0) -> float:
"""Horizontal text displacement for any word according this text state"""
width: float = 0.0
for char in word:
if char == " ":
width += self.font.space_width
else:
width += self.font.text_width(char)
return (
(self.font_size * ((width - td_offset) / 1000.0))
+ self.Tc
+ word.count(" ") * self.Tw
) * (self.Tz / 100.0)
@staticmethod
def to_dict(inst: "TextStateParams") -> dict[str, Any]:
"""Dataclass to dict for json.dumps serialization"""
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}

View File

@@ -0,0 +1,351 @@
# Copyright (c) 2006, Mathieu Fenniak
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import math
from typing import Any, Callable, Optional, Union
from .._font import Font, FontDescriptor
from ..generic import DictionaryObject, TextStringObject
from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
class TextExtraction:
"""
A class to handle PDF text extraction operations.
This class encapsulates all the state and operations needed for extracting
text from PDF content streams, replacing the nested functions and nonlocal
variables in the original implementation.
"""
def __init__(self) -> None:
self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
# Text extraction state variables
self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.cm_stack: list[
tuple[
list[float],
Optional[DictionaryObject],
Font,
float,
float,
float,
float,
]
] = []
# Store the last modified matrices; can be an intermediate position
self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
# Store the position at the beginning of building the text
self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.char_scale = 1.0
self.space_scale = 1.0
self._space_width: float = 500.0 # will be set correctly at first Tf
self._actual_str_size: dict[str, float] = {
"str_widths": 0.0,
"str_height": 0.0,
} # will be set to string length calculation result
self.TL = 0.0
self.font_size = 12.0 # init just in case of
# Text extraction variables
self.text: str = ""
self.output: str = ""
self.rtl_dir: bool = False # right-to-left
self.font_resource: Optional[DictionaryObject] = None
self.font = Font(
name = "NotInitialized",
sub_type="Unknown",
encoding="charmap",
font_descriptor=FontDescriptor(),
)
self.orientations: tuple[int, ...] = (0, 90, 180, 270)
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
self.font_resources: dict[str, DictionaryObject] = {}
self.fonts: dict[str, Font] = {}
self.operation_handlers = {
b"BT": self._handle_bt,
b"ET": self._handle_et,
b"q": self._handle_save_graphics_state,
b"Q": self._handle_restore_graphics_state,
b"cm": self._handle_cm,
b"Tz": self._handle_tz,
b"Tw": self._handle_tw,
b"TL": self._handle_tl,
b"Tf": self._handle_tf,
b"Td": self._handle_td,
b"Tm": self._handle_tm,
b"T*": self._handle_t_star,
b"Tj": self._handle_tj_operation,
}
def initialize_extraction(
self,
orientations: tuple[int, ...] = (0, 90, 180, 270),
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
font_resources: Optional[dict[str, DictionaryObject]] = None,
fonts: Optional[dict[str, Font]] = None
) -> None:
"""Initialize the extractor with extraction parameters."""
self.orientations = orientations
self.visitor_text = visitor_text
self.font_resources = font_resources or {}
self.fonts = fonts or {}
# Reset state
self.text = ""
self.output = ""
self.rtl_dir = False
def compute_str_widths(self, str_widths: float) -> float:
return str_widths / 1000
def process_operation(self, operator: bytes, operands: list[Any]) -> None:
if operator in self.operation_handlers:
handler = self.operation_handlers[operator]
str_widths = handler(operands)
# Post-process operations that affect text positioning
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
self._post_process_text_operation(str_widths or 0.0)
def _post_process_text_operation(self, str_widths: float) -> None:
"""Handle common post-processing for text positioning operations."""
try:
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
self.text,
(self.cm_prev, self.tm_prev),
(self.cm_matrix, self.tm_matrix),
(self.memo_cm, self.memo_tm),
self.font_resource,
self.orientations,
self.output,
self.font_size,
self.visitor_text,
str_widths,
self.compute_str_widths(self.font_size * self._space_width),
self._actual_str_size["str_height"],
)
if self.text == "":
self.memo_cm = self.cm_matrix.copy()
self.memo_tm = self.tm_matrix.copy()
except OrientationNotFoundError:
pass
def _handle_tj(
self,
text: str,
operands: list[Union[str, TextStringObject]],
cm_matrix: list[float],
tm_matrix: list[float],
font_resource: Optional[DictionaryObject],
font: Font,
orientations: tuple[int, ...],
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
actual_str_size: dict[str, float],
) -> tuple[str, bool, dict[str, float]]:
text_operands, is_str_operands = get_text_operands(
operands, cm_matrix, tm_matrix, font, orientations
)
if is_str_operands:
text += text_operands
font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
else:
text, rtl_dir, font_widths = get_display_str(
text,
cm_matrix,
tm_matrix, # text matrix
font_resource,
font,
text_operands,
font_size,
rtl_dir,
visitor_text,
)
actual_str_size["str_widths"] += font_widths * font_size
actual_str_size["str_height"] = font_size
return text, rtl_dir, actual_str_size
def _flush_text(self) -> None:
"""Flush accumulated text to output and call visitor if present."""
self.output += self.text
if self.visitor_text is not None:
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
self.text = ""
self.memo_cm = self.cm_matrix.copy()
self.memo_tm = self.tm_matrix.copy()
# Operation handlers
def _handle_bt(self, operands: list[Any]) -> None:
"""Handle BT (Begin Text) operation - Table 5.4 page 405."""
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self._flush_text()
def _handle_et(self, operands: list[Any]) -> None:
"""Handle ET (End Text) operation - Table 5.4 page 405."""
self._flush_text()
def _handle_save_graphics_state(self, operands: list[Any]) -> None:
"""Handle q (Save graphics state) operation - Table 4.7 page 219."""
self.cm_stack.append(
(
self.cm_matrix,
self.font_resource,
self.font,
self.font_size,
self.char_scale,
self.space_scale,
self.TL,
)
)
def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
"""Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
try:
(
self.cm_matrix,
self.font_resource,
self.font,
self.font_size,
self.char_scale,
self.space_scale,
self.TL,
) = self.cm_stack.pop()
except Exception:
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
def _handle_cm(self, operands: list[Any]) -> None:
"""Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
self.output += self.text
if self.visitor_text is not None:
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
self.text = ""
try:
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
except Exception:
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
self.memo_cm = self.cm_matrix.copy()
self.memo_tm = self.tm_matrix.copy()
def _handle_tz(self, operands: list[Any]) -> None:
"""Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
self.char_scale = float(operands[0]) / 100 if operands else 1.0
def _handle_tw(self, operands: list[Any]) -> None:
"""Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
def _handle_tl(self, operands: list[Any]) -> None:
"""Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
def _handle_tf(self, operands: list[Any]) -> None:
"""Handle Tf (Set font size) operation - Table 5.2 page 398."""
if self.text != "":
self.output += self.text # .translate(cmap)
if self.visitor_text is not None:
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
self.text = ""
self.memo_cm = self.cm_matrix.copy()
self.memo_tm = self.tm_matrix.copy()
try:
self.font_resource = self.font_resources[operands[0]]
self.font = self.fonts[operands[0]]
except KeyError: # font not found
self.font_resource = None
font_descriptor = FontDescriptor()
self.font = Font(
"Unknown",
space_width=250,
encoding=dict.fromkeys(range(256), "<EFBFBD>"),
font_descriptor=font_descriptor,
character_map={},
character_widths=font_descriptor.character_widths
)
self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space...
try:
self.font_size = float(operands[1])
except Exception:
pass # keep previous size
def _handle_td(self, operands: list[Any]) -> float:
"""Handle Td (Move text position) operation - Table 5.5 page 406."""
# A special case is a translating only tm:
# tm = [1, 0, 0, 1, e, f]
# i.e. tm[4] += tx, tm[5] += ty.
tx, ty = float(operands[0]), float(operands[1])
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
self._actual_str_size["str_widths"] = 0.0
return str_widths
def _handle_tm(self, operands: list[Any]) -> float:
"""Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
self.tm_matrix = [float(operand) for operand in operands[:6]]
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
self._actual_str_size["str_widths"] = 0.0
return str_widths
def _handle_t_star(self, operands: list[Any]) -> float:
"""Handle T* (Move to next line) operation - Table 5.5 page 406."""
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
self._actual_str_size["str_widths"] = 0.0
return str_widths
def _handle_tj_operation(self, operands: list[Any]) -> float:
"""Handle Tj (Show text) operation - Table 5.5 page 406."""
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
self.text,
operands,
self.cm_matrix,
self.tm_matrix,
self.font_resource,
self.font,
self.orientations,
self.font_size,
self.rtl_dir,
self.visitor_text,
self._actual_str_size,
)
return 0.0 # str_widths will be handled in post-processing