Update ashboard, dashboard, memory +1 more (+2 ~3)
This commit is contained in:
@@ -0,0 +1,245 @@
|
||||
"""
|
||||
Code related to text extraction.
|
||||
|
||||
Some parts are still in _page.py. In doubt, they will stay there.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from .._font import Font
|
||||
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
|
||||
|
||||
CUSTOM_RTL_MIN: int = -1
|
||||
CUSTOM_RTL_MAX: int = -1
|
||||
CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
|
||||
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
|
||||
|
||||
|
||||
class OrientationNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def set_custom_rtl(
|
||||
_min: Union[str, int, None] = None,
|
||||
_max: Union[str, int, None] = None,
|
||||
specials: Union[str, list[int], None] = None,
|
||||
) -> tuple[int, int, list[int]]:
|
||||
"""
|
||||
Change the Right-To-Left and special characters custom parameters.
|
||||
|
||||
Args:
|
||||
_min: The new minimum value for the range of custom characters that
|
||||
will be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
_max: The new maximum value for the range of custom characters that will
|
||||
be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
specials: The new list of special characters to be inserted in the
|
||||
current insertion order.
|
||||
If set to ``None``, the current value will not be changed.
|
||||
If set to a string, it will be converted to a list of ASCII codes.
|
||||
The default value is an empty list.
|
||||
|
||||
Returns:
|
||||
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
|
||||
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
|
||||
|
||||
"""
|
||||
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
if isinstance(_min, int):
|
||||
CUSTOM_RTL_MIN = _min
|
||||
elif isinstance(_min, str):
|
||||
CUSTOM_RTL_MIN = ord(_min)
|
||||
if isinstance(_max, int):
|
||||
CUSTOM_RTL_MAX = _max
|
||||
elif isinstance(_max, str):
|
||||
CUSTOM_RTL_MAX = ord(_max)
|
||||
if isinstance(specials, str):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
|
||||
elif isinstance(specials, list):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = specials
|
||||
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
|
||||
|
||||
def mult(m: list[float], n: list[float]) -> list[float]:
|
||||
return [
|
||||
m[0] * n[0] + m[1] * n[2],
|
||||
m[0] * n[1] + m[1] * n[3],
|
||||
m[2] * n[0] + m[3] * n[2],
|
||||
m[2] * n[1] + m[3] * n[3],
|
||||
m[4] * n[0] + m[5] * n[2] + n[4],
|
||||
m[4] * n[1] + m[5] * n[3] + n[5],
|
||||
]
|
||||
|
||||
|
||||
def orient(m: list[float]) -> int:
|
||||
if m[3] > 1e-6:
|
||||
return 0
|
||||
if m[3] < -1e-6:
|
||||
return 180
|
||||
if m[1] > 0:
|
||||
return 90
|
||||
return 270
|
||||
|
||||
|
||||
def crlf_space_check(
|
||||
text: str,
|
||||
cmtm_prev: tuple[list[float], list[float]],
|
||||
cmtm_matrix: tuple[list[float], list[float]],
|
||||
memo_cmtm: tuple[list[float], list[float]],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
orientations: tuple[int, ...],
|
||||
output: str,
|
||||
font_size: float,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||
str_widths: float,
|
||||
spacewidth: float,
|
||||
str_height: float,
|
||||
) -> tuple[str, str, list[float], list[float]]:
|
||||
cm_prev = cmtm_prev[0]
|
||||
tm_prev = cmtm_prev[1]
|
||||
cm_matrix = cmtm_matrix[0]
|
||||
tm_matrix = cmtm_matrix[1]
|
||||
memo_cm = memo_cmtm[0]
|
||||
memo_tm = memo_cmtm[1]
|
||||
|
||||
m_prev = mult(tm_prev, cm_prev)
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
delta_x = m[4] - m_prev[4]
|
||||
delta_y = m[5] - m_prev[5]
|
||||
# Table 108 of the 1.7 reference ("Text positioning operators")
|
||||
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
|
||||
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
|
||||
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
|
||||
cm_prev = m
|
||||
|
||||
if orientation not in orientations:
|
||||
raise OrientationNotFoundError
|
||||
if orientation in (0, 180):
|
||||
moved_height: float = delta_y
|
||||
moved_width: float = delta_x
|
||||
elif orientation in (90, 270):
|
||||
moved_height = delta_x
|
||||
moved_width = delta_y
|
||||
try:
|
||||
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
|
||||
if (output + text)[-1] != "\n":
|
||||
output += text + "\n"
|
||||
if visitor_text is not None:
|
||||
visitor_text(
|
||||
text + "\n",
|
||||
memo_cm,
|
||||
memo_tm,
|
||||
font_resource,
|
||||
font_size,
|
||||
)
|
||||
text = ""
|
||||
elif (
|
||||
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
|
||||
and (output + text)[-1] != " "
|
||||
):
|
||||
text += " "
|
||||
except Exception:
|
||||
pass
|
||||
tm_prev = tm_matrix.copy()
|
||||
cm_prev = cm_matrix.copy()
|
||||
return text, output, cm_prev, tm_prev
|
||||
|
||||
|
||||
def get_text_operands(
|
||||
operands: list[Union[str, TextStringObject]],
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font: Font,
|
||||
orientations: tuple[int, ...]
|
||||
) -> tuple[str, bool]:
|
||||
t: str = ""
|
||||
is_str_operands = False
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
if orientation in orientations and len(operands) > 0:
|
||||
if isinstance(operands[0], str):
|
||||
t = operands[0]
|
||||
is_str_operands = True
|
||||
else:
|
||||
t = ""
|
||||
tt: bytes = (
|
||||
encode_pdfdocencoding(operands[0])
|
||||
if isinstance(operands[0], str)
|
||||
else operands[0]
|
||||
)
|
||||
if isinstance(font.encoding, str):
|
||||
try:
|
||||
t = tt.decode(font.encoding, "surrogatepass") # apply str encoding
|
||||
except Exception:
|
||||
# the data does not match the expectation,
|
||||
# we use the alternative ;
|
||||
# text extraction may not be good
|
||||
t = tt.decode(
|
||||
"utf-16-be" if font.encoding == "charmap" else "charmap",
|
||||
"surrogatepass",
|
||||
) # apply str encoding
|
||||
else: # apply dict encoding
|
||||
t = "".join(
|
||||
[font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
|
||||
)
|
||||
return (t, is_str_operands)
|
||||
|
||||
|
||||
def get_display_str(
|
||||
text: str,
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
font: Font,
|
||||
text_operands: str,
|
||||
font_size: float,
|
||||
rtl_dir: bool,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
|
||||
) -> tuple[str, bool, float]:
|
||||
# "\u0590 - \u08FF \uFB50 - \uFDFF"
|
||||
widths: float = 0.0
|
||||
for x in [font.character_map.get(x, x) for x in text_operands]:
|
||||
# x can be a sequence of bytes ; ex: habibi.pdf
|
||||
if len(x) == 1:
|
||||
xx = ord(x)
|
||||
else:
|
||||
xx = 1
|
||||
# fmt: off
|
||||
if (
|
||||
# cases where the current inserting order is kept
|
||||
(xx <= 0x2F) # punctuations but...
|
||||
or 0x3A <= xx <= 0x40 # numbers (x30-39)
|
||||
or 0x2000 <= xx <= 0x206F # upper punctuations..
|
||||
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
|
||||
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
|
||||
):
|
||||
text = x + text if rtl_dir else text + x
|
||||
elif ( # right-to-left characters set
|
||||
0x0590 <= xx <= 0x08FF
|
||||
or 0xFB1D <= xx <= 0xFDFF
|
||||
or 0xFE70 <= xx <= 0xFEFF
|
||||
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
|
||||
):
|
||||
if not rtl_dir:
|
||||
rtl_dir = True
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||
text = ""
|
||||
text = x + text
|
||||
else: # left-to-right
|
||||
if rtl_dir:
|
||||
rtl_dir = False
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||
text = ""
|
||||
text = text + x
|
||||
widths += font.space_width if x == " " else font.text_width(x)
|
||||
# fmt: on
|
||||
return text, rtl_dir, widths
|
||||
@@ -0,0 +1,16 @@
|
||||
"""Layout mode text extraction extension for pypdf"""
|
||||
from ..._font import Font
|
||||
from ._fixed_width_page import (
|
||||
fixed_char_width,
|
||||
fixed_width_page,
|
||||
text_show_operations,
|
||||
y_coordinate_groups,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Font",
|
||||
"fixed_char_width",
|
||||
"fixed_width_page",
|
||||
"text_show_operations",
|
||||
"y_coordinate_groups",
|
||||
]
|
||||
@@ -0,0 +1,400 @@
|
||||
"""Extract PDF text preserving the layout of the source PDF"""
|
||||
|
||||
from collections.abc import Iterator
|
||||
from itertools import groupby
|
||||
from math import ceil
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Optional, TypedDict
|
||||
|
||||
from ..._font import Font
|
||||
from ..._utils import logger_warning
|
||||
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
from ._text_state_manager import TextStateManager
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
|
||||
class BTGroup(TypedDict):
|
||||
"""
|
||||
Dict describing a line of text rendered within a BT/ET operator pair.
|
||||
If multiple text show operations render text on the same line, the text
|
||||
will be combined into a single BTGroup dict.
|
||||
|
||||
Keys:
|
||||
tx: x coordinate of first character in BTGroup
|
||||
ty: y coordinate of first character in BTGroup
|
||||
font_size: nominal font size
|
||||
font_height: effective font height
|
||||
text: rendered text
|
||||
displaced_tx: x coordinate of last character in BTGroup
|
||||
flip_sort: -1 if page is upside down, else 1
|
||||
"""
|
||||
|
||||
tx: float
|
||||
ty: float
|
||||
font_size: float
|
||||
font_height: float
|
||||
text: str
|
||||
displaced_tx: float
|
||||
flip_sort: Literal[-1, 1]
|
||||
|
||||
|
||||
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
|
||||
"""
|
||||
BTGroup constructed from a TextStateParams instance, rendered text, and
|
||||
displaced tx value.
|
||||
|
||||
Args:
|
||||
tj_op (TextStateParams): TextStateParams instance
|
||||
rendered_text (str): rendered text
|
||||
dispaced_tx (float): x coordinate of last character in BTGroup
|
||||
|
||||
"""
|
||||
return BTGroup(
|
||||
tx=tj_op.tx,
|
||||
ty=tj_op.ty,
|
||||
font_size=tj_op.font_size,
|
||||
font_height=tj_op.font_height,
|
||||
text=rendered_text,
|
||||
displaced_tx=dispaced_tx,
|
||||
flip_sort=-1 if tj_op.flip_vertical else 1,
|
||||
)
|
||||
|
||||
|
||||
def recurs_to_target_op(
|
||||
ops: Iterator[tuple[list[Any], bytes]],
|
||||
text_state_mgr: TextStateManager,
|
||||
end_target: Literal[b"Q", b"ET"],
|
||||
fonts: dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
) -> tuple[list[BTGroup], list[TextStateParams]]:
|
||||
"""
|
||||
Recurse operators between BT/ET and/or q/Q operators managing the transform
|
||||
stack and capturing text positioning and rendering data.
|
||||
|
||||
Args:
|
||||
ops: iterator of operators in content stream
|
||||
text_state_mgr: a TextStateManager instance
|
||||
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
|
||||
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
|
||||
|
||||
Returns:
|
||||
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
|
||||
|
||||
"""
|
||||
# 1 entry per line of text rendered within each BT/ET operation.
|
||||
bt_groups: list[BTGroup] = []
|
||||
|
||||
# 1 entry per text show operator (Tj/TJ/'/")
|
||||
tj_ops: list[TextStateParams] = []
|
||||
|
||||
if end_target == b"Q":
|
||||
# add new q level. cm's added at this level will be popped at next b'Q'
|
||||
text_state_mgr.add_q()
|
||||
|
||||
for operands, op in ops:
|
||||
# The loop is broken by the end target, or exits normally when there are no more ops.
|
||||
if op == end_target:
|
||||
if op == b"Q":
|
||||
text_state_mgr.remove_q()
|
||||
if op == b"ET":
|
||||
if not tj_ops:
|
||||
return bt_groups, tj_ops
|
||||
_text = ""
|
||||
bt_idx = 0 # idx of first tj in this bt group
|
||||
last_displaced_tx = tj_ops[bt_idx].displaced_tx
|
||||
last_ty = tj_ops[bt_idx].ty
|
||||
for _idx, _tj in enumerate(
|
||||
tj_ops
|
||||
): # ... build text from new Tj operators
|
||||
if strip_rotated and _tj.rotated:
|
||||
continue
|
||||
if not _tj.font.interpretable: # generates warning
|
||||
continue
|
||||
# if the y position of the text is greater than the font height, assume
|
||||
# the text is on a new line and start a new group
|
||||
if abs(_tj.ty - last_ty) > _tj.font_height:
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
_text = ""
|
||||
|
||||
# if the x position of the text is less than the last x position by
|
||||
# more than 5 spaces widths, assume the text order should be flipped
|
||||
# and start a new group
|
||||
if (
|
||||
last_displaced_tx - _tj.tx
|
||||
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
):
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
_text = ""
|
||||
|
||||
# calculate excess x translation based on ending tx of previous Tj.
|
||||
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
|
||||
# applied to the first tj of a BTGroup in fixed_width_page().
|
||||
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
|
||||
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
|
||||
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
|
||||
new_text = f'{" " * spaces}{_tj.txt}'
|
||||
|
||||
last_ty = _tj.ty
|
||||
_text = f"{_text}{new_text}"
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
if _text:
|
||||
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
|
||||
text_state_mgr.reset_tm()
|
||||
break
|
||||
if op == b"q":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"Q", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"cm":
|
||||
text_state_mgr.add_cm(*operands)
|
||||
elif op == b"BT":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"ET", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"Tj":
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b"TJ":
|
||||
_tj = text_state_mgr.text_state_params()
|
||||
for tj_op in operands[0]:
|
||||
if isinstance(tj_op, bytes):
|
||||
_tj = text_state_mgr.text_state_params(tj_op)
|
||||
tj_ops.append(_tj)
|
||||
else:
|
||||
text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op))
|
||||
elif op == b"'":
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b'"':
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.set_state_param(b"Tw", operands[0])
|
||||
text_state_mgr.set_state_param(b"Tc", operands[1])
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
|
||||
elif op in (b"Td", b"Tm", b"TD", b"T*"):
|
||||
text_state_mgr.reset_trm()
|
||||
if op == b"Tm":
|
||||
text_state_mgr.reset_tm()
|
||||
elif op == b"TD":
|
||||
text_state_mgr.set_state_param(b"TL", -operands[1])
|
||||
elif op == b"T*":
|
||||
operands = [0, -text_state_mgr.TL]
|
||||
text_state_mgr.add_tm(operands)
|
||||
elif op == b"Tf":
|
||||
text_state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # handle Tc, Tw, Tz, TL, and Ts operators
|
||||
text_state_mgr.set_state_param(op, operands)
|
||||
else:
|
||||
logger_warning(
|
||||
f"Unbalanced target operations, expected {end_target!r}.",
|
||||
__name__,
|
||||
)
|
||||
return bt_groups, tj_ops
|
||||
|
||||
|
||||
def y_coordinate_groups(
|
||||
bt_groups: list[BTGroup], debug_path: Optional[Path] = None
|
||||
) -> dict[int, list[BTGroup]]:
|
||||
"""
|
||||
Group text operations by rendered y coordinate, i.e. the line number.
|
||||
|
||||
Args:
|
||||
bt_groups: list of dicts as returned by text_show_operations()
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
|
||||
keyed by y coordinate
|
||||
|
||||
"""
|
||||
ty_groups = {
|
||||
ty: sorted(grp, key=lambda x: x["tx"])
|
||||
for ty, grp in groupby(
|
||||
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
|
||||
)
|
||||
}
|
||||
# combine groups whose y coordinates differ by less than the effective font height
|
||||
# (accounts for mixed fonts and other minor oddities)
|
||||
last_ty = next(iter(ty_groups))
|
||||
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
|
||||
for ty in list(ty_groups)[1:]:
|
||||
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
|
||||
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
|
||||
# prevent merge if both groups are rendering in the same x position.
|
||||
no_text_overlap = not (txs & last_txs)
|
||||
offset_less_than_font_height = abs(ty - last_ty) < fsz
|
||||
if no_text_overlap and offset_less_than_font_height:
|
||||
ty_groups[last_ty] = sorted(
|
||||
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
|
||||
)
|
||||
last_txs |= txs
|
||||
else:
|
||||
last_ty = ty
|
||||
last_txs = txs
|
||||
if debug_path: # pragma: no cover
|
||||
import json # noqa: PLC0415
|
||||
|
||||
debug_path.joinpath("bt_groups.json").write_text(
|
||||
json.dumps(ty_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
return ty_groups
|
||||
|
||||
|
||||
def text_show_operations(
|
||||
ops: Iterator[tuple[list[Any], bytes]],
|
||||
fonts: dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
debug_path: Optional[Path] = None,
|
||||
) -> list[BTGroup]:
|
||||
"""
|
||||
Extract text from BT/ET operator pairs.
|
||||
|
||||
Args:
|
||||
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
|
||||
fonts (Dict[str, Font]): font dictionary
|
||||
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
List[BTGroup]: list of dicts of text rendered by each BT operator
|
||||
|
||||
"""
|
||||
state_mgr = TextStateManager() # transformation stack manager
|
||||
bt_groups: list[BTGroup] = [] # BT operator dict
|
||||
tj_ops: list[TextStateParams] = [] # Tj/TJ operator data
|
||||
for operands, op in ops:
|
||||
if op in (b"BT", b"q"):
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"Tf":
|
||||
state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
|
||||
state_mgr.set_state_param(op, operands)
|
||||
|
||||
if any(tj.rotated for tj in tj_ops):
|
||||
if strip_rotated:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Output will be incomplete.", __name__
|
||||
)
|
||||
else:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Layout will be degraded.", __name__
|
||||
)
|
||||
if not all(tj.font.interpretable for tj in tj_ops):
|
||||
logger_warning(
|
||||
"PDF contains an uninterpretable font. Output will be incomplete.", __name__
|
||||
)
|
||||
|
||||
# left align the data, i.e. decrement all tx values by min(tx)
|
||||
min_x = min((x["tx"] for x in bt_groups), default=0.0)
|
||||
bt_groups = [
|
||||
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
|
||||
for ogrp in sorted(
|
||||
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
|
||||
)
|
||||
]
|
||||
|
||||
if debug_path: # pragma: no cover
|
||||
import json # noqa: PLC0415
|
||||
|
||||
debug_path.joinpath("bts.json").write_text(
|
||||
json.dumps(bt_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
debug_path.joinpath("tjs.json").write_text(
|
||||
json.dumps(
|
||||
tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
|
||||
),
|
||||
"utf-8",
|
||||
)
|
||||
return bt_groups
|
||||
|
||||
|
||||
def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float:
|
||||
"""
|
||||
Calculate average character width weighted by the length of the rendered
|
||||
text in each sample for conversion to fixed-width layout.
|
||||
|
||||
Args:
|
||||
bt_groups (List[BTGroup]): List of dicts of text rendered by each
|
||||
BT operator
|
||||
|
||||
Returns:
|
||||
float: fixed character width
|
||||
|
||||
"""
|
||||
char_widths = []
|
||||
for _bt in bt_groups:
|
||||
_len = len(_bt["text"]) * scale_weight
|
||||
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
|
||||
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
|
||||
|
||||
|
||||
def fixed_width_page(
|
||||
ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
|
||||
) -> str:
|
||||
"""
|
||||
Generate page text from text operations grouped by rendered y coordinate.
|
||||
|
||||
Args:
|
||||
ty_groups: dict of text show ops as returned by y_coordinate_groups()
|
||||
char_width: fixed character width
|
||||
space_vertically: include blank lines inferred from y distance + font height.
|
||||
font_height_weight: multiplier for font height when calculating blank lines.
|
||||
|
||||
Returns:
|
||||
str: page text in a fixed width format that closely adheres to the rendered
|
||||
layout in the source pdf.
|
||||
|
||||
"""
|
||||
lines: list[str] = []
|
||||
last_y_coord = 0
|
||||
table = str.maketrans(dict.fromkeys(range(14, 32), " "))
|
||||
for y_coord, line_data in ty_groups.items():
|
||||
if space_vertically and lines:
|
||||
fh = line_data[0]["font_height"]
|
||||
blank_lines = 0 if fh == 0 else (
|
||||
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
|
||||
)
|
||||
lines.extend([""] * blank_lines)
|
||||
|
||||
line_parts = [] # It uses a list to construct the line, avoiding string concatenation.
|
||||
current_len = 0 # Track the size with int instead of len(str) overhead.
|
||||
last_disp = 0.0
|
||||
for bt_op in line_data:
|
||||
tx = bt_op["tx"]
|
||||
offset = int(tx // char_width)
|
||||
needed_spaces = offset - current_len
|
||||
if needed_spaces > 0 and ceil(last_disp) < int(tx):
|
||||
padding = " " * needed_spaces
|
||||
line_parts.append(padding)
|
||||
current_len += needed_spaces
|
||||
|
||||
raw_text = bt_op["text"]
|
||||
text = raw_text.translate(table)
|
||||
line_parts.append(text)
|
||||
current_len += len(text)
|
||||
last_disp = bt_op["displaced_tx"]
|
||||
|
||||
full_line = "".join(line_parts).rstrip()
|
||||
if full_line.strip() or (space_vertically and lines):
|
||||
lines.append(full_line)
|
||||
|
||||
last_y_coord = y_coord
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,221 @@
|
||||
"""manage the PDF transform stack during "layout" mode text extraction"""
|
||||
|
||||
from collections import ChainMap, Counter
|
||||
from collections import ChainMap as ChainMapType
|
||||
from collections import Counter as CounterType
|
||||
from collections.abc import MutableMapping
|
||||
from typing import Any, Union
|
||||
|
||||
from ..._font import Font
|
||||
from ...errors import PdfReadError
|
||||
from .. import mult
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
|
||||
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
|
||||
|
||||
|
||||
class TextStateManager:
|
||||
"""
|
||||
Tracks the current text state including cm/tm/trm transformation matrices.
|
||||
|
||||
Attributes:
|
||||
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
|
||||
q_queue (Counter[int]): Counter of q operators
|
||||
q_depth (List[int]): list of q operator nesting levels
|
||||
Tc (float): character spacing
|
||||
Tw (float): word spacing
|
||||
Tz (int): horizontal scaling
|
||||
TL (float): leading
|
||||
Ts (float): text rise
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.transform_stack: TextStateManagerChainMapType = ChainMap(
|
||||
self.new_transform()
|
||||
)
|
||||
self.q_queue: CounterType[int] = Counter()
|
||||
self.q_depth = [0]
|
||||
self.Tc: float = 0.0
|
||||
self.Tw: float = 0.0
|
||||
self.Tz: float = 100.0
|
||||
self.TL: float = 0.0
|
||||
self.Ts: float = 0.0
|
||||
self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
|
||||
self.font: Union[Font, None] = None
|
||||
self.font_size: Union[int, float] = 0
|
||||
|
||||
def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
|
||||
"""
|
||||
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
|
||||
|
||||
Args:
|
||||
op: operator read from PDF stream as bytes. No action is taken
|
||||
for unsupported operators (see supported operators above).
|
||||
value (float | List[Any]): new parameter value. If a list,
|
||||
value[0] is used.
|
||||
|
||||
"""
|
||||
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
|
||||
return
|
||||
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
|
||||
|
||||
def set_font(self, font: Font, size: float) -> None:
|
||||
"""
|
||||
Set the current font and font_size.
|
||||
|
||||
Args:
|
||||
font (Font): a layout mode Font
|
||||
size (float): font size
|
||||
|
||||
"""
|
||||
self.font = font
|
||||
self.font_size = size
|
||||
|
||||
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
|
||||
"""
|
||||
Create a TextStateParams instance to display a text string. Type[bytes] values
|
||||
will be decoded implicitly.
|
||||
|
||||
Args:
|
||||
value (str | bytes): text to associate with the captured state.
|
||||
|
||||
Raises:
|
||||
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
|
||||
|
||||
Returns:
|
||||
TextStateParams: current text state parameters
|
||||
|
||||
"""
|
||||
if not isinstance(self.font, Font):
|
||||
raise PdfReadError(
|
||||
"font not set: is PDF missing a Tf operator?"
|
||||
) # pragma: no cover
|
||||
if isinstance(value, bytes):
|
||||
try:
|
||||
if isinstance(self.font.encoding, str):
|
||||
txt = value.decode(self.font.encoding, "surrogatepass")
|
||||
else:
|
||||
txt = "".join(
|
||||
self.font.encoding[x]
|
||||
if x in self.font.encoding
|
||||
else bytes((x,)).decode()
|
||||
for x in value
|
||||
)
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
txt = value.decode("utf-8", "replace")
|
||||
txt = "".join(
|
||||
self.font.character_map.get(x, x) for x in txt
|
||||
)
|
||||
else:
|
||||
txt = value
|
||||
return TextStateParams(
|
||||
txt,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.Tc,
|
||||
self.Tw,
|
||||
self.Tz,
|
||||
self.TL,
|
||||
self.Ts,
|
||||
self.effective_transform,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def raw_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
) -> dict[int, float]:
|
||||
"""Only a/b/c/d/e/f matrix params"""
|
||||
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
|
||||
|
||||
@staticmethod
|
||||
def new_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
is_text: bool = False,
|
||||
is_render: bool = False,
|
||||
) -> TextStateManagerDictType:
|
||||
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
|
||||
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
|
||||
result.update({"is_text": is_text, "is_render": is_render})
|
||||
return result
|
||||
|
||||
def reset_tm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
|
||||
while (
|
||||
self.transform_stack.maps[0]["is_text"]
|
||||
or self.transform_stack.maps[0]["is_render"]
|
||||
):
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def reset_trm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_render==True"""
|
||||
while self.transform_stack.maps[0]["is_render"]:
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def remove_q(self) -> TextStateManagerChainMapType:
|
||||
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
|
||||
self.font, self.font_size = self.font_stack.pop(-1)
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.transform_stack.maps = self.transform_stack.maps[
|
||||
self.q_queue.pop(self.q_depth.pop(), 0) :
|
||||
]
|
||||
return self.transform_stack
|
||||
|
||||
def add_q(self) -> None:
|
||||
"""Add another level to q_queue"""
|
||||
self.font_stack.append((self.font, self.font_size))
|
||||
self.q_depth.append(len(self.q_depth))
|
||||
|
||||
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
|
||||
"""Concatenate an additional transform matrix"""
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.q_queue.update(self.q_depth[-1:])
|
||||
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
|
||||
return self.transform_stack
|
||||
|
||||
def _complete_matrix(self, operands: list[float]) -> list[float]:
|
||||
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
|
||||
if len(operands) == 2: # this is a Td operator or equivalent
|
||||
operands = [1.0, 0.0, 0.0, 1.0, *operands]
|
||||
return operands
|
||||
|
||||
def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text rendering transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
@property
|
||||
def effective_transform(self) -> list[float]:
|
||||
"""Current effective transform accounting for cm, tm, and trm transforms"""
|
||||
eff_transform = [*self.transform_stack.maps[0].values()]
|
||||
for transform in self.transform_stack.maps[1:]:
|
||||
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
|
||||
return eff_transform
|
||||
@@ -0,0 +1,135 @@
|
||||
"""A dataclass that captures the CTM and Text State for a tj operation"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Union
|
||||
|
||||
from ..._font import Font
|
||||
from .. import mult, orient
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextStateParams:
|
||||
"""
|
||||
Text state parameters and operator values for a single text value in a
|
||||
TJ or Tj PDF operation.
|
||||
|
||||
Attributes:
|
||||
txt (str): the text to be rendered.
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
Tc (float): character spacing. Defaults to 0.0.
|
||||
Tw (float): word spacing. Defaults to 0.0.
|
||||
Tz (float): horizontal scaling. Defaults to 100.0.
|
||||
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
|
||||
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
|
||||
transform (List[float]): effective transformation matrix.
|
||||
tx (float): x cood of rendered text, i.e. self.transform[4]
|
||||
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
|
||||
displaced_tx (float): x coord immediately following rendered text
|
||||
space_tx (float): tx for a space character
|
||||
font_height (float): effective font height accounting for CTM
|
||||
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
|
||||
rotated (bool): True if the text orientation is rotated with respect to the page.
|
||||
|
||||
"""
|
||||
|
||||
txt: str
|
||||
font: Font
|
||||
font_size: Union[int, float]
|
||||
Tc: float = 0.0
|
||||
Tw: float = 0.0
|
||||
Tz: float = 100.0
|
||||
TL: float = 0.0
|
||||
Ts: float = 0.0
|
||||
transform: list[float] = field(
|
||||
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
)
|
||||
tx: float = field(default=0.0, init=False)
|
||||
ty: float = field(default=0.0, init=False)
|
||||
displaced_tx: float = field(default=0.0, init=False)
|
||||
space_tx: float = field(default=0.0, init=False)
|
||||
font_height: float = field(default=0.0, init=False)
|
||||
flip_vertical: bool = field(default=False, init=False)
|
||||
rotated: bool = field(default=False, init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if orient(self.transform) in (90, 270):
|
||||
self.transform = mult(
|
||||
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
|
||||
self.transform,
|
||||
)
|
||||
self.rotated = True
|
||||
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
|
||||
# If only self.transform[3] < 0, the y coords are simply inverted.
|
||||
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
|
||||
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
|
||||
self.rotated = True
|
||||
self.displaced_tx = self.displaced_transform()[4]
|
||||
self.tx = self.transform[4]
|
||||
self.ty = self.render_transform()[5]
|
||||
self.space_tx = round(self.word_tx(" "), 3)
|
||||
if self.space_tx < 1e-6:
|
||||
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
|
||||
# with TJ int operators a la crazyones.pdf), calculate space_tx as
|
||||
# a td_offset of -1 * font.space_width where font.space_width is
|
||||
# the space_width calculated in _font.py.
|
||||
self.space_tx = round(self.word_tx("", -self.font.space_width), 3)
|
||||
self.font_height = self.font_size * math.sqrt(
|
||||
self.transform[1] ** 2 + self.transform[3] ** 2
|
||||
)
|
||||
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
|
||||
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
|
||||
|
||||
def font_size_matrix(self) -> list[float]:
|
||||
"""Font size matrix"""
|
||||
return [
|
||||
self.font_size * (self.Tz / 100.0),
|
||||
0.0,
|
||||
0.0,
|
||||
self.font_size,
|
||||
0.0,
|
||||
self.Ts,
|
||||
]
|
||||
|
||||
def displaced_transform(self) -> list[float]:
|
||||
"""Effective transform matrix after text has been rendered."""
|
||||
return mult(self.displacement_matrix(), self.transform)
|
||||
|
||||
def render_transform(self) -> list[float]:
|
||||
"""Effective transform matrix accounting for font size, Tz, and Ts."""
|
||||
return mult(self.font_size_matrix(), self.transform)
|
||||
|
||||
def displacement_matrix(
|
||||
self, word: Union[str, None] = None, td_offset: float = 0.0
|
||||
) -> list[float]:
|
||||
"""
|
||||
Text displacement matrix
|
||||
|
||||
Args:
|
||||
word (str, optional): Defaults to None in which case self.txt displacement is
|
||||
returned.
|
||||
td_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
|
||||
|
||||
"""
|
||||
word = word if word is not None else self.txt
|
||||
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0]
|
||||
|
||||
def word_tx(self, word: str, td_offset: float = 0.0) -> float:
|
||||
"""Horizontal text displacement for any word according this text state"""
|
||||
width: float = 0.0
|
||||
for char in word:
|
||||
if char == " ":
|
||||
width += self.font.space_width
|
||||
else:
|
||||
width += self.font.text_width(char)
|
||||
return (
|
||||
(self.font_size * ((width - td_offset) / 1000.0))
|
||||
+ self.Tc
|
||||
+ word.count(" ") * self.Tw
|
||||
) * (self.Tz / 100.0)
|
||||
|
||||
@staticmethod
|
||||
def to_dict(inst: "TextStateParams") -> dict[str, Any]:
|
||||
"""Dataclass to dict for json.dumps serialization"""
|
||||
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
|
||||
@@ -0,0 +1,351 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import math
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from .._font import Font, FontDescriptor
|
||||
from ..generic import DictionaryObject, TextStringObject
|
||||
from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
|
||||
|
||||
|
||||
class TextExtraction:
|
||||
"""
|
||||
A class to handle PDF text extraction operations.
|
||||
|
||||
This class encapsulates all the state and operations needed for extracting
|
||||
text from PDF content streams, replacing the nested functions and nonlocal
|
||||
variables in the original implementation.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
|
||||
|
||||
# Text extraction state variables
|
||||
self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.cm_stack: list[
|
||||
tuple[
|
||||
list[float],
|
||||
Optional[DictionaryObject],
|
||||
Font,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
]
|
||||
] = []
|
||||
|
||||
# Store the last modified matrices; can be an intermediate position
|
||||
self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
# Store the position at the beginning of building the text
|
||||
self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
self.char_scale = 1.0
|
||||
self.space_scale = 1.0
|
||||
self._space_width: float = 500.0 # will be set correctly at first Tf
|
||||
self._actual_str_size: dict[str, float] = {
|
||||
"str_widths": 0.0,
|
||||
"str_height": 0.0,
|
||||
} # will be set to string length calculation result
|
||||
self.TL = 0.0
|
||||
self.font_size = 12.0 # init just in case of
|
||||
|
||||
# Text extraction variables
|
||||
self.text: str = ""
|
||||
self.output: str = ""
|
||||
self.rtl_dir: bool = False # right-to-left
|
||||
self.font_resource: Optional[DictionaryObject] = None
|
||||
self.font = Font(
|
||||
name = "NotInitialized",
|
||||
sub_type="Unknown",
|
||||
encoding="charmap",
|
||||
font_descriptor=FontDescriptor(),
|
||||
)
|
||||
self.orientations: tuple[int, ...] = (0, 90, 180, 270)
|
||||
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
|
||||
self.font_resources: dict[str, DictionaryObject] = {}
|
||||
self.fonts: dict[str, Font] = {}
|
||||
|
||||
self.operation_handlers = {
|
||||
b"BT": self._handle_bt,
|
||||
b"ET": self._handle_et,
|
||||
b"q": self._handle_save_graphics_state,
|
||||
b"Q": self._handle_restore_graphics_state,
|
||||
b"cm": self._handle_cm,
|
||||
b"Tz": self._handle_tz,
|
||||
b"Tw": self._handle_tw,
|
||||
b"TL": self._handle_tl,
|
||||
b"Tf": self._handle_tf,
|
||||
b"Td": self._handle_td,
|
||||
b"Tm": self._handle_tm,
|
||||
b"T*": self._handle_t_star,
|
||||
b"Tj": self._handle_tj_operation,
|
||||
}
|
||||
|
||||
def initialize_extraction(
|
||||
self,
|
||||
orientations: tuple[int, ...] = (0, 90, 180, 270),
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
|
||||
font_resources: Optional[dict[str, DictionaryObject]] = None,
|
||||
fonts: Optional[dict[str, Font]] = None
|
||||
) -> None:
|
||||
"""Initialize the extractor with extraction parameters."""
|
||||
self.orientations = orientations
|
||||
self.visitor_text = visitor_text
|
||||
self.font_resources = font_resources or {}
|
||||
self.fonts = fonts or {}
|
||||
|
||||
# Reset state
|
||||
self.text = ""
|
||||
self.output = ""
|
||||
self.rtl_dir = False
|
||||
|
||||
def compute_str_widths(self, str_widths: float) -> float:
|
||||
return str_widths / 1000
|
||||
|
||||
def process_operation(self, operator: bytes, operands: list[Any]) -> None:
|
||||
if operator in self.operation_handlers:
|
||||
handler = self.operation_handlers[operator]
|
||||
str_widths = handler(operands)
|
||||
|
||||
# Post-process operations that affect text positioning
|
||||
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
|
||||
self._post_process_text_operation(str_widths or 0.0)
|
||||
|
||||
def _post_process_text_operation(self, str_widths: float) -> None:
|
||||
"""Handle common post-processing for text positioning operations."""
|
||||
try:
|
||||
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
|
||||
self.text,
|
||||
(self.cm_prev, self.tm_prev),
|
||||
(self.cm_matrix, self.tm_matrix),
|
||||
(self.memo_cm, self.memo_tm),
|
||||
self.font_resource,
|
||||
self.orientations,
|
||||
self.output,
|
||||
self.font_size,
|
||||
self.visitor_text,
|
||||
str_widths,
|
||||
self.compute_str_widths(self.font_size * self._space_width),
|
||||
self._actual_str_size["str_height"],
|
||||
)
|
||||
if self.text == "":
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
except OrientationNotFoundError:
|
||||
pass
|
||||
|
||||
def _handle_tj(
|
||||
self,
|
||||
text: str,
|
||||
operands: list[Union[str, TextStringObject]],
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
font: Font,
|
||||
orientations: tuple[int, ...],
|
||||
font_size: float,
|
||||
rtl_dir: bool,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||
actual_str_size: dict[str, float],
|
||||
) -> tuple[str, bool, dict[str, float]]:
|
||||
text_operands, is_str_operands = get_text_operands(
|
||||
operands, cm_matrix, tm_matrix, font, orientations
|
||||
)
|
||||
if is_str_operands:
|
||||
text += text_operands
|
||||
font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
|
||||
else:
|
||||
text, rtl_dir, font_widths = get_display_str(
|
||||
text,
|
||||
cm_matrix,
|
||||
tm_matrix, # text matrix
|
||||
font_resource,
|
||||
font,
|
||||
text_operands,
|
||||
font_size,
|
||||
rtl_dir,
|
||||
visitor_text,
|
||||
)
|
||||
actual_str_size["str_widths"] += font_widths * font_size
|
||||
actual_str_size["str_height"] = font_size
|
||||
return text, rtl_dir, actual_str_size
|
||||
|
||||
def _flush_text(self) -> None:
|
||||
"""Flush accumulated text to output and call visitor if present."""
|
||||
self.output += self.text
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
|
||||
# Operation handlers
|
||||
|
||||
def _handle_bt(self, operands: list[Any]) -> None:
|
||||
"""Handle BT (Begin Text) operation - Table 5.4 page 405."""
|
||||
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self._flush_text()
|
||||
|
||||
def _handle_et(self, operands: list[Any]) -> None:
|
||||
"""Handle ET (End Text) operation - Table 5.4 page 405."""
|
||||
self._flush_text()
|
||||
|
||||
def _handle_save_graphics_state(self, operands: list[Any]) -> None:
|
||||
"""Handle q (Save graphics state) operation - Table 4.7 page 219."""
|
||||
self.cm_stack.append(
|
||||
(
|
||||
self.cm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.char_scale,
|
||||
self.space_scale,
|
||||
self.TL,
|
||||
)
|
||||
)
|
||||
|
||||
def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
|
||||
"""Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
|
||||
try:
|
||||
(
|
||||
self.cm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.char_scale,
|
||||
self.space_scale,
|
||||
self.TL,
|
||||
) = self.cm_stack.pop()
|
||||
except Exception:
|
||||
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
def _handle_cm(self, operands: list[Any]) -> None:
|
||||
"""Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
|
||||
self.output += self.text
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
try:
|
||||
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
|
||||
except Exception:
|
||||
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
|
||||
def _handle_tz(self, operands: list[Any]) -> None:
|
||||
"""Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
|
||||
self.char_scale = float(operands[0]) / 100 if operands else 1.0
|
||||
|
||||
def _handle_tw(self, operands: list[Any]) -> None:
|
||||
"""Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
|
||||
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
|
||||
|
||||
def _handle_tl(self, operands: list[Any]) -> None:
|
||||
"""Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
|
||||
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
|
||||
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
|
||||
|
||||
def _handle_tf(self, operands: list[Any]) -> None:
|
||||
"""Handle Tf (Set font size) operation - Table 5.2 page 398."""
|
||||
if self.text != "":
|
||||
self.output += self.text # .translate(cmap)
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
try:
|
||||
self.font_resource = self.font_resources[operands[0]]
|
||||
self.font = self.fonts[operands[0]]
|
||||
except KeyError: # font not found
|
||||
self.font_resource = None
|
||||
font_descriptor = FontDescriptor()
|
||||
self.font = Font(
|
||||
"Unknown",
|
||||
space_width=250,
|
||||
encoding=dict.fromkeys(range(256), "<EFBFBD>"),
|
||||
font_descriptor=font_descriptor,
|
||||
character_map={},
|
||||
character_widths=font_descriptor.character_widths
|
||||
)
|
||||
|
||||
self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space...
|
||||
try:
|
||||
self.font_size = float(operands[1])
|
||||
except Exception:
|
||||
pass # keep previous size
|
||||
|
||||
def _handle_td(self, operands: list[Any]) -> float:
|
||||
"""Handle Td (Move text position) operation - Table 5.5 page 406."""
|
||||
# A special case is a translating only tm:
|
||||
# tm = [1, 0, 0, 1, e, f]
|
||||
# i.e. tm[4] += tx, tm[5] += ty.
|
||||
tx, ty = float(operands[0]), float(operands[1])
|
||||
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
|
||||
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_tm(self, operands: list[Any]) -> float:
|
||||
"""Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
|
||||
self.tm_matrix = [float(operand) for operand in operands[:6]]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_t_star(self, operands: list[Any]) -> float:
|
||||
"""Handle T* (Move to next line) operation - Table 5.5 page 406."""
|
||||
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
|
||||
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_tj_operation(self, operands: list[Any]) -> float:
|
||||
"""Handle Tj (Show text) operation - Table 5.5 page 406."""
|
||||
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
|
||||
self.text,
|
||||
operands,
|
||||
self.cm_matrix,
|
||||
self.tm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.orientations,
|
||||
self.font_size,
|
||||
self.rtl_dir,
|
||||
self.visitor_text,
|
||||
self._actual_str_size,
|
||||
)
|
||||
return 0.0 # str_widths will be handled in post-processing
|
||||
Reference in New Issue
Block a user