328 lines
14 KiB
Python
328 lines
14 KiB
Python
from collections.abc import Sequence
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Optional, Union, cast
|
|
|
|
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
|
|
|
from ._cmap import get_encoding
|
|
from ._codecs.adobe_glyphs import adobe_glyphs
|
|
from ._utils import logger_warning
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FontDescriptor:
|
|
"""
|
|
Represents the FontDescriptor dictionary as defined in the PDF specification.
|
|
This contains both descriptive and metric information.
|
|
|
|
The defaults are derived from the mean values of the 14 core fonts, rounded
|
|
to 100.
|
|
"""
|
|
|
|
name: str = "Unknown"
|
|
family: str = "Unknown"
|
|
weight: str = "Unknown"
|
|
|
|
ascent: float = 700.0
|
|
descent: float = -200.0
|
|
cap_height: float = 600.0
|
|
x_height: float = 500.0
|
|
italic_angle: float = 0.0 # Non-italic
|
|
flags: int = 32 # Non-serif, non-symbolic, not fixed width
|
|
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
|
|
|
|
character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
|
|
|
|
@staticmethod
|
|
def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
|
|
font_descriptor_dict: DictionaryObject = (
|
|
font_descriptor_obj.get_object()
|
|
if isinstance(font_descriptor_obj, IndirectObject)
|
|
else font_descriptor_obj
|
|
)
|
|
for source_key, target_key in [
|
|
("/FontName", "name"),
|
|
("/FontFamily", "family"),
|
|
("/FontWeight", "weight"),
|
|
("/Ascent", "ascent"),
|
|
("/Descent", "descent"),
|
|
("/CapHeight", "cap_height"),
|
|
("/XHeight", "x_height"),
|
|
("/ItalicAngle", "italic_angle"),
|
|
("/Flags", "flags"),
|
|
("/FontBBox", "bbox")
|
|
]:
|
|
if source_key in font_descriptor_dict:
|
|
font_kwargs[target_key] = font_descriptor_dict[source_key]
|
|
# Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
|
|
if "bbox" in font_kwargs:
|
|
bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
|
|
assert len(bbox_tuple) == 4, bbox_tuple
|
|
font_kwargs["bbox"] = bbox_tuple
|
|
return font_kwargs
|
|
|
|
@staticmethod
|
|
def _collect_tt_t1_character_widths(
|
|
pdf_font_dict: DictionaryObject,
|
|
char_map: dict[Any, Any],
|
|
encoding: Union[str, dict[int, str]],
|
|
current_widths: dict[str, int]
|
|
) -> None:
|
|
"""Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
|
|
widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
|
|
first_char = pdf_font_dict.get("/FirstChar", 0)
|
|
if not isinstance(encoding, str):
|
|
# This means that encoding is a dict
|
|
current_widths.update({
|
|
encoding.get(idx + first_char, chr(idx + first_char)): width
|
|
for idx, width in enumerate(widths_array)
|
|
})
|
|
return
|
|
|
|
# We map the character code directly to the character
|
|
# using the string encoding
|
|
for idx, width in enumerate(widths_array):
|
|
# Often "idx == 0" will denote the .notdef character, but we add it anyway
|
|
char_code = idx + first_char # This is a raw code
|
|
# Get the "raw" character or byte representation
|
|
raw_char = bytes([char_code]).decode(encoding, "surrogatepass")
|
|
# Translate raw_char to the REAL Unicode character using the char_map
|
|
unicode_char = char_map.get(raw_char)
|
|
if unicode_char:
|
|
current_widths[unicode_char] = int(width)
|
|
else:
|
|
current_widths[raw_char] = int(width)
|
|
|
|
@staticmethod
|
|
def _collect_cid_character_widths(
|
|
d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
|
|
) -> None:
|
|
"""Parses the /W array from a DescendantFont dictionary and updates character widths."""
|
|
ord_map = {
|
|
ord(_target): _surrogate
|
|
for _target, _surrogate in char_map.items()
|
|
if isinstance(_target, str)
|
|
}
|
|
# /W width definitions have two valid formats which can be mixed and matched:
|
|
# (1) A character start index followed by a list of widths, e.g.
|
|
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
|
|
# (2) A character start index, a character stop index, and a width, e.g.
|
|
# `45 65 500` applies width 500 to characters 45-65.
|
|
skip_count = 0
|
|
_w = d_font.get("/W", [])
|
|
for idx, w_entry in enumerate(_w):
|
|
w_entry = w_entry.get_object()
|
|
if skip_count:
|
|
skip_count -= 1
|
|
continue
|
|
if not isinstance(w_entry, (int, float)):
|
|
# We should never get here due to skip_count above. But
|
|
# sometimes we do.
|
|
logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)
|
|
continue
|
|
# check for format (1): `int [int int int int ...]`
|
|
w_next_entry = _w[idx + 1].get_object()
|
|
if isinstance(w_next_entry, Sequence):
|
|
start_idx, width_list = w_entry, w_next_entry
|
|
current_widths.update(
|
|
{
|
|
ord_map[_cidx]: _width
|
|
for _cidx, _width in zip(
|
|
range(
|
|
cast(int, start_idx),
|
|
cast(int, start_idx) + len(width_list),
|
|
1,
|
|
),
|
|
width_list,
|
|
)
|
|
if _cidx in ord_map
|
|
}
|
|
)
|
|
skip_count = 1
|
|
# check for format (2): `int int int`
|
|
elif isinstance(w_next_entry, (int, float)) and isinstance(
|
|
_w[idx + 2].get_object(), (int, float)
|
|
):
|
|
start_idx, stop_idx, const_width = (
|
|
w_entry,
|
|
w_next_entry,
|
|
_w[idx + 2].get_object(),
|
|
)
|
|
current_widths.update(
|
|
{
|
|
ord_map[_cidx]: const_width
|
|
for _cidx in range(
|
|
cast(int, start_idx), cast(int, stop_idx + 1), 1
|
|
)
|
|
if _cidx in ord_map
|
|
}
|
|
)
|
|
skip_count = 2
|
|
else:
|
|
# This handles the case of out of bounds (reaching the end of the width definitions
|
|
# while expecting more elements).
|
|
logger_warning(
|
|
f"Invalid font width definition. Last element: {w_entry}.",
|
|
__name__
|
|
)
|
|
|
|
@staticmethod
|
|
def _add_default_width(current_widths: dict[str, int]) -> None:
|
|
if not current_widths:
|
|
current_widths["default"] = 500
|
|
return
|
|
|
|
if "default" in current_widths:
|
|
return
|
|
|
|
if " " in current_widths and current_widths[" "] != 0:
|
|
# Setting default to twice the space width
|
|
current_widths["default"] = int(2 * current_widths[" "])
|
|
return
|
|
|
|
# Use the average width of existing glyph widths
|
|
valid_widths = [w for w in current_widths.values() if w > 0]
|
|
current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
|
|
|
|
@classmethod
|
|
def from_font_resource(
|
|
cls,
|
|
pdf_font_dict: DictionaryObject,
|
|
encoding: Optional[Union[str, dict[int, str]]] = None,
|
|
char_map: Optional[dict[Any, Any]] = None
|
|
) -> "FontDescriptor":
|
|
from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415
|
|
# Prioritize information from the PDF font dictionary
|
|
font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
|
font_kwargs: dict[str, Any] = {"character_widths": {}}
|
|
|
|
# Deal with fonts by type; Type1, TrueType and certain Type3
|
|
if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
|
|
if "/Widths" in pdf_font_dict:
|
|
if not (encoding and char_map):
|
|
encoding, char_map = get_encoding(pdf_font_dict)
|
|
cls._collect_tt_t1_character_widths(
|
|
pdf_font_dict, char_map, encoding, font_kwargs["character_widths"]
|
|
)
|
|
elif font_name in CORE_FONT_METRICS:
|
|
font_descriptor = CORE_FONT_METRICS[font_name]
|
|
cls._add_default_width(font_descriptor.character_widths)
|
|
|
|
return font_descriptor
|
|
|
|
if "/FontDescriptor" in pdf_font_dict: # TODO: This does not account for some Type3 fonts;
|
|
# see tests/test_cmap.py::test_ascii_charset
|
|
font_descriptor_resource = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
|
|
font_descriptor_obj = cast(DictionaryObject, font_descriptor_resource)
|
|
if "/MissingWidth" in font_descriptor_obj:
|
|
font_kwargs["character_widths"]["default"] = font_descriptor_obj["/MissingWidth"].get_object()
|
|
font_kwargs = cls._parse_font_descriptor(
|
|
font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
|
|
)
|
|
if "default" not in font_kwargs["character_widths"]:
|
|
cls._add_default_width(font_kwargs["character_widths"])
|
|
|
|
return cls(**font_kwargs)
|
|
|
|
# Composite font or CID font - CID fonts have a /W array mapping character codes
|
|
# to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
|
|
# because all other fonts have already been dealt with.
|
|
if not (encoding and char_map):
|
|
encoding, char_map = get_encoding(pdf_font_dict)
|
|
d_font: DictionaryObject
|
|
for d_font_idx, d_font in enumerate(
|
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
|
|
):
|
|
d_font = cast(DictionaryObject, d_font.get_object())
|
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
|
|
cls._collect_cid_character_widths(
|
|
d_font, char_map, font_kwargs["character_widths"]
|
|
)
|
|
if "/DW" in d_font:
|
|
font_kwargs["character_widths"]["default"] = d_font["/DW"].get_object()
|
|
else:
|
|
cls._add_default_width(font_kwargs["character_widths"])
|
|
font_kwargs = cls._parse_font_descriptor(
|
|
font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
|
|
)
|
|
|
|
return cls(**font_kwargs)
|
|
|
|
|
|
@dataclass
|
|
class Font:
|
|
"""
|
|
A font object for use during text extraction and for producing
|
|
text appearance streams.
|
|
|
|
Attributes:
|
|
name: Font name, derived from font["/BaseFont"]
|
|
character_map: The font's character map
|
|
encoding: Font encoding
|
|
sub_type: The font type, such as Type1, TrueType, or Type3.
|
|
font_descriptor: Font metrics, including a mapping of characters to widths
|
|
character_widths: A mapping of characters to widths
|
|
space_width: The width of a space, or an approximation
|
|
interpretable: Default True. If False, the font glyphs cannot
|
|
be translated to characters, e.g. Type3 fonts that do not define
|
|
a '/ToUnicode' mapping.
|
|
|
|
"""
|
|
|
|
name: str
|
|
encoding: Union[str, dict[int, str]]
|
|
character_map: dict[Any, Any] = field(default_factory=dict)
|
|
sub_type: str = "Unknown"
|
|
font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
|
|
character_widths: dict[str, int] = field(default_factory=dict)
|
|
space_width: Union[float, int] = 250
|
|
interpretable: bool = True
|
|
|
|
@classmethod
|
|
def from_font_resource(
|
|
cls,
|
|
pdf_font_dict: DictionaryObject,
|
|
) -> "Font":
|
|
# Can collect base_font, name and encoding directly from font resource
|
|
name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
|
sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
|
|
encoding, character_map = get_encoding(pdf_font_dict)
|
|
|
|
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
|
|
# reliably converted into character codes unless all named chars
|
|
# in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
|
|
# PDF 1.7 standard.
|
|
interpretable = True
|
|
if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
|
|
interpretable = all(
|
|
cname in adobe_glyphs
|
|
for cname in pdf_font_dict.get("/CharProcs") or []
|
|
)
|
|
|
|
if interpretable:
|
|
font_descriptor = FontDescriptor.from_font_resource(pdf_font_dict, encoding, character_map)
|
|
else:
|
|
font_descriptor = FontDescriptor() # Save some overhead if font is not interpretable
|
|
character_widths = font_descriptor.character_widths
|
|
|
|
space_width = font_descriptor.character_widths.get(" ")
|
|
if not space_width or space_width == 0:
|
|
space_width = font_descriptor.character_widths["default"] // 2
|
|
|
|
return cls(
|
|
name=name,
|
|
sub_type=sub_type,
|
|
encoding=encoding,
|
|
font_descriptor=font_descriptor,
|
|
character_map=character_map,
|
|
character_widths=character_widths,
|
|
space_width=space_width,
|
|
interpretable=interpretable
|
|
)
|
|
|
|
def text_width(self, text: str = "") -> float:
|
|
"""Sum of character widths specified in PDF font for the supplied text."""
|
|
return sum(
|
|
[self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
|
|
)
|