Update ashboard, dashboard, memory +1 more (+2 ~3)
This commit is contained in:
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
from ..constants import OutlineFontFlag
|
||||
from ._base import (
|
||||
BooleanObject,
|
||||
ByteStringObject,
|
||||
FloatObject,
|
||||
IndirectObject,
|
||||
NameObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
PdfObject,
|
||||
TextStringObject,
|
||||
encode_pdfdocencoding,
|
||||
is_null_or_none,
|
||||
)
|
||||
from ._data_structures import (
|
||||
ArrayObject,
|
||||
ContentStream,
|
||||
DecodedStreamObject,
|
||||
Destination,
|
||||
DictionaryObject,
|
||||
EncodedStreamObject,
|
||||
Field,
|
||||
StreamObject,
|
||||
TreeObject,
|
||||
read_object,
|
||||
)
|
||||
from ._files import EmbeddedFile
|
||||
from ._fit import Fit
|
||||
from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links
|
||||
from ._outline import OutlineItem
|
||||
from ._rectangle import RectangleObject
|
||||
from ._utils import (
|
||||
create_string_object,
|
||||
decode_pdfdocencoding,
|
||||
hex_to_rgb,
|
||||
read_hex_string_from_stream,
|
||||
read_string_from_stream,
|
||||
)
|
||||
from ._viewerpref import ViewerPreferences
|
||||
|
||||
PAGE_FIT = Fit.fit()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PAGE_FIT",
|
||||
"ArrayObject",
|
||||
"BooleanObject",
|
||||
"ByteStringObject",
|
||||
"ContentStream",
|
||||
"DecodedStreamObject",
|
||||
"Destination",
|
||||
"DictionaryObject",
|
||||
"DirectReferenceLink",
|
||||
"EmbeddedFile",
|
||||
"EncodedStreamObject",
|
||||
"Field",
|
||||
"Fit",
|
||||
"FloatObject",
|
||||
"IndirectObject",
|
||||
"NameObject",
|
||||
"NamedReferenceLink",
|
||||
"NullObject",
|
||||
"NumberObject",
|
||||
"OutlineFontFlag",
|
||||
"OutlineItem",
|
||||
"PdfObject",
|
||||
"RectangleObject",
|
||||
"ReferenceLink",
|
||||
"StreamObject",
|
||||
"TextStringObject",
|
||||
"TreeObject",
|
||||
"ViewerPreferences",
|
||||
# Utility functions
|
||||
"create_string_object",
|
||||
"decode_pdfdocencoding",
|
||||
"encode_pdfdocencoding",
|
||||
"extract_links",
|
||||
"hex_to_rgb",
|
||||
"is_null_or_none",
|
||||
"read_hex_string_from_stream",
|
||||
# Data structures core functions
|
||||
"read_object",
|
||||
"read_string_from_stream",
|
||||
]
|
||||
@@ -0,0 +1,547 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from .._codecs import fill_from_encoding
|
||||
from .._codecs.core_fontmetrics import CORE_FONT_METRICS
|
||||
from .._font import Font
|
||||
from .._utils import logger_warning
|
||||
from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
|
||||
from ..generic import (
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
RectangleObject,
|
||||
)
|
||||
from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none
|
||||
|
||||
DEFAULT_FONT_SIZE_IN_MULTILINE = 12
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseStreamConfig:
|
||||
"""A container representing the basic layout of an appearance stream."""
|
||||
rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0)
|
||||
border_width: int = 1 # The width of the border in points
|
||||
border_style: str = BorderStyles.SOLID
|
||||
|
||||
|
||||
class BaseStreamAppearance(DecodedStreamObject):
|
||||
"""A class representing the very base of an appearance stream, that is, a rectangle and a border."""
|
||||
|
||||
def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None:
|
||||
"""
|
||||
Takes the appearance stream layout as an argument.
|
||||
|
||||
Args:
|
||||
layout: The basic layout parameters.
|
||||
"""
|
||||
super().__init__()
|
||||
self._layout = layout or BaseStreamConfig()
|
||||
self[NameObject("/Type")] = NameObject("/XObject")
|
||||
self[NameObject("/Subtype")] = NameObject("/Form")
|
||||
self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle)
|
||||
|
||||
|
||||
class TextAlignment(IntEnum):
|
||||
"""Defines the alignment options for text within a form field's appearance stream."""
|
||||
|
||||
LEFT = 0
|
||||
CENTER = 1
|
||||
RIGHT = 2
|
||||
|
||||
|
||||
class TextStreamAppearance(BaseStreamAppearance):
|
||||
"""
|
||||
A class representing the appearance stream for a text-based form field.
|
||||
|
||||
This class generates the content stream (the `ap_stream_data`) that dictates
|
||||
how text is rendered within a form field's bounding box. It handles properties
|
||||
like font, font size, color, multiline text, and text selection highlighting.
|
||||
"""
|
||||
|
||||
def _scale_text(
|
||||
self,
|
||||
font: Font,
|
||||
font_size: float,
|
||||
leading_factor: float,
|
||||
field_width: float,
|
||||
field_height: float,
|
||||
text: str,
|
||||
min_font_size: float,
|
||||
font_size_step: float = 0.2
|
||||
) -> tuple[list[tuple[float, str]], float]:
|
||||
"""
|
||||
Takes a piece of text and scales it to field_width or field_height, given font_name
|
||||
and font_size. Wraps text where necessary.
|
||||
|
||||
Args:
|
||||
font: The font to be used.
|
||||
font_size: The font size in points.
|
||||
leading_factor: The line distance.
|
||||
field_width: The width of the field in which to fit the text.
|
||||
field_height: The height of the field in which to fit the text.
|
||||
text: The text to fit with the field.
|
||||
min_font_size: The minimum font size at which to scale the text.
|
||||
font_size_step: The amount by which to decrement font size per step while scaling.
|
||||
|
||||
Returns:
|
||||
The text in the form of list of tuples, each tuple containing the length of a line
|
||||
and its contents, and the font_size for these lines and lengths.
|
||||
"""
|
||||
orig_text = text
|
||||
paragraphs = text.replace("\n", "\r").split("\r")
|
||||
wrapped_lines = []
|
||||
current_line_words: list[str] = []
|
||||
current_line_width: float = 0
|
||||
space_width = font.space_width * font_size / 1000
|
||||
for paragraph in paragraphs:
|
||||
if not paragraph.strip():
|
||||
wrapped_lines.append((0.0, ""))
|
||||
continue
|
||||
words = paragraph.split(" ")
|
||||
for i, word in enumerate(words):
|
||||
word_width = font.text_width(word) * font_size / 1000
|
||||
test_width = current_line_width + word_width + (space_width if i else 0)
|
||||
if test_width > field_width and current_line_words:
|
||||
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||
current_line_words = [word]
|
||||
current_line_width = word_width
|
||||
elif not current_line_words and word_width > field_width:
|
||||
wrapped_lines.append((word_width, word))
|
||||
current_line_words = []
|
||||
current_line_width = 0
|
||||
else:
|
||||
if current_line_words:
|
||||
current_line_width += space_width
|
||||
current_line_words.append(word)
|
||||
current_line_width += word_width
|
||||
if current_line_words:
|
||||
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||
current_line_words = []
|
||||
current_line_width = 0
|
||||
# Estimate total height.
|
||||
estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size
|
||||
if estimated_total_height > field_height:
|
||||
# Text overflows height; Retry with smaller font size.
|
||||
new_font_size = font_size - font_size_step
|
||||
if new_font_size >= min_font_size:
|
||||
return self._scale_text(
|
||||
font,
|
||||
new_font_size,
|
||||
leading_factor,
|
||||
field_width,
|
||||
field_height,
|
||||
orig_text,
|
||||
min_font_size,
|
||||
font_size_step
|
||||
)
|
||||
return wrapped_lines, round(font_size, 1)
|
||||
|
||||
def _generate_appearance_stream_data(
|
||||
self,
|
||||
text: str,
|
||||
selection: Union[list[str], None],
|
||||
font: Font,
|
||||
font_glyph_byte_map: Optional[dict[str, bytes]] = None,
|
||||
font_name: str = "/Helv",
|
||||
font_size: float = 0.0,
|
||||
font_color: str = "0 g",
|
||||
is_multiline: bool = False,
|
||||
alignment: TextAlignment = TextAlignment.LEFT,
|
||||
is_comb: bool = False,
|
||||
max_length: Optional[int] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generates the raw bytes of the PDF appearance stream for a text field.
|
||||
|
||||
This private method assembles the PDF content stream operators to draw
|
||||
the provided text within the specified rectangle. It handles text positioning,
|
||||
font application, color, and special formatting like selected text.
|
||||
|
||||
Args:
|
||||
text: The text to be rendered in the form field.
|
||||
selection: An optional list of strings that should be highlighted as selected.
|
||||
font: The font to use.
|
||||
font_glyph_byte_map: An optional dictionary mapping characters to their
|
||||
byte representation for glyph encoding.
|
||||
font_name: The name of the font resource to use (e.g., "/Helv").
|
||||
font_size: The font size. If 0, it is automatically calculated
|
||||
based on whether the field is multiline or not.
|
||||
font_color: The color to apply to the font, represented as a PDF
|
||||
graphics state string (e.g., "0 g" for black).
|
||||
is_multiline: A boolean indicating if the text field is multiline.
|
||||
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||
is_comb: Boolean that designates fixed-length fields, where every character
|
||||
fills one "cell", such as in a postcode.
|
||||
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||
length field.
|
||||
|
||||
Returns:
|
||||
A byte string containing the PDF content stream data.
|
||||
|
||||
"""
|
||||
rectangle = self._layout.rectangle
|
||||
font_glyph_byte_map = font_glyph_byte_map or {}
|
||||
if isinstance(rectangle, tuple):
|
||||
rectangle = RectangleObject(rectangle)
|
||||
leading_factor = (font.font_descriptor.bbox[3] - font.font_descriptor.bbox[1]) / 1000.0
|
||||
|
||||
# Set margins based on border width and style, but never less than 1 point
|
||||
factor = 2 if self._layout.border_style in {"/B", "/I"} else 1
|
||||
margin = max(self._layout.border_width * factor, 1)
|
||||
field_height = rectangle.height - 2 * margin
|
||||
field_width = rectangle.width - 4 * margin
|
||||
|
||||
# If font_size is 0, apply the logic for multiline or large-as-possible font
|
||||
if font_size == 0:
|
||||
min_font_size = 4.0 # The mininum font size
|
||||
if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems
|
||||
is_multiline = False # with matching "selection" with "line" later on.
|
||||
if is_multiline:
|
||||
font_size = DEFAULT_FONT_SIZE_IN_MULTILINE
|
||||
lines, font_size = self._scale_text(
|
||||
font,
|
||||
font_size,
|
||||
leading_factor,
|
||||
field_width,
|
||||
field_height,
|
||||
text,
|
||||
min_font_size
|
||||
)
|
||||
else:
|
||||
max_vertical_size = field_height / leading_factor
|
||||
text_width_unscaled = font.text_width(text) / 1000
|
||||
max_horizontal_size = field_width / (text_width_unscaled or 1)
|
||||
font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1)
|
||||
lines = [(text_width_unscaled * font_size, text)]
|
||||
elif is_comb:
|
||||
if max_length and len(text) > max_length:
|
||||
logger_warning (
|
||||
f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.",
|
||||
__name__
|
||||
)
|
||||
# We act as if each character is one line, because we draw it separately later on
|
||||
lines = [(
|
||||
font.text_width(char) * font_size / 1000,
|
||||
char
|
||||
) for index, char in enumerate(text) if index < (max_length or len(text))]
|
||||
else:
|
||||
lines = [(
|
||||
font.text_width(line) * font_size / 1000,
|
||||
line
|
||||
) for line in text.replace("\n", "\r").split("\r")]
|
||||
|
||||
# Set the vertical offset
|
||||
if is_multiline:
|
||||
y_offset = rectangle.height + margin - font.font_descriptor.bbox[3] * font_size / 1000.0
|
||||
else:
|
||||
y_offset = margin + ((field_height - font.font_descriptor.ascent * font_size / 1000) / 2)
|
||||
default_appearance = f"{font_name} {font_size} Tf {font_color}"
|
||||
|
||||
ap_stream = (
|
||||
f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} "
|
||||
f"re\nW\nBT\n{default_appearance}\n"
|
||||
).encode()
|
||||
current_x_pos: float = 0 # Initial virtual position within the text object.
|
||||
|
||||
for line_number, (line_width, line) in enumerate(lines):
|
||||
if selection and line in selection:
|
||||
# Might be improved, but cannot find how to get fill working => replaced with lined box
|
||||
ap_stream += (
|
||||
f"1 {y_offset - (line_number * font_size * leading_factor) - 1} "
|
||||
f"{rectangle.width - 2} {font_size + 2} re\n"
|
||||
f"0.5 0.5 0.5 rg s\n{default_appearance}\n"
|
||||
).encode()
|
||||
|
||||
# Calculate the desired absolute starting X for the current line
|
||||
desired_abs_x_start: float = 0
|
||||
if is_comb and max_length:
|
||||
# Calculate the width of a cell for one character
|
||||
cell_width = rectangle.width / max_length
|
||||
# Space from the left edge of the cell to the character's baseline start
|
||||
# line_width here is the *actual* character width in points for the single character 'line'
|
||||
centering_offset_in_cell = (cell_width - line_width) / 2
|
||||
# Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset
|
||||
desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell
|
||||
elif alignment == TextAlignment.RIGHT:
|
||||
desired_abs_x_start = rectangle.width - margin * 2 - line_width
|
||||
elif alignment == TextAlignment.CENTER:
|
||||
desired_abs_x_start = (rectangle.width - line_width) / 2
|
||||
else: # Left aligned; default
|
||||
desired_abs_x_start = margin * 2
|
||||
# Calculate x_rel_offset: how much to move from the current_x_pos
|
||||
# to reach the desired_abs_x_start.
|
||||
x_rel_offset = desired_abs_x_start - current_x_pos
|
||||
|
||||
# Y-offset:
|
||||
y_rel_offset: float = 0
|
||||
if line_number == 0:
|
||||
y_rel_offset = y_offset # Initial vertical position
|
||||
elif is_comb:
|
||||
y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters
|
||||
else:
|
||||
y_rel_offset = - font_size * leading_factor # Move down by line height
|
||||
|
||||
# Td is a relative translation (Tx and Ty).
|
||||
# It updates the current text position.
|
||||
ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode()
|
||||
# Update current_x_pos based on the Td operation for the next iteration.
|
||||
# This is the X position where the *current line* will start.
|
||||
current_x_pos = desired_abs_x_start
|
||||
|
||||
encoded_line: list[bytes] = [
|
||||
font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line
|
||||
]
|
||||
if any(len(c) >= 2 for c in encoded_line):
|
||||
ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n"
|
||||
else:
|
||||
ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n"
|
||||
ap_stream += b"ET\nQ\nEMC\nQ\n"
|
||||
return ap_stream
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layout: Optional[BaseStreamConfig] = None,
|
||||
text: str = "",
|
||||
selection: Optional[list[str]] = None,
|
||||
font_resource: Optional[DictionaryObject] = None,
|
||||
font_name: str = "/Helv",
|
||||
font_size: float = 0.0,
|
||||
font_color: str = "0 g",
|
||||
is_multiline: bool = False,
|
||||
alignment: TextAlignment = TextAlignment.LEFT,
|
||||
is_comb: bool = False,
|
||||
max_length: Optional[int] = None
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a TextStreamAppearance object.
|
||||
|
||||
This constructor creates a new PDF stream object configured as an XObject
|
||||
of subtype Form. It uses the `_appearance_stream_data` method to generate
|
||||
the content for the stream.
|
||||
|
||||
Args:
|
||||
layout: The basic layout parameters.
|
||||
text: The text to be rendered in the form field.
|
||||
selection: An optional list of strings that should be highlighted as selected.
|
||||
font_resource: An optional variable that represents a PDF font dictionary.
|
||||
font_name: The name of the font resource, e.g., "/Helv".
|
||||
font_size: The font size. If 0, it's auto-calculated.
|
||||
font_color: The font color string.
|
||||
is_multiline: A boolean indicating if the text field is multiline.
|
||||
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||
is_comb: Boolean that designates fixed-length fields, where every character
|
||||
fills one "cell", such as in a postcode.
|
||||
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||
length field.
|
||||
|
||||
"""
|
||||
super().__init__(layout)
|
||||
|
||||
# If a font resource was added, get the font character map
|
||||
if font_resource:
|
||||
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||
font = Font.from_font_resource(font_resource)
|
||||
else:
|
||||
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
|
||||
font_name = "/Helv"
|
||||
font_resource = DictionaryObject({
|
||||
NameObject("/Subtype"): NameObject("/Type1"),
|
||||
NameObject("/Name"): NameObject("/Helv"),
|
||||
NameObject("/Type"): NameObject("/Font"),
|
||||
NameObject("/BaseFont"): NameObject("/Helvetica"),
|
||||
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
|
||||
})
|
||||
font_descriptor = CORE_FONT_METRICS["Helvetica"]
|
||||
font_descriptor.character_widths["default"] = 2 * font_descriptor.character_widths[" "]
|
||||
font = Font(
|
||||
name="Helvetica",
|
||||
character_map={},
|
||||
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
|
||||
sub_type="Type1",
|
||||
font_descriptor = font_descriptor,
|
||||
character_widths = font_descriptor.character_widths
|
||||
)
|
||||
|
||||
font_glyph_byte_map: dict[str, bytes]
|
||||
if isinstance(font.encoding, str):
|
||||
font_glyph_byte_map = {
|
||||
v: k.encode(font.encoding) for k, v in font.character_map.items()
|
||||
}
|
||||
else:
|
||||
font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||
font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||
for key, value in font.character_map.items():
|
||||
font_glyph_byte_map[value] = font_encoding_rev.get(key, key)
|
||||
|
||||
ap_stream_data = self._generate_appearance_stream_data(
|
||||
text,
|
||||
selection,
|
||||
font,
|
||||
font_glyph_byte_map,
|
||||
font_name=font_name,
|
||||
font_size=font_size,
|
||||
font_color=font_color,
|
||||
is_multiline=is_multiline,
|
||||
alignment=alignment,
|
||||
is_comb=is_comb,
|
||||
max_length=max_length
|
||||
)
|
||||
|
||||
self.set_data(ByteStringObject(ap_stream_data))
|
||||
self[NameObject("/Length")] = NumberObject(len(ap_stream_data))
|
||||
# Update Resources with font information
|
||||
self[NameObject("/Resources")] = DictionaryObject({
|
||||
NameObject("/Font"): DictionaryObject({
|
||||
NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource)
|
||||
})
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def from_text_annotation(
|
||||
cls,
|
||||
acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM])
|
||||
field: DictionaryObject,
|
||||
annotation: DictionaryObject,
|
||||
user_font_name: str = "",
|
||||
user_font_size: float = -1,
|
||||
) -> "TextStreamAppearance":
|
||||
"""
|
||||
Creates a TextStreamAppearance object from a text field annotation.
|
||||
|
||||
This class method is a factory for creating a `TextStreamAppearance`
|
||||
instance by extracting all necessary information (bounding box, font,
|
||||
text content, etc.) from the PDF field and annotation dictionaries.
|
||||
It respects inheritance for properties like default appearance (`/DA`).
|
||||
|
||||
Args:
|
||||
acro_form: The root AcroForm dictionary from the PDF catalog.
|
||||
field: The field dictionary object.
|
||||
annotation: The widget annotation dictionary object associated with the field.
|
||||
user_font_name: An optional user-provided font name to override the
|
||||
default. Defaults to an empty string.
|
||||
user_font_size: An optional user-provided font size to override the
|
||||
default. A value of -1 indicates no override.
|
||||
|
||||
Returns:
|
||||
A new `TextStreamAppearance` instance configured for the given field.
|
||||
|
||||
"""
|
||||
# Calculate rectangle dimensions
|
||||
_rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect])
|
||||
rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1])))
|
||||
|
||||
# Get default appearance dictionary from annotation
|
||||
default_appearance = annotation.get_inherited(
|
||||
AnnotationDictionaryAttributes.DA,
|
||||
acro_form.get(AnnotationDictionaryAttributes.DA, None),
|
||||
)
|
||||
if not default_appearance:
|
||||
# Create a default appearance if none was found in the annotation
|
||||
default_appearance = TextStringObject("/Helv 0 Tf 0 g")
|
||||
else:
|
||||
default_appearance = default_appearance.get_object()
|
||||
|
||||
# Derive font name, size and color from the default appearance. Also set
|
||||
# user-provided font name and font size in the default appearance, if given.
|
||||
# For a font name, this presumes that we can find an associated font resource
|
||||
# dictionary. Uses the variable font_properties as an intermediate.
|
||||
# As per the PDF spec:
|
||||
# "At a minimum, the string [that is, default_appearance] shall include a Tf (text
|
||||
# font) operator along with its two operands, font and size" (Section 12.7.4.3
|
||||
# "Variable text" of the PDF 2.0 specification).
|
||||
font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop]
|
||||
font_name = font_properties.pop(font_properties.index("Tf") - 2)
|
||||
font_size = float(font_properties.pop(font_properties.index("Tf") - 1))
|
||||
font_properties.remove("Tf")
|
||||
font_color = " ".join(font_properties)
|
||||
# Determine the font name to use, prioritizing the user's input
|
||||
if user_font_name:
|
||||
font_name = user_font_name
|
||||
# Determine the font size to use, prioritizing the user's input
|
||||
if user_font_size > 0:
|
||||
font_size = user_font_size
|
||||
|
||||
# Try to find a resource dictionary for the font
|
||||
document_resources: Any = cast(
|
||||
DictionaryObject,
|
||||
cast(
|
||||
DictionaryObject,
|
||||
annotation.get_inherited(
|
||||
"/DR",
|
||||
acro_form.get("/DR", DictionaryObject()),
|
||||
),
|
||||
).get_object(),
|
||||
)
|
||||
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
|
||||
# CORE_FONT_METRICS is the dict with Standard font metrics
|
||||
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
|
||||
# ...or AcroForm dictionary
|
||||
document_resources = cast(
|
||||
dict[Any, Any],
|
||||
acro_form.get("/DR", {}),
|
||||
)
|
||||
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
|
||||
font_resource = document_font_resources.get(font_name, None)
|
||||
if not is_null_or_none(font_resource):
|
||||
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||
|
||||
# Retrieve field text and selected values
|
||||
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
|
||||
if (
|
||||
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
|
||||
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
|
||||
):
|
||||
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
|
||||
selection = field.get("/V", [])
|
||||
if not isinstance(selection, list):
|
||||
selection = [selection]
|
||||
else: # /Tx
|
||||
text = field.get("/V", "")
|
||||
selection = []
|
||||
|
||||
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
|
||||
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
|
||||
|
||||
# Retrieve formatting information
|
||||
is_comb = False
|
||||
max_length = None
|
||||
if field_flags & FieldDictionaryAttributes.FfBits.Comb:
|
||||
is_comb = True
|
||||
max_length = annotation.get("/MaxLen")
|
||||
is_multiline = False
|
||||
if field_flags & FieldDictionaryAttributes.FfBits.Multiline:
|
||||
is_multiline = True
|
||||
alignment = field.get("/Q", TextAlignment.LEFT)
|
||||
border_width = 1
|
||||
border_style = BorderStyles.SOLID
|
||||
if "/BS" in field:
|
||||
border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width)
|
||||
border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style)
|
||||
|
||||
# Create the TextStreamAppearance instance
|
||||
layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style)
|
||||
new_appearance_stream = cls(
|
||||
layout,
|
||||
text,
|
||||
selection,
|
||||
font_resource,
|
||||
font_name=font_name,
|
||||
font_size=font_size,
|
||||
font_color=font_color,
|
||||
is_multiline=is_multiline,
|
||||
alignment=alignment,
|
||||
is_comb=is_comb,
|
||||
max_length=max_length
|
||||
)
|
||||
if AnnotationDictionaryAttributes.AP in annotation:
|
||||
for key, value in (
|
||||
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
|
||||
):
|
||||
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
|
||||
new_appearance_stream[key] = value
|
||||
|
||||
return new_appearance_stream
|
||||
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
@@ -0,0 +1,937 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
import binascii
|
||||
import codecs
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
from binascii import unhexlify
|
||||
from collections.abc import Sequence
|
||||
from math import log10
|
||||
from struct import iter_unpack
|
||||
from typing import Any, Callable, ClassVar, Optional, Union, cast
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeGuard
|
||||
else:
|
||||
from typing_extensions import TypeGuard # PEP 647
|
||||
|
||||
from .._codecs import _pdfdoc_encoding_rev
|
||||
from .._protocols import PdfObjectProtocol, PdfWriterProtocol
|
||||
from .._utils import (
|
||||
StreamType,
|
||||
classproperty,
|
||||
deprecation_no_replacement,
|
||||
deprecation_with_replacement,
|
||||
logger_warning,
|
||||
read_non_whitespace,
|
||||
read_until_regex,
|
||||
)
|
||||
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
|
||||
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
|
||||
class PdfObject(PdfObjectProtocol):
|
||||
# function for calculating a hash value
|
||||
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
|
||||
indirect_reference: Optional["IndirectObject"]
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement .hash_bin() so far"
|
||||
)
|
||||
|
||||
def hash_value_data(self) -> bytes:
|
||||
return f"{self}".encode()
|
||||
|
||||
def hash_value(self) -> bytes:
|
||||
return (
|
||||
f"{self.__class__.__name__}:"
|
||||
f"{self.hash_func(self.hash_value_data()).hexdigest()}"
|
||||
).encode()
|
||||
|
||||
def replicate(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
) -> "PdfObject":
|
||||
"""
|
||||
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
|
||||
without ensuring links. This is used in clone_document_from_root with incremental = True.
|
||||
|
||||
Args:
|
||||
pdf_dest: Target to clone to.
|
||||
|
||||
Returns:
|
||||
The cloned PdfObject
|
||||
|
||||
"""
|
||||
return self.clone(pdf_dest)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "PdfObject":
|
||||
"""
|
||||
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
|
||||
|
||||
By default, this method will call ``_reference_clone`` (see ``_reference``).
|
||||
|
||||
|
||||
Args:
|
||||
pdf_dest: Target to clone to.
|
||||
force_duplicate: By default, if the object has already been cloned and referenced,
|
||||
the copy will be returned; when ``True``, a new copy will be created.
|
||||
(Default value = ``False``)
|
||||
ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
|
||||
during cloning (applies to children duplication as well). If fields are to be
|
||||
considered for a limited number of levels, you have to add it as integer, for
|
||||
example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
|
||||
level only but ``"/TOTO"`` on all levels.
|
||||
|
||||
Returns:
|
||||
The cloned PdfObject
|
||||
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement .clone so far"
|
||||
)
|
||||
|
||||
def _reference_clone(
|
||||
self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
|
||||
) -> PdfObjectProtocol:
|
||||
"""
|
||||
Reference the object within the _objects of pdf_dest only if
|
||||
indirect_reference attribute exists (which means the objects was
|
||||
already identified in xref/xobjstm) if object has been already
|
||||
referenced do nothing.
|
||||
|
||||
Args:
|
||||
clone:
|
||||
pdf_dest:
|
||||
|
||||
Returns:
|
||||
The clone
|
||||
|
||||
"""
|
||||
try:
|
||||
if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
|
||||
return clone
|
||||
except Exception:
|
||||
pass
|
||||
# if hasattr(clone, "indirect_reference"):
|
||||
try:
|
||||
ind = self.indirect_reference
|
||||
except AttributeError:
|
||||
return clone
|
||||
if (
|
||||
pdf_dest.incremental
|
||||
and ind is not None
|
||||
and ind.pdf == pdf_dest._reader
|
||||
and ind.idnum <= len(pdf_dest._objects)
|
||||
):
|
||||
i = ind.idnum
|
||||
else:
|
||||
i = len(pdf_dest._objects) + 1
|
||||
if ind is not None:
|
||||
if id(ind.pdf) not in pdf_dest._id_translated:
|
||||
pdf_dest._id_translated[id(ind.pdf)] = {}
|
||||
pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
|
||||
if (
|
||||
not force_duplicate
|
||||
and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
|
||||
):
|
||||
obj = pdf_dest.get_object(
|
||||
pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
|
||||
)
|
||||
assert obj is not None
|
||||
return obj
|
||||
pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
|
||||
try:
|
||||
pdf_dest._objects[i - 1] = clone
|
||||
except IndexError:
|
||||
pdf_dest._objects.append(clone)
|
||||
i = len(pdf_dest._objects)
|
||||
clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
|
||||
return clone
|
||||
|
||||
def get_object(self) -> Optional["PdfObject"]:
|
||||
"""Resolve indirect references."""
|
||||
return self
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class NullObject(PdfObject):
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NullObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__,))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"null")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> "NullObject":
|
||||
nulltxt = stream.read(4)
|
||||
if nulltxt != b"null":
|
||||
raise PdfReadError("Could not read Null object")
|
||||
return NullObject()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "NullObject"
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return isinstance(other, NullObject)
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.hash_bin()
|
||||
|
||||
|
||||
class BooleanObject(PdfObject):
|
||||
def __init__(self, value: Any) -> None:
|
||||
self.value = value
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "BooleanObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"BooleanObject",
|
||||
self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.value))
|
||||
|
||||
def __eq__(self, o: object, /) -> bool:
|
||||
if isinstance(o, BooleanObject):
|
||||
return self.value == o.value
|
||||
if isinstance(o, bool):
|
||||
return self.value == o
|
||||
return False
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.hash_bin()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "True" if self.value else "False"
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
if self.value:
|
||||
stream.write(b"true")
|
||||
else:
|
||||
stream.write(b"false")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> "BooleanObject":
|
||||
word = stream.read(4)
|
||||
if word == b"true":
|
||||
return BooleanObject(True)
|
||||
if word == b"fals":
|
||||
stream.read(1)
|
||||
return BooleanObject(False)
|
||||
raise PdfReadError("Could not read Boolean object")
|
||||
|
||||
|
||||
class IndirectObject(PdfObject):
|
||||
def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
|
||||
self.idnum = idnum
|
||||
self.generation = generation
|
||||
self.pdf = pdf
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.idnum, self.generation, id(self.pdf)))
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
|
||||
|
||||
def replicate(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
) -> "PdfObject":
|
||||
return IndirectObject(self.idnum, self.generation, pdf_dest)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "IndirectObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
if self.pdf == pdf_dest and not force_duplicate:
|
||||
# Already duplicated and no extra duplication required
|
||||
return self
|
||||
if id(self.pdf) not in pdf_dest._id_translated:
|
||||
pdf_dest._id_translated[id(self.pdf)] = {}
|
||||
pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
|
||||
|
||||
if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
|
||||
dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
|
||||
if force_duplicate:
|
||||
assert dup is not None
|
||||
assert dup.indirect_reference is not None
|
||||
idref = dup.indirect_reference
|
||||
return IndirectObject(idref.idnum, idref.generation, idref.pdf)
|
||||
else:
|
||||
obj = self.get_object()
|
||||
# case observed : a pointed object can not be found
|
||||
if obj is None:
|
||||
# this normally
|
||||
obj = NullObject()
|
||||
assert isinstance(self, (IndirectObject,))
|
||||
obj.indirect_reference = self
|
||||
dup = pdf_dest._add_object(
|
||||
obj.clone(pdf_dest, force_duplicate, ignore_fields)
|
||||
)
|
||||
assert dup is not None, "mypy"
|
||||
assert dup.indirect_reference is not None, "mypy"
|
||||
return dup.indirect_reference
|
||||
|
||||
@property
|
||||
def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
|
||||
return self
|
||||
|
||||
def get_object(self) -> Optional["PdfObject"]:
|
||||
return self.pdf.get_object(self)
|
||||
|
||||
def __deepcopy__(self, memo: Any) -> "IndirectObject":
|
||||
return IndirectObject(self.idnum, self.generation, self.pdf)
|
||||
|
||||
def _get_object_with_check(self) -> Optional["PdfObject"]:
|
||||
o = self.get_object()
|
||||
# the check is done here to not slow down get_object()
|
||||
if isinstance(o, IndirectObject):
|
||||
raise PdfStreamError(
|
||||
f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
|
||||
)
|
||||
return o
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
# Attribute not found in object: look in pointed object
|
||||
try:
|
||||
return getattr(self._get_object_with_check(), name)
|
||||
except AttributeError:
|
||||
raise AttributeError(
|
||||
f"No attribute {name} found in IndirectObject or pointed object"
|
||||
)
|
||||
|
||||
def __getitem__(self, key: Any) -> Any:
|
||||
# items should be extracted from pointed Object
|
||||
return self._get_object_with_check()[key] # type: ignore
|
||||
|
||||
def __contains__(self, key: Any) -> bool:
|
||||
return key in self._get_object_with_check() # type: ignore
|
||||
|
||||
def __iter__(self) -> Any:
|
||||
return self._get_object_with_check().__iter__() # type: ignore
|
||||
|
||||
def __float__(self) -> str:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__float__() # type: ignore
|
||||
|
||||
def __int__(self) -> int:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__int__() # type: ignore
|
||||
|
||||
def __str__(self) -> str:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__str__()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return (
|
||||
other is not None
|
||||
and isinstance(other, IndirectObject)
|
||||
and self.idnum == other.idnum
|
||||
and self.generation == other.generation
|
||||
and self.pdf is other.pdf
|
||||
)
|
||||
|
||||
def __ne__(self, other: object) -> bool:
|
||||
return not self.__eq__(other)
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(f"{self.idnum} {self.generation} R".encode())
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
|
||||
idnum = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok.isspace():
|
||||
break
|
||||
idnum += tok
|
||||
generation = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok.isspace():
|
||||
if not generation:
|
||||
continue
|
||||
break
|
||||
generation += tok
|
||||
r = read_non_whitespace(stream)
|
||||
if r != b"R":
|
||||
raise PdfReadError(
|
||||
f"Error reading indirect object reference at byte {hex(stream.tell())}"
|
||||
)
|
||||
return IndirectObject(int(idnum), int(generation), pdf)
|
||||
|
||||
|
||||
FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
|
||||
|
||||
|
||||
class FloatObject(float, PdfObject):
|
||||
def __new__(
|
||||
cls, value: Any = "0.0", context: Optional[Any] = None
|
||||
) -> "FloatObject":
|
||||
try:
|
||||
value = float(value)
|
||||
return float.__new__(cls, value)
|
||||
except Exception as e:
|
||||
# If this isn't a valid decimal (happens in malformed PDFs)
|
||||
# fallback to 0
|
||||
logger_warning(
|
||||
f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
|
||||
)
|
||||
return float.__new__(cls, 0.0)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "FloatObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"FloatObject",
|
||||
self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.as_numeric))
|
||||
|
||||
def myrepr(self) -> str:
|
||||
if self == 0:
|
||||
return "0.0"
|
||||
nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
|
||||
return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.myrepr() # repr(float(self))
|
||||
|
||||
def as_numeric(self) -> float:
|
||||
return float(self)
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(self.myrepr().encode("utf8"))
|
||||
|
||||
|
||||
class NumberObject(int, PdfObject):
|
||||
NumberPattern = re.compile(b"[^+-.0-9]")
|
||||
|
||||
def __new__(cls, value: Any) -> "NumberObject":
|
||||
try:
|
||||
return int.__new__(cls, int(value))
|
||||
except ValueError:
|
||||
logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
|
||||
return int.__new__(cls, 0)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NumberObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NumberObject",
|
||||
self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.as_numeric()))
|
||||
|
||||
def as_numeric(self) -> int:
|
||||
return int(repr(self).encode("utf8"))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(repr(self).encode("utf8"))
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
|
||||
num = read_until_regex(stream, NumberObject.NumberPattern)
|
||||
if b"." in num:
|
||||
return FloatObject(num)
|
||||
return NumberObject(num)
|
||||
|
||||
|
||||
class ByteStringObject(bytes, PdfObject):
|
||||
"""
|
||||
Represents a string object where the text encoding could not be determined.
|
||||
|
||||
This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
||||
represent strings -- for example, the encryption data stored in files (like
|
||||
/O) is clearly not text, but is still stored in a "String" object.
|
||||
"""
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "ByteStringObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"ByteStringObject",
|
||||
self._reference_clone(
|
||||
ByteStringObject(bytes(self)), pdf_dest, force_duplicate
|
||||
),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, bytes(self)))
|
||||
|
||||
@property
|
||||
def original_bytes(self) -> bytes:
|
||||
"""For compatibility with TextStringObject.original_bytes."""
|
||||
return self
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"<")
|
||||
stream.write(binascii.hexlify(self))
|
||||
stream.write(b">")
|
||||
|
||||
def __str__(self) -> str:
|
||||
charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
|
||||
for enc in charset_to_try:
|
||||
try:
|
||||
return self.decode(enc)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
raise PdfReadError("Cannot decode ByteStringObject.")
|
||||
|
||||
|
||||
class TextStringObject(str, PdfObject): # noqa: SLOT000
|
||||
"""
|
||||
A string object that has been decoded into a real unicode string.
|
||||
|
||||
If read from a PDF document, this string appeared to match the
|
||||
PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
|
||||
to occur.
|
||||
"""
|
||||
|
||||
autodetect_pdfdocencoding: bool
|
||||
autodetect_utf16: bool
|
||||
utf16_bom: bytes
|
||||
_original_bytes: Optional[bytes] = None
|
||||
|
||||
def __new__(cls, value: Any) -> "TextStringObject":
|
||||
original_bytes = None
|
||||
if isinstance(value, bytes):
|
||||
original_bytes = value
|
||||
value = value.decode("charmap")
|
||||
text_string_object = str.__new__(cls, value)
|
||||
text_string_object._original_bytes = original_bytes
|
||||
text_string_object.autodetect_utf16 = False
|
||||
text_string_object.autodetect_pdfdocencoding = False
|
||||
text_string_object.utf16_bom = b""
|
||||
if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
|
||||
# The value of `original_bytes` is only set for inputs being `bytes`.
|
||||
# If this is UTF-16 data according to the BOM (first two characters),
|
||||
# perform special handling. All other cases should not need any special conversion
|
||||
# due to already being a string.
|
||||
try:
|
||||
text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
|
||||
except UnicodeDecodeError as exception:
|
||||
logger_warning(
|
||||
f"{exception!s}\ninitial string:{exception.object!r}",
|
||||
__name__,
|
||||
)
|
||||
text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
|
||||
text_string_object._original_bytes = original_bytes
|
||||
text_string_object.autodetect_utf16 = True
|
||||
text_string_object.utf16_bom = original_bytes[:2]
|
||||
else:
|
||||
try:
|
||||
encode_pdfdocencoding(text_string_object)
|
||||
text_string_object.autodetect_pdfdocencoding = True
|
||||
except UnicodeEncodeError:
|
||||
text_string_object.autodetect_utf16 = True
|
||||
text_string_object.utf16_bom = codecs.BOM_UTF16_BE
|
||||
return text_string_object
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "TextStringObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
obj = TextStringObject(self)
|
||||
obj._original_bytes = self._original_bytes
|
||||
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
|
||||
obj.autodetect_utf16 = self.autodetect_utf16
|
||||
obj.utf16_bom = self.utf16_bom
|
||||
return cast(
|
||||
"TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.original_bytes))
|
||||
|
||||
@property
|
||||
def original_bytes(self) -> bytes:
|
||||
"""
|
||||
It is occasionally possible that a text string object gets created where
|
||||
a byte string object was expected due to the autodetection mechanism --
|
||||
if that occurs, this "original_bytes" property can be used to
|
||||
back-calculate what the original encoded bytes were.
|
||||
"""
|
||||
if self._original_bytes is not None:
|
||||
return self._original_bytes
|
||||
return self.get_original_bytes()
|
||||
|
||||
def get_original_bytes(self) -> bytes:
|
||||
# We're a text string object, but the library is trying to get our raw
|
||||
# bytes. This can happen if we auto-detected this string as text, but
|
||||
# we were wrong. It's pretty common. Return the original bytes that
|
||||
# would have been used to create this object, based upon the autodetect
|
||||
# method.
|
||||
if self.autodetect_utf16:
|
||||
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||
return codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||
if self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||
return self.encode("utf-16be")
|
||||
if self.autodetect_pdfdocencoding:
|
||||
return encode_pdfdocencoding(self)
|
||||
raise Exception("no information about original bytes") # pragma: no cover
|
||||
|
||||
def get_encoded_bytes(self) -> bytes:
|
||||
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
||||
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
||||
# here for trying...
|
||||
try:
|
||||
if self._original_bytes is not None:
|
||||
return self._original_bytes
|
||||
if self.autodetect_utf16:
|
||||
raise UnicodeEncodeError("", "forced", -1, -1, "")
|
||||
bytearr = encode_pdfdocencoding(self)
|
||||
except UnicodeEncodeError:
|
||||
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||
bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||
elif self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||
else:
|
||||
bytearr = self.encode("utf-16be")
|
||||
return bytearr
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
bytearr = self.get_encoded_bytes()
|
||||
stream.write(b"(")
|
||||
for c_ in iter_unpack("c", bytearr):
|
||||
c = cast(bytes, c_[0])
|
||||
if not c.isalnum() and c != b" ":
|
||||
# This:
|
||||
# stream.write(rf"\{c:0>3o}".encode())
|
||||
# gives
|
||||
# https://github.com/davidhalter/parso/issues/207
|
||||
stream.write(b"\\%03o" % ord(c))
|
||||
else:
|
||||
stream.write(c)
|
||||
stream.write(b")")
|
||||
|
||||
|
||||
class NameObject(str, PdfObject): # noqa: SLOT000
|
||||
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
|
||||
prefix = b"/"
|
||||
renumber_table: ClassVar[dict[str, bytes]] = {
|
||||
**{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
|
||||
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
|
||||
}
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NameObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NameObject",
|
||||
self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(self.renumber())
|
||||
|
||||
def renumber(self) -> bytes:
|
||||
out = self[0].encode("utf-8")
|
||||
if out != b"/":
|
||||
deprecation_no_replacement(
|
||||
f"Incorrect first char in NameObject, should start with '/': ({self})",
|
||||
"5.0.0",
|
||||
)
|
||||
for c in self[1:]:
|
||||
if c > "~":
|
||||
for x in c.encode("utf-8"):
|
||||
out += f"#{x:02X}".encode()
|
||||
else:
|
||||
try:
|
||||
out += self.renumber_table[c]
|
||||
except KeyError:
|
||||
out += c.encode("utf-8")
|
||||
return out
|
||||
|
||||
def _sanitize(self) -> "NameObject":
|
||||
"""
|
||||
Sanitize the NameObject's name to be a valid PDF name part
|
||||
(alphanumeric, underscore, hyphen). The _sanitize method replaces
|
||||
spaces and any non-alphanumeric/non-underscore/non-hyphen with
|
||||
underscores.
|
||||
|
||||
Returns:
|
||||
NameObject with sanitized name.
|
||||
"""
|
||||
name = str(self).removeprefix("/")
|
||||
name = re.sub(r"\ ", "_", name)
|
||||
name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||
return NameObject("/" + name)
|
||||
|
||||
@classproperty
|
||||
def surfix(cls) -> bytes: # noqa: N805
|
||||
deprecation_with_replacement("surfix", "prefix", "5.0.0")
|
||||
return b"/"
|
||||
|
||||
@staticmethod
|
||||
def unnumber(sin: bytes) -> bytes:
|
||||
i = sin.find(b"#", 0)
|
||||
while i >= 0:
|
||||
try:
|
||||
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
|
||||
i = sin.find(b"#", i + 1)
|
||||
except ValueError:
|
||||
# if the 2 characters after # can not be converted to hex
|
||||
# we change nothing and carry on
|
||||
i = i + 1
|
||||
return sin
|
||||
|
||||
CHARSETS = ("utf-8", "gbk", "latin1")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
|
||||
name = stream.read(1)
|
||||
if name != NameObject.prefix:
|
||||
raise PdfReadError("Name read error")
|
||||
name += read_until_regex(stream, NameObject.delimiter_pattern)
|
||||
try:
|
||||
# Name objects should represent irregular characters
|
||||
# with a '#' followed by the symbol's hex number
|
||||
name = NameObject.unnumber(name)
|
||||
for enc in NameObject.CHARSETS:
|
||||
try:
|
||||
ret = name.decode(enc)
|
||||
return NameObject(ret)
|
||||
except Exception:
|
||||
pass
|
||||
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError) as e:
|
||||
if not pdf.strict:
|
||||
logger_warning(
|
||||
f"Illegal character in NameObject ({name!r}), "
|
||||
"you may need to adjust NameObject.CHARSETS",
|
||||
__name__,
|
||||
)
|
||||
return NameObject(name.decode("charmap"))
|
||||
raise PdfReadError(
|
||||
f"Illegal character in NameObject ({name!r}). "
|
||||
"You may need to adjust NameObject.CHARSETS.",
|
||||
) from e
|
||||
|
||||
|
||||
def encode_pdfdocencoding(unicode_string: str) -> bytes:
|
||||
try:
|
||||
return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
|
||||
except KeyError:
|
||||
raise UnicodeEncodeError(
|
||||
"pdfdocencoding",
|
||||
unicode_string,
|
||||
-1,
|
||||
-1,
|
||||
"does not exist in translation table",
|
||||
)
|
||||
|
||||
|
||||
def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
|
||||
"""
|
||||
Returns:
|
||||
True if x is None or NullObject.
|
||||
|
||||
"""
|
||||
return x is None or (
|
||||
isinstance(x, PdfObject)
|
||||
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
|
||||
)
|
||||
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
File diff suppressed because it is too large
Load Diff
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
@@ -0,0 +1,401 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import bisect
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from pypdf._utils import format_iso8824_date, parse_iso8824_date
|
||||
from pypdf.constants import CatalogAttributes as CA
|
||||
from pypdf.constants import FileSpecificationDictionaryEntries
|
||||
from pypdf.constants import PageAttributes as PG
|
||||
from pypdf.errors import PdfReadError, PyPdfError
|
||||
from pypdf.generic import (
|
||||
ArrayObject,
|
||||
ByteStringObject,
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NameObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
StreamObject,
|
||||
TextStringObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from collections.abc import Generator
|
||||
|
||||
from pypdf._writer import PdfWriter
|
||||
|
||||
|
||||
class EmbeddedFile:
|
||||
"""
|
||||
Container holding the information on an embedded file.
|
||||
|
||||
Attributes are evaluated lazily if possible.
|
||||
|
||||
Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.
|
||||
"""
|
||||
def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None:
|
||||
"""
|
||||
Args:
|
||||
name: The (primary) name as provided in the name tree.
|
||||
pdf_object: The corresponding PDF object to allow retrieving further data.
|
||||
parent: The parent list.
|
||||
"""
|
||||
self._name = name
|
||||
self.pdf_object = pdf_object
|
||||
self._parent = parent
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""The (primary) name of the embedded file as provided in the name tree."""
|
||||
return self._name
|
||||
|
||||
@classmethod
|
||||
def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile:
|
||||
"""
|
||||
Create a new embedded file and add it to the PdfWriter.
|
||||
|
||||
Args:
|
||||
writer: The PdfWriter instance to add the embedded file to.
|
||||
name: The filename to display.
|
||||
content: The data in the file.
|
||||
|
||||
Returns:
|
||||
EmbeddedFile instance for the newly created embedded file.
|
||||
"""
|
||||
# Convert string content to bytes if needed
|
||||
if isinstance(content, str):
|
||||
content = content.encode("latin-1")
|
||||
|
||||
# Create the file entry (the actual embedded file stream)
|
||||
file_entry = DecodedStreamObject()
|
||||
file_entry.set_data(content)
|
||||
file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")})
|
||||
|
||||
# Create the /EF entry
|
||||
ef_entry = DictionaryObject()
|
||||
ef_entry.update({NameObject("/F"): writer._add_object(file_entry)})
|
||||
|
||||
# Create the filespec dictionary
|
||||
from pypdf.generic import create_string_object # noqa: PLC0415
|
||||
filespec = DictionaryObject()
|
||||
filespec_reference = writer._add_object(filespec)
|
||||
name_object = cast(TextStringObject, create_string_object(name))
|
||||
filespec.update(
|
||||
{
|
||||
NameObject(PG.TYPE): NameObject("/Filespec"),
|
||||
NameObject(FileSpecificationDictionaryEntries.F): name_object,
|
||||
NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
|
||||
}
|
||||
)
|
||||
|
||||
# Add the name and filespec to the names array.
|
||||
# We use the inverse order for insertion, as this allows us to re-use the
|
||||
# same index.
|
||||
names_array = cls._get_names_array(writer)
|
||||
insertion_index = cls._get_insertion_index(names_array, name_object)
|
||||
names_array.insert(insertion_index, filespec_reference)
|
||||
names_array.insert(insertion_index, name_object)
|
||||
|
||||
# Return an EmbeddedFile instance
|
||||
return cls(name=name, pdf_object=filespec, parent=names_array)
|
||||
|
||||
@classmethod
|
||||
def _get_names_array(cls, writer: PdfWriter) -> ArrayObject:
|
||||
"""Get the names array for embedded files, possibly creating and flattening it."""
|
||||
if CA.NAMES not in writer.root_object:
|
||||
# Add the /Names entry to the catalog.
|
||||
writer.root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject())
|
||||
|
||||
names_dict = cast(DictionaryObject, writer.root_object[CA.NAMES])
|
||||
if "/EmbeddedFiles" not in names_dict:
|
||||
# We do not yet have an entry for embedded files. Create and return it.
|
||||
names = ArrayObject()
|
||||
embedded_files_names_dictionary = DictionaryObject(
|
||||
{NameObject(CA.NAMES): names}
|
||||
)
|
||||
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||
return names
|
||||
|
||||
# We have an existing embedded files entry.
|
||||
embedded_files_names_tree = cast(DictionaryObject, names_dict["/EmbeddedFiles"])
|
||||
if "/Names" in embedded_files_names_tree:
|
||||
# Simple case: We already have a flat list.
|
||||
return cast(ArrayObject, embedded_files_names_tree[NameObject(CA.NAMES)])
|
||||
if "/Kids" not in embedded_files_names_tree:
|
||||
# Invalid case: This is no name tree.
|
||||
raise PdfReadError("Got neither Names nor Kids in embedded files tree.")
|
||||
|
||||
# Complex case: Convert a /Kids-based name tree to a /Names-based one.
|
||||
# /Name-based ones are much easier to handle and allow us to simplify the
|
||||
# actual insertion logic by only having to consider one case.
|
||||
names = ArrayObject()
|
||||
kids = cast(ArrayObject, embedded_files_names_tree["/Kids"].get_object())
|
||||
embedded_files_names_dictionary = DictionaryObject(
|
||||
{NameObject(CA.NAMES): names}
|
||||
)
|
||||
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||
for kid in kids:
|
||||
# Write the flattened file entries. As we do not change the actual files,
|
||||
# this should not have any impact on references to them.
|
||||
# There might be further (nested) kids here.
|
||||
# Wait for an example before evaluating an implementation.
|
||||
for name in kid.get_object().get("/Names", []):
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
@classmethod
|
||||
def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int:
|
||||
keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)]
|
||||
name_bytes = name.encode("utf-8")
|
||||
|
||||
start = bisect.bisect_left(keys, name_bytes)
|
||||
end = bisect.bisect_right(keys, name_bytes)
|
||||
|
||||
if start != end:
|
||||
return end * 2
|
||||
if start == 0:
|
||||
return 0
|
||||
if start == (key_count := len(keys)):
|
||||
return key_count * 2
|
||||
return end * 2
|
||||
|
||||
@property
|
||||
def alternative_name(self) -> str | None:
|
||||
"""Retrieve the alternative name (file specification)."""
|
||||
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||
# PDF 2.0 reference, table 43:
|
||||
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
|
||||
if key in self.pdf_object:
|
||||
value = self.pdf_object[key].get_object()
|
||||
if not is_null_or_none(value):
|
||||
return cast(str, value)
|
||||
return None
|
||||
|
||||
@alternative_name.setter
|
||||
def alternative_name(self, value: TextStringObject | None) -> None:
|
||||
"""Set the alternative name (file specification)."""
|
||||
if value is None:
|
||||
if FileSpecificationDictionaryEntries.UF in self.pdf_object:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject()
|
||||
if FileSpecificationDictionaryEntries.F in self.pdf_object:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject()
|
||||
else:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value
|
||||
|
||||
@property
|
||||
def description(self) -> str | None:
|
||||
"""Retrieve the description."""
|
||||
value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC)
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@description.setter
|
||||
def description(self, value: TextStringObject | None) -> None:
|
||||
"""Set the description."""
|
||||
if value is None:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject()
|
||||
else:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value
|
||||
|
||||
@property
|
||||
def associated_file_relationship(self) -> str:
|
||||
"""Retrieve the relationship of the referring document to this embedded file."""
|
||||
return self.pdf_object.get("/AFRelationship", "/Unspecified")
|
||||
|
||||
@associated_file_relationship.setter
|
||||
def associated_file_relationship(self, value: NameObject) -> None:
|
||||
"""Set the relationship of the referring document to this embedded file."""
|
||||
self.pdf_object[NameObject("/AFRelationship")] = value
|
||||
|
||||
@property
|
||||
def _embedded_file(self) -> StreamObject:
|
||||
"""Retrieve the actual embedded file stream."""
|
||||
if "/EF" not in self.pdf_object:
|
||||
raise PdfReadError(f"/EF entry not found: {self.pdf_object}")
|
||||
ef = cast(DictionaryObject, self.pdf_object["/EF"])
|
||||
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||
if key in ef:
|
||||
return cast(StreamObject, ef[key].get_object())
|
||||
raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}")
|
||||
|
||||
@property
|
||||
def _params(self) -> DictionaryObject:
|
||||
"""Retrieve the file-specific parameters."""
|
||||
return self._embedded_file.get("/Params", DictionaryObject()).get_object()
|
||||
|
||||
@cached_property
|
||||
def _ensure_params(self) -> DictionaryObject:
|
||||
"""Ensure the /Params dictionary exists and return it."""
|
||||
embedded_file = self._embedded_file
|
||||
if "/Params" not in embedded_file:
|
||||
embedded_file[NameObject("/Params")] = DictionaryObject()
|
||||
return cast(DictionaryObject, embedded_file["/Params"])
|
||||
|
||||
@property
|
||||
def subtype(self) -> str | None:
|
||||
"""Retrieve the subtype. This is a MIME media type, prefixed by a slash."""
|
||||
value = self._embedded_file.get("/Subtype")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@subtype.setter
|
||||
def subtype(self, value: NameObject | None) -> None:
|
||||
"""Set the subtype. This should be a MIME media type, prefixed by a slash."""
|
||||
embedded_file = self._embedded_file
|
||||
if value is None:
|
||||
embedded_file[NameObject("/Subtype")] = NullObject()
|
||||
else:
|
||||
embedded_file[NameObject("/Subtype")] = value
|
||||
|
||||
@property
|
||||
def content(self) -> bytes:
|
||||
"""Retrieve the actual file content."""
|
||||
return self._embedded_file.get_data()
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str | bytes) -> None:
|
||||
"""Set the file content."""
|
||||
if isinstance(value, str):
|
||||
value = value.encode("latin-1")
|
||||
self._embedded_file.set_data(value)
|
||||
|
||||
@property
|
||||
def size(self) -> int | None:
|
||||
"""Retrieve the size of the uncompressed file in bytes."""
|
||||
value = self._params.get("/Size")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@size.setter
|
||||
def size(self, value: NumberObject | None) -> None:
|
||||
"""Set the size of the uncompressed file in bytes."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/Size")] = NullObject()
|
||||
else:
|
||||
params[NameObject("/Size")] = value
|
||||
|
||||
@property
|
||||
def creation_date(self) -> datetime.datetime | None:
|
||||
"""Retrieve the file creation datetime."""
|
||||
return parse_iso8824_date(self._params.get("/CreationDate"))
|
||||
|
||||
@creation_date.setter
|
||||
def creation_date(self, value: datetime.datetime | None) -> None:
|
||||
"""Set the file creation datetime."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/CreationDate")] = NullObject()
|
||||
else:
|
||||
date_str = format_iso8824_date(value)
|
||||
params[NameObject("/CreationDate")] = TextStringObject(date_str)
|
||||
|
||||
@property
|
||||
def modification_date(self) -> datetime.datetime | None:
|
||||
"""Retrieve the datetime of the last file modification."""
|
||||
return parse_iso8824_date(self._params.get("/ModDate"))
|
||||
|
||||
@modification_date.setter
|
||||
def modification_date(self, value: datetime.datetime | None) -> None:
|
||||
"""Set the datetime of the last file modification."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/ModDate")] = NullObject()
|
||||
else:
|
||||
date_str = format_iso8824_date(value)
|
||||
params[NameObject("/ModDate")] = TextStringObject(date_str)
|
||||
|
||||
@property
|
||||
def checksum(self) -> bytes | None:
|
||||
"""Retrieve the MD5 checksum of the (uncompressed) file."""
|
||||
value = self._params.get("/CheckSum")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@checksum.setter
|
||||
def checksum(self, value: ByteStringObject | None) -> None:
|
||||
"""Set the MD5 checksum of the (uncompressed) file."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/CheckSum")] = NullObject()
|
||||
else:
|
||||
params[NameObject("/CheckSum")] = value
|
||||
|
||||
def delete(self) -> None:
|
||||
"""Delete the file from the document."""
|
||||
if not self._parent:
|
||||
raise PyPdfError("Parent required to delete file from document.")
|
||||
if self.pdf_object in self._parent:
|
||||
index = self._parent.index(self.pdf_object)
|
||||
elif (
|
||||
(indirect_reference := getattr(self.pdf_object, "indirect_reference", None)) is not None
|
||||
and indirect_reference in self._parent
|
||||
):
|
||||
index = self._parent.index(indirect_reference)
|
||||
else:
|
||||
raise PyPdfError("File not found in parent object.")
|
||||
self._parent.pop(index) # Reference.
|
||||
self._parent.pop(index - 1) # Name.
|
||||
self.pdf_object = DictionaryObject() # Invalidate.
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.__class__.__name__} name={self.name!r}>"
|
||||
|
||||
@classmethod
|
||||
def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]:
|
||||
"""
|
||||
Convert the given name tree into class instances.
|
||||
|
||||
Args:
|
||||
names: The name tree to load the data from.
|
||||
|
||||
Returns:
|
||||
Iterable of class instances for the files found.
|
||||
"""
|
||||
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
|
||||
for i, name in enumerate(names):
|
||||
if not isinstance(name, str):
|
||||
# Skip plain strings and retrieve them as `direct_name` by index.
|
||||
file_dictionary = name.get_object()
|
||||
direct_name = names[i - 1].get_object()
|
||||
yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names)
|
||||
|
||||
@classmethod
|
||||
def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]:
|
||||
"""
|
||||
Load the embedded files for the given document catalog.
|
||||
|
||||
This method and its signature are considered internal API and thus not exposed publicly for now.
|
||||
|
||||
Args:
|
||||
catalog: The document catalog to load from.
|
||||
|
||||
Returns:
|
||||
Iterable of class instances for the files found.
|
||||
"""
|
||||
try:
|
||||
container = cast(
|
||||
DictionaryObject,
|
||||
cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
|
||||
)
|
||||
except KeyError:
|
||||
return
|
||||
|
||||
if "/Kids" in container:
|
||||
for kid in cast(ArrayObject, container["/Kids"].get_object()):
|
||||
# There might be further (nested) kids here.
|
||||
# Wait for an example before evaluating an implementation.
|
||||
kid = kid.get_object()
|
||||
if "/Names" in kid:
|
||||
yield from cls._load_from_names(cast(ArrayObject, kid["/Names"]))
|
||||
if "/Names" in container:
|
||||
yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))
|
||||
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
@@ -0,0 +1,174 @@
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from ._base import is_null_or_none
|
||||
|
||||
|
||||
class Fit:
|
||||
def __init__(
|
||||
self, fit_type: str, fit_args: tuple[Union[None, float, Any], ...] = ()
|
||||
) -> None:
|
||||
from ._base import FloatObject, NameObject, NullObject, NumberObject # noqa: PLC0415
|
||||
|
||||
self.fit_type = NameObject(fit_type)
|
||||
self.fit_args: list[Union[NullObject, FloatObject, NumberObject]] = [
|
||||
NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def xyz(
|
||||
cls,
|
||||
left: Optional[float] = None,
|
||||
top: Optional[float] = None,
|
||||
zoom: Optional[float] = None,
|
||||
) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the coordinates (left, top)
|
||||
positioned at the upper-left corner of the window and the contents
|
||||
of the page magnified by the factor zoom.
|
||||
|
||||
A null value for any of the parameters left, top, or zoom specifies
|
||||
that the current value of that parameter is to be retained unchanged.
|
||||
|
||||
A zoom value of 0 has the same meaning as a null value.
|
||||
|
||||
Args:
|
||||
left:
|
||||
top:
|
||||
zoom:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/XYZ", fit_args=(left, top, zoom))
|
||||
|
||||
@classmethod
|
||||
def fit(cls) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified just
|
||||
enough to fit the entire page within the window both horizontally and
|
||||
vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the page within the
|
||||
window in the other dimension.
|
||||
"""
|
||||
return Fit(fit_type="/Fit")
|
||||
|
||||
@classmethod
|
||||
def fit_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the vertical coordinate top
|
||||
positioned at the top edge of the window and the contents of the page
|
||||
magnified just enough to fit the entire width of the page within the
|
||||
window.
|
||||
|
||||
A null value for ``top`` specifies that the current value of that
|
||||
parameter is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitH", fit_args=(top,))
|
||||
|
||||
@classmethod
|
||||
def fit_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||
return Fit(fit_type="/FitV", fit_args=(left,))
|
||||
|
||||
@classmethod
|
||||
def fit_rectangle(
|
||||
cls,
|
||||
left: Optional[float] = None,
|
||||
bottom: Optional[float] = None,
|
||||
right: Optional[float] = None,
|
||||
top: Optional[float] = None,
|
||||
) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified
|
||||
just enough to fit the rectangle specified by the coordinates
|
||||
left, bottom, right, and top entirely within the window
|
||||
both horizontally and vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the rectangle within
|
||||
the window in the other dimension.
|
||||
|
||||
A null value for any of the parameters may result in unpredictable
|
||||
behavior.
|
||||
|
||||
Args:
|
||||
left:
|
||||
bottom:
|
||||
right:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top))
|
||||
|
||||
@classmethod
|
||||
def fit_box(cls) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified just
|
||||
enough to fit its bounding box entirely within the window both
|
||||
horizontally and vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the bounding box
|
||||
within the window in the other dimension.
|
||||
"""
|
||||
return Fit(fit_type="/FitB")
|
||||
|
||||
@classmethod
|
||||
def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the vertical coordinate top
|
||||
positioned at the top edge of the window and the contents of the page
|
||||
magnified just enough to fit the entire width of its bounding box
|
||||
within the window.
|
||||
|
||||
A null value for top specifies that the current value of that parameter
|
||||
is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitBH", fit_args=(top,))
|
||||
|
||||
@classmethod
|
||||
def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the horizontal coordinate
|
||||
left positioned at the left edge of the window and the contents of the
|
||||
page magnified just enough to fit the entire height of its bounding box
|
||||
within the window.
|
||||
|
||||
A null value for left specifies that the current value of that
|
||||
parameter is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
left:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitBV", fit_args=(left,))
|
||||
|
||||
def __str__(self) -> str:
|
||||
if not self.fit_args:
|
||||
return f"Fit({self.fit_type})"
|
||||
return f"Fit({self.fit_type}, {self.fit_args})"
|
||||
|
||||
|
||||
DEFAULT_FIT = Fit.fit()
|
||||
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
@@ -0,0 +1,314 @@
|
||||
# Copyright (c) 2024, pypdf contributors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import IO
|
||||
|
||||
from .._utils import (
|
||||
WHITESPACES,
|
||||
WHITESPACES_AS_BYTES,
|
||||
StreamType,
|
||||
logger_warning,
|
||||
read_non_whitespace,
|
||||
)
|
||||
from ..errors import PdfReadError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# An inline image should be used only for small images (4096 bytes or less),
|
||||
# but allow twice this for cases where this has been exceeded.
|
||||
BUFFER_SIZE = 8192
|
||||
|
||||
|
||||
def _check_end_image_marker(stream: StreamType) -> bool:
|
||||
ei_tok = read_non_whitespace(stream)
|
||||
ei_tok += stream.read(2)
|
||||
stream.seek(-3, 1)
|
||||
return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)
|
||||
|
||||
|
||||
def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract HexEncoded stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter > and EI as backup.
|
||||
while True:
|
||||
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b">")
|
||||
if pos_tok >= 0: # found >
|
||||
data_out += data_buffered[: pos_tok + 1]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||
break
|
||||
pos_ei = data_buffered.find(b"EI")
|
||||
if pos_ei >= 0: # found EI
|
||||
stream.seek(-len(data_buffered) + pos_ei - 1, 1)
|
||||
c = stream.read(1)
|
||||
while c in WHITESPACES:
|
||||
stream.seek(-2, 1)
|
||||
c = stream.read(1)
|
||||
pos_ei -= 1
|
||||
data_out += data_buffered[:pos_ei]
|
||||
break
|
||||
if len(data_buffered) == 2:
|
||||
data_out += data_buffered
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
# Neither > nor EI found
|
||||
data_out += data_buffered[:-2]
|
||||
stream.seek(-2, 1)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract A85 stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter ~>
|
||||
while True:
|
||||
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b"~>")
|
||||
if pos_tok >= 0: # found!
|
||||
data_out += data_buffered[: pos_tok + 2]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 2, 1)
|
||||
break
|
||||
if len(data_buffered) == 2: # end of buffer
|
||||
data_out += data_buffered
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
data_out += data_buffered[
|
||||
:-2
|
||||
] # back by one char in case of in the middle of ~>
|
||||
stream.seek(-2, 1)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__run_length_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract RL (RunLengthDecode) stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter 128
|
||||
while True:
|
||||
data_buffered = stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b"\x80")
|
||||
if pos_tok >= 0: # found
|
||||
# Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
|
||||
# marks the EOD. But there apparently are cases like in issue #3517, where we have
|
||||
# an inline image with up to 51 EOD markers. In these cases, be resilient here and
|
||||
# use the default `EI` marker detection instead. Please note that this fallback
|
||||
# still omits special `EI` handling within the stream, but for now assume that having
|
||||
# both of these cases occur at the same time is very unlikely (and the image stream
|
||||
# is broken anyway).
|
||||
# For now, do not skip over more than one whitespace character.
|
||||
after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
|
||||
if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
|
||||
data_out += data_buffered[: pos_tok + 1]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||
else:
|
||||
logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
|
||||
ei_marker = data_buffered.find(b"EI")
|
||||
if ei_marker > 0:
|
||||
data_out += data_buffered[: ei_marker]
|
||||
stream.seek(-len(data_buffered) + ei_marker - 1, 1)
|
||||
break
|
||||
data_out += data_buffered
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__dct_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract DCT (JPEG) stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
def read(length: int) -> bytes:
|
||||
# If 0 bytes are returned, and *size* was not 0, this indicates end of file.
|
||||
# If the object is in non-blocking mode and no bytes are available, `None` is returned.
|
||||
_result = stream.read(length)
|
||||
if _result is None or len(_result) != length:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
return _result
|
||||
|
||||
data_out: bytes = b""
|
||||
# Read Blocks of data (ID/Size/data) up to ID=FF/D9
|
||||
# https://www.digicamsoft.com/itu/itu-t81-36.html
|
||||
not_first = False
|
||||
while True:
|
||||
c = read(1)
|
||||
if not_first or (c == b"\xff"):
|
||||
data_out += c
|
||||
if c != b"\xff":
|
||||
continue
|
||||
not_first = True
|
||||
c = read(1)
|
||||
data_out += c
|
||||
if c == b"\xff":
|
||||
stream.seek(-1, 1) # pragma: no cover
|
||||
elif c == b"\x00": # stuffing
|
||||
pass
|
||||
elif c == b"\xd9": # end
|
||||
break
|
||||
elif c in (
|
||||
b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
|
||||
b"\xda\xdb\xdc\xdd\xde\xdf"
|
||||
b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
|
||||
):
|
||||
c = read(2)
|
||||
data_out += c
|
||||
sz = c[0] * 256 + c[1]
|
||||
data_out += read(sz - 2)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline_default(stream: StreamType) -> bytes:
|
||||
"""Legacy method, used by default"""
|
||||
stream_out = BytesIO()
|
||||
# Read the inline image, while checking for EI (End Image) operator.
|
||||
while True:
|
||||
data_buffered = stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_ei = data_buffered.find(
|
||||
b"E"
|
||||
) # We can not look straight for "EI" because it may not have been loaded in the buffer
|
||||
|
||||
if pos_ei == -1:
|
||||
stream_out.write(data_buffered)
|
||||
else:
|
||||
# Write out everything including E (the one from EI to be removed)
|
||||
stream_out.write(data_buffered[0 : pos_ei + 1])
|
||||
sav_pos_ei = stream_out.tell() - 1
|
||||
# Seek back in the stream to read the E next
|
||||
stream.seek(pos_ei + 1 - len(data_buffered), 1)
|
||||
saved_pos = stream.tell()
|
||||
# Check for End Image
|
||||
tok2 = stream.read(1) # I of "EI"
|
||||
if tok2 != b"I":
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
tok3 = stream.read(1) # possible space after "EI"
|
||||
if tok3 not in WHITESPACES:
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
while tok3 in WHITESPACES:
|
||||
tok3 = stream.read(1)
|
||||
if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
|
||||
b"Q",
|
||||
b"E",
|
||||
}: # for Q or EMC
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
if is_followed_by_binary_data(stream):
|
||||
# Inline image contains `EI ` sequence usually marking the end of it, but
|
||||
# is followed by binary data which does not make sense for the actual end.
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
# Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient
|
||||
# remove E(I) wrongly inserted earlier
|
||||
stream.seek(saved_pos - 1, 0)
|
||||
stream_out.truncate(sav_pos_ei)
|
||||
break
|
||||
|
||||
return stream_out.getvalue()
|
||||
|
||||
|
||||
def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool:
|
||||
"""
|
||||
Check if the next bytes of the stream look like binary image data or regular page content.
|
||||
|
||||
This is just some heuristics due to the PDF specification being too imprecise about
|
||||
inline images containing the `EI` marker which would end an image. Starting with PDF 2.0,
|
||||
we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited
|
||||
everywhere, we should not expect to be able to remove such hacks in the near future - especially
|
||||
considering legacy documents as well.
|
||||
|
||||
The actual implementation draws some inspiration from
|
||||
https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
|
||||
"""
|
||||
position = stream.tell()
|
||||
data = stream.read(length)
|
||||
stream.seek(position)
|
||||
if not data:
|
||||
return False
|
||||
operator_start = None
|
||||
operator_end = None
|
||||
|
||||
for index, byte in enumerate(data):
|
||||
if byte < 32 and byte not in WHITESPACES_AS_BYTES:
|
||||
# This covers all characters not being displayable directly, although omitting whitespace
|
||||
# to allow for operator detection.
|
||||
return True
|
||||
is_whitespace = byte in WHITESPACES_AS_BYTES
|
||||
if operator_start is None and not is_whitespace:
|
||||
# Interpret all other non-whitespace characters as the start of an operation.
|
||||
operator_start = index
|
||||
if operator_start is not None and is_whitespace:
|
||||
# A whitespace stops an operation.
|
||||
# Assume that having an inline image with tons of whitespace is rather unlikely.
|
||||
operator_end = index
|
||||
break
|
||||
|
||||
if operator_start is None:
|
||||
# Inline images should not have tons of whitespaces, which would lead to no operator start.
|
||||
return False
|
||||
if operator_end is None:
|
||||
# We probably are inside an operation.
|
||||
operator_end = length
|
||||
operator_length = operator_end - operator_start
|
||||
operator = data[operator_start:operator_end]
|
||||
if operator.startswith(b"/") and operator_length > 1:
|
||||
# Name object.
|
||||
return False
|
||||
if operator.replace(b".", b"").isdigit():
|
||||
# Graphics operator, for example a move. A number (integer or float).
|
||||
return False
|
||||
if operator_length > 3: # noqa: SIM103
|
||||
# Usually, the operators inside a content stream should not have more than three characters,
|
||||
# especially after an inline image.
|
||||
return True
|
||||
return False
|
||||
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
# This module contains code used by _writer.py to track links in pages
|
||||
# being added to the writer until the links can be resolved.
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union, cast
|
||||
|
||||
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .._page import PageObject
|
||||
from .._reader import PdfReader
|
||||
from .._writer import PdfWriter
|
||||
|
||||
|
||||
class NamedReferenceLink:
|
||||
"""Named reference link being preserved until we can resolve it correctly."""
|
||||
|
||||
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
|
||||
"""reference: TextStringObject with named reference"""
|
||||
self._reference = reference
|
||||
self._source_pdf = source_pdf
|
||||
|
||||
def find_referenced_page(self) -> Union[IndirectObject, None]:
|
||||
destination = self._source_pdf.named_destinations.get(str(self._reference))
|
||||
return destination.page if destination else None
|
||||
|
||||
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||
"""target_pdf: PdfWriter which the new link went into"""
|
||||
# point named destination in new PDF to the new page
|
||||
if str(self._reference) not in target_pdf.named_destinations:
|
||||
target_pdf.add_named_destination(str(self._reference), new_page.page_number)
|
||||
|
||||
|
||||
class DirectReferenceLink:
|
||||
"""Direct reference link being preserved until we can resolve it correctly."""
|
||||
|
||||
def __init__(self, reference: ArrayObject) -> None:
|
||||
"""reference: an ArrayObject whose first element is the Page indirect object"""
|
||||
self._reference = reference
|
||||
|
||||
def find_referenced_page(self) -> IndirectObject:
|
||||
return self._reference[0]
|
||||
|
||||
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||
"""target_pdf: PdfWriter which the new link went into"""
|
||||
self._reference[0] = new_page
|
||||
|
||||
|
||||
ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink]
|
||||
|
||||
|
||||
def extract_links(new_page: "PageObject", old_page: "PageObject") -> list[tuple[ReferenceLink, ReferenceLink]]:
|
||||
"""Extracts links from two pages on the assumption that the two pages are
|
||||
the same. Produces one list of (new link, old link) tuples.
|
||||
"""
|
||||
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
|
||||
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
|
||||
|
||||
return [
|
||||
(new_link, old_link) for (new_link, old_link)
|
||||
in zip(new_links, old_links)
|
||||
if new_link and old_link
|
||||
]
|
||||
|
||||
|
||||
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
|
||||
src = cast("PdfReader", page.pdf)
|
||||
link = cast(DictionaryObject, indirect_object.get_object())
|
||||
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
|
||||
return None
|
||||
|
||||
if "/A" in link:
|
||||
action = cast(DictionaryObject, link["/A"])
|
||||
if action.get("/S") != "/GoTo":
|
||||
return None
|
||||
|
||||
if "/D" not in action:
|
||||
return None
|
||||
return _create_link(action["/D"], src)
|
||||
|
||||
if "/Dest" in link:
|
||||
return _create_link(link["/Dest"], src)
|
||||
|
||||
return None # Nothing to do here
|
||||
|
||||
|
||||
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
|
||||
if isinstance(reference, TextStringObject):
|
||||
return NamedReferenceLink(reference, source_pdf)
|
||||
if isinstance(reference, ArrayObject):
|
||||
return DirectReferenceLink(reference)
|
||||
return None
|
||||
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from typing import Union
|
||||
|
||||
from .._utils import StreamType, deprecation_no_replacement
|
||||
from ._base import NameObject
|
||||
from ._data_structures import Destination
|
||||
|
||||
|
||||
class OutlineItem(Destination):
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"<<\n")
|
||||
for key in [
|
||||
NameObject(x)
|
||||
for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"]
|
||||
if x in self
|
||||
]:
|
||||
key.write_to_stream(stream)
|
||||
stream.write(b" ")
|
||||
value = self.raw_get(key)
|
||||
value.write_to_stream(stream)
|
||||
stream.write(b"\n")
|
||||
key = NameObject("/Dest")
|
||||
key.write_to_stream(stream)
|
||||
stream.write(b" ")
|
||||
value = self.dest_array
|
||||
value.write_to_stream(stream)
|
||||
stream.write(b"\n")
|
||||
stream.write(b">>")
|
||||
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import FloatObject, NumberObject
|
||||
from ._data_structures import ArrayObject
|
||||
|
||||
|
||||
class RectangleObject(ArrayObject):
|
||||
"""
|
||||
This class is used to represent *page boxes* in pypdf.
|
||||
|
||||
These boxes include:
|
||||
|
||||
* :attr:`artbox <pypdf._page.PageObject.artbox>`
|
||||
* :attr:`bleedbox <pypdf._page.PageObject.bleedbox>`
|
||||
* :attr:`cropbox <pypdf._page.PageObject.cropbox>`
|
||||
* :attr:`mediabox <pypdf._page.PageObject.mediabox>`
|
||||
* :attr:`trimbox <pypdf._page.PageObject.trimbox>`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, arr: Union["RectangleObject", tuple[float, float, float, float]]
|
||||
) -> None:
|
||||
# must have four points
|
||||
assert len(arr) == 4
|
||||
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
||||
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr])
|
||||
|
||||
def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
|
||||
if not isinstance(value, (FloatObject, NumberObject)):
|
||||
value = FloatObject(value)
|
||||
return value
|
||||
|
||||
def scale(self, sx: float, sy: float) -> "RectangleObject":
|
||||
return RectangleObject(
|
||||
(
|
||||
float(self.left) * sx,
|
||||
float(self.bottom) * sy,
|
||||
float(self.right) * sx,
|
||||
float(self.top) * sy,
|
||||
)
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"RectangleObject({list(self)!r})"
|
||||
|
||||
@property
|
||||
def left(self) -> FloatObject:
|
||||
return self[0]
|
||||
|
||||
@left.setter
|
||||
def left(self, f: float) -> None:
|
||||
self[0] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def bottom(self) -> FloatObject:
|
||||
return self[1]
|
||||
|
||||
@bottom.setter
|
||||
def bottom(self, f: float) -> None:
|
||||
self[1] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def right(self) -> FloatObject:
|
||||
return self[2]
|
||||
|
||||
@right.setter
|
||||
def right(self, f: float) -> None:
|
||||
self[2] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def top(self) -> FloatObject:
|
||||
return self[3]
|
||||
|
||||
@top.setter
|
||||
def top(self, f: float) -> None:
|
||||
self[3] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def lower_left(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the lower left coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.left, self.bottom
|
||||
|
||||
@lower_left.setter
|
||||
def lower_left(self, value: tuple[float, float]) -> None:
|
||||
self[0], self[1] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def lower_right(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the lower right coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.right, self.bottom
|
||||
|
||||
@lower_right.setter
|
||||
def lower_right(self, value: tuple[float, float]) -> None:
|
||||
self[2], self[1] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def upper_left(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the upper left coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.left, self.top
|
||||
|
||||
@upper_left.setter
|
||||
def upper_left(self, value: tuple[float, float]) -> None:
|
||||
self[0], self[3] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def upper_right(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the upper right coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.right, self.top
|
||||
|
||||
@upper_right.setter
|
||||
def upper_right(self, value: tuple[float, float]) -> None:
|
||||
self[2], self[3] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def width(self) -> float:
|
||||
return self.right - self.left
|
||||
|
||||
@property
|
||||
def height(self) -> float:
|
||||
return self.top - self.bottom
|
||||
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
@@ -0,0 +1,208 @@
|
||||
import codecs
|
||||
from typing import Union
|
||||
|
||||
from .._codecs import _pdfdoc_encoding
|
||||
from .._utils import StreamType, logger_warning, read_non_whitespace
|
||||
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
|
||||
from ._base import ByteStringObject, TextStringObject
|
||||
|
||||
|
||||
def hex_to_rgb(value: str) -> tuple[float, float, float]:
|
||||
return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
|
||||
|
||||
|
||||
def read_hex_string_from_stream(
|
||||
stream: StreamType,
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||
stream.read(1)
|
||||
arr = []
|
||||
x = b""
|
||||
while True:
|
||||
tok = read_non_whitespace(stream)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok == b">":
|
||||
break
|
||||
x += tok
|
||||
if len(x) == 2:
|
||||
arr.append(int(x, base=16))
|
||||
x = b""
|
||||
if len(x) == 1:
|
||||
x += b"0"
|
||||
if x != b"":
|
||||
arr.append(int(x, base=16))
|
||||
return create_string_object(bytes(arr), forced_encoding)
|
||||
|
||||
|
||||
__ESCAPE_DICT__ = {
|
||||
b"n": ord(b"\n"),
|
||||
b"r": ord(b"\r"),
|
||||
b"t": ord(b"\t"),
|
||||
b"b": ord(b"\b"),
|
||||
b"f": ord(b"\f"),
|
||||
b"(": ord(b"("),
|
||||
b")": ord(b")"),
|
||||
b"/": ord(b"/"),
|
||||
b"\\": ord(b"\\"),
|
||||
b" ": ord(b" "),
|
||||
b"%": ord(b"%"),
|
||||
b"<": ord(b"<"),
|
||||
b">": ord(b">"),
|
||||
b"[": ord(b"["),
|
||||
b"]": ord(b"]"),
|
||||
b"#": ord(b"#"),
|
||||
b"_": ord(b"_"),
|
||||
b"&": ord(b"&"),
|
||||
b"$": ord(b"$"),
|
||||
}
|
||||
__BACKSLASH_CODE__ = 92
|
||||
|
||||
|
||||
def read_string_from_stream(
|
||||
stream: StreamType,
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||
tok = stream.read(1)
|
||||
parens = 1
|
||||
txt = []
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok == b"(":
|
||||
parens += 1
|
||||
elif tok == b")":
|
||||
parens -= 1
|
||||
if parens == 0:
|
||||
break
|
||||
elif tok == b"\\":
|
||||
tok = stream.read(1)
|
||||
try:
|
||||
txt.append(__ESCAPE_DICT__[tok])
|
||||
continue
|
||||
except KeyError:
|
||||
if b"0" <= tok <= b"7":
|
||||
# "The number ddd may consist of one, two, or three
|
||||
# octal digits; high-order overflow shall be ignored.
|
||||
# Three octal digits shall be used, with leading zeros
|
||||
# as needed, if the next character of the string is also
|
||||
# a digit." (PDF reference 7.3.4.2, p 16)
|
||||
sav = stream.tell() - 1
|
||||
for _ in range(2):
|
||||
ntok = stream.read(1)
|
||||
if b"0" <= ntok <= b"7":
|
||||
tok += ntok
|
||||
else:
|
||||
stream.seek(-1, 1) # ntok has to be analyzed
|
||||
break
|
||||
i = int(tok, base=8)
|
||||
if i > 255:
|
||||
txt.append(__BACKSLASH_CODE__)
|
||||
stream.seek(sav)
|
||||
else:
|
||||
txt.append(i)
|
||||
continue
|
||||
if tok in b"\n\r":
|
||||
# This case is hit when a backslash followed by a line
|
||||
# break occurs. If it's a multi-char EOL, consume the
|
||||
# second character:
|
||||
tok = stream.read(1)
|
||||
if tok not in b"\n\r":
|
||||
stream.seek(-1, 1)
|
||||
# Then don't add anything to the actual string, since this
|
||||
# line break was escaped:
|
||||
continue
|
||||
msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"
|
||||
logger_warning(msg, __name__)
|
||||
txt.append(__BACKSLASH_CODE__)
|
||||
txt.append(ord(tok))
|
||||
return create_string_object(bytes(txt), forced_encoding)
|
||||
|
||||
|
||||
def create_string_object(
|
||||
string: Union[str, bytes],
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union[TextStringObject, ByteStringObject]:
|
||||
"""
|
||||
Create a ByteStringObject or a TextStringObject from a string to represent the string.
|
||||
|
||||
Args:
|
||||
string: The data being used
|
||||
forced_encoding: Typically None, or an encoding string
|
||||
|
||||
Returns:
|
||||
A ByteStringObject
|
||||
|
||||
Raises:
|
||||
TypeError: If string is not of type str or bytes.
|
||||
|
||||
"""
|
||||
if isinstance(string, str):
|
||||
return TextStringObject(string)
|
||||
if isinstance(string, bytes):
|
||||
if isinstance(forced_encoding, (list, dict)):
|
||||
out = ""
|
||||
for x in string:
|
||||
try:
|
||||
out += forced_encoding[x]
|
||||
except Exception:
|
||||
out += bytes((x,)).decode("charmap")
|
||||
obj = TextStringObject(out)
|
||||
obj._original_bytes = string
|
||||
return obj
|
||||
if isinstance(forced_encoding, str):
|
||||
if forced_encoding == "bytes":
|
||||
return ByteStringObject(string)
|
||||
obj = TextStringObject(string.decode(forced_encoding))
|
||||
obj._original_bytes = string
|
||||
return obj
|
||||
try:
|
||||
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
|
||||
retval = TextStringObject(string.decode("utf-16"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = string[:2]
|
||||
return retval
|
||||
if string.startswith(b"\x00"):
|
||||
retval = TextStringObject(string.decode("utf-16be"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = codecs.BOM_UTF16_BE
|
||||
return retval
|
||||
if string[1:2] == b"\x00":
|
||||
retval = TextStringObject(string.decode("utf-16le"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = codecs.BOM_UTF16_LE
|
||||
return retval
|
||||
|
||||
# This is probably a big performance hit here, but we need
|
||||
# to convert string objects into the text/unicode-aware
|
||||
# version if possible... and the only way to check if that's
|
||||
# possible is to try.
|
||||
# Some strings are strings, some are just byte arrays.
|
||||
retval = TextStringObject(decode_pdfdocencoding(string))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_pdfdocencoding = True
|
||||
return retval
|
||||
except UnicodeDecodeError:
|
||||
return ByteStringObject(string)
|
||||
else:
|
||||
raise TypeError("create_string_object should have str or unicode arg")
|
||||
|
||||
|
||||
def decode_pdfdocencoding(byte_array: bytes) -> str:
|
||||
retval = ""
|
||||
for b in byte_array:
|
||||
c = _pdfdoc_encoding[b]
|
||||
if c == "\u0000":
|
||||
raise UnicodeDecodeError(
|
||||
"pdfdocencoding",
|
||||
bytearray(b),
|
||||
-1,
|
||||
-1,
|
||||
"does not exist in translation table",
|
||||
)
|
||||
retval += c
|
||||
return retval
|
||||
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Copyright (c) 2023, Pubpub-ZZ
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none
|
||||
from ._data_structures import ArrayObject, DictionaryObject
|
||||
|
||||
f_obj = BooleanObject(False)
|
||||
|
||||
|
||||
class ViewerPreferences(DictionaryObject):
|
||||
def __init__(self, obj: Optional[DictionaryObject] = None) -> None:
|
||||
super().__init__(self)
|
||||
if not is_null_or_none(obj):
|
||||
self.update(obj.items()) # type: ignore
|
||||
try:
|
||||
self.indirect_reference = obj.indirect_reference # type: ignore
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _get_bool(self, key: str, default: Optional[BooleanObject]) -> Optional[BooleanObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_bool(self, key: str, v: bool) -> None:
|
||||
self[NameObject(key)] = BooleanObject(v is True)
|
||||
|
||||
def _get_name(self, key: str, default: Optional[NameObject]) -> Optional[NameObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_name(self, key: str, lst: list[str], v: NameObject) -> None:
|
||||
if v[0] != "/":
|
||||
raise ValueError(f"{v} does not start with '/'")
|
||||
if lst != [] and v not in lst:
|
||||
raise ValueError(f"{v} is an unacceptable value")
|
||||
self[NameObject(key)] = NameObject(v)
|
||||
|
||||
def _get_arr(self, key: str, default: Optional[list[Any]]) -> Optional[ArrayObject]:
|
||||
return self.get(key, None if default is None else ArrayObject(default))
|
||||
|
||||
def _set_arr(self, key: str, v: Optional[ArrayObject]) -> None:
|
||||
if v is None:
|
||||
try:
|
||||
del self[NameObject(key)]
|
||||
except KeyError:
|
||||
pass
|
||||
return
|
||||
if not isinstance(v, ArrayObject):
|
||||
raise ValueError("ArrayObject is expected")
|
||||
self[NameObject(key)] = v
|
||||
|
||||
def _get_int(self, key: str, default: Optional[NumberObject]) -> Optional[NumberObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_int(self, key: str, v: int) -> None:
|
||||
self[NameObject(key)] = NumberObject(v)
|
||||
|
||||
@property
|
||||
def PRINT_SCALING(self) -> NameObject:
|
||||
return NameObject("/PrintScaling")
|
||||
|
||||
def __new__(cls: Any, value: Any = None) -> "ViewerPreferences":
|
||||
def _add_prop_bool(key: str, default: Optional[BooleanObject]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_bool(key, default),
|
||||
lambda self, v: self._set_bool(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_name(
|
||||
key: str, lst: list[str], default: Optional[NameObject]
|
||||
) -> property:
|
||||
return property(
|
||||
lambda self: self._get_name(key, default),
|
||||
lambda self, v: self._set_name(key, lst, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined.
|
||||
Acceptable values: {lst}
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_arr(key: str, default: Optional[ArrayObject]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_arr(key, default),
|
||||
lambda self, v: self._set_arr(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_int(key: str, default: Optional[int]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_int(key, default),
|
||||
lambda self, v: self._set_int(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
cls.hide_toolbar = _add_prop_bool("/HideToolbar", f_obj)
|
||||
cls.hide_menubar = _add_prop_bool("/HideMenubar", f_obj)
|
||||
cls.hide_windowui = _add_prop_bool("/HideWindowUI", f_obj)
|
||||
cls.fit_window = _add_prop_bool("/FitWindow", f_obj)
|
||||
cls.center_window = _add_prop_bool("/CenterWindow", f_obj)
|
||||
cls.display_doctitle = _add_prop_bool("/DisplayDocTitle", f_obj)
|
||||
|
||||
cls.non_fullscreen_pagemode = _add_prop_name(
|
||||
"/NonFullScreenPageMode",
|
||||
["/UseNone", "/UseOutlines", "/UseThumbs", "/UseOC"],
|
||||
NameObject("/UseNone"),
|
||||
)
|
||||
cls.direction = _add_prop_name(
|
||||
"/Direction", ["/L2R", "/R2L"], NameObject("/L2R")
|
||||
)
|
||||
cls.view_area = _add_prop_name("/ViewArea", [], None)
|
||||
cls.view_clip = _add_prop_name("/ViewClip", [], None)
|
||||
cls.print_area = _add_prop_name("/PrintArea", [], None)
|
||||
cls.print_clip = _add_prop_name("/PrintClip", [], None)
|
||||
cls.print_scaling = _add_prop_name("/PrintScaling", [], None)
|
||||
cls.duplex = _add_prop_name(
|
||||
"/Duplex", ["/Simplex", "/DuplexFlipShortEdge", "/DuplexFlipLongEdge"], None
|
||||
)
|
||||
cls.pick_tray_by_pdfsize = _add_prop_bool("/PickTrayByPDFSize", None)
|
||||
cls.print_pagerange = _add_prop_arr("/PrintPageRange", None)
|
||||
cls.num_copies = _add_prop_int("/NumCopies", None)
|
||||
|
||||
cls.enforce = _add_prop_arr("/Enforce", ArrayObject())
|
||||
|
||||
return DictionaryObject.__new__(cls)
|
||||
Reference in New Issue
Block a user