Update ashboard, dashboard, memory +1 more (+2 ~3)
This commit is contained in:
@@ -687,6 +687,11 @@
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.issue-checkbox.in-progress {
|
||||
background: rgba(59, 130, 246, 0.3);
|
||||
border-color: #3b82f6;
|
||||
}
|
||||
|
||||
.issue-checkbox svg {
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
@@ -698,6 +703,14 @@
|
||||
display: block;
|
||||
}
|
||||
|
||||
.issue-checkbox.in-progress::after {
|
||||
content: '';
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
background: #3b82f6;
|
||||
}
|
||||
|
||||
.issue-content {
|
||||
flex: 1;
|
||||
min-width: 0;
|
||||
@@ -738,6 +751,27 @@
|
||||
.issue-owner.marius { color: #22c55e; }
|
||||
.issue-owner.robert { color: #f59e0b; }
|
||||
|
||||
.issue-status {
|
||||
padding: 2px 8px;
|
||||
border-radius: var(--radius-sm);
|
||||
font-size: 11px;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.3px;
|
||||
}
|
||||
.issue-status.todo {
|
||||
background: rgba(156, 163, 175, 0.2);
|
||||
color: #9ca3af;
|
||||
}
|
||||
.issue-status.in-progress {
|
||||
background: rgba(59, 130, 246, 0.2);
|
||||
color: #3b82f6;
|
||||
}
|
||||
.issue-status.done {
|
||||
background: rgba(34, 197, 94, 0.2);
|
||||
color: #22c55e;
|
||||
}
|
||||
|
||||
/* Todo's Panel */
|
||||
.todos-panel { border-left: 3px solid #8b5cf6; }
|
||||
.todo-section { margin-bottom: 16px; }
|
||||
@@ -1266,10 +1300,21 @@
|
||||
<option value="backlog">⚪ Backlog</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="form-label">Status</label>
|
||||
<select class="input" id="issueStatus">
|
||||
<option value="todo">Todo</option>
|
||||
<option value="in-progress">In Progress</option>
|
||||
<option value="done">Done</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<div class="form-group">
|
||||
<label class="form-label">Deadline</label>
|
||||
<input type="date" class="input" id="issueDeadline">
|
||||
</div>
|
||||
<div class="form-group"></div>
|
||||
</div>
|
||||
<div class="modal-actions">
|
||||
<button class="btn btn-danger" id="issueDeleteBtn" onclick="deleteIssue()" style="margin-right: auto; display: none;">Șterge</button>
|
||||
@@ -2129,7 +2174,6 @@
|
||||
<div class="priority-group">
|
||||
<div class="priority-header ${isCollapsed ? 'collapsed' : ''}" onclick="togglePriority('${priority}')">
|
||||
<i data-lucide="chevron-down"></i>
|
||||
<span class="priority-dot ${priority}"></span>
|
||||
<span>${priorityLabels[priority]}</span>
|
||||
<span style="margin-left: auto; opacity: 0.7">${todoCount}/${issues.length}</span>
|
||||
</div>
|
||||
@@ -2146,18 +2190,23 @@
|
||||
|
||||
function renderIssueItem(issue) {
|
||||
const isDone = issue.status === 'done';
|
||||
const isInProgress = issue.status === 'in-progress';
|
||||
const ownerIcons = { 'clawdbot': '🤖', 'robert': '👷', 'marius': '👤' };
|
||||
const ownerIcon = ownerIcons[issue.owner] || '👤';
|
||||
const dateStr = new Date(issue.created).toLocaleDateString('ro-RO', { day: 'numeric', month: 'short' });
|
||||
const statusLabels = { 'todo': 'Todo', 'in-progress': 'In Progress', 'done': 'Done' };
|
||||
const statusLabel = statusLabels[issue.status] || 'Todo';
|
||||
const checkboxClass = isDone ? 'checked' : (isInProgress ? 'in-progress' : '');
|
||||
|
||||
return `
|
||||
<div class="issue-item ${isDone ? 'done' : ''}" data-id="${issue.id}">
|
||||
<div class="issue-checkbox ${isDone ? 'checked' : ''}" onclick="toggleIssue('${issue.id}')">
|
||||
<div class="issue-checkbox ${checkboxClass}" onclick="toggleIssue('${issue.id}')" title="Click pentru a schimba statusul">
|
||||
<i data-lucide="check"></i>
|
||||
</div>
|
||||
<div class="issue-content" onclick="editIssue('${issue.id}')">
|
||||
<div class="issue-title">${issue.title}</div>
|
||||
<div class="issue-meta">
|
||||
<span class="issue-status ${issue.status || 'todo'}">${statusLabel}</span>
|
||||
${issue.program ? `<span class="issue-tag program">${issue.program}</span>` : ''}
|
||||
<span class="issue-owner ${issue.owner}">${ownerIcon} ${issue.owner === 'clawdbot' ? 'Clawdbot' : (issue.owner === 'robert' ? 'Robert' : 'Marius')}</span>
|
||||
<span class="issue-date">${dateStr}</span>
|
||||
@@ -2180,17 +2229,27 @@
|
||||
const issue = issuesData.issues.find(i => i.id === id);
|
||||
if (!issue) return;
|
||||
|
||||
issue.status = issue.status === 'done' ? 'todo' : 'done';
|
||||
// Cycle: todo → in-progress → done → todo
|
||||
const statusCycle = { 'todo': 'in-progress', 'in-progress': 'done', 'done': 'todo' };
|
||||
const currentStatus = issue.status || 'todo';
|
||||
issue.status = statusCycle[currentStatus] || 'in-progress';
|
||||
|
||||
if (issue.status === 'done') {
|
||||
issue.completed = new Date().toISOString();
|
||||
} else {
|
||||
delete issue.completed;
|
||||
}
|
||||
|
||||
const statusMessages = {
|
||||
'in-progress': '🔄 In Progress',
|
||||
'done': '✅ Done!',
|
||||
'todo': '📋 Todo'
|
||||
};
|
||||
|
||||
renderIssues();
|
||||
updateIssuesCount();
|
||||
await saveIssues();
|
||||
showToast(issue.status === 'done' ? 'Issue finalizat! ✓' : 'Issue redeschis');
|
||||
showToast(statusMessages[issue.status]);
|
||||
}
|
||||
|
||||
// Filters
|
||||
@@ -2212,6 +2271,7 @@
|
||||
document.getElementById('issueProgram').value = '';
|
||||
document.getElementById('issueOwner').value = 'marius';
|
||||
document.getElementById('issuePriority').value = 'urgent-important';
|
||||
document.getElementById('issueStatus').value = 'todo';
|
||||
document.getElementById('issueDeadline').value = '';
|
||||
document.getElementById('issueDeleteBtn').style.display = 'none';
|
||||
document.getElementById('issueSaveBtn').textContent = 'Adaugă';
|
||||
@@ -2230,6 +2290,7 @@
|
||||
document.getElementById('issueProgram').value = issue.program || '';
|
||||
document.getElementById('issueOwner').value = issue.owner || 'marius';
|
||||
document.getElementById('issuePriority').value = issue.priority || 'backlog';
|
||||
document.getElementById('issueStatus').value = issue.status || 'todo';
|
||||
document.getElementById('issueDeadline').value = issue.deadline || '';
|
||||
document.getElementById('issueDeleteBtn').style.display = 'block';
|
||||
document.getElementById('issueSaveBtn').textContent = 'Salvează';
|
||||
@@ -2272,6 +2333,13 @@
|
||||
issue.program = document.getElementById('issueProgram').value;
|
||||
issue.owner = document.getElementById('issueOwner').value;
|
||||
issue.priority = document.getElementById('issuePriority').value;
|
||||
const newStatus = document.getElementById('issueStatus').value;
|
||||
if (newStatus === 'done' && issue.status !== 'done') {
|
||||
issue.completed = new Date().toISOString();
|
||||
} else if (newStatus !== 'done') {
|
||||
delete issue.completed;
|
||||
}
|
||||
issue.status = newStatus;
|
||||
issue.deadline = document.getElementById('issueDeadline').value || null;
|
||||
issue.updated = new Date().toISOString();
|
||||
}
|
||||
@@ -2285,7 +2353,7 @@
|
||||
program: document.getElementById('issueProgram').value,
|
||||
owner: document.getElementById('issueOwner').value,
|
||||
priority: document.getElementById('issuePriority').value,
|
||||
status: 'todo',
|
||||
status: document.getElementById('issueStatus').value || 'todo',
|
||||
created: new Date().toISOString(),
|
||||
deadline: document.getElementById('issueDeadline').value || null
|
||||
};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"lastUpdated": "2026-02-02T11:25:18.119Z",
|
||||
"lastUpdated": "2026-02-02T22:27:06.452Z",
|
||||
"programs": [
|
||||
"ROACONT",
|
||||
"ROAGEST",
|
||||
@@ -23,7 +23,8 @@
|
||||
"priority": "urgent-important",
|
||||
"status": "todo",
|
||||
"created": "2026-02-02T11:25:18.115Z",
|
||||
"deadline": "2026-02-02"
|
||||
"deadline": "2026-02-02",
|
||||
"updated": "2026-02-02T22:27:06.428Z"
|
||||
},
|
||||
{
|
||||
"id": "ROA-001",
|
||||
@@ -31,10 +32,11 @@
|
||||
"description": "RD 49 = în urma inspecției fiscale\nRD 50 = impozit precedent\nFormularul nu recalculează impozitul de 16%\nRD 40 se modifică și la 4.1",
|
||||
"program": "ROACONT",
|
||||
"owner": "marius",
|
||||
"priority": "urgent-important",
|
||||
"priority": "important",
|
||||
"status": "todo",
|
||||
"created": "2026-01-30T15:10:00Z",
|
||||
"deadline": null
|
||||
"deadline": "2026-02-06",
|
||||
"updated": "2026-02-02T22:26:59.690Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,52 +1,33 @@
|
||||
# 2026-02-02 - Note de sesiune
|
||||
# 2 Februarie 2026
|
||||
|
||||
## Decizii
|
||||
- Marius aprobă TOATE propunerile din raportul de seară ("Da")
|
||||
- A0 + A3 executate imediat
|
||||
- A1 + A2 (sesiuni TU+EU) de programat luni-joi 15:00-16:00
|
||||
|
||||
### Rapoarte pe EMAIL (nu Discord)
|
||||
- Morning-report și evening-report merg acum pe **email** (mmarius28@gmail.com)
|
||||
- Format nou cu două secțiuni:
|
||||
- **📚 Sinteză** - modele/concepte → fișier separat + link
|
||||
- **⚡ Acționabile** - task-uri cu CINE/CE/EFORT/REZULTAT clar
|
||||
- 3 răspunsuri predefinite (1/2/3) pentru 80/20
|
||||
- Job-uri actualizate: `morning-report`, `evening-report`
|
||||
## Executat
|
||||
- **A0:** Git commit și push (2 commits: TOOLS.md, KB index, coaching, email tool)
|
||||
- **A3:** Integrată întrebarea "Ce poveste despre tine ar trebui să renunți?" în insights pentru coaching dimineață
|
||||
|
||||
### Fix email_send.py
|
||||
- Problema: MailChannels + Gmail respingeau emailurile
|
||||
- Cauza: Emoji în FROM_NAME + header-e non-RFC
|
||||
- Fix:
|
||||
- `FROM_NAME = "Echo"` (fără emoji)
|
||||
- `Header(subject, 'utf-8')` pentru encoding
|
||||
- `formataddr((FROM_NAME, SMTP_USER))` pentru RFC compliance
|
||||
## De programat
|
||||
- **A1:** Sesiune "Dizolvarea lui Nu Merit" (30 min) - exercițiu Monica Ion
|
||||
- **A2:** Sistemul 5 pași pentru frici (15 min) - Zoltan Vereș
|
||||
|
||||
### Reguli sub-agenți (AGENTS.md)
|
||||
- Când lansez sub-agent, TREBUIE să-i dau tot contextul: AGENTS.md, SOUL.md, USER.md, memory relevant
|
||||
- Sub-agentul rulează izolat, nu are acces automat la fișierele mele
|
||||
## Feedback Marius
|
||||
1. **Email replies:** Nu primește email-urile de confirmare - de verificat flux
|
||||
2. **Insights → Rapoarte:** Raportul de seară a fost prea conservator - 22 insights extrase dar doar 4 propuneri în raport. De ajustat job-ul evening-report să propună mai multe.
|
||||
|
||||
## Fișiere create/modificate
|
||||
## Stats azi
|
||||
- 23 note YouTube în KB (20 procesate azi - Zoltan Vereș workshop)
|
||||
- 22 insights extrase în `memory/kb/insights/2026-02-02.md`
|
||||
- Job insights-extract funcționează, dar rapoartele nu folosesc toate
|
||||
|
||||
- `memory/kb/insights/2026-02-02.md` - 22 insights din 20 video-uri
|
||||
- `memory/kb/insights/sinteza-2026-02-02.md` - 16 modele/concepte (sinteză)
|
||||
- `tools/email_send.py` - fix RFC compliance
|
||||
- `AGENTS.md` - reguli sub-agenți
|
||||
- `TOOLS.md` - documentație joburi actualizată
|
||||
## De făcut
|
||||
- [x] Ajustez evening-report și morning-report să propună cu ZI și ORĂ concrete
|
||||
- [x] Adăugat listare insights disponibile în rapoarte
|
||||
- [ ] Programez A1 și A2 cu Marius
|
||||
|
||||
## Aprobat și executat (răspuns email: DA)
|
||||
|
||||
**Executat:**
|
||||
- ✅ A0: Git commit + push (54 fișiere)
|
||||
- ✅ A4: Template seară "10 lucruri" → memory/kb/projects/templates/template-seara-merit.md
|
||||
|
||||
**Programat mâine (job grup-sprijin-pregatire):**
|
||||
- A3: Fișă grup sprijin - starea de victimă (tema pregătită din insights)
|
||||
|
||||
**Programat miercuri-joi 15-16:**
|
||||
- A1: Lista eforturilor pt clienți noi (template + completăm împreună)
|
||||
- A2: Template valoare adusă clienți (template + completăm împreună)
|
||||
- A5: Sesiune film interior (30 min conversație)
|
||||
|
||||
## Învățat
|
||||
|
||||
- Email deliverability: MailChannels poate bloca emailuri de la hosting shared
|
||||
- Gmail e strict pe RFC 5322 - header-ele trebuie corect formatate
|
||||
- Rapoarte pe email > Discord pentru decizii care necesită gândire
|
||||
- Format "sinteză + acționabile + răspunsuri predefinite" = 80/20 friendly
|
||||
## Lecții învățate
|
||||
- **Rapoarte:** TOATE propunerile TU+EU/FAC TU trebuie să aibă zi și oră concrete
|
||||
- **Email flow:** Reply #1 imediat (confirmare primire), Reply #2 după execuție (ce s-a făcut)
|
||||
- **Insights:** Listează TOATE insight-urile disponibile, nu doar câteva
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pip
|
||||
170
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/METADATA
Normal file
170
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/METADATA
Normal file
@@ -0,0 +1,170 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: pypdf
|
||||
Version: 6.6.2
|
||||
Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
|
||||
Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>
|
||||
Maintainer: stefan6419846
|
||||
Maintainer-email: Martin Thoma <info@martin-thoma.de>
|
||||
Requires-Python: >=3.9
|
||||
Description-Content-Type: text/markdown
|
||||
License-Expression: BSD-3-Clause
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: 3.14
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Typing :: Typed
|
||||
License-File: LICENSE
|
||||
Requires-Dist: typing_extensions >= 4.0; python_version < '3.11'
|
||||
Requires-Dist: cryptography ; extra == "crypto"
|
||||
Requires-Dist: PyCryptodome ; extra == "cryptodome"
|
||||
Requires-Dist: black ; extra == "dev"
|
||||
Requires-Dist: flit ; extra == "dev"
|
||||
Requires-Dist: pip-tools ; extra == "dev"
|
||||
Requires-Dist: pre-commit ; extra == "dev"
|
||||
Requires-Dist: pytest-cov ; extra == "dev"
|
||||
Requires-Dist: pytest-socket ; extra == "dev"
|
||||
Requires-Dist: pytest-timeout ; extra == "dev"
|
||||
Requires-Dist: pytest-xdist ; extra == "dev"
|
||||
Requires-Dist: wheel ; extra == "dev"
|
||||
Requires-Dist: myst_parser ; extra == "docs"
|
||||
Requires-Dist: sphinx ; extra == "docs"
|
||||
Requires-Dist: sphinx_rtd_theme ; extra == "docs"
|
||||
Requires-Dist: cryptography ; extra == "full"
|
||||
Requires-Dist: Pillow>=8.0.0 ; extra == "full"
|
||||
Requires-Dist: Pillow>=8.0.0 ; extra == "image"
|
||||
Project-URL: Bug Reports, https://github.com/py-pdf/pypdf/issues
|
||||
Project-URL: Changelog, https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html
|
||||
Project-URL: Documentation, https://pypdf.readthedocs.io/en/latest/
|
||||
Project-URL: Source, https://github.com/py-pdf/pypdf
|
||||
Provides-Extra: crypto
|
||||
Provides-Extra: cryptodome
|
||||
Provides-Extra: dev
|
||||
Provides-Extra: docs
|
||||
Provides-Extra: full
|
||||
Provides-Extra: image
|
||||
|
||||
[](https://badge.fury.io/py/pypdf)
|
||||
[](https://pypi.org/project/pypdf/)
|
||||
[](https://pypdf.readthedocs.io/en/stable/)
|
||||
[](https://github.com/py-pdf/pypdf)
|
||||
[](https://codecov.io/gh/py-pdf/pypdf)
|
||||
|
||||
# pypdf
|
||||
|
||||
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||
[merging](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html),
|
||||
[cropping, and transforming](https://pypdf.readthedocs.io/en/stable/user/cropping-and-transforming.html)
|
||||
the pages of PDF files. It can also add
|
||||
custom data, viewing options, and
|
||||
[passwords](https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html)
|
||||
to PDF files. pypdf can
|
||||
[retrieve text](https://pypdf.readthedocs.io/en/stable/user/extract-text.html)
|
||||
and
|
||||
[metadata](https://pypdf.readthedocs.io/en/stable/user/metadata.html)
|
||||
from PDFs as well.
|
||||
|
||||
See [pdfly](https://github.com/py-pdf/pdfly) for a CLI application that uses pypdf to interact with PDFs.
|
||||
|
||||
## Installation
|
||||
|
||||
Install pypdf using pip:
|
||||
|
||||
```
|
||||
pip install pypdf
|
||||
```
|
||||
|
||||
For using pypdf with AES encryption or decryption, install extra dependencies:
|
||||
|
||||
```
|
||||
pip install pypdf[crypto]
|
||||
```
|
||||
|
||||
> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to
|
||||
> previous versions. Please refer to [the migration
|
||||
> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for
|
||||
> more information.
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader("example.pdf")
|
||||
number_of_pages = len(reader.pages)
|
||||
page = reader.pages[0]
|
||||
text = page.extract_text()
|
||||
```
|
||||
|
||||
pypdf can do a lot more, e.g. splitting, merging, reading and creating annotations, decrypting and encrypting. Check out the
|
||||
[documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage
|
||||
examples!
|
||||
|
||||
For questions and answers, visit
|
||||
[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
|
||||
(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)).
|
||||
|
||||
## Contributions
|
||||
|
||||
Maintaining pypdf is a collaborative effort. You can support the project by
|
||||
writing documentation, helping to narrow down issues, and submitting code.
|
||||
See the [CONTRIBUTING.md](https://github.com/py-pdf/pypdf/blob/main/CONTRIBUTING.md) file for more information.
|
||||
|
||||
### Q&A
|
||||
|
||||
The experience pypdf users have covers the whole range from beginner to expert. You can contribute to the pypdf community by answering questions
|
||||
on [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf),
|
||||
helping in [discussions](https://github.com/py-pdf/pypdf/discussions),
|
||||
and asking users who report issues for [MCVE](https://stackoverflow.com/help/minimal-reproducible-example)'s (Code + example PDF!).
|
||||
|
||||
|
||||
### Issues
|
||||
|
||||
A good bug ticket includes a MCVE - a minimal complete verifiable example.
|
||||
For pypdf, this means that you must upload a PDF that causes the bug to occur
|
||||
as well as the code you're executing with all of the output. Use
|
||||
`print(pypdf.__version__)` to tell us which version you're using.
|
||||
|
||||
### Code
|
||||
|
||||
All code contributions are welcome, but smaller ones have a better chance to
|
||||
get included in a timely manner. Adding unit tests for new features or test
|
||||
cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
|
||||
|
||||
pypdf includes a test suite which can be executed with `pytest`:
|
||||
|
||||
```bash
|
||||
$ pytest
|
||||
===================== test session starts =====================
|
||||
platform linux -- Python 3.6.15, pytest-7.0.1, pluggy-1.0.0
|
||||
rootdir: /home/moose/GitHub/Martin/pypdf
|
||||
plugins: cov-3.0.0
|
||||
collected 233 items
|
||||
|
||||
tests/test_basic_features.py .. [ 0%]
|
||||
tests/test_constants.py . [ 1%]
|
||||
tests/test_filters.py .................x..... [ 11%]
|
||||
tests/test_generic.py ................................. [ 25%]
|
||||
............. [ 30%]
|
||||
tests/test_javascript.py .. [ 31%]
|
||||
tests/test_merger.py . [ 32%]
|
||||
tests/test_page.py ......................... [ 42%]
|
||||
tests/test_pagerange.py ................ [ 49%]
|
||||
tests/test_papersizes.py .................. [ 57%]
|
||||
tests/test_reader.py .................................. [ 72%]
|
||||
............... [ 78%]
|
||||
tests/test_utils.py .................... [ 87%]
|
||||
tests/test_workflows.py .......... [ 91%]
|
||||
tests/test_writer.py ................. [ 98%]
|
||||
tests/test_xmp.py ... [100%]
|
||||
|
||||
========== 232 passed, 1 xfailed, 1 warning in 4.52s ==========
|
||||
```
|
||||
|
||||
117
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/RECORD
Normal file
117
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/RECORD
Normal file
@@ -0,0 +1,117 @@
|
||||
pypdf-6.6.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
pypdf-6.6.2.dist-info/METADATA,sha256=1Vu0OgjW3amj2S_YMUmD0Lj_7_GEw-f5VaIM-_9niK8,7149
|
||||
pypdf-6.6.2.dist-info/RECORD,,
|
||||
pypdf-6.6.2.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
pypdf-6.6.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
||||
pypdf-6.6.2.dist-info/licenses/LICENSE,sha256=qXrCMOXzPvEKU2eoUOsB-R8aCwZONHQsd5TSKUVX9SQ,1605
|
||||
pypdf/__init__.py,sha256=YS_1ZrQ3jBPHsRgMstqJrAts3lUApj_lMOMK5qiLG5w,1283
|
||||
pypdf/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_cmap.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_doc_common.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_encryption.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_font.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_page.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_page_labels.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_protocols.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_reader.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_utils.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_version.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_writer.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_xobj_image_helpers.cpython-312.pyc,,
|
||||
pypdf/__pycache__/constants.cpython-312.pyc,,
|
||||
pypdf/__pycache__/errors.cpython-312.pyc,,
|
||||
pypdf/__pycache__/filters.cpython-312.pyc,,
|
||||
pypdf/__pycache__/pagerange.cpython-312.pyc,,
|
||||
pypdf/__pycache__/papersizes.cpython-312.pyc,,
|
||||
pypdf/__pycache__/types.cpython-312.pyc,,
|
||||
pypdf/__pycache__/xmp.cpython-312.pyc,,
|
||||
pypdf/_cmap.py,sha256=iaAvJQQKBxkqMj5-WdD4vZV-Zdz-Sba5j6q3oPQyLT0,11713
|
||||
pypdf/_codecs/__init__.py,sha256=PF1KlsLWCOF0cgdqns7G4X-l3zq5_OnZePw7RFIn1bE,1645
|
||||
pypdf/_codecs/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/_codecs.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/adobe_glyphs.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/core_fontmetrics.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/pdfdoc.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/std.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/symbol.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/zapfding.cpython-312.pyc,,
|
||||
pypdf/_codecs/_codecs.py,sha256=46oRZJySwGxCJp1kjIer7js_TYSjj4Gs2i2Uce3v-eE,10555
|
||||
pypdf/_codecs/adobe_glyphs.py,sha256=t3cDFPDqwIz1w9B0gdVzjdc8eEK9AuRjk5f7laEw_fY,447213
|
||||
pypdf/_codecs/core_fontmetrics.py,sha256=qQvNRQi8V8FOBmSwGcsak4qyl9cQ80cDjbpD5TvhuBg,113269
|
||||
pypdf/_codecs/pdfdoc.py,sha256=xfSvMFYsvxuaSQ0Uu9vZDKaB0Wu85h1uCiB1i9rAcUU,4269
|
||||
pypdf/_codecs/std.py,sha256=DyQMuEpAGEpS9uy1jWf4cnj-kqShPOAij5sI7Q1YD8E,2630
|
||||
pypdf/_codecs/symbol.py,sha256=nIaGQIlhWCJiPMHrwUlmGHH-_fOXyEKvguRmuKXcGAk,3734
|
||||
pypdf/_codecs/zapfding.py,sha256=PQxjxRC616d41xF3exVxP1W8nM4QrZfjO3lmtLxpE_s,3742
|
||||
pypdf/_crypt_providers/__init__.py,sha256=K3Z6AuXhXVeXgLet-Tukq2gt9H66OgdupsvxIS1CmkI,3054
|
||||
pypdf/_crypt_providers/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_cryptography.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_fallback.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_pycryptodome.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/_base.py,sha256=_f53Mj6vivhEZMQ4vNxN5G0IOgFY-n5_leke0c_qiNU,1711
|
||||
pypdf/_crypt_providers/_cryptography.py,sha256=zT3WmbPzesvgHRkGcKAldqJ24MY3BwZViVbSc55Zxhw,4557
|
||||
pypdf/_crypt_providers/_fallback.py,sha256=vsYoowR1YCAV_q-HrdIZhkUcrCb6HvRBNMYm03QtCU8,3334
|
||||
pypdf/_crypt_providers/_pycryptodome.py,sha256=U1aQZ9iYBrZo-hKCjJUhGOPhwEFToiitowQ316TNrrA,3381
|
||||
pypdf/_doc_common.py,sha256=Cbsc2uczFhAi2JRioaICx0ISC4lCBkRdo_tKRGw3bpc,53243
|
||||
pypdf/_encryption.py,sha256=-LwFEKfhL3B10afkco6fXx-EqtjoXf67pAUgH2VBfDw,48762
|
||||
pypdf/_font.py,sha256=R5jQsBYa_eMrK7VezyoWCmbBARZyS5xp8jzD2XRvKeE,14146
|
||||
pypdf/_page.py,sha256=Tp2GyjjOHLFwQ1tw8bO-poyZA65PJn3k94BymXMmurw,89909
|
||||
pypdf/_page_labels.py,sha256=_HXqgEhSLTH_mMhy8m4QAOzIOHRQLV6_lYvg81-l9hI,8546
|
||||
pypdf/_protocols.py,sha256=7qz92LVdPrYkSpdUPpAp9U4GW5jxNBTfVcpUWwUhEOo,2123
|
||||
pypdf/_reader.py,sha256=KyeDHVEI5n4cZBHGVzbGIfhaPC1nZMiIU0W_ZNb0w_Y,55079
|
||||
pypdf/_text_extraction/__init__.py,sha256=a3Z33rQVTiMKGtwt7_bfXlPosbST8rzELoNnt053_Qw,8515
|
||||
pypdf/_text_extraction/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/__pycache__/_text_extractor.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__init__.py,sha256=RUQIwiUwzneNtcljnVM6jkRaem6pgP7mOD2-MBmtpvw,340
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_fixed_width_page.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_manager.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_params.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/_fixed_width_page.py,sha256=eJveDbyMooG970qJOhM5Rwb9ZoyyJDynzWpV9a7IS20,15370
|
||||
pypdf/_text_extraction/_layout_mode/_text_state_manager.py,sha256=XVrIjeTd5jSdMexBQxs0tL5I5RUOitRmN1mELOcKYm4,8221
|
||||
pypdf/_text_extraction/_layout_mode/_text_state_params.py,sha256=hyw6pnC8upBkoFVUJ3LH8hBIIHrNwiqaqcYyzIIyr6Y,5481
|
||||
pypdf/_text_extraction/_text_extractor.py,sha256=wRmFtgMYTbJFbZRJVG3j1-lQWhb6mUC5uiE73DLRhIo,14454
|
||||
pypdf/_utils.py,sha256=v579jJEHn-JophTC4Ej2MBFTEoQGitPWs_d507pyS6g,20194
|
||||
pypdf/_version.py,sha256=S2Qku7VqFDmWPW_O3fID47IPC76TVFqesX1qVVa575w,22
|
||||
pypdf/_writer.py,sha256=K7ANMEgNz-tPngYVMW9j07SEcksk5tFf1_tgi0JDRIg,129793
|
||||
pypdf/_xobj_image_helpers.py,sha256=y7EMrXlYqwbIeUtdQS2XH9nO_2R73DOLf9-T1IyHMIA,21450
|
||||
pypdf/annotations/__init__.py,sha256=f2k_-jAn39CCB27KxQ_e93GinnzkAHbUnnSeGJl1jyE,990
|
||||
pypdf/annotations/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_markup_annotations.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_non_markup_annotations.cpython-312.pyc,,
|
||||
pypdf/annotations/_base.py,sha256=eeoc9v2w15jAUhKXj48l1bB66YgBgV-2v5IIUJH-vws,961
|
||||
pypdf/annotations/_markup_annotations.py,sha256=PLDCbsEWSgOmk6HTxepolEzj-Q3EE5J4hXMgnTDFaqc,9590
|
||||
pypdf/annotations/_non_markup_annotations.py,sha256=Z2IUvcCOcTcpJhSXrex_9riYM2D64XxFQ_vac10BNRU,3649
|
||||
pypdf/constants.py,sha256=_U_xkH1REx2rsgtx3jCOaKivhmyqPA25PLL7Z4A1_ZI,23260
|
||||
pypdf/errors.py,sha256=Bw1W9hxOsDgwqwU6YoQ2l0-JiUyTq6l5QjVCr-W4GFA,1947
|
||||
pypdf/filters.py,sha256=FzfrqdZK9bs3MjU75KJ2uIMPpx6VcxYQ4oV9wLh3j-w,29210
|
||||
pypdf/generic/__init__.py,sha256=VrqdYftQECePDU2rXVMgEqRaYFR8zOV_fvJgo19x_uw,3468
|
||||
pypdf/generic/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_appearance_stream.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_data_structures.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_files.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_fit.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_image_inline.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_link.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_outline.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_rectangle.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_utils.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_viewerpref.cpython-312.pyc,,
|
||||
pypdf/generic/_appearance_stream.py,sha256=ofXHlJC4-jSBCLOhkKztoeFiYlD-zi8QMdvRrMm3rdE,24867
|
||||
pypdf/generic/_base.py,sha256=N8O_NcqK5y5O70OF8-p6vsac9R1ykTDcBIksBY_9rnA,32531
|
||||
pypdf/generic/_data_structures.py,sha256=g1Jy5tpPSTHIhOme6HFXdMvxV2HuxbZx-HOsF2Awdc0,63602
|
||||
pypdf/generic/_files.py,sha256=NtSkRo6JBgisi4QOyrVneO891boVsuY25hRwij6X9RA,16238
|
||||
pypdf/generic/_fit.py,sha256=X_iADJj1YY4PUStS7rFWC2xR2LUVSvKtUAky0AFAIDM,5515
|
||||
pypdf/generic/_image_inline.py,sha256=4cADiCeaCYq2kgJu0wOYXRn5YZ27cCHb3hGFqFFT5D4,12787
|
||||
pypdf/generic/_link.py,sha256=ibdLhdU0mP_phneaJs-CzUDErkJuqnMT6TsQoHNOYiE,4951
|
||||
pypdf/generic/_outline.py,sha256=qKbMX42OWfqnopIiE6BUy6EvdTLGe3ZtjaiWN85JpaY,1094
|
||||
pypdf/generic/_rectangle.py,sha256=lOqSfFivQxgBN9LU9aqHoxPH8aCPTDUNgRZsNEUd6fc,3785
|
||||
pypdf/generic/_utils.py,sha256=vTDAesfG-cJNDKilz_kbgFodAITzd5ejppWHGjvConk,7258
|
||||
pypdf/generic/_viewerpref.py,sha256=6a_s0Avm9-XvV0wqxiW23cE92qK98ry3y6EPjfsFSdo,6758
|
||||
pypdf/pagerange.py,sha256=2bt21jQZm-9aq2bVf3TXuH8_wGVx7b9T6UrMFXCEJhQ,7108
|
||||
pypdf/papersizes.py,sha256=6Tz5sfNN_3JOUapY83U-lakohnpXYA0hSEQNmOVLFL8,1413
|
||||
pypdf/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
pypdf/types.py,sha256=sJ7wHzk7ER_CJ7kP-s8u9axFnkCXnFpr8nzcj1AxTas,1915
|
||||
pypdf/xmp.py,sha256=gqh3IlgTNP7ZuyhvE59p2tsMvu4adGkq0G8RDg0OtQw,29238
|
||||
@@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: flit 3.12.0
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
@@ -0,0 +1,29 @@
|
||||
Copyright (c) 2006-2008, Mathieu Fenniak
|
||||
Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||
Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
48
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
48
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||
merging, cropping, and transforming the pages of PDF files. It can also add
|
||||
custom data, viewing options, and passwords to PDF files. pypdf can retrieve
|
||||
text and metadata from PDFs as well.
|
||||
|
||||
You can read the full docs at https://pypdf.readthedocs.io/.
|
||||
"""
|
||||
|
||||
from ._crypt_providers import crypt_provider
|
||||
from ._doc_common import DocumentInformation
|
||||
from ._encryption import PasswordType
|
||||
from ._page import PageObject, Transformation
|
||||
from ._reader import PdfReader
|
||||
from ._text_extraction import mult
|
||||
from ._version import __version__
|
||||
from ._writer import ObjectDeletionFlag, PdfWriter
|
||||
from .constants import ImageType
|
||||
from .pagerange import PageRange, parse_filename_page_ranges
|
||||
from .papersizes import PaperSize
|
||||
|
||||
try:
|
||||
import PIL
|
||||
|
||||
pil_version = PIL.__version__
|
||||
except ImportError:
|
||||
pil_version = "none"
|
||||
|
||||
_debug_versions = (
|
||||
f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}"
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DocumentInformation",
|
||||
"ImageType",
|
||||
"ObjectDeletionFlag",
|
||||
"PageObject",
|
||||
"PageRange",
|
||||
"PaperSize",
|
||||
"PasswordType",
|
||||
"PdfReader",
|
||||
"PdfWriter",
|
||||
"Transformation",
|
||||
"__version__",
|
||||
"_debug_versions",
|
||||
"mult",
|
||||
"parse_filename_page_ranges",
|
||||
]
|
||||
338
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
338
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
@@ -0,0 +1,338 @@
|
||||
import binascii
|
||||
from binascii import Error as BinasciiError
|
||||
from binascii import unhexlify
|
||||
from math import ceil
|
||||
from typing import Any, Union, cast
|
||||
|
||||
from ._codecs import adobe_glyphs, charset_encoding
|
||||
from ._utils import logger_error, logger_warning
|
||||
from .generic import (
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NullObject,
|
||||
StreamObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
_predefined_cmap: dict[str, str] = {
|
||||
"/Identity-H": "utf-16-be",
|
||||
"/Identity-V": "utf-16-be",
|
||||
"/GB-EUC-H": "gbk",
|
||||
"/GB-EUC-V": "gbk",
|
||||
"/GBpc-EUC-H": "gb2312",
|
||||
"/GBpc-EUC-V": "gb2312",
|
||||
"/GBK-EUC-H": "gbk",
|
||||
"/GBK-EUC-V": "gbk",
|
||||
"/GBK2K-H": "gb18030",
|
||||
"/GBK2K-V": "gb18030",
|
||||
"/ETen-B5-H": "cp950",
|
||||
"/ETen-B5-V": "cp950",
|
||||
"/ETenms-B5-H": "cp950",
|
||||
"/ETenms-B5-V": "cp950",
|
||||
"/UniCNS-UTF16-H": "utf-16-be",
|
||||
"/UniCNS-UTF16-V": "utf-16-be",
|
||||
"/UniGB-UTF16-H": "gb18030",
|
||||
"/UniGB-UTF16-V": "gb18030",
|
||||
# UCS2 in code
|
||||
}
|
||||
|
||||
|
||||
def get_encoding(
|
||||
ft: DictionaryObject
|
||||
) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
|
||||
encoding = _parse_encoding(ft)
|
||||
map_dict, int_entry = _parse_to_unicode(ft)
|
||||
|
||||
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
|
||||
# if cmap not empty encoding should be discarded
|
||||
# (here transformed into identity for those characters)
|
||||
# If encoding is a string, it is expected to be an identity translation.
|
||||
if isinstance(encoding, dict):
|
||||
for x in int_entry:
|
||||
if x <= 255:
|
||||
encoding[x] = chr(x)
|
||||
|
||||
return encoding, map_dict
|
||||
|
||||
|
||||
def _parse_encoding(
|
||||
ft: DictionaryObject
|
||||
) -> Union[str, dict[int, str]]:
|
||||
encoding: Union[str, list[str], dict[int, str]] = []
|
||||
if "/Encoding" not in ft:
|
||||
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
|
||||
encoding = dict(
|
||||
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
|
||||
)
|
||||
else:
|
||||
encoding = "charmap"
|
||||
return encoding
|
||||
enc: Union[str, DictionaryObject, NullObject] = cast(
|
||||
Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
|
||||
)
|
||||
if isinstance(enc, str):
|
||||
try:
|
||||
# already done : enc = NameObject.unnumber(enc.encode()).decode()
|
||||
# for #xx decoding
|
||||
if enc in charset_encoding:
|
||||
encoding = charset_encoding[enc].copy()
|
||||
elif enc in _predefined_cmap:
|
||||
encoding = _predefined_cmap[enc]
|
||||
elif "-UCS2-" in enc:
|
||||
encoding = "utf-16-be"
|
||||
else:
|
||||
raise Exception("not found")
|
||||
except Exception:
|
||||
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
|
||||
encoding = enc
|
||||
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
|
||||
try:
|
||||
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
|
||||
except Exception:
|
||||
logger_error(
|
||||
f"Advanced encoding {encoding} not implemented yet",
|
||||
__name__,
|
||||
)
|
||||
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||
else:
|
||||
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||
if isinstance(enc, DictionaryObject) and "/Differences" in enc:
|
||||
x: int = 0
|
||||
o: Union[int, str]
|
||||
for o in cast(DictionaryObject, enc["/Differences"]):
|
||||
if isinstance(o, int):
|
||||
x = o
|
||||
else: # isinstance(o, str):
|
||||
try:
|
||||
if x < len(encoding):
|
||||
encoding[x] = adobe_glyphs[o] # type: ignore
|
||||
except Exception:
|
||||
encoding[x] = o # type: ignore
|
||||
x += 1
|
||||
if isinstance(encoding, list):
|
||||
encoding = dict(zip(range(256), encoding))
|
||||
return encoding
|
||||
|
||||
|
||||
def _parse_to_unicode(
|
||||
ft: DictionaryObject
|
||||
) -> tuple[dict[Any, Any], list[int]]:
|
||||
# will store all translation code
|
||||
# and map_dict[-1] we will have the number of bytes to convert
|
||||
map_dict: dict[Any, Any] = {}
|
||||
|
||||
# will provide the list of cmap keys as int to correct encoding
|
||||
int_entry: list[int] = []
|
||||
|
||||
if "/ToUnicode" not in ft:
|
||||
if ft.get("/Subtype", "") == "/Type1":
|
||||
return _type1_alternative(ft, map_dict, int_entry)
|
||||
return {}, []
|
||||
process_rg: bool = False
|
||||
process_char: bool = False
|
||||
multiline_rg: Union[
|
||||
None, tuple[int, int]
|
||||
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
|
||||
cm = prepare_cm(ft)
|
||||
for line in cm.split(b"\n"):
|
||||
process_rg, process_char, multiline_rg = process_cm_line(
|
||||
line.strip(b" \t"),
|
||||
process_rg,
|
||||
process_char,
|
||||
multiline_rg,
|
||||
map_dict,
|
||||
int_entry,
|
||||
)
|
||||
|
||||
return map_dict, int_entry
|
||||
|
||||
|
||||
def prepare_cm(ft: DictionaryObject) -> bytes:
|
||||
tu = ft["/ToUnicode"]
|
||||
cm: bytes
|
||||
if isinstance(tu, StreamObject):
|
||||
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
|
||||
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
|
||||
# the full range 0000-FFFF will be processed
|
||||
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
|
||||
if isinstance(cm, str):
|
||||
cm = cm.encode()
|
||||
# we need to prepare cm before due to missing return line in pdf printed
|
||||
# to pdf from word
|
||||
cm = (
|
||||
cm.strip()
|
||||
.replace(b"beginbfchar", b"\nbeginbfchar\n")
|
||||
.replace(b"endbfchar", b"\nendbfchar\n")
|
||||
.replace(b"beginbfrange", b"\nbeginbfrange\n")
|
||||
.replace(b"endbfrange", b"\nendbfrange\n")
|
||||
.replace(b"<<", b"\n{\n") # text between << and >> not used but
|
||||
.replace(b">>", b"\n}\n") # some solution to find it back
|
||||
)
|
||||
ll = cm.split(b"<")
|
||||
for i in range(len(ll)):
|
||||
j = ll[i].find(b">")
|
||||
if j >= 0:
|
||||
if j == 0:
|
||||
# string is empty: stash a placeholder here (see below)
|
||||
# see https://github.com/py-pdf/pypdf/issues/1111
|
||||
content = b"."
|
||||
else:
|
||||
content = ll[i][:j].replace(b" ", b"")
|
||||
ll[i] = content + b" " + ll[i][j + 1 :]
|
||||
cm = (
|
||||
(b" ".join(ll))
|
||||
.replace(b"[", b" [ ")
|
||||
.replace(b"]", b" ]\n ")
|
||||
.replace(b"\r", b"\n")
|
||||
)
|
||||
return cm
|
||||
|
||||
|
||||
def process_cm_line(
|
||||
line: bytes,
|
||||
process_rg: bool,
|
||||
process_char: bool,
|
||||
multiline_rg: Union[None, tuple[int, int]],
|
||||
map_dict: dict[Any, Any],
|
||||
int_entry: list[int],
|
||||
) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
|
||||
if line == b"" or line[0] == 37: # 37 = %
|
||||
return process_rg, process_char, multiline_rg
|
||||
line = line.replace(b"\t", b" ")
|
||||
if b"beginbfrange" in line:
|
||||
process_rg = True
|
||||
elif b"endbfrange" in line:
|
||||
process_rg = False
|
||||
elif b"beginbfchar" in line:
|
||||
process_char = True
|
||||
elif b"endbfchar" in line:
|
||||
process_char = False
|
||||
elif process_rg:
|
||||
try:
|
||||
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
|
||||
except binascii.Error as error:
|
||||
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
|
||||
elif process_char:
|
||||
parse_bfchar(line, map_dict, int_entry)
|
||||
return process_rg, process_char, multiline_rg
|
||||
|
||||
|
||||
def parse_bfrange(
|
||||
line: bytes,
|
||||
map_dict: dict[Any, Any],
|
||||
int_entry: list[int],
|
||||
multiline_rg: Union[None, tuple[int, int]],
|
||||
) -> Union[None, tuple[int, int]]:
|
||||
lst = [x for x in line.split(b" ") if x]
|
||||
closure_found = False
|
||||
if multiline_rg is not None:
|
||||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||
a = multiline_rg[0] # a, b not in the current line
|
||||
b = multiline_rg[1]
|
||||
for sq in lst:
|
||||
if sq == b"]":
|
||||
closure_found = True
|
||||
break
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
else:
|
||||
a = int(lst[0], 16)
|
||||
b = int(lst[1], 16)
|
||||
nbi = max(len(lst[0]), len(lst[1]))
|
||||
map_dict[-1] = ceil(nbi / 2)
|
||||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||
if lst[2] == b"[":
|
||||
for sq in lst[3:]:
|
||||
if sq == b"]":
|
||||
closure_found = True
|
||||
break
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
else: # case without list
|
||||
c = int(lst[2], 16)
|
||||
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
|
||||
closure_found = True
|
||||
while a <= b:
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
c += 1
|
||||
return None if closure_found else (a, b)
|
||||
|
||||
|
||||
def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
|
||||
lst = [x for x in line.split(b" ") if x]
|
||||
map_dict[-1] = len(lst[0]) // 2
|
||||
while len(lst) > 1:
|
||||
map_to = ""
|
||||
# placeholder (see above) means empty string
|
||||
if lst[1] != b".":
|
||||
try:
|
||||
map_to = unhexlify(lst[1]).decode(
|
||||
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
|
||||
) # join is here as some cases where the code was split
|
||||
except BinasciiError as exception:
|
||||
logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
|
||||
map_dict[
|
||||
unhexlify(lst[0]).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
|
||||
)
|
||||
] = map_to
|
||||
int_entry.append(int(lst[0], 16))
|
||||
lst = lst[2:]
|
||||
|
||||
|
||||
def _type1_alternative(
|
||||
ft: DictionaryObject,
|
||||
map_dict: dict[Any, Any],
|
||||
int_entry: list[int],
|
||||
) -> tuple[dict[Any, Any], list[int]]:
|
||||
if "/FontDescriptor" not in ft:
|
||||
return map_dict, int_entry
|
||||
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
|
||||
if is_null_or_none(ft_desc):
|
||||
return map_dict, int_entry
|
||||
assert ft_desc is not None, "mypy"
|
||||
txt = ft_desc.get_object().get_data()
|
||||
txt = txt.split(b"eexec\n")[0] # only clear part
|
||||
txt = txt.split(b"/Encoding")[1] # to get the encoding part
|
||||
lines = txt.replace(b"\r", b"\n").split(b"\n")
|
||||
for li in lines:
|
||||
if li.startswith(b"dup"):
|
||||
words = [_w for _w in li.split(b" ") if _w != b""]
|
||||
if len(words) > 3 and words[3] != b"put":
|
||||
continue
|
||||
try:
|
||||
i = int(words[1])
|
||||
except ValueError: # pragma: no cover
|
||||
continue
|
||||
try:
|
||||
v = adobe_glyphs[words[2].decode()]
|
||||
except KeyError:
|
||||
if words[2].startswith(b"/uni"):
|
||||
try:
|
||||
v = chr(int(words[2][4:], 16))
|
||||
except ValueError: # pragma: no cover
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
map_dict[chr(i)] = v
|
||||
int_entry.append(i)
|
||||
return map_dict, int_entry
|
||||
59
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
59
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from .adobe_glyphs import adobe_glyphs
|
||||
from .pdfdoc import _pdfdoc_encoding
|
||||
from .std import _std_encoding
|
||||
from .symbol import _symbol_encoding
|
||||
from .zapfding import _zapfding_encoding
|
||||
|
||||
|
||||
def fill_from_encoding(enc: str) -> list[str]:
|
||||
lst: list[str] = []
|
||||
for x in range(256):
|
||||
try:
|
||||
lst += (bytes((x,)).decode(enc),)
|
||||
except Exception:
|
||||
lst += (chr(x),)
|
||||
return lst
|
||||
|
||||
|
||||
def rev_encoding(enc: list[str]) -> dict[str, int]:
|
||||
rev: dict[str, int] = {}
|
||||
for i in range(256):
|
||||
char = enc[i]
|
||||
if char == "\u0000":
|
||||
continue
|
||||
assert char not in rev, f"{char} at {i} already at {rev[char]}"
|
||||
rev[char] = i
|
||||
return rev
|
||||
|
||||
|
||||
_win_encoding = fill_from_encoding("cp1252")
|
||||
_mac_encoding = fill_from_encoding("mac_roman")
|
||||
|
||||
|
||||
_win_encoding_rev: dict[str, int] = rev_encoding(_win_encoding)
|
||||
_mac_encoding_rev: dict[str, int] = rev_encoding(_mac_encoding)
|
||||
_symbol_encoding_rev: dict[str, int] = rev_encoding(_symbol_encoding)
|
||||
_zapfding_encoding_rev: dict[str, int] = rev_encoding(_zapfding_encoding)
|
||||
_pdfdoc_encoding_rev: dict[str, int] = rev_encoding(_pdfdoc_encoding)
|
||||
|
||||
|
||||
charset_encoding: dict[str, list[str]] = {
|
||||
"/StandardEncoding": _std_encoding,
|
||||
"/WinAnsiEncoding": _win_encoding,
|
||||
"/MacRomanEncoding": _mac_encoding,
|
||||
"/PDFDocEncoding": _pdfdoc_encoding,
|
||||
"/Symbol": _symbol_encoding,
|
||||
"/ZapfDingbats": _zapfding_encoding,
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"_mac_encoding",
|
||||
"_pdfdoc_encoding",
|
||||
"_pdfdoc_encoding_rev",
|
||||
"_std_encoding",
|
||||
"_symbol_encoding",
|
||||
"_win_encoding",
|
||||
"_zapfding_encoding",
|
||||
"adobe_glyphs",
|
||||
"charset_encoding",
|
||||
]
|
||||
281
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
281
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
This module is for codecs only.
|
||||
|
||||
While the codec implementation can contain details of the PDF specification,
|
||||
the module should not do any PDF parsing.
|
||||
"""
|
||||
|
||||
import io
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from pypdf._utils import logger_warning
|
||||
from pypdf.errors import LimitReachedError
|
||||
|
||||
|
||||
class Codec(ABC):
|
||||
"""Abstract base class for all codecs."""
|
||||
|
||||
@abstractmethod
|
||||
def encode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Encode the input data.
|
||||
|
||||
Args:
|
||||
data: Data to encode.
|
||||
|
||||
Returns:
|
||||
Encoded data.
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def decode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Decode the input data.
|
||||
|
||||
Args:
|
||||
data: Data to decode.
|
||||
|
||||
Returns:
|
||||
Decoded data.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class LzwCodec(Codec):
|
||||
"""Lempel-Ziv-Welch (LZW) adaptive compression codec."""
|
||||
|
||||
CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset
|
||||
EOD_MARKER = 257 # End-of-data marker
|
||||
INITIAL_BITS_PER_CODE = 9 # Initial code bit width
|
||||
MAX_BITS_PER_CODE = 12 # Maximum code bit width
|
||||
|
||||
def __init__(self, max_output_length: int = 75_000_000) -> None:
|
||||
self.max_output_length = max_output_length
|
||||
|
||||
def _initialize_encoding_table(self) -> None:
|
||||
"""Initialize the encoding table and state to initial conditions."""
|
||||
self.encoding_table: dict[bytes, int] = {bytes([i]): i for i in range(256)}
|
||||
self.next_code = self.EOD_MARKER + 1
|
||||
self.bits_per_code = self.INITIAL_BITS_PER_CODE
|
||||
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||
|
||||
def _increase_next_code(self) -> None:
|
||||
"""Update bits_per_code and max_code_value if necessary."""
|
||||
self.next_code += 1
|
||||
if (
|
||||
self.next_code > self.max_code_value
|
||||
and self.bits_per_code < self.MAX_BITS_PER_CODE
|
||||
):
|
||||
self.bits_per_code += 1
|
||||
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||
|
||||
def encode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Encode data using the LZW compression algorithm.
|
||||
|
||||
Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
|
||||
"""
|
||||
result_codes: list[int] = []
|
||||
|
||||
# The encoder shall begin by issuing a clear-table code
|
||||
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||
self._initialize_encoding_table()
|
||||
|
||||
current_sequence = b""
|
||||
for byte in data:
|
||||
next_sequence = current_sequence + bytes([byte])
|
||||
|
||||
if next_sequence in self.encoding_table:
|
||||
# Extend current sequence if already in the table
|
||||
current_sequence = next_sequence
|
||||
else:
|
||||
# Output code for the current sequence
|
||||
result_codes.append(self.encoding_table[current_sequence])
|
||||
|
||||
# Add the new sequence to the table if there's room
|
||||
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
|
||||
self.encoding_table[next_sequence] = self.next_code
|
||||
self._increase_next_code()
|
||||
else:
|
||||
# If the table is full, emit a clear-table command
|
||||
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||
self._initialize_encoding_table()
|
||||
|
||||
# Start new sequence
|
||||
current_sequence = bytes([byte])
|
||||
|
||||
# Ensure everything actually is encoded
|
||||
if current_sequence:
|
||||
result_codes.append(self.encoding_table[current_sequence])
|
||||
result_codes.append(self.EOD_MARKER)
|
||||
|
||||
return self._pack_codes_into_bytes(result_codes)
|
||||
|
||||
def _pack_codes_into_bytes(self, codes: list[int]) -> bytes:
|
||||
"""
|
||||
Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
|
||||
The bit-width starts at 9 bits and expands as needed.
|
||||
"""
|
||||
self._initialize_encoding_table()
|
||||
buffer = 0
|
||||
bits_in_buffer = 0
|
||||
output = bytearray()
|
||||
|
||||
for code in codes:
|
||||
buffer = (buffer << self.bits_per_code) | code
|
||||
bits_in_buffer += self.bits_per_code
|
||||
|
||||
# Codes shall be packed into a continuous bit stream, high-order bit
|
||||
# first. This stream shall then be divided into bytes, high-order bit
|
||||
# first.
|
||||
while bits_in_buffer >= 8:
|
||||
bits_in_buffer -= 8
|
||||
output.append((buffer >> bits_in_buffer) & 0xFF)
|
||||
|
||||
if code == self.CLEAR_TABLE_MARKER:
|
||||
self._initialize_encoding_table()
|
||||
elif code == self.EOD_MARKER:
|
||||
continue
|
||||
else:
|
||||
self._increase_next_code()
|
||||
|
||||
# Flush any remaining bits in the buffer
|
||||
if bits_in_buffer > 0:
|
||||
output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def _initialize_decoding_table(self) -> None:
|
||||
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
|
||||
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
|
||||
b""
|
||||
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
|
||||
self._table_index = self.EOD_MARKER + 1
|
||||
self._bits_to_get = 9
|
||||
|
||||
def _next_code_decode(self, data: bytes) -> int:
|
||||
self._next_data: int
|
||||
try:
|
||||
while self._next_bits < self._bits_to_get:
|
||||
self._next_data = (self._next_data << 8) | (
|
||||
data[self._byte_pointer]
|
||||
)
|
||||
self._byte_pointer += 1
|
||||
self._next_bits += 8
|
||||
|
||||
code = (
|
||||
self._next_data >> (self._next_bits - self._bits_to_get)
|
||||
) & self._and_table[self._bits_to_get - 9]
|
||||
self._next_bits -= self._bits_to_get
|
||||
|
||||
# Reduce data to get rid of the overhead,
|
||||
# which increases performance on large streams significantly.
|
||||
self._next_data = self._next_data & 0xFFFFF
|
||||
|
||||
return code
|
||||
except IndexError:
|
||||
return self.EOD_MARKER
|
||||
|
||||
# The following method has been converted to Python from PDFsharp:
|
||||
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||
#
|
||||
# Original license:
|
||||
#
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
|
||||
# Germany
|
||||
#
|
||||
# http://docs.pdfsharp.net
|
||||
#
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
# --------------------------------------------------------------------------
|
||||
def decode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
The following code was converted to Python from the following code:
|
||||
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||
"""
|
||||
self._and_table = [511, 1023, 2047, 4095]
|
||||
self._table_index = 0
|
||||
self._bits_to_get = 9
|
||||
self._byte_pointer = 0
|
||||
self._next_data = 0
|
||||
self._next_bits = 0
|
||||
|
||||
output_stream = io.BytesIO()
|
||||
output_length = 0
|
||||
|
||||
self._initialize_decoding_table()
|
||||
self._byte_pointer = 0
|
||||
self._next_data = 0
|
||||
self._next_bits = 0
|
||||
old_code = self.CLEAR_TABLE_MARKER
|
||||
|
||||
while True:
|
||||
code = self._next_code_decode(data)
|
||||
if code == self.EOD_MARKER:
|
||||
break
|
||||
|
||||
if code == self.CLEAR_TABLE_MARKER:
|
||||
self._initialize_decoding_table()
|
||||
code = self._next_code_decode(data)
|
||||
if code == self.EOD_MARKER:
|
||||
break
|
||||
output_stream.write(decoded := self.decoding_table[code])
|
||||
old_code = code
|
||||
elif code < self._table_index:
|
||||
decoded = self.decoding_table[code]
|
||||
output_stream.write(decoded)
|
||||
if old_code != self.CLEAR_TABLE_MARKER:
|
||||
self._add_entry_decode(self.decoding_table[old_code], decoded[0])
|
||||
old_code = code
|
||||
else:
|
||||
# The code is not in the table and not one of the special codes
|
||||
decoded = (
|
||||
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
|
||||
)
|
||||
output_stream.write(decoded)
|
||||
self._add_entry_decode(self.decoding_table[old_code], decoded[0])
|
||||
old_code = code
|
||||
|
||||
output_length += len(decoded)
|
||||
if output_length > self.max_output_length:
|
||||
raise LimitReachedError(
|
||||
f"Limit reached while decompressing: {output_length} > {self.max_output_length}"
|
||||
)
|
||||
|
||||
return output_stream.getvalue()
|
||||
|
||||
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
|
||||
new_string = old_string + bytes([new_char])
|
||||
if self._table_index > self.max_code_value:
|
||||
logger_warning("Ignoring too large LZW table index.", __name__)
|
||||
return
|
||||
self.decoding_table[self._table_index] = new_string
|
||||
self._table_index += 1
|
||||
|
||||
# Update the number of bits to get based on the table index
|
||||
if self._table_index == 511:
|
||||
self._bits_to_get = 10
|
||||
elif self._table_index == 1023:
|
||||
self._bits_to_get = 11
|
||||
elif self._table_index == 2047:
|
||||
self._bits_to_get = 12
|
||||
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
File diff suppressed because it is too large
Load Diff
4441
venv/lib/python3.12/site-packages/pypdf/_codecs/core_fontmetrics.py
Normal file
4441
venv/lib/python3.12/site-packages/pypdf/_codecs/core_fontmetrics.py
Normal file
File diff suppressed because it is too large
Load Diff
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
|
||||
# C.1 Predefined encodings sorted by character name of another PDF reference
|
||||
# Some indices have '\u0000' although they should have something else:
|
||||
# 22: should be '\u0017'
|
||||
_pdfdoc_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007", # 0 - 7
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000a",
|
||||
"\u000b",
|
||||
"\u000c",
|
||||
"\u000d",
|
||||
"\u000e",
|
||||
"\u000f", # 8 - 15
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0000",
|
||||
"\u0017", # 16 - 23
|
||||
"\u02d8",
|
||||
"\u02c7",
|
||||
"\u02c6",
|
||||
"\u02d9",
|
||||
"\u02dd",
|
||||
"\u02db",
|
||||
"\u02da",
|
||||
"\u02dc", # 24 - 31
|
||||
"\u0020",
|
||||
"\u0021",
|
||||
"\u0022",
|
||||
"\u0023",
|
||||
"\u0024",
|
||||
"\u0025",
|
||||
"\u0026",
|
||||
"\u0027", # 32 - 39
|
||||
"\u0028",
|
||||
"\u0029",
|
||||
"\u002a",
|
||||
"\u002b",
|
||||
"\u002c",
|
||||
"\u002d",
|
||||
"\u002e",
|
||||
"\u002f", # 40 - 47
|
||||
"\u0030",
|
||||
"\u0031",
|
||||
"\u0032",
|
||||
"\u0033",
|
||||
"\u0034",
|
||||
"\u0035",
|
||||
"\u0036",
|
||||
"\u0037", # 48 - 55
|
||||
"\u0038",
|
||||
"\u0039",
|
||||
"\u003a",
|
||||
"\u003b",
|
||||
"\u003c",
|
||||
"\u003d",
|
||||
"\u003e",
|
||||
"\u003f", # 56 - 63
|
||||
"\u0040",
|
||||
"\u0041",
|
||||
"\u0042",
|
||||
"\u0043",
|
||||
"\u0044",
|
||||
"\u0045",
|
||||
"\u0046",
|
||||
"\u0047", # 64 - 71
|
||||
"\u0048",
|
||||
"\u0049",
|
||||
"\u004a",
|
||||
"\u004b",
|
||||
"\u004c",
|
||||
"\u004d",
|
||||
"\u004e",
|
||||
"\u004f", # 72 - 79
|
||||
"\u0050",
|
||||
"\u0051",
|
||||
"\u0052",
|
||||
"\u0053",
|
||||
"\u0054",
|
||||
"\u0055",
|
||||
"\u0056",
|
||||
"\u0057", # 80 - 87
|
||||
"\u0058",
|
||||
"\u0059",
|
||||
"\u005a",
|
||||
"\u005b",
|
||||
"\u005c",
|
||||
"\u005d",
|
||||
"\u005e",
|
||||
"\u005f", # 88 - 95
|
||||
"\u0060",
|
||||
"\u0061",
|
||||
"\u0062",
|
||||
"\u0063",
|
||||
"\u0064",
|
||||
"\u0065",
|
||||
"\u0066",
|
||||
"\u0067", # 96 - 103
|
||||
"\u0068",
|
||||
"\u0069",
|
||||
"\u006a",
|
||||
"\u006b",
|
||||
"\u006c",
|
||||
"\u006d",
|
||||
"\u006e",
|
||||
"\u006f", # 104 - 111
|
||||
"\u0070",
|
||||
"\u0071",
|
||||
"\u0072",
|
||||
"\u0073",
|
||||
"\u0074",
|
||||
"\u0075",
|
||||
"\u0076",
|
||||
"\u0077", # 112 - 119
|
||||
"\u0078",
|
||||
"\u0079",
|
||||
"\u007a",
|
||||
"\u007b",
|
||||
"\u007c",
|
||||
"\u007d",
|
||||
"\u007e",
|
||||
"\u0000", # 120 - 127
|
||||
"\u2022",
|
||||
"\u2020",
|
||||
"\u2021",
|
||||
"\u2026",
|
||||
"\u2014",
|
||||
"\u2013",
|
||||
"\u0192",
|
||||
"\u2044", # 128 - 135
|
||||
"\u2039",
|
||||
"\u203a",
|
||||
"\u2212",
|
||||
"\u2030",
|
||||
"\u201e",
|
||||
"\u201c",
|
||||
"\u201d",
|
||||
"\u2018", # 136 - 143
|
||||
"\u2019",
|
||||
"\u201a",
|
||||
"\u2122",
|
||||
"\ufb01",
|
||||
"\ufb02",
|
||||
"\u0141",
|
||||
"\u0152",
|
||||
"\u0160", # 144 - 151
|
||||
"\u0178",
|
||||
"\u017d",
|
||||
"\u0131",
|
||||
"\u0142",
|
||||
"\u0153",
|
||||
"\u0161",
|
||||
"\u017e",
|
||||
"\u0000", # 152 - 159
|
||||
"\u20ac",
|
||||
"\u00a1",
|
||||
"\u00a2",
|
||||
"\u00a3",
|
||||
"\u00a4",
|
||||
"\u00a5",
|
||||
"\u00a6",
|
||||
"\u00a7", # 160 - 167
|
||||
"\u00a8",
|
||||
"\u00a9",
|
||||
"\u00aa",
|
||||
"\u00ab",
|
||||
"\u00ac",
|
||||
"\u0000",
|
||||
"\u00ae",
|
||||
"\u00af", # 168 - 175
|
||||
"\u00b0",
|
||||
"\u00b1",
|
||||
"\u00b2",
|
||||
"\u00b3",
|
||||
"\u00b4",
|
||||
"\u00b5",
|
||||
"\u00b6",
|
||||
"\u00b7", # 176 - 183
|
||||
"\u00b8",
|
||||
"\u00b9",
|
||||
"\u00ba",
|
||||
"\u00bb",
|
||||
"\u00bc",
|
||||
"\u00bd",
|
||||
"\u00be",
|
||||
"\u00bf", # 184 - 191
|
||||
"\u00c0",
|
||||
"\u00c1",
|
||||
"\u00c2",
|
||||
"\u00c3",
|
||||
"\u00c4",
|
||||
"\u00c5",
|
||||
"\u00c6",
|
||||
"\u00c7", # 192 - 199
|
||||
"\u00c8",
|
||||
"\u00c9",
|
||||
"\u00ca",
|
||||
"\u00cb",
|
||||
"\u00cc",
|
||||
"\u00cd",
|
||||
"\u00ce",
|
||||
"\u00cf", # 200 - 207
|
||||
"\u00d0",
|
||||
"\u00d1",
|
||||
"\u00d2",
|
||||
"\u00d3",
|
||||
"\u00d4",
|
||||
"\u00d5",
|
||||
"\u00d6",
|
||||
"\u00d7", # 208 - 215
|
||||
"\u00d8",
|
||||
"\u00d9",
|
||||
"\u00da",
|
||||
"\u00db",
|
||||
"\u00dc",
|
||||
"\u00dd",
|
||||
"\u00de",
|
||||
"\u00df", # 216 - 223
|
||||
"\u00e0",
|
||||
"\u00e1",
|
||||
"\u00e2",
|
||||
"\u00e3",
|
||||
"\u00e4",
|
||||
"\u00e5",
|
||||
"\u00e6",
|
||||
"\u00e7", # 224 - 231
|
||||
"\u00e8",
|
||||
"\u00e9",
|
||||
"\u00ea",
|
||||
"\u00eb",
|
||||
"\u00ec",
|
||||
"\u00ed",
|
||||
"\u00ee",
|
||||
"\u00ef", # 232 - 239
|
||||
"\u00f0",
|
||||
"\u00f1",
|
||||
"\u00f2",
|
||||
"\u00f3",
|
||||
"\u00f4",
|
||||
"\u00f5",
|
||||
"\u00f6",
|
||||
"\u00f7", # 240 - 247
|
||||
"\u00f8",
|
||||
"\u00f9",
|
||||
"\u00fa",
|
||||
"\u00fb",
|
||||
"\u00fc",
|
||||
"\u00fd",
|
||||
"\u00fe",
|
||||
"\u00ff", # 248 - 255
|
||||
]
|
||||
|
||||
assert len(_pdfdoc_encoding) == 256
|
||||
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
@@ -0,0 +1,258 @@
|
||||
_std_encoding = [
|
||||
"\x00",
|
||||
"\x01",
|
||||
"\x02",
|
||||
"\x03",
|
||||
"\x04",
|
||||
"\x05",
|
||||
"\x06",
|
||||
"\x07",
|
||||
"\x08",
|
||||
"\t",
|
||||
"\n",
|
||||
"\x0b",
|
||||
"\x0c",
|
||||
"\r",
|
||||
"\x0e",
|
||||
"\x0f",
|
||||
"\x10",
|
||||
"\x11",
|
||||
"\x12",
|
||||
"\x13",
|
||||
"\x14",
|
||||
"\x15",
|
||||
"\x16",
|
||||
"\x17",
|
||||
"\x18",
|
||||
"\x19",
|
||||
"\x1a",
|
||||
"\x1b",
|
||||
"\x1c",
|
||||
"\x1d",
|
||||
"\x1e",
|
||||
"\x1f",
|
||||
" ",
|
||||
"!",
|
||||
'"',
|
||||
"#",
|
||||
"$",
|
||||
"%",
|
||||
"&",
|
||||
"’",
|
||||
"(",
|
||||
")",
|
||||
"*",
|
||||
"+",
|
||||
",",
|
||||
"-",
|
||||
".",
|
||||
"/",
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
":",
|
||||
";",
|
||||
"<",
|
||||
"=",
|
||||
">",
|
||||
"?",
|
||||
"@",
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"D",
|
||||
"E",
|
||||
"F",
|
||||
"G",
|
||||
"H",
|
||||
"I",
|
||||
"J",
|
||||
"K",
|
||||
"L",
|
||||
"M",
|
||||
"N",
|
||||
"O",
|
||||
"P",
|
||||
"Q",
|
||||
"R",
|
||||
"S",
|
||||
"T",
|
||||
"U",
|
||||
"V",
|
||||
"W",
|
||||
"X",
|
||||
"Y",
|
||||
"Z",
|
||||
"[",
|
||||
"\\",
|
||||
"]",
|
||||
"^",
|
||||
"_",
|
||||
"‘",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"q",
|
||||
"r",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"{",
|
||||
"|",
|
||||
"}",
|
||||
"~",
|
||||
"\x7f",
|
||||
"\x80",
|
||||
"\x81",
|
||||
"\x82",
|
||||
"\x83",
|
||||
"\x84",
|
||||
"\x85",
|
||||
"\x86",
|
||||
"\x87",
|
||||
"\x88",
|
||||
"\x89",
|
||||
"\x8a",
|
||||
"\x8b",
|
||||
"\x8c",
|
||||
"\x8d",
|
||||
"\x8e",
|
||||
"\x8f",
|
||||
"\x90",
|
||||
"\x91",
|
||||
"\x92",
|
||||
"\x93",
|
||||
"\x94",
|
||||
"\x95",
|
||||
"\x96",
|
||||
"\x97",
|
||||
"\x98",
|
||||
"\x99",
|
||||
"\x9a",
|
||||
"\x9b",
|
||||
"\x9c",
|
||||
"\x9d",
|
||||
"\x9e",
|
||||
"\x9f",
|
||||
"\xa0",
|
||||
"¡",
|
||||
"¢",
|
||||
"£",
|
||||
"⁄",
|
||||
"¥",
|
||||
"ƒ",
|
||||
"§",
|
||||
"¤",
|
||||
"'",
|
||||
"“",
|
||||
"«",
|
||||
"‹",
|
||||
"›",
|
||||
"fi",
|
||||
"fl",
|
||||
"°",
|
||||
"–",
|
||||
"†",
|
||||
"‡",
|
||||
"·",
|
||||
"µ",
|
||||
"¶",
|
||||
"•",
|
||||
"‚",
|
||||
"„",
|
||||
"”",
|
||||
"»",
|
||||
"…",
|
||||
"‰",
|
||||
"¾",
|
||||
"¿",
|
||||
"À",
|
||||
"`",
|
||||
"´",
|
||||
"ˆ",
|
||||
"˜",
|
||||
"¯",
|
||||
"˘",
|
||||
"˙",
|
||||
"¨",
|
||||
"É",
|
||||
"˚",
|
||||
"¸",
|
||||
"Ì",
|
||||
"˝",
|
||||
"˛",
|
||||
"ˇ",
|
||||
"—",
|
||||
"Ñ",
|
||||
"Ò",
|
||||
"Ó",
|
||||
"Ô",
|
||||
"Õ",
|
||||
"Ö",
|
||||
"×",
|
||||
"Ø",
|
||||
"Ù",
|
||||
"Ú",
|
||||
"Û",
|
||||
"Ü",
|
||||
"Ý",
|
||||
"Þ",
|
||||
"ß",
|
||||
"à",
|
||||
"Æ",
|
||||
"â",
|
||||
"ª",
|
||||
"ä",
|
||||
"å",
|
||||
"æ",
|
||||
"ç",
|
||||
"Ł",
|
||||
"Ø",
|
||||
"Œ",
|
||||
"º",
|
||||
"ì",
|
||||
"í",
|
||||
"î",
|
||||
"ï",
|
||||
"ð",
|
||||
"æ",
|
||||
"ò",
|
||||
"ó",
|
||||
"ô",
|
||||
"ı",
|
||||
"ö",
|
||||
"÷",
|
||||
"ł",
|
||||
"ø",
|
||||
"œ",
|
||||
"ß",
|
||||
"ü",
|
||||
"ý",
|
||||
"þ",
|
||||
"ÿ",
|
||||
]
|
||||
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
|
||||
_symbol_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007",
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000A",
|
||||
"\u000B",
|
||||
"\u000C",
|
||||
"\u000D",
|
||||
"\u000E",
|
||||
"\u000F",
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0016",
|
||||
"\u0017",
|
||||
"\u0018",
|
||||
"\u0019",
|
||||
"\u001A",
|
||||
"\u001B",
|
||||
"\u001C",
|
||||
"\u001D",
|
||||
"\u001E",
|
||||
"\u001F",
|
||||
"\u0020",
|
||||
"\u0021",
|
||||
"\u2200",
|
||||
"\u0023",
|
||||
"\u2203",
|
||||
"\u0025",
|
||||
"\u0026",
|
||||
"\u220B",
|
||||
"\u0028",
|
||||
"\u0029",
|
||||
"\u2217",
|
||||
"\u002B",
|
||||
"\u002C",
|
||||
"\u2212",
|
||||
"\u002E",
|
||||
"\u002F",
|
||||
"\u0030",
|
||||
"\u0031",
|
||||
"\u0032",
|
||||
"\u0033",
|
||||
"\u0034",
|
||||
"\u0035",
|
||||
"\u0036",
|
||||
"\u0037",
|
||||
"\u0038",
|
||||
"\u0039",
|
||||
"\u003A",
|
||||
"\u003B",
|
||||
"\u003C",
|
||||
"\u003D",
|
||||
"\u003E",
|
||||
"\u003F",
|
||||
"\u2245",
|
||||
"\u0391",
|
||||
"\u0392",
|
||||
"\u03A7",
|
||||
"\u0394",
|
||||
"\u0395",
|
||||
"\u03A6",
|
||||
"\u0393",
|
||||
"\u0397",
|
||||
"\u0399",
|
||||
"\u03D1",
|
||||
"\u039A",
|
||||
"\u039B",
|
||||
"\u039C",
|
||||
"\u039D",
|
||||
"\u039F",
|
||||
"\u03A0",
|
||||
"\u0398",
|
||||
"\u03A1",
|
||||
"\u03A3",
|
||||
"\u03A4",
|
||||
"\u03A5",
|
||||
"\u03C2",
|
||||
"\u03A9",
|
||||
"\u039E",
|
||||
"\u03A8",
|
||||
"\u0396",
|
||||
"\u005B",
|
||||
"\u2234",
|
||||
"\u005D",
|
||||
"\u22A5",
|
||||
"\u005F",
|
||||
"\uF8E5",
|
||||
"\u03B1",
|
||||
"\u03B2",
|
||||
"\u03C7",
|
||||
"\u03B4",
|
||||
"\u03B5",
|
||||
"\u03C6",
|
||||
"\u03B3",
|
||||
"\u03B7",
|
||||
"\u03B9",
|
||||
"\u03D5",
|
||||
"\u03BA",
|
||||
"\u03BB",
|
||||
"\u00B5",
|
||||
"\u03BD",
|
||||
"\u03BF",
|
||||
"\u03C0",
|
||||
"\u03B8",
|
||||
"\u03C1",
|
||||
"\u03C3",
|
||||
"\u03C4",
|
||||
"\u03C5",
|
||||
"\u03D6",
|
||||
"\u03C9",
|
||||
"\u03BE",
|
||||
"\u03C8",
|
||||
"\u03B6",
|
||||
"\u007B",
|
||||
"\u007C",
|
||||
"\u007D",
|
||||
"\u223C",
|
||||
"\u007F",
|
||||
"\u0080",
|
||||
"\u0081",
|
||||
"\u0082",
|
||||
"\u0083",
|
||||
"\u0084",
|
||||
"\u0085",
|
||||
"\u0086",
|
||||
"\u0087",
|
||||
"\u0088",
|
||||
"\u0089",
|
||||
"\u008A",
|
||||
"\u008B",
|
||||
"\u008C",
|
||||
"\u008D",
|
||||
"\u008E",
|
||||
"\u008F",
|
||||
"\u0090",
|
||||
"\u0091",
|
||||
"\u0092",
|
||||
"\u0093",
|
||||
"\u0094",
|
||||
"\u0095",
|
||||
"\u0096",
|
||||
"\u0097",
|
||||
"\u0098",
|
||||
"\u0099",
|
||||
"\u009A",
|
||||
"\u009B",
|
||||
"\u009C",
|
||||
"\u009D",
|
||||
"\u009E",
|
||||
"\u009F",
|
||||
"\u20AC",
|
||||
"\u03D2",
|
||||
"\u2032",
|
||||
"\u2264",
|
||||
"\u2044",
|
||||
"\u221E",
|
||||
"\u0192",
|
||||
"\u2663",
|
||||
"\u2666",
|
||||
"\u2665",
|
||||
"\u2660",
|
||||
"\u2194",
|
||||
"\u2190",
|
||||
"\u2191",
|
||||
"\u2192",
|
||||
"\u2193",
|
||||
"\u00B0",
|
||||
"\u00B1",
|
||||
"\u2033",
|
||||
"\u2265",
|
||||
"\u00D7",
|
||||
"\u221D",
|
||||
"\u2202",
|
||||
"\u2022",
|
||||
"\u00F7",
|
||||
"\u2260",
|
||||
"\u2261",
|
||||
"\u2248",
|
||||
"\u2026",
|
||||
"\uF8E6",
|
||||
"\uF8E7",
|
||||
"\u21B5",
|
||||
"\u2135",
|
||||
"\u2111",
|
||||
"\u211C",
|
||||
"\u2118",
|
||||
"\u2297",
|
||||
"\u2295",
|
||||
"\u2205",
|
||||
"\u2229",
|
||||
"\u222A",
|
||||
"\u2283",
|
||||
"\u2287",
|
||||
"\u2284",
|
||||
"\u2282",
|
||||
"\u2286",
|
||||
"\u2208",
|
||||
"\u2209",
|
||||
"\u2220",
|
||||
"\u2207",
|
||||
"\uF6DA",
|
||||
"\uF6D9",
|
||||
"\uF6DB",
|
||||
"\u220F",
|
||||
"\u221A",
|
||||
"\u22C5",
|
||||
"\u00AC",
|
||||
"\u2227",
|
||||
"\u2228",
|
||||
"\u21D4",
|
||||
"\u21D0",
|
||||
"\u21D1",
|
||||
"\u21D2",
|
||||
"\u21D3",
|
||||
"\u25CA",
|
||||
"\u2329",
|
||||
"\uF8E8",
|
||||
"\uF8E9",
|
||||
"\uF8EA",
|
||||
"\u2211",
|
||||
"\uF8EB",
|
||||
"\uF8EC",
|
||||
"\uF8ED",
|
||||
"\uF8EE",
|
||||
"\uF8EF",
|
||||
"\uF8F0",
|
||||
"\uF8F1",
|
||||
"\uF8F2",
|
||||
"\uF8F3",
|
||||
"\uF8F4",
|
||||
"\u00F0",
|
||||
"\u232A",
|
||||
"\u222B",
|
||||
"\u2320",
|
||||
"\uF8F5",
|
||||
"\u2321",
|
||||
"\uF8F6",
|
||||
"\uF8F7",
|
||||
"\uF8F8",
|
||||
"\uF8F9",
|
||||
"\uF8FA",
|
||||
"\uF8FB",
|
||||
"\uF8FC",
|
||||
"\uF8FD",
|
||||
"\uF8FE",
|
||||
"\u00FF",
|
||||
]
|
||||
assert len(_symbol_encoding) == 256
|
||||
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
@@ -0,0 +1,261 @@
|
||||
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
|
||||
|
||||
_zapfding_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007",
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000A",
|
||||
"\u000B",
|
||||
"\u000C",
|
||||
"\u000D",
|
||||
"\u000E",
|
||||
"\u000F",
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0016",
|
||||
"\u0017",
|
||||
"\u0018",
|
||||
"\u0019",
|
||||
"\u001A",
|
||||
"\u001B",
|
||||
"\u001C",
|
||||
"\u001D",
|
||||
"\u001E",
|
||||
"\u001F",
|
||||
"\u0020",
|
||||
"\u2701",
|
||||
"\u2702",
|
||||
"\u2703",
|
||||
"\u2704",
|
||||
"\u260E",
|
||||
"\u2706",
|
||||
"\u2707",
|
||||
"\u2708",
|
||||
"\u2709",
|
||||
"\u261B",
|
||||
"\u261E",
|
||||
"\u270C",
|
||||
"\u270D",
|
||||
"\u270E",
|
||||
"\u270F",
|
||||
"\u2710",
|
||||
"\u2711",
|
||||
"\u2712",
|
||||
"\u2713",
|
||||
"\u2714",
|
||||
"\u2715",
|
||||
"\u2716",
|
||||
"\u2717",
|
||||
"\u2718",
|
||||
"\u2719",
|
||||
"\u271A",
|
||||
"\u271B",
|
||||
"\u271C",
|
||||
"\u271D",
|
||||
"\u271E",
|
||||
"\u271F",
|
||||
"\u2720",
|
||||
"\u2721",
|
||||
"\u2722",
|
||||
"\u2723",
|
||||
"\u2724",
|
||||
"\u2725",
|
||||
"\u2726",
|
||||
"\u2727",
|
||||
"\u2605",
|
||||
"\u2729",
|
||||
"\u272A",
|
||||
"\u272B",
|
||||
"\u272C",
|
||||
"\u272D",
|
||||
"\u272E",
|
||||
"\u272F",
|
||||
"\u2730",
|
||||
"\u2731",
|
||||
"\u2732",
|
||||
"\u2733",
|
||||
"\u2734",
|
||||
"\u2735",
|
||||
"\u2736",
|
||||
"\u2737",
|
||||
"\u2738",
|
||||
"\u2739",
|
||||
"\u273A",
|
||||
"\u273B",
|
||||
"\u273C",
|
||||
"\u273D",
|
||||
"\u273E",
|
||||
"\u273F",
|
||||
"\u2740",
|
||||
"\u2741",
|
||||
"\u2742",
|
||||
"\u2743",
|
||||
"\u2744",
|
||||
"\u2745",
|
||||
"\u2746",
|
||||
"\u2747",
|
||||
"\u2748",
|
||||
"\u2749",
|
||||
"\u274A",
|
||||
"\u274B",
|
||||
"\u25CF",
|
||||
"\u274D",
|
||||
"\u25A0",
|
||||
"\u274F",
|
||||
"\u2750",
|
||||
"\u2751",
|
||||
"\u2752",
|
||||
"\u25B2",
|
||||
"\u25BC",
|
||||
"\u25C6",
|
||||
"\u2756",
|
||||
"\u25D7",
|
||||
"\u2758",
|
||||
"\u2759",
|
||||
"\u275A",
|
||||
"\u275B",
|
||||
"\u275C",
|
||||
"\u275D",
|
||||
"\u275E",
|
||||
"\u007F",
|
||||
"\uF8D7",
|
||||
"\uF8D8",
|
||||
"\uF8D9",
|
||||
"\uF8DA",
|
||||
"\uF8DB",
|
||||
"\uF8DC",
|
||||
"\uF8DD",
|
||||
"\uF8DE",
|
||||
"\uF8DF",
|
||||
"\uF8E0",
|
||||
"\uF8E1",
|
||||
"\uF8E2",
|
||||
"\uF8E3",
|
||||
"\uF8E4",
|
||||
"\u008E",
|
||||
"\u008F",
|
||||
"\u0090",
|
||||
"\u0091",
|
||||
"\u0092",
|
||||
"\u0093",
|
||||
"\u0094",
|
||||
"\u0095",
|
||||
"\u0096",
|
||||
"\u0097",
|
||||
"\u0098",
|
||||
"\u0099",
|
||||
"\u009A",
|
||||
"\u009B",
|
||||
"\u009C",
|
||||
"\u009D",
|
||||
"\u009E",
|
||||
"\u009F",
|
||||
"\u00A0",
|
||||
"\u2761",
|
||||
"\u2762",
|
||||
"\u2763",
|
||||
"\u2764",
|
||||
"\u2765",
|
||||
"\u2766",
|
||||
"\u2767",
|
||||
"\u2663",
|
||||
"\u2666",
|
||||
"\u2665",
|
||||
"\u2660",
|
||||
"\u2460",
|
||||
"\u2461",
|
||||
"\u2462",
|
||||
"\u2463",
|
||||
"\u2464",
|
||||
"\u2465",
|
||||
"\u2466",
|
||||
"\u2467",
|
||||
"\u2468",
|
||||
"\u2469",
|
||||
"\u2776",
|
||||
"\u2777",
|
||||
"\u2778",
|
||||
"\u2779",
|
||||
"\u277A",
|
||||
"\u277B",
|
||||
"\u277C",
|
||||
"\u277D",
|
||||
"\u277E",
|
||||
"\u277F",
|
||||
"\u2780",
|
||||
"\u2781",
|
||||
"\u2782",
|
||||
"\u2783",
|
||||
"\u2784",
|
||||
"\u2785",
|
||||
"\u2786",
|
||||
"\u2787",
|
||||
"\u2788",
|
||||
"\u2789",
|
||||
"\u278A",
|
||||
"\u278B",
|
||||
"\u278C",
|
||||
"\u278D",
|
||||
"\u278E",
|
||||
"\u278F",
|
||||
"\u2790",
|
||||
"\u2791",
|
||||
"\u2792",
|
||||
"\u2793",
|
||||
"\u2794",
|
||||
"\u2192",
|
||||
"\u2194",
|
||||
"\u2195",
|
||||
"\u2798",
|
||||
"\u2799",
|
||||
"\u279A",
|
||||
"\u279B",
|
||||
"\u279C",
|
||||
"\u279D",
|
||||
"\u279E",
|
||||
"\u279F",
|
||||
"\u27A0",
|
||||
"\u27A1",
|
||||
"\u27A2",
|
||||
"\u27A3",
|
||||
"\u27A4",
|
||||
"\u27A5",
|
||||
"\u27A6",
|
||||
"\u27A7",
|
||||
"\u27A8",
|
||||
"\u27A9",
|
||||
"\u27AA",
|
||||
"\u27AB",
|
||||
"\u27AC",
|
||||
"\u27AD",
|
||||
"\u27AE",
|
||||
"\u27AF",
|
||||
"\u00F0",
|
||||
"\u27B1",
|
||||
"\u27B2",
|
||||
"\u27B3",
|
||||
"\u27B4",
|
||||
"\u27B5",
|
||||
"\u27B6",
|
||||
"\u27B7",
|
||||
"\u27B8",
|
||||
"\u27B9",
|
||||
"\u27BA",
|
||||
"\u27BB",
|
||||
"\u27BC",
|
||||
"\u27BD",
|
||||
"\u27BE",
|
||||
"\u00FF",
|
||||
]
|
||||
assert len(_zapfding_encoding) == 256
|
||||
@@ -0,0 +1,86 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase, CryptIdentity
|
||||
|
||||
try:
|
||||
from pypdf._crypt_providers._cryptography import (
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
from pypdf._utils import Version
|
||||
|
||||
if Version(crypt_provider[1]) <= Version("3.0"):
|
||||
# This is due to the backend parameter being required back then:
|
||||
# https://cryptography.io/en/latest/changelog/#v3-1
|
||||
raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover
|
||||
except ImportError:
|
||||
try:
|
||||
from pypdf._crypt_providers._pycryptodome import ( # type: ignore
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
except ImportError:
|
||||
from pypdf._crypt_providers._fallback import ( # type: ignore
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CryptAES",
|
||||
"CryptBase",
|
||||
"CryptIdentity",
|
||||
"CryptRC4",
|
||||
"aes_cbc_decrypt",
|
||||
"aes_cbc_encrypt",
|
||||
"aes_ecb_decrypt",
|
||||
"aes_ecb_encrypt",
|
||||
"crypt_provider",
|
||||
"rc4_decrypt",
|
||||
"rc4_encrypt",
|
||||
]
|
||||
@@ -0,0 +1,38 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
class CryptBase:
|
||||
def encrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||
return data
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||
return data
|
||||
|
||||
|
||||
class CryptIdentity(CryptBase):
|
||||
pass
|
||||
@@ -0,0 +1,118 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import secrets
|
||||
|
||||
from cryptography import __version__
|
||||
from cryptography.hazmat.primitives import padding
|
||||
from cryptography.hazmat.primitives.ciphers.algorithms import AES
|
||||
|
||||
try:
|
||||
# 43.0.0 - https://cryptography.io/en/latest/changelog/#v43-0-0
|
||||
from cryptography.hazmat.decrepit.ciphers.algorithms import ARC4
|
||||
except ImportError:
|
||||
from cryptography.hazmat.primitives.ciphers.algorithms import ARC4
|
||||
from cryptography.hazmat.primitives.ciphers.base import Cipher
|
||||
from cryptography.hazmat.primitives.ciphers.modes import CBC, ECB
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
|
||||
crypt_provider = ("cryptography", __version__)
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.cipher = Cipher(ARC4(key), mode=None)
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
encryptor = self.cipher.encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
decryptor = self.cipher.decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.alg = AES(key)
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
iv = secrets.token_bytes(16)
|
||||
pad = padding.PKCS7(128).padder()
|
||||
data = pad.update(data) + pad.finalize()
|
||||
|
||||
cipher = Cipher(self.alg, CBC(iv))
|
||||
encryptor = cipher.encryptor()
|
||||
return iv + encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
iv = data[:16]
|
||||
data = data[16:]
|
||||
# for empty encrypted data
|
||||
if not data:
|
||||
return data
|
||||
|
||||
# just for robustness, it does not happen under normal circumstances
|
||||
if len(data) % 16 != 0:
|
||||
pad = padding.PKCS7(128).padder()
|
||||
data = pad.update(data) + pad.finalize()
|
||||
|
||||
cipher = Cipher(self.alg, CBC(iv))
|
||||
decryptor = cipher.decryptor()
|
||||
d = decryptor.update(data) + decryptor.finalize()
|
||||
return d[: -d[-1]]
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(ARC4(key), mode=None).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(ARC4(key), mode=None).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(AES(key), mode=ECB()).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(AES(key), mode=ECB()).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(AES(key), mode=CBC(iv)).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(AES(key), mode=CBC(iv)).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
@@ -0,0 +1,93 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
from pypdf.errors import DependencyError
|
||||
|
||||
_DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm"
|
||||
|
||||
|
||||
crypt_provider = ("local_crypt_fallback", "0.0.0")
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.s = bytearray(range(256))
|
||||
j = 0
|
||||
for i in range(256):
|
||||
j = (j + self.s[i] + key[i % len(key)]) % 256
|
||||
self.s[i], self.s[j] = self.s[j], self.s[i]
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
s = bytearray(self.s)
|
||||
out = [0 for _ in range(len(data))]
|
||||
i, j = 0, 0
|
||||
for k in range(len(data)):
|
||||
i = (i + 1) % 256
|
||||
j = (j + s[i]) % 256
|
||||
s[i], s[j] = s[j], s[i]
|
||||
x = s[(s[i] + s[j]) % 256]
|
||||
out[k] = data[k] ^ x
|
||||
return bytes(out)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
return self.encrypt(data)
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
pass
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return CryptRC4(key).encrypt(data)
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return CryptRC4(key).decrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
@@ -0,0 +1,97 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import secrets
|
||||
|
||||
from Crypto import __version__
|
||||
from Crypto.Cipher import AES, ARC4
|
||||
from Crypto.Util.Padding import pad
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
|
||||
crypt_provider = ("pycryptodome", __version__)
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.key = key
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(self.key).encrypt(data)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(self.key).decrypt(data)
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.key = key
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
iv = secrets.token_bytes(16)
|
||||
data = pad(data, 16)
|
||||
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||
return iv + aes.encrypt(data)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
iv = data[:16]
|
||||
data = data[16:]
|
||||
# for empty encrypted data
|
||||
if not data:
|
||||
return data
|
||||
|
||||
# just for robustness, it does not happen under normal circumstances
|
||||
if len(data) % 16 != 0:
|
||||
data = pad(data, 16)
|
||||
|
||||
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||
d = aes.decrypt(data)
|
||||
return d[: -d[-1]]
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(key).encrypt(data)
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(key).decrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_ECB).encrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_ECB).decrypt(data)
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_CBC, iv).encrypt(data)
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_CBC, iv).decrypt(data)
|
||||
1461
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
1461
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
File diff suppressed because it is too large
Load Diff
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
File diff suppressed because it is too large
Load Diff
327
venv/lib/python3.12/site-packages/pypdf/_font.py
Normal file
327
venv/lib/python3.12/site-packages/pypdf/_font.py
Normal file
@@ -0,0 +1,327 @@
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
||||
|
||||
from ._cmap import get_encoding
|
||||
from ._codecs.adobe_glyphs import adobe_glyphs
|
||||
from ._utils import logger_warning
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FontDescriptor:
|
||||
"""
|
||||
Represents the FontDescriptor dictionary as defined in the PDF specification.
|
||||
This contains both descriptive and metric information.
|
||||
|
||||
The defaults are derived from the mean values of the 14 core fonts, rounded
|
||||
to 100.
|
||||
"""
|
||||
|
||||
name: str = "Unknown"
|
||||
family: str = "Unknown"
|
||||
weight: str = "Unknown"
|
||||
|
||||
ascent: float = 700.0
|
||||
descent: float = -200.0
|
||||
cap_height: float = 600.0
|
||||
x_height: float = 500.0
|
||||
italic_angle: float = 0.0 # Non-italic
|
||||
flags: int = 32 # Non-serif, non-symbolic, not fixed width
|
||||
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
|
||||
|
||||
character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
|
||||
|
||||
@staticmethod
|
||||
def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
|
||||
font_descriptor_dict: DictionaryObject = (
|
||||
font_descriptor_obj.get_object()
|
||||
if isinstance(font_descriptor_obj, IndirectObject)
|
||||
else font_descriptor_obj
|
||||
)
|
||||
for source_key, target_key in [
|
||||
("/FontName", "name"),
|
||||
("/FontFamily", "family"),
|
||||
("/FontWeight", "weight"),
|
||||
("/Ascent", "ascent"),
|
||||
("/Descent", "descent"),
|
||||
("/CapHeight", "cap_height"),
|
||||
("/XHeight", "x_height"),
|
||||
("/ItalicAngle", "italic_angle"),
|
||||
("/Flags", "flags"),
|
||||
("/FontBBox", "bbox")
|
||||
]:
|
||||
if source_key in font_descriptor_dict:
|
||||
font_kwargs[target_key] = font_descriptor_dict[source_key]
|
||||
# Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
|
||||
if "bbox" in font_kwargs:
|
||||
bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
|
||||
assert len(bbox_tuple) == 4, bbox_tuple
|
||||
font_kwargs["bbox"] = bbox_tuple
|
||||
return font_kwargs
|
||||
|
||||
@staticmethod
|
||||
def _collect_tt_t1_character_widths(
|
||||
pdf_font_dict: DictionaryObject,
|
||||
char_map: dict[Any, Any],
|
||||
encoding: Union[str, dict[int, str]],
|
||||
current_widths: dict[str, int]
|
||||
) -> None:
|
||||
"""Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
|
||||
widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
|
||||
first_char = pdf_font_dict.get("/FirstChar", 0)
|
||||
if not isinstance(encoding, str):
|
||||
# This means that encoding is a dict
|
||||
current_widths.update({
|
||||
encoding.get(idx + first_char, chr(idx + first_char)): width
|
||||
for idx, width in enumerate(widths_array)
|
||||
})
|
||||
return
|
||||
|
||||
# We map the character code directly to the character
|
||||
# using the string encoding
|
||||
for idx, width in enumerate(widths_array):
|
||||
# Often "idx == 0" will denote the .notdef character, but we add it anyway
|
||||
char_code = idx + first_char # This is a raw code
|
||||
# Get the "raw" character or byte representation
|
||||
raw_char = bytes([char_code]).decode(encoding, "surrogatepass")
|
||||
# Translate raw_char to the REAL Unicode character using the char_map
|
||||
unicode_char = char_map.get(raw_char)
|
||||
if unicode_char:
|
||||
current_widths[unicode_char] = int(width)
|
||||
else:
|
||||
current_widths[raw_char] = int(width)
|
||||
|
||||
@staticmethod
|
||||
def _collect_cid_character_widths(
|
||||
d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
|
||||
) -> None:
|
||||
"""Parses the /W array from a DescendantFont dictionary and updates character widths."""
|
||||
ord_map = {
|
||||
ord(_target): _surrogate
|
||||
for _target, _surrogate in char_map.items()
|
||||
if isinstance(_target, str)
|
||||
}
|
||||
# /W width definitions have two valid formats which can be mixed and matched:
|
||||
# (1) A character start index followed by a list of widths, e.g.
|
||||
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
|
||||
# (2) A character start index, a character stop index, and a width, e.g.
|
||||
# `45 65 500` applies width 500 to characters 45-65.
|
||||
skip_count = 0
|
||||
_w = d_font.get("/W", [])
|
||||
for idx, w_entry in enumerate(_w):
|
||||
w_entry = w_entry.get_object()
|
||||
if skip_count:
|
||||
skip_count -= 1
|
||||
continue
|
||||
if not isinstance(w_entry, (int, float)):
|
||||
# We should never get here due to skip_count above. But
|
||||
# sometimes we do.
|
||||
logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)
|
||||
continue
|
||||
# check for format (1): `int [int int int int ...]`
|
||||
w_next_entry = _w[idx + 1].get_object()
|
||||
if isinstance(w_next_entry, Sequence):
|
||||
start_idx, width_list = w_entry, w_next_entry
|
||||
current_widths.update(
|
||||
{
|
||||
ord_map[_cidx]: _width
|
||||
for _cidx, _width in zip(
|
||||
range(
|
||||
cast(int, start_idx),
|
||||
cast(int, start_idx) + len(width_list),
|
||||
1,
|
||||
),
|
||||
width_list,
|
||||
)
|
||||
if _cidx in ord_map
|
||||
}
|
||||
)
|
||||
skip_count = 1
|
||||
# check for format (2): `int int int`
|
||||
elif isinstance(w_next_entry, (int, float)) and isinstance(
|
||||
_w[idx + 2].get_object(), (int, float)
|
||||
):
|
||||
start_idx, stop_idx, const_width = (
|
||||
w_entry,
|
||||
w_next_entry,
|
||||
_w[idx + 2].get_object(),
|
||||
)
|
||||
current_widths.update(
|
||||
{
|
||||
ord_map[_cidx]: const_width
|
||||
for _cidx in range(
|
||||
cast(int, start_idx), cast(int, stop_idx + 1), 1
|
||||
)
|
||||
if _cidx in ord_map
|
||||
}
|
||||
)
|
||||
skip_count = 2
|
||||
else:
|
||||
# This handles the case of out of bounds (reaching the end of the width definitions
|
||||
# while expecting more elements).
|
||||
logger_warning(
|
||||
f"Invalid font width definition. Last element: {w_entry}.",
|
||||
__name__
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _add_default_width(current_widths: dict[str, int]) -> None:
|
||||
if not current_widths:
|
||||
current_widths["default"] = 500
|
||||
return
|
||||
|
||||
if "default" in current_widths:
|
||||
return
|
||||
|
||||
if " " in current_widths and current_widths[" "] != 0:
|
||||
# Setting default to twice the space width
|
||||
current_widths["default"] = int(2 * current_widths[" "])
|
||||
return
|
||||
|
||||
# Use the average width of existing glyph widths
|
||||
valid_widths = [w for w in current_widths.values() if w > 0]
|
||||
current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
|
||||
|
||||
@classmethod
|
||||
def from_font_resource(
|
||||
cls,
|
||||
pdf_font_dict: DictionaryObject,
|
||||
encoding: Optional[Union[str, dict[int, str]]] = None,
|
||||
char_map: Optional[dict[Any, Any]] = None
|
||||
) -> "FontDescriptor":
|
||||
from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415
|
||||
# Prioritize information from the PDF font dictionary
|
||||
font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
||||
font_kwargs: dict[str, Any] = {"character_widths": {}}
|
||||
|
||||
# Deal with fonts by type; Type1, TrueType and certain Type3
|
||||
if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
|
||||
if "/Widths" in pdf_font_dict:
|
||||
if not (encoding and char_map):
|
||||
encoding, char_map = get_encoding(pdf_font_dict)
|
||||
cls._collect_tt_t1_character_widths(
|
||||
pdf_font_dict, char_map, encoding, font_kwargs["character_widths"]
|
||||
)
|
||||
elif font_name in CORE_FONT_METRICS:
|
||||
font_descriptor = CORE_FONT_METRICS[font_name]
|
||||
cls._add_default_width(font_descriptor.character_widths)
|
||||
|
||||
return font_descriptor
|
||||
|
||||
if "/FontDescriptor" in pdf_font_dict: # TODO: This does not account for some Type3 fonts;
|
||||
# see tests/test_cmap.py::test_ascii_charset
|
||||
font_descriptor_resource = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
|
||||
font_descriptor_obj = cast(DictionaryObject, font_descriptor_resource)
|
||||
if "/MissingWidth" in font_descriptor_obj:
|
||||
font_kwargs["character_widths"]["default"] = font_descriptor_obj["/MissingWidth"].get_object()
|
||||
font_kwargs = cls._parse_font_descriptor(
|
||||
font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
|
||||
)
|
||||
if "default" not in font_kwargs["character_widths"]:
|
||||
cls._add_default_width(font_kwargs["character_widths"])
|
||||
|
||||
return cls(**font_kwargs)
|
||||
|
||||
# Composite font or CID font - CID fonts have a /W array mapping character codes
|
||||
# to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
|
||||
# because all other fonts have already been dealt with.
|
||||
if not (encoding and char_map):
|
||||
encoding, char_map = get_encoding(pdf_font_dict)
|
||||
d_font: DictionaryObject
|
||||
for d_font_idx, d_font in enumerate(
|
||||
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
|
||||
):
|
||||
d_font = cast(DictionaryObject, d_font.get_object())
|
||||
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
|
||||
cls._collect_cid_character_widths(
|
||||
d_font, char_map, font_kwargs["character_widths"]
|
||||
)
|
||||
if "/DW" in d_font:
|
||||
font_kwargs["character_widths"]["default"] = d_font["/DW"].get_object()
|
||||
else:
|
||||
cls._add_default_width(font_kwargs["character_widths"])
|
||||
font_kwargs = cls._parse_font_descriptor(
|
||||
font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
|
||||
)
|
||||
|
||||
return cls(**font_kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Font:
|
||||
"""
|
||||
A font object for use during text extraction and for producing
|
||||
text appearance streams.
|
||||
|
||||
Attributes:
|
||||
name: Font name, derived from font["/BaseFont"]
|
||||
character_map: The font's character map
|
||||
encoding: Font encoding
|
||||
sub_type: The font type, such as Type1, TrueType, or Type3.
|
||||
font_descriptor: Font metrics, including a mapping of characters to widths
|
||||
character_widths: A mapping of characters to widths
|
||||
space_width: The width of a space, or an approximation
|
||||
interpretable: Default True. If False, the font glyphs cannot
|
||||
be translated to characters, e.g. Type3 fonts that do not define
|
||||
a '/ToUnicode' mapping.
|
||||
|
||||
"""
|
||||
|
||||
name: str
|
||||
encoding: Union[str, dict[int, str]]
|
||||
character_map: dict[Any, Any] = field(default_factory=dict)
|
||||
sub_type: str = "Unknown"
|
||||
font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
|
||||
character_widths: dict[str, int] = field(default_factory=dict)
|
||||
space_width: Union[float, int] = 250
|
||||
interpretable: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_font_resource(
|
||||
cls,
|
||||
pdf_font_dict: DictionaryObject,
|
||||
) -> "Font":
|
||||
# Can collect base_font, name and encoding directly from font resource
|
||||
name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
||||
sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
|
||||
encoding, character_map = get_encoding(pdf_font_dict)
|
||||
|
||||
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
|
||||
# reliably converted into character codes unless all named chars
|
||||
# in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
|
||||
# PDF 1.7 standard.
|
||||
interpretable = True
|
||||
if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
|
||||
interpretable = all(
|
||||
cname in adobe_glyphs
|
||||
for cname in pdf_font_dict.get("/CharProcs") or []
|
||||
)
|
||||
|
||||
if interpretable:
|
||||
font_descriptor = FontDescriptor.from_font_resource(pdf_font_dict, encoding, character_map)
|
||||
else:
|
||||
font_descriptor = FontDescriptor() # Save some overhead if font is not interpretable
|
||||
character_widths = font_descriptor.character_widths
|
||||
|
||||
space_width = font_descriptor.character_widths.get(" ")
|
||||
if not space_width or space_width == 0:
|
||||
space_width = font_descriptor.character_widths["default"] // 2
|
||||
|
||||
return cls(
|
||||
name=name,
|
||||
sub_type=sub_type,
|
||||
encoding=encoding,
|
||||
font_descriptor=font_descriptor,
|
||||
character_map=character_map,
|
||||
character_widths=character_widths,
|
||||
space_width=space_width,
|
||||
interpretable=interpretable
|
||||
)
|
||||
|
||||
def text_width(self, text: str = "") -> float:
|
||||
"""Sum of character widths specified in PDF font for the supplied text."""
|
||||
return sum(
|
||||
[self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
|
||||
)
|
||||
2353
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
2353
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
File diff suppressed because it is too large
Load Diff
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
Page labels are shown by PDF viewers as "the page number".
|
||||
|
||||
A page has a numeric index, starting at 0. Additionally, the page
|
||||
has a label. In the most simple case:
|
||||
|
||||
label = index + 1
|
||||
|
||||
However, the title page and the table of contents might have Roman numerals as
|
||||
page labels. This makes things more complicated.
|
||||
|
||||
Example 1
|
||||
---------
|
||||
|
||||
>>> reader.root_object["/PageLabels"]["/Nums"]
|
||||
[0, IndirectObject(18, 0, 139929798197504),
|
||||
8, IndirectObject(19, 0, 139929798197504)]
|
||||
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
|
||||
{'/S': '/r'}
|
||||
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
|
||||
{'/S': '/D'}
|
||||
|
||||
Example 2
|
||||
---------
|
||||
The following is a document with pages labeled
|
||||
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
|
||||
|
||||
1 0 obj
|
||||
<< /Type /Catalog
|
||||
/PageLabels << /Nums [
|
||||
0 << /S /r >>
|
||||
4 << /S /D >>
|
||||
7 << /S /D
|
||||
/P ( A- )
|
||||
/St 8
|
||||
>>
|
||||
% A number tree containing
|
||||
% three page label dictionaries
|
||||
]
|
||||
>>
|
||||
...
|
||||
>>
|
||||
endobj
|
||||
|
||||
|
||||
§12.4.2 PDF Specification 1.7 and 2.0
|
||||
=====================================
|
||||
|
||||
Entries in a page label dictionary
|
||||
----------------------------------
|
||||
The /S key:
|
||||
D Decimal Arabic numerals
|
||||
R Uppercase Roman numerals
|
||||
r Lowercase Roman numerals
|
||||
A Uppercase letters (A to Z for the first 26 pages,
|
||||
AA to ZZ for the next 26, and so on)
|
||||
a Lowercase letters (a to z for the first 26 pages,
|
||||
aa to zz for the next 26, and so on)
|
||||
"""
|
||||
|
||||
from collections.abc import Iterator
|
||||
from typing import Optional, cast
|
||||
|
||||
from ._protocols import PdfCommonDocProtocol
|
||||
from ._utils import logger_warning
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DictionaryObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
|
||||
def number2uppercase_roman_numeral(num: int) -> str:
|
||||
roman = [
|
||||
(1000, "M"),
|
||||
(900, "CM"),
|
||||
(500, "D"),
|
||||
(400, "CD"),
|
||||
(100, "C"),
|
||||
(90, "XC"),
|
||||
(50, "L"),
|
||||
(40, "XL"),
|
||||
(10, "X"),
|
||||
(9, "IX"),
|
||||
(5, "V"),
|
||||
(4, "IV"),
|
||||
(1, "I"),
|
||||
]
|
||||
|
||||
def roman_num(num: int) -> Iterator[str]:
|
||||
for decimal, roman_repr in roman:
|
||||
x, _ = divmod(num, decimal)
|
||||
yield roman_repr * x
|
||||
num -= decimal * x
|
||||
if num <= 0:
|
||||
break
|
||||
|
||||
return "".join(list(roman_num(num)))
|
||||
|
||||
|
||||
def number2lowercase_roman_numeral(number: int) -> str:
|
||||
return number2uppercase_roman_numeral(number).lower()
|
||||
|
||||
|
||||
def number2uppercase_letter(number: int) -> str:
|
||||
if number <= 0:
|
||||
raise ValueError("Expecting a positive number")
|
||||
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
|
||||
rep = ""
|
||||
while number > 0:
|
||||
remainder = number % 26
|
||||
if remainder == 0:
|
||||
remainder = 26
|
||||
rep = alphabet[remainder - 1] + rep
|
||||
# update
|
||||
number -= remainder
|
||||
number = number // 26
|
||||
return rep
|
||||
|
||||
|
||||
def number2lowercase_letter(number: int) -> str:
|
||||
return number2uppercase_letter(number).lower()
|
||||
|
||||
|
||||
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
|
||||
# [Nums] shall be an array of the form
|
||||
# [ key_1 value_1 key_2 value_2 ... key_n value_n ]
|
||||
# where each key_i is an integer and the corresponding
|
||||
# value_i shall be the object associated with that key.
|
||||
# The keys shall be sorted in numerical order,
|
||||
# analogously to the arrangement of keys in a name tree
|
||||
# as described in 7.9.6, "Name Trees."
|
||||
nums = cast(ArrayObject, dictionary_object["/Nums"])
|
||||
i = 0
|
||||
value = None
|
||||
start_index = 0
|
||||
while i < len(nums):
|
||||
start_index = nums[i]
|
||||
value = nums[i + 1].get_object()
|
||||
if i + 2 == len(nums):
|
||||
break
|
||||
if nums[i + 2] > index:
|
||||
break
|
||||
i += 2
|
||||
m = {
|
||||
None: lambda _: "",
|
||||
"/D": lambda n: str(n),
|
||||
"/R": number2uppercase_roman_numeral,
|
||||
"/r": number2lowercase_roman_numeral,
|
||||
"/A": number2uppercase_letter,
|
||||
"/a": number2lowercase_letter,
|
||||
}
|
||||
# if /Nums array is not following the specification or if /Nums is empty
|
||||
if not isinstance(value, dict):
|
||||
return str(index + 1) # Fallback
|
||||
start = value.get("/St", 1)
|
||||
prefix = value.get("/P", "")
|
||||
return prefix + m[value.get("/S")](index - start_index + start)
|
||||
|
||||
|
||||
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
|
||||
"""
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
reader: The PdfReader
|
||||
index: The index of the page
|
||||
|
||||
Returns:
|
||||
The label of the page, e.g. "iv" or "4".
|
||||
|
||||
"""
|
||||
root = cast(DictionaryObject, reader.root_object)
|
||||
if "/PageLabels" not in root:
|
||||
return str(index + 1) # Fallback
|
||||
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
|
||||
if "/Nums" in number_tree:
|
||||
return get_label_from_nums(number_tree, index)
|
||||
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
|
||||
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
|
||||
# Limit maximum depth.
|
||||
level = 0
|
||||
while level < 100:
|
||||
kids = cast(list[DictionaryObject], number_tree["/Kids"])
|
||||
for kid in kids:
|
||||
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
|
||||
limits = cast(list[int], kid["/Limits"])
|
||||
if limits[0] <= index <= limits[1]:
|
||||
if not is_null_or_none(kid.get("/Kids", None)):
|
||||
# Recursive definition.
|
||||
level += 1
|
||||
if level == 100: # pragma: no cover
|
||||
raise NotImplementedError(
|
||||
"Too deep nesting is not supported."
|
||||
)
|
||||
number_tree = kid
|
||||
# Exit the inner `for` loop and continue at the next level with the
|
||||
# next iteration of the `while` loop.
|
||||
break
|
||||
return get_label_from_nums(kid, index)
|
||||
else:
|
||||
# When there are no kids, make sure to exit the `while` loop directly
|
||||
# and continue with the fallback.
|
||||
break
|
||||
|
||||
logger_warning(f"Could not reliably determine page label for {index}.", __name__)
|
||||
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
|
||||
|
||||
|
||||
def nums_insert(
|
||||
key: NumberObject,
|
||||
value: DictionaryObject,
|
||||
nums: ArrayObject,
|
||||
) -> None:
|
||||
"""
|
||||
Insert a key, value pair in a Nums array.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry
|
||||
value: value of the entry
|
||||
nums: Nums array to modify
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
|
||||
i = len(nums)
|
||||
while i != 0 and key <= nums[i - 2]:
|
||||
i = i - 2
|
||||
|
||||
if i < len(nums) and key == nums[i]:
|
||||
nums[i + 1] = value
|
||||
else:
|
||||
nums.insert(i, key)
|
||||
nums.insert(i + 1, value)
|
||||
|
||||
|
||||
def nums_clear_range(
|
||||
key: NumberObject,
|
||||
page_index_to: int,
|
||||
nums: ArrayObject,
|
||||
) -> None:
|
||||
"""
|
||||
Remove all entries in a number tree in a range after an entry.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry before the range
|
||||
page_index_to: The page index of the upper limit of the range
|
||||
nums: Nums array to modify
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
if page_index_to < key:
|
||||
raise ValueError("page_index_to must be greater or equal than key")
|
||||
|
||||
i = nums.index(key) + 2
|
||||
while i < len(nums) and nums[i] <= page_index_to:
|
||||
nums.pop(i)
|
||||
nums.pop(i)
|
||||
|
||||
|
||||
def nums_next(
|
||||
key: NumberObject,
|
||||
nums: ArrayObject,
|
||||
) -> tuple[Optional[NumberObject], Optional[DictionaryObject]]:
|
||||
"""
|
||||
Return the (key, value) pair of the entry after the given one.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry
|
||||
nums: Nums array
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
|
||||
i = nums.index(key) + 2
|
||||
if i < len(nums):
|
||||
return (nums[i], nums[i + 1])
|
||||
return (None, None)
|
||||
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Helpers for working with PDF types."""
|
||||
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Optional, Protocol, Union
|
||||
|
||||
from ._utils import StrByteType, StreamType
|
||||
|
||||
|
||||
class PdfObjectProtocol(Protocol):
|
||||
indirect_reference: Any
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Union[tuple[str, ...], list[str], None] = (),
|
||||
) -> Any:
|
||||
... # pragma: no cover
|
||||
|
||||
def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
|
||||
... # pragma: no cover
|
||||
|
||||
def get_object(self) -> Optional["PdfObjectProtocol"]:
|
||||
... # pragma: no cover
|
||||
|
||||
def hash_value(self) -> bytes:
|
||||
... # pragma: no cover
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class XmpInformationProtocol(PdfObjectProtocol):
|
||||
pass
|
||||
|
||||
|
||||
class PdfCommonDocProtocol(Protocol):
|
||||
@property
|
||||
def pdf_header(self) -> str:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def pages(self) -> list[Any]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def root_object(self) -> PdfObjectProtocol:
|
||||
... # pragma: no cover
|
||||
|
||||
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def strict(self) -> bool:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class PdfReaderProtocol(PdfCommonDocProtocol, Protocol):
|
||||
@property
|
||||
@abstractmethod
|
||||
def xref(self) -> dict[int, dict[int, Any]]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def trailer(self) -> dict[str, Any]:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
|
||||
_objects: list[Any]
|
||||
_id_translated: dict[int, dict[int, int]]
|
||||
|
||||
incremental: bool
|
||||
_reader: Any # PdfReader
|
||||
|
||||
@abstractmethod
|
||||
def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
|
||||
... # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
def _add_object(self, obj: Any) -> Any:
|
||||
... # pragma: no cover
|
||||
1352
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
1352
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,245 @@
|
||||
"""
|
||||
Code related to text extraction.
|
||||
|
||||
Some parts are still in _page.py. In doubt, they will stay there.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from .._font import Font
|
||||
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
|
||||
|
||||
CUSTOM_RTL_MIN: int = -1
|
||||
CUSTOM_RTL_MAX: int = -1
|
||||
CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
|
||||
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
|
||||
|
||||
|
||||
class OrientationNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def set_custom_rtl(
|
||||
_min: Union[str, int, None] = None,
|
||||
_max: Union[str, int, None] = None,
|
||||
specials: Union[str, list[int], None] = None,
|
||||
) -> tuple[int, int, list[int]]:
|
||||
"""
|
||||
Change the Right-To-Left and special characters custom parameters.
|
||||
|
||||
Args:
|
||||
_min: The new minimum value for the range of custom characters that
|
||||
will be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
_max: The new maximum value for the range of custom characters that will
|
||||
be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
specials: The new list of special characters to be inserted in the
|
||||
current insertion order.
|
||||
If set to ``None``, the current value will not be changed.
|
||||
If set to a string, it will be converted to a list of ASCII codes.
|
||||
The default value is an empty list.
|
||||
|
||||
Returns:
|
||||
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
|
||||
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
|
||||
|
||||
"""
|
||||
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
if isinstance(_min, int):
|
||||
CUSTOM_RTL_MIN = _min
|
||||
elif isinstance(_min, str):
|
||||
CUSTOM_RTL_MIN = ord(_min)
|
||||
if isinstance(_max, int):
|
||||
CUSTOM_RTL_MAX = _max
|
||||
elif isinstance(_max, str):
|
||||
CUSTOM_RTL_MAX = ord(_max)
|
||||
if isinstance(specials, str):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
|
||||
elif isinstance(specials, list):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = specials
|
||||
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
|
||||
|
||||
def mult(m: list[float], n: list[float]) -> list[float]:
|
||||
return [
|
||||
m[0] * n[0] + m[1] * n[2],
|
||||
m[0] * n[1] + m[1] * n[3],
|
||||
m[2] * n[0] + m[3] * n[2],
|
||||
m[2] * n[1] + m[3] * n[3],
|
||||
m[4] * n[0] + m[5] * n[2] + n[4],
|
||||
m[4] * n[1] + m[5] * n[3] + n[5],
|
||||
]
|
||||
|
||||
|
||||
def orient(m: list[float]) -> int:
|
||||
if m[3] > 1e-6:
|
||||
return 0
|
||||
if m[3] < -1e-6:
|
||||
return 180
|
||||
if m[1] > 0:
|
||||
return 90
|
||||
return 270
|
||||
|
||||
|
||||
def crlf_space_check(
|
||||
text: str,
|
||||
cmtm_prev: tuple[list[float], list[float]],
|
||||
cmtm_matrix: tuple[list[float], list[float]],
|
||||
memo_cmtm: tuple[list[float], list[float]],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
orientations: tuple[int, ...],
|
||||
output: str,
|
||||
font_size: float,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||
str_widths: float,
|
||||
spacewidth: float,
|
||||
str_height: float,
|
||||
) -> tuple[str, str, list[float], list[float]]:
|
||||
cm_prev = cmtm_prev[0]
|
||||
tm_prev = cmtm_prev[1]
|
||||
cm_matrix = cmtm_matrix[0]
|
||||
tm_matrix = cmtm_matrix[1]
|
||||
memo_cm = memo_cmtm[0]
|
||||
memo_tm = memo_cmtm[1]
|
||||
|
||||
m_prev = mult(tm_prev, cm_prev)
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
delta_x = m[4] - m_prev[4]
|
||||
delta_y = m[5] - m_prev[5]
|
||||
# Table 108 of the 1.7 reference ("Text positioning operators")
|
||||
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
|
||||
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
|
||||
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
|
||||
cm_prev = m
|
||||
|
||||
if orientation not in orientations:
|
||||
raise OrientationNotFoundError
|
||||
if orientation in (0, 180):
|
||||
moved_height: float = delta_y
|
||||
moved_width: float = delta_x
|
||||
elif orientation in (90, 270):
|
||||
moved_height = delta_x
|
||||
moved_width = delta_y
|
||||
try:
|
||||
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
|
||||
if (output + text)[-1] != "\n":
|
||||
output += text + "\n"
|
||||
if visitor_text is not None:
|
||||
visitor_text(
|
||||
text + "\n",
|
||||
memo_cm,
|
||||
memo_tm,
|
||||
font_resource,
|
||||
font_size,
|
||||
)
|
||||
text = ""
|
||||
elif (
|
||||
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
|
||||
and (output + text)[-1] != " "
|
||||
):
|
||||
text += " "
|
||||
except Exception:
|
||||
pass
|
||||
tm_prev = tm_matrix.copy()
|
||||
cm_prev = cm_matrix.copy()
|
||||
return text, output, cm_prev, tm_prev
|
||||
|
||||
|
||||
def get_text_operands(
|
||||
operands: list[Union[str, TextStringObject]],
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font: Font,
|
||||
orientations: tuple[int, ...]
|
||||
) -> tuple[str, bool]:
|
||||
t: str = ""
|
||||
is_str_operands = False
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
if orientation in orientations and len(operands) > 0:
|
||||
if isinstance(operands[0], str):
|
||||
t = operands[0]
|
||||
is_str_operands = True
|
||||
else:
|
||||
t = ""
|
||||
tt: bytes = (
|
||||
encode_pdfdocencoding(operands[0])
|
||||
if isinstance(operands[0], str)
|
||||
else operands[0]
|
||||
)
|
||||
if isinstance(font.encoding, str):
|
||||
try:
|
||||
t = tt.decode(font.encoding, "surrogatepass") # apply str encoding
|
||||
except Exception:
|
||||
# the data does not match the expectation,
|
||||
# we use the alternative ;
|
||||
# text extraction may not be good
|
||||
t = tt.decode(
|
||||
"utf-16-be" if font.encoding == "charmap" else "charmap",
|
||||
"surrogatepass",
|
||||
) # apply str encoding
|
||||
else: # apply dict encoding
|
||||
t = "".join(
|
||||
[font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
|
||||
)
|
||||
return (t, is_str_operands)
|
||||
|
||||
|
||||
def get_display_str(
|
||||
text: str,
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
font: Font,
|
||||
text_operands: str,
|
||||
font_size: float,
|
||||
rtl_dir: bool,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
|
||||
) -> tuple[str, bool, float]:
|
||||
# "\u0590 - \u08FF \uFB50 - \uFDFF"
|
||||
widths: float = 0.0
|
||||
for x in [font.character_map.get(x, x) for x in text_operands]:
|
||||
# x can be a sequence of bytes ; ex: habibi.pdf
|
||||
if len(x) == 1:
|
||||
xx = ord(x)
|
||||
else:
|
||||
xx = 1
|
||||
# fmt: off
|
||||
if (
|
||||
# cases where the current inserting order is kept
|
||||
(xx <= 0x2F) # punctuations but...
|
||||
or 0x3A <= xx <= 0x40 # numbers (x30-39)
|
||||
or 0x2000 <= xx <= 0x206F # upper punctuations..
|
||||
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
|
||||
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
|
||||
):
|
||||
text = x + text if rtl_dir else text + x
|
||||
elif ( # right-to-left characters set
|
||||
0x0590 <= xx <= 0x08FF
|
||||
or 0xFB1D <= xx <= 0xFDFF
|
||||
or 0xFE70 <= xx <= 0xFEFF
|
||||
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
|
||||
):
|
||||
if not rtl_dir:
|
||||
rtl_dir = True
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||
text = ""
|
||||
text = x + text
|
||||
else: # left-to-right
|
||||
if rtl_dir:
|
||||
rtl_dir = False
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||
text = ""
|
||||
text = text + x
|
||||
widths += font.space_width if x == " " else font.text_width(x)
|
||||
# fmt: on
|
||||
return text, rtl_dir, widths
|
||||
@@ -0,0 +1,16 @@
|
||||
"""Layout mode text extraction extension for pypdf"""
|
||||
from ..._font import Font
|
||||
from ._fixed_width_page import (
|
||||
fixed_char_width,
|
||||
fixed_width_page,
|
||||
text_show_operations,
|
||||
y_coordinate_groups,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Font",
|
||||
"fixed_char_width",
|
||||
"fixed_width_page",
|
||||
"text_show_operations",
|
||||
"y_coordinate_groups",
|
||||
]
|
||||
@@ -0,0 +1,400 @@
|
||||
"""Extract PDF text preserving the layout of the source PDF"""
|
||||
|
||||
from collections.abc import Iterator
|
||||
from itertools import groupby
|
||||
from math import ceil
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Optional, TypedDict
|
||||
|
||||
from ..._font import Font
|
||||
from ..._utils import logger_warning
|
||||
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
from ._text_state_manager import TextStateManager
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
|
||||
class BTGroup(TypedDict):
|
||||
"""
|
||||
Dict describing a line of text rendered within a BT/ET operator pair.
|
||||
If multiple text show operations render text on the same line, the text
|
||||
will be combined into a single BTGroup dict.
|
||||
|
||||
Keys:
|
||||
tx: x coordinate of first character in BTGroup
|
||||
ty: y coordinate of first character in BTGroup
|
||||
font_size: nominal font size
|
||||
font_height: effective font height
|
||||
text: rendered text
|
||||
displaced_tx: x coordinate of last character in BTGroup
|
||||
flip_sort: -1 if page is upside down, else 1
|
||||
"""
|
||||
|
||||
tx: float
|
||||
ty: float
|
||||
font_size: float
|
||||
font_height: float
|
||||
text: str
|
||||
displaced_tx: float
|
||||
flip_sort: Literal[-1, 1]
|
||||
|
||||
|
||||
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
|
||||
"""
|
||||
BTGroup constructed from a TextStateParams instance, rendered text, and
|
||||
displaced tx value.
|
||||
|
||||
Args:
|
||||
tj_op (TextStateParams): TextStateParams instance
|
||||
rendered_text (str): rendered text
|
||||
dispaced_tx (float): x coordinate of last character in BTGroup
|
||||
|
||||
"""
|
||||
return BTGroup(
|
||||
tx=tj_op.tx,
|
||||
ty=tj_op.ty,
|
||||
font_size=tj_op.font_size,
|
||||
font_height=tj_op.font_height,
|
||||
text=rendered_text,
|
||||
displaced_tx=dispaced_tx,
|
||||
flip_sort=-1 if tj_op.flip_vertical else 1,
|
||||
)
|
||||
|
||||
|
||||
def recurs_to_target_op(
|
||||
ops: Iterator[tuple[list[Any], bytes]],
|
||||
text_state_mgr: TextStateManager,
|
||||
end_target: Literal[b"Q", b"ET"],
|
||||
fonts: dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
) -> tuple[list[BTGroup], list[TextStateParams]]:
|
||||
"""
|
||||
Recurse operators between BT/ET and/or q/Q operators managing the transform
|
||||
stack and capturing text positioning and rendering data.
|
||||
|
||||
Args:
|
||||
ops: iterator of operators in content stream
|
||||
text_state_mgr: a TextStateManager instance
|
||||
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
|
||||
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
|
||||
|
||||
Returns:
|
||||
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
|
||||
|
||||
"""
|
||||
# 1 entry per line of text rendered within each BT/ET operation.
|
||||
bt_groups: list[BTGroup] = []
|
||||
|
||||
# 1 entry per text show operator (Tj/TJ/'/")
|
||||
tj_ops: list[TextStateParams] = []
|
||||
|
||||
if end_target == b"Q":
|
||||
# add new q level. cm's added at this level will be popped at next b'Q'
|
||||
text_state_mgr.add_q()
|
||||
|
||||
for operands, op in ops:
|
||||
# The loop is broken by the end target, or exits normally when there are no more ops.
|
||||
if op == end_target:
|
||||
if op == b"Q":
|
||||
text_state_mgr.remove_q()
|
||||
if op == b"ET":
|
||||
if not tj_ops:
|
||||
return bt_groups, tj_ops
|
||||
_text = ""
|
||||
bt_idx = 0 # idx of first tj in this bt group
|
||||
last_displaced_tx = tj_ops[bt_idx].displaced_tx
|
||||
last_ty = tj_ops[bt_idx].ty
|
||||
for _idx, _tj in enumerate(
|
||||
tj_ops
|
||||
): # ... build text from new Tj operators
|
||||
if strip_rotated and _tj.rotated:
|
||||
continue
|
||||
if not _tj.font.interpretable: # generates warning
|
||||
continue
|
||||
# if the y position of the text is greater than the font height, assume
|
||||
# the text is on a new line and start a new group
|
||||
if abs(_tj.ty - last_ty) > _tj.font_height:
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
_text = ""
|
||||
|
||||
# if the x position of the text is less than the last x position by
|
||||
# more than 5 spaces widths, assume the text order should be flipped
|
||||
# and start a new group
|
||||
if (
|
||||
last_displaced_tx - _tj.tx
|
||||
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
):
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
_text = ""
|
||||
|
||||
# calculate excess x translation based on ending tx of previous Tj.
|
||||
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
|
||||
# applied to the first tj of a BTGroup in fixed_width_page().
|
||||
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
|
||||
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
|
||||
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
|
||||
new_text = f'{" " * spaces}{_tj.txt}'
|
||||
|
||||
last_ty = _tj.ty
|
||||
_text = f"{_text}{new_text}"
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
if _text:
|
||||
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
|
||||
text_state_mgr.reset_tm()
|
||||
break
|
||||
if op == b"q":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"Q", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"cm":
|
||||
text_state_mgr.add_cm(*operands)
|
||||
elif op == b"BT":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"ET", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"Tj":
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b"TJ":
|
||||
_tj = text_state_mgr.text_state_params()
|
||||
for tj_op in operands[0]:
|
||||
if isinstance(tj_op, bytes):
|
||||
_tj = text_state_mgr.text_state_params(tj_op)
|
||||
tj_ops.append(_tj)
|
||||
else:
|
||||
text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op))
|
||||
elif op == b"'":
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b'"':
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.set_state_param(b"Tw", operands[0])
|
||||
text_state_mgr.set_state_param(b"Tc", operands[1])
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
|
||||
elif op in (b"Td", b"Tm", b"TD", b"T*"):
|
||||
text_state_mgr.reset_trm()
|
||||
if op == b"Tm":
|
||||
text_state_mgr.reset_tm()
|
||||
elif op == b"TD":
|
||||
text_state_mgr.set_state_param(b"TL", -operands[1])
|
||||
elif op == b"T*":
|
||||
operands = [0, -text_state_mgr.TL]
|
||||
text_state_mgr.add_tm(operands)
|
||||
elif op == b"Tf":
|
||||
text_state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # handle Tc, Tw, Tz, TL, and Ts operators
|
||||
text_state_mgr.set_state_param(op, operands)
|
||||
else:
|
||||
logger_warning(
|
||||
f"Unbalanced target operations, expected {end_target!r}.",
|
||||
__name__,
|
||||
)
|
||||
return bt_groups, tj_ops
|
||||
|
||||
|
||||
def y_coordinate_groups(
|
||||
bt_groups: list[BTGroup], debug_path: Optional[Path] = None
|
||||
) -> dict[int, list[BTGroup]]:
|
||||
"""
|
||||
Group text operations by rendered y coordinate, i.e. the line number.
|
||||
|
||||
Args:
|
||||
bt_groups: list of dicts as returned by text_show_operations()
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
|
||||
keyed by y coordinate
|
||||
|
||||
"""
|
||||
ty_groups = {
|
||||
ty: sorted(grp, key=lambda x: x["tx"])
|
||||
for ty, grp in groupby(
|
||||
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
|
||||
)
|
||||
}
|
||||
# combine groups whose y coordinates differ by less than the effective font height
|
||||
# (accounts for mixed fonts and other minor oddities)
|
||||
last_ty = next(iter(ty_groups))
|
||||
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
|
||||
for ty in list(ty_groups)[1:]:
|
||||
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
|
||||
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
|
||||
# prevent merge if both groups are rendering in the same x position.
|
||||
no_text_overlap = not (txs & last_txs)
|
||||
offset_less_than_font_height = abs(ty - last_ty) < fsz
|
||||
if no_text_overlap and offset_less_than_font_height:
|
||||
ty_groups[last_ty] = sorted(
|
||||
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
|
||||
)
|
||||
last_txs |= txs
|
||||
else:
|
||||
last_ty = ty
|
||||
last_txs = txs
|
||||
if debug_path: # pragma: no cover
|
||||
import json # noqa: PLC0415
|
||||
|
||||
debug_path.joinpath("bt_groups.json").write_text(
|
||||
json.dumps(ty_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
return ty_groups
|
||||
|
||||
|
||||
def text_show_operations(
|
||||
ops: Iterator[tuple[list[Any], bytes]],
|
||||
fonts: dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
debug_path: Optional[Path] = None,
|
||||
) -> list[BTGroup]:
|
||||
"""
|
||||
Extract text from BT/ET operator pairs.
|
||||
|
||||
Args:
|
||||
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
|
||||
fonts (Dict[str, Font]): font dictionary
|
||||
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
List[BTGroup]: list of dicts of text rendered by each BT operator
|
||||
|
||||
"""
|
||||
state_mgr = TextStateManager() # transformation stack manager
|
||||
bt_groups: list[BTGroup] = [] # BT operator dict
|
||||
tj_ops: list[TextStateParams] = [] # Tj/TJ operator data
|
||||
for operands, op in ops:
|
||||
if op in (b"BT", b"q"):
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"Tf":
|
||||
state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
|
||||
state_mgr.set_state_param(op, operands)
|
||||
|
||||
if any(tj.rotated for tj in tj_ops):
|
||||
if strip_rotated:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Output will be incomplete.", __name__
|
||||
)
|
||||
else:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Layout will be degraded.", __name__
|
||||
)
|
||||
if not all(tj.font.interpretable for tj in tj_ops):
|
||||
logger_warning(
|
||||
"PDF contains an uninterpretable font. Output will be incomplete.", __name__
|
||||
)
|
||||
|
||||
# left align the data, i.e. decrement all tx values by min(tx)
|
||||
min_x = min((x["tx"] for x in bt_groups), default=0.0)
|
||||
bt_groups = [
|
||||
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
|
||||
for ogrp in sorted(
|
||||
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
|
||||
)
|
||||
]
|
||||
|
||||
if debug_path: # pragma: no cover
|
||||
import json # noqa: PLC0415
|
||||
|
||||
debug_path.joinpath("bts.json").write_text(
|
||||
json.dumps(bt_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
debug_path.joinpath("tjs.json").write_text(
|
||||
json.dumps(
|
||||
tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
|
||||
),
|
||||
"utf-8",
|
||||
)
|
||||
return bt_groups
|
||||
|
||||
|
||||
def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float:
|
||||
"""
|
||||
Calculate average character width weighted by the length of the rendered
|
||||
text in each sample for conversion to fixed-width layout.
|
||||
|
||||
Args:
|
||||
bt_groups (List[BTGroup]): List of dicts of text rendered by each
|
||||
BT operator
|
||||
|
||||
Returns:
|
||||
float: fixed character width
|
||||
|
||||
"""
|
||||
char_widths = []
|
||||
for _bt in bt_groups:
|
||||
_len = len(_bt["text"]) * scale_weight
|
||||
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
|
||||
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
|
||||
|
||||
|
||||
def fixed_width_page(
|
||||
ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
|
||||
) -> str:
|
||||
"""
|
||||
Generate page text from text operations grouped by rendered y coordinate.
|
||||
|
||||
Args:
|
||||
ty_groups: dict of text show ops as returned by y_coordinate_groups()
|
||||
char_width: fixed character width
|
||||
space_vertically: include blank lines inferred from y distance + font height.
|
||||
font_height_weight: multiplier for font height when calculating blank lines.
|
||||
|
||||
Returns:
|
||||
str: page text in a fixed width format that closely adheres to the rendered
|
||||
layout in the source pdf.
|
||||
|
||||
"""
|
||||
lines: list[str] = []
|
||||
last_y_coord = 0
|
||||
table = str.maketrans(dict.fromkeys(range(14, 32), " "))
|
||||
for y_coord, line_data in ty_groups.items():
|
||||
if space_vertically and lines:
|
||||
fh = line_data[0]["font_height"]
|
||||
blank_lines = 0 if fh == 0 else (
|
||||
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
|
||||
)
|
||||
lines.extend([""] * blank_lines)
|
||||
|
||||
line_parts = [] # It uses a list to construct the line, avoiding string concatenation.
|
||||
current_len = 0 # Track the size with int instead of len(str) overhead.
|
||||
last_disp = 0.0
|
||||
for bt_op in line_data:
|
||||
tx = bt_op["tx"]
|
||||
offset = int(tx // char_width)
|
||||
needed_spaces = offset - current_len
|
||||
if needed_spaces > 0 and ceil(last_disp) < int(tx):
|
||||
padding = " " * needed_spaces
|
||||
line_parts.append(padding)
|
||||
current_len += needed_spaces
|
||||
|
||||
raw_text = bt_op["text"]
|
||||
text = raw_text.translate(table)
|
||||
line_parts.append(text)
|
||||
current_len += len(text)
|
||||
last_disp = bt_op["displaced_tx"]
|
||||
|
||||
full_line = "".join(line_parts).rstrip()
|
||||
if full_line.strip() or (space_vertically and lines):
|
||||
lines.append(full_line)
|
||||
|
||||
last_y_coord = y_coord
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,221 @@
|
||||
"""manage the PDF transform stack during "layout" mode text extraction"""
|
||||
|
||||
from collections import ChainMap, Counter
|
||||
from collections import ChainMap as ChainMapType
|
||||
from collections import Counter as CounterType
|
||||
from collections.abc import MutableMapping
|
||||
from typing import Any, Union
|
||||
|
||||
from ..._font import Font
|
||||
from ...errors import PdfReadError
|
||||
from .. import mult
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
|
||||
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
|
||||
|
||||
|
||||
class TextStateManager:
|
||||
"""
|
||||
Tracks the current text state including cm/tm/trm transformation matrices.
|
||||
|
||||
Attributes:
|
||||
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
|
||||
q_queue (Counter[int]): Counter of q operators
|
||||
q_depth (List[int]): list of q operator nesting levels
|
||||
Tc (float): character spacing
|
||||
Tw (float): word spacing
|
||||
Tz (int): horizontal scaling
|
||||
TL (float): leading
|
||||
Ts (float): text rise
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.transform_stack: TextStateManagerChainMapType = ChainMap(
|
||||
self.new_transform()
|
||||
)
|
||||
self.q_queue: CounterType[int] = Counter()
|
||||
self.q_depth = [0]
|
||||
self.Tc: float = 0.0
|
||||
self.Tw: float = 0.0
|
||||
self.Tz: float = 100.0
|
||||
self.TL: float = 0.0
|
||||
self.Ts: float = 0.0
|
||||
self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
|
||||
self.font: Union[Font, None] = None
|
||||
self.font_size: Union[int, float] = 0
|
||||
|
||||
def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
|
||||
"""
|
||||
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
|
||||
|
||||
Args:
|
||||
op: operator read from PDF stream as bytes. No action is taken
|
||||
for unsupported operators (see supported operators above).
|
||||
value (float | List[Any]): new parameter value. If a list,
|
||||
value[0] is used.
|
||||
|
||||
"""
|
||||
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
|
||||
return
|
||||
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
|
||||
|
||||
def set_font(self, font: Font, size: float) -> None:
|
||||
"""
|
||||
Set the current font and font_size.
|
||||
|
||||
Args:
|
||||
font (Font): a layout mode Font
|
||||
size (float): font size
|
||||
|
||||
"""
|
||||
self.font = font
|
||||
self.font_size = size
|
||||
|
||||
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
|
||||
"""
|
||||
Create a TextStateParams instance to display a text string. Type[bytes] values
|
||||
will be decoded implicitly.
|
||||
|
||||
Args:
|
||||
value (str | bytes): text to associate with the captured state.
|
||||
|
||||
Raises:
|
||||
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
|
||||
|
||||
Returns:
|
||||
TextStateParams: current text state parameters
|
||||
|
||||
"""
|
||||
if not isinstance(self.font, Font):
|
||||
raise PdfReadError(
|
||||
"font not set: is PDF missing a Tf operator?"
|
||||
) # pragma: no cover
|
||||
if isinstance(value, bytes):
|
||||
try:
|
||||
if isinstance(self.font.encoding, str):
|
||||
txt = value.decode(self.font.encoding, "surrogatepass")
|
||||
else:
|
||||
txt = "".join(
|
||||
self.font.encoding[x]
|
||||
if x in self.font.encoding
|
||||
else bytes((x,)).decode()
|
||||
for x in value
|
||||
)
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
txt = value.decode("utf-8", "replace")
|
||||
txt = "".join(
|
||||
self.font.character_map.get(x, x) for x in txt
|
||||
)
|
||||
else:
|
||||
txt = value
|
||||
return TextStateParams(
|
||||
txt,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.Tc,
|
||||
self.Tw,
|
||||
self.Tz,
|
||||
self.TL,
|
||||
self.Ts,
|
||||
self.effective_transform,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def raw_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
) -> dict[int, float]:
|
||||
"""Only a/b/c/d/e/f matrix params"""
|
||||
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
|
||||
|
||||
@staticmethod
|
||||
def new_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
is_text: bool = False,
|
||||
is_render: bool = False,
|
||||
) -> TextStateManagerDictType:
|
||||
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
|
||||
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
|
||||
result.update({"is_text": is_text, "is_render": is_render})
|
||||
return result
|
||||
|
||||
def reset_tm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
|
||||
while (
|
||||
self.transform_stack.maps[0]["is_text"]
|
||||
or self.transform_stack.maps[0]["is_render"]
|
||||
):
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def reset_trm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_render==True"""
|
||||
while self.transform_stack.maps[0]["is_render"]:
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def remove_q(self) -> TextStateManagerChainMapType:
|
||||
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
|
||||
self.font, self.font_size = self.font_stack.pop(-1)
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.transform_stack.maps = self.transform_stack.maps[
|
||||
self.q_queue.pop(self.q_depth.pop(), 0) :
|
||||
]
|
||||
return self.transform_stack
|
||||
|
||||
def add_q(self) -> None:
|
||||
"""Add another level to q_queue"""
|
||||
self.font_stack.append((self.font, self.font_size))
|
||||
self.q_depth.append(len(self.q_depth))
|
||||
|
||||
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
|
||||
"""Concatenate an additional transform matrix"""
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.q_queue.update(self.q_depth[-1:])
|
||||
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
|
||||
return self.transform_stack
|
||||
|
||||
def _complete_matrix(self, operands: list[float]) -> list[float]:
|
||||
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
|
||||
if len(operands) == 2: # this is a Td operator or equivalent
|
||||
operands = [1.0, 0.0, 0.0, 1.0, *operands]
|
||||
return operands
|
||||
|
||||
def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text rendering transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
@property
|
||||
def effective_transform(self) -> list[float]:
|
||||
"""Current effective transform accounting for cm, tm, and trm transforms"""
|
||||
eff_transform = [*self.transform_stack.maps[0].values()]
|
||||
for transform in self.transform_stack.maps[1:]:
|
||||
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
|
||||
return eff_transform
|
||||
@@ -0,0 +1,135 @@
|
||||
"""A dataclass that captures the CTM and Text State for a tj operation"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Union
|
||||
|
||||
from ..._font import Font
|
||||
from .. import mult, orient
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextStateParams:
|
||||
"""
|
||||
Text state parameters and operator values for a single text value in a
|
||||
TJ or Tj PDF operation.
|
||||
|
||||
Attributes:
|
||||
txt (str): the text to be rendered.
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
Tc (float): character spacing. Defaults to 0.0.
|
||||
Tw (float): word spacing. Defaults to 0.0.
|
||||
Tz (float): horizontal scaling. Defaults to 100.0.
|
||||
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
|
||||
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
|
||||
transform (List[float]): effective transformation matrix.
|
||||
tx (float): x cood of rendered text, i.e. self.transform[4]
|
||||
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
|
||||
displaced_tx (float): x coord immediately following rendered text
|
||||
space_tx (float): tx for a space character
|
||||
font_height (float): effective font height accounting for CTM
|
||||
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
|
||||
rotated (bool): True if the text orientation is rotated with respect to the page.
|
||||
|
||||
"""
|
||||
|
||||
txt: str
|
||||
font: Font
|
||||
font_size: Union[int, float]
|
||||
Tc: float = 0.0
|
||||
Tw: float = 0.0
|
||||
Tz: float = 100.0
|
||||
TL: float = 0.0
|
||||
Ts: float = 0.0
|
||||
transform: list[float] = field(
|
||||
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
)
|
||||
tx: float = field(default=0.0, init=False)
|
||||
ty: float = field(default=0.0, init=False)
|
||||
displaced_tx: float = field(default=0.0, init=False)
|
||||
space_tx: float = field(default=0.0, init=False)
|
||||
font_height: float = field(default=0.0, init=False)
|
||||
flip_vertical: bool = field(default=False, init=False)
|
||||
rotated: bool = field(default=False, init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if orient(self.transform) in (90, 270):
|
||||
self.transform = mult(
|
||||
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
|
||||
self.transform,
|
||||
)
|
||||
self.rotated = True
|
||||
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
|
||||
# If only self.transform[3] < 0, the y coords are simply inverted.
|
||||
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
|
||||
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
|
||||
self.rotated = True
|
||||
self.displaced_tx = self.displaced_transform()[4]
|
||||
self.tx = self.transform[4]
|
||||
self.ty = self.render_transform()[5]
|
||||
self.space_tx = round(self.word_tx(" "), 3)
|
||||
if self.space_tx < 1e-6:
|
||||
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
|
||||
# with TJ int operators a la crazyones.pdf), calculate space_tx as
|
||||
# a td_offset of -1 * font.space_width where font.space_width is
|
||||
# the space_width calculated in _font.py.
|
||||
self.space_tx = round(self.word_tx("", -self.font.space_width), 3)
|
||||
self.font_height = self.font_size * math.sqrt(
|
||||
self.transform[1] ** 2 + self.transform[3] ** 2
|
||||
)
|
||||
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
|
||||
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
|
||||
|
||||
def font_size_matrix(self) -> list[float]:
|
||||
"""Font size matrix"""
|
||||
return [
|
||||
self.font_size * (self.Tz / 100.0),
|
||||
0.0,
|
||||
0.0,
|
||||
self.font_size,
|
||||
0.0,
|
||||
self.Ts,
|
||||
]
|
||||
|
||||
def displaced_transform(self) -> list[float]:
|
||||
"""Effective transform matrix after text has been rendered."""
|
||||
return mult(self.displacement_matrix(), self.transform)
|
||||
|
||||
def render_transform(self) -> list[float]:
|
||||
"""Effective transform matrix accounting for font size, Tz, and Ts."""
|
||||
return mult(self.font_size_matrix(), self.transform)
|
||||
|
||||
def displacement_matrix(
|
||||
self, word: Union[str, None] = None, td_offset: float = 0.0
|
||||
) -> list[float]:
|
||||
"""
|
||||
Text displacement matrix
|
||||
|
||||
Args:
|
||||
word (str, optional): Defaults to None in which case self.txt displacement is
|
||||
returned.
|
||||
td_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
|
||||
|
||||
"""
|
||||
word = word if word is not None else self.txt
|
||||
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0]
|
||||
|
||||
def word_tx(self, word: str, td_offset: float = 0.0) -> float:
|
||||
"""Horizontal text displacement for any word according this text state"""
|
||||
width: float = 0.0
|
||||
for char in word:
|
||||
if char == " ":
|
||||
width += self.font.space_width
|
||||
else:
|
||||
width += self.font.text_width(char)
|
||||
return (
|
||||
(self.font_size * ((width - td_offset) / 1000.0))
|
||||
+ self.Tc
|
||||
+ word.count(" ") * self.Tw
|
||||
) * (self.Tz / 100.0)
|
||||
|
||||
@staticmethod
|
||||
def to_dict(inst: "TextStateParams") -> dict[str, Any]:
|
||||
"""Dataclass to dict for json.dumps serialization"""
|
||||
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
|
||||
@@ -0,0 +1,351 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import math
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from .._font import Font, FontDescriptor
|
||||
from ..generic import DictionaryObject, TextStringObject
|
||||
from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
|
||||
|
||||
|
||||
class TextExtraction:
|
||||
"""
|
||||
A class to handle PDF text extraction operations.
|
||||
|
||||
This class encapsulates all the state and operations needed for extracting
|
||||
text from PDF content streams, replacing the nested functions and nonlocal
|
||||
variables in the original implementation.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
|
||||
|
||||
# Text extraction state variables
|
||||
self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.cm_stack: list[
|
||||
tuple[
|
||||
list[float],
|
||||
Optional[DictionaryObject],
|
||||
Font,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
]
|
||||
] = []
|
||||
|
||||
# Store the last modified matrices; can be an intermediate position
|
||||
self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
# Store the position at the beginning of building the text
|
||||
self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
self.char_scale = 1.0
|
||||
self.space_scale = 1.0
|
||||
self._space_width: float = 500.0 # will be set correctly at first Tf
|
||||
self._actual_str_size: dict[str, float] = {
|
||||
"str_widths": 0.0,
|
||||
"str_height": 0.0,
|
||||
} # will be set to string length calculation result
|
||||
self.TL = 0.0
|
||||
self.font_size = 12.0 # init just in case of
|
||||
|
||||
# Text extraction variables
|
||||
self.text: str = ""
|
||||
self.output: str = ""
|
||||
self.rtl_dir: bool = False # right-to-left
|
||||
self.font_resource: Optional[DictionaryObject] = None
|
||||
self.font = Font(
|
||||
name = "NotInitialized",
|
||||
sub_type="Unknown",
|
||||
encoding="charmap",
|
||||
font_descriptor=FontDescriptor(),
|
||||
)
|
||||
self.orientations: tuple[int, ...] = (0, 90, 180, 270)
|
||||
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
|
||||
self.font_resources: dict[str, DictionaryObject] = {}
|
||||
self.fonts: dict[str, Font] = {}
|
||||
|
||||
self.operation_handlers = {
|
||||
b"BT": self._handle_bt,
|
||||
b"ET": self._handle_et,
|
||||
b"q": self._handle_save_graphics_state,
|
||||
b"Q": self._handle_restore_graphics_state,
|
||||
b"cm": self._handle_cm,
|
||||
b"Tz": self._handle_tz,
|
||||
b"Tw": self._handle_tw,
|
||||
b"TL": self._handle_tl,
|
||||
b"Tf": self._handle_tf,
|
||||
b"Td": self._handle_td,
|
||||
b"Tm": self._handle_tm,
|
||||
b"T*": self._handle_t_star,
|
||||
b"Tj": self._handle_tj_operation,
|
||||
}
|
||||
|
||||
def initialize_extraction(
|
||||
self,
|
||||
orientations: tuple[int, ...] = (0, 90, 180, 270),
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
|
||||
font_resources: Optional[dict[str, DictionaryObject]] = None,
|
||||
fonts: Optional[dict[str, Font]] = None
|
||||
) -> None:
|
||||
"""Initialize the extractor with extraction parameters."""
|
||||
self.orientations = orientations
|
||||
self.visitor_text = visitor_text
|
||||
self.font_resources = font_resources or {}
|
||||
self.fonts = fonts or {}
|
||||
|
||||
# Reset state
|
||||
self.text = ""
|
||||
self.output = ""
|
||||
self.rtl_dir = False
|
||||
|
||||
def compute_str_widths(self, str_widths: float) -> float:
|
||||
return str_widths / 1000
|
||||
|
||||
def process_operation(self, operator: bytes, operands: list[Any]) -> None:
|
||||
if operator in self.operation_handlers:
|
||||
handler = self.operation_handlers[operator]
|
||||
str_widths = handler(operands)
|
||||
|
||||
# Post-process operations that affect text positioning
|
||||
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
|
||||
self._post_process_text_operation(str_widths or 0.0)
|
||||
|
||||
def _post_process_text_operation(self, str_widths: float) -> None:
|
||||
"""Handle common post-processing for text positioning operations."""
|
||||
try:
|
||||
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
|
||||
self.text,
|
||||
(self.cm_prev, self.tm_prev),
|
||||
(self.cm_matrix, self.tm_matrix),
|
||||
(self.memo_cm, self.memo_tm),
|
||||
self.font_resource,
|
||||
self.orientations,
|
||||
self.output,
|
||||
self.font_size,
|
||||
self.visitor_text,
|
||||
str_widths,
|
||||
self.compute_str_widths(self.font_size * self._space_width),
|
||||
self._actual_str_size["str_height"],
|
||||
)
|
||||
if self.text == "":
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
except OrientationNotFoundError:
|
||||
pass
|
||||
|
||||
def _handle_tj(
|
||||
self,
|
||||
text: str,
|
||||
operands: list[Union[str, TextStringObject]],
|
||||
cm_matrix: list[float],
|
||||
tm_matrix: list[float],
|
||||
font_resource: Optional[DictionaryObject],
|
||||
font: Font,
|
||||
orientations: tuple[int, ...],
|
||||
font_size: float,
|
||||
rtl_dir: bool,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||
actual_str_size: dict[str, float],
|
||||
) -> tuple[str, bool, dict[str, float]]:
|
||||
text_operands, is_str_operands = get_text_operands(
|
||||
operands, cm_matrix, tm_matrix, font, orientations
|
||||
)
|
||||
if is_str_operands:
|
||||
text += text_operands
|
||||
font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
|
||||
else:
|
||||
text, rtl_dir, font_widths = get_display_str(
|
||||
text,
|
||||
cm_matrix,
|
||||
tm_matrix, # text matrix
|
||||
font_resource,
|
||||
font,
|
||||
text_operands,
|
||||
font_size,
|
||||
rtl_dir,
|
||||
visitor_text,
|
||||
)
|
||||
actual_str_size["str_widths"] += font_widths * font_size
|
||||
actual_str_size["str_height"] = font_size
|
||||
return text, rtl_dir, actual_str_size
|
||||
|
||||
def _flush_text(self) -> None:
|
||||
"""Flush accumulated text to output and call visitor if present."""
|
||||
self.output += self.text
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
|
||||
# Operation handlers
|
||||
|
||||
def _handle_bt(self, operands: list[Any]) -> None:
|
||||
"""Handle BT (Begin Text) operation - Table 5.4 page 405."""
|
||||
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self._flush_text()
|
||||
|
||||
def _handle_et(self, operands: list[Any]) -> None:
|
||||
"""Handle ET (End Text) operation - Table 5.4 page 405."""
|
||||
self._flush_text()
|
||||
|
||||
def _handle_save_graphics_state(self, operands: list[Any]) -> None:
|
||||
"""Handle q (Save graphics state) operation - Table 4.7 page 219."""
|
||||
self.cm_stack.append(
|
||||
(
|
||||
self.cm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.char_scale,
|
||||
self.space_scale,
|
||||
self.TL,
|
||||
)
|
||||
)
|
||||
|
||||
def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
|
||||
"""Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
|
||||
try:
|
||||
(
|
||||
self.cm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.char_scale,
|
||||
self.space_scale,
|
||||
self.TL,
|
||||
) = self.cm_stack.pop()
|
||||
except Exception:
|
||||
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
|
||||
def _handle_cm(self, operands: list[Any]) -> None:
|
||||
"""Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
|
||||
self.output += self.text
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
try:
|
||||
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
|
||||
except Exception:
|
||||
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
|
||||
def _handle_tz(self, operands: list[Any]) -> None:
|
||||
"""Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
|
||||
self.char_scale = float(operands[0]) / 100 if operands else 1.0
|
||||
|
||||
def _handle_tw(self, operands: list[Any]) -> None:
|
||||
"""Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
|
||||
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
|
||||
|
||||
def _handle_tl(self, operands: list[Any]) -> None:
|
||||
"""Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
|
||||
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
|
||||
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
|
||||
|
||||
def _handle_tf(self, operands: list[Any]) -> None:
|
||||
"""Handle Tf (Set font size) operation - Table 5.2 page 398."""
|
||||
if self.text != "":
|
||||
self.output += self.text # .translate(cmap)
|
||||
if self.visitor_text is not None:
|
||||
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||
self.text = ""
|
||||
self.memo_cm = self.cm_matrix.copy()
|
||||
self.memo_tm = self.tm_matrix.copy()
|
||||
try:
|
||||
self.font_resource = self.font_resources[operands[0]]
|
||||
self.font = self.fonts[operands[0]]
|
||||
except KeyError: # font not found
|
||||
self.font_resource = None
|
||||
font_descriptor = FontDescriptor()
|
||||
self.font = Font(
|
||||
"Unknown",
|
||||
space_width=250,
|
||||
encoding=dict.fromkeys(range(256), "<EFBFBD>"),
|
||||
font_descriptor=font_descriptor,
|
||||
character_map={},
|
||||
character_widths=font_descriptor.character_widths
|
||||
)
|
||||
|
||||
self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space...
|
||||
try:
|
||||
self.font_size = float(operands[1])
|
||||
except Exception:
|
||||
pass # keep previous size
|
||||
|
||||
def _handle_td(self, operands: list[Any]) -> float:
|
||||
"""Handle Td (Move text position) operation - Table 5.5 page 406."""
|
||||
# A special case is a translating only tm:
|
||||
# tm = [1, 0, 0, 1, e, f]
|
||||
# i.e. tm[4] += tx, tm[5] += ty.
|
||||
tx, ty = float(operands[0]), float(operands[1])
|
||||
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
|
||||
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_tm(self, operands: list[Any]) -> float:
|
||||
"""Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
|
||||
self.tm_matrix = [float(operand) for operand in operands[:6]]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_t_star(self, operands: list[Any]) -> float:
|
||||
"""Handle T* (Move to next line) operation - Table 5.5 page 406."""
|
||||
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
|
||||
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
|
||||
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||
self._actual_str_size["str_widths"] = 0.0
|
||||
return str_widths
|
||||
|
||||
def _handle_tj_operation(self, operands: list[Any]) -> float:
|
||||
"""Handle Tj (Show text) operation - Table 5.5 page 406."""
|
||||
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
|
||||
self.text,
|
||||
operands,
|
||||
self.cm_matrix,
|
||||
self.tm_matrix,
|
||||
self.font_resource,
|
||||
self.font,
|
||||
self.orientations,
|
||||
self.font_size,
|
||||
self.rtl_dir,
|
||||
self.visitor_text,
|
||||
self._actual_str_size,
|
||||
)
|
||||
return 0.0 # str_widths will be handled in post-processing
|
||||
631
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
631
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
@@ -0,0 +1,631 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""Utility functions for PDF library."""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from io import DEFAULT_BUFFER_SIZE
|
||||
from os import SEEK_CUR
|
||||
from re import Pattern
|
||||
from typing import (
|
||||
IO,
|
||||
Any,
|
||||
Optional,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
# Python 3.10+: https://www.python.org/dev/peps/pep-0484/
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
from typing import Self
|
||||
else:
|
||||
from typing_extensions import Self
|
||||
|
||||
from .errors import (
|
||||
STREAM_TRUNCATED_PREMATURELY,
|
||||
DeprecationError,
|
||||
PdfStreamError,
|
||||
)
|
||||
|
||||
TransformationMatrixType: TypeAlias = tuple[
|
||||
tuple[float, float, float], tuple[float, float, float], tuple[float, float, float]
|
||||
]
|
||||
CompressedTransformationMatrix: TypeAlias = tuple[
|
||||
float, float, float, float, float, float
|
||||
]
|
||||
|
||||
StreamType = IO[Any]
|
||||
StrByteType = Union[str, StreamType]
|
||||
|
||||
|
||||
def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]:
|
||||
orgtext = text
|
||||
if not text:
|
||||
return None
|
||||
if text[0].isdigit():
|
||||
text = "D:" + text
|
||||
if text.endswith(("Z", "z")):
|
||||
text += "0000"
|
||||
text = text.replace("z", "+").replace("Z", "+").replace("'", "")
|
||||
i = max(text.find("+"), text.find("-"))
|
||||
if i > 0 and i != len(text) - 5:
|
||||
text += "00"
|
||||
for f in (
|
||||
"D:%Y",
|
||||
"D:%Y%m",
|
||||
"D:%Y%m%d",
|
||||
"D:%Y%m%d%H",
|
||||
"D:%Y%m%d%H%M",
|
||||
"D:%Y%m%d%H%M%S",
|
||||
"D:%Y%m%d%H%M%S%z",
|
||||
):
|
||||
try:
|
||||
d = datetime.strptime(text, f) # noqa: DTZ007
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
if text.endswith("+0000"):
|
||||
d = d.replace(tzinfo=timezone.utc)
|
||||
return d
|
||||
raise ValueError(f"Can not convert date: {orgtext}")
|
||||
|
||||
|
||||
def format_iso8824_date(dt: datetime) -> str:
|
||||
"""
|
||||
Convert a datetime object to PDF date string format.
|
||||
|
||||
Converts datetime to the PDF date format D:YYYYMMDDHHmmSSOHH'mm
|
||||
as specified in the PDF Reference.
|
||||
|
||||
Args:
|
||||
dt: A datetime object to convert.
|
||||
|
||||
Returns:
|
||||
A date string in PDF format.
|
||||
"""
|
||||
date_str = dt.strftime("D:%Y%m%d%H%M%S")
|
||||
if dt.tzinfo is not None:
|
||||
offset = dt.utcoffset()
|
||||
assert offset is not None
|
||||
total_seconds = int(offset.total_seconds())
|
||||
hours, remainder = divmod(abs(total_seconds), 3600)
|
||||
minutes = remainder // 60
|
||||
sign = "+" if total_seconds >= 0 else "-"
|
||||
date_str += f"{sign}{hours:02d}'{minutes:02d}'"
|
||||
return date_str
|
||||
|
||||
|
||||
def _get_max_pdf_version_header(header1: str, header2: str) -> str:
|
||||
versions = (
|
||||
"%PDF-1.3",
|
||||
"%PDF-1.4",
|
||||
"%PDF-1.5",
|
||||
"%PDF-1.6",
|
||||
"%PDF-1.7",
|
||||
"%PDF-2.0",
|
||||
)
|
||||
pdf_header_indices = []
|
||||
if header1 in versions:
|
||||
pdf_header_indices.append(versions.index(header1))
|
||||
if header2 in versions:
|
||||
pdf_header_indices.append(versions.index(header2))
|
||||
if len(pdf_header_indices) == 0:
|
||||
raise ValueError(f"Neither {header1!r} nor {header2!r} are proper headers")
|
||||
return versions[max(pdf_header_indices)]
|
||||
|
||||
|
||||
WHITESPACES = (b"\x00", b"\t", b"\n", b"\f", b"\r", b" ")
|
||||
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
|
||||
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
|
||||
|
||||
|
||||
def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes:
|
||||
"""
|
||||
Read non-whitespace characters and return them.
|
||||
|
||||
Stops upon encountering whitespace or when maxchars is reached.
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
maxchars: The maximum number of bytes returned; by default unlimited.
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
txt = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace() or not tok:
|
||||
break
|
||||
txt += tok
|
||||
if len(txt) == maxchars:
|
||||
break
|
||||
return txt
|
||||
|
||||
|
||||
def read_non_whitespace(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Find and read the next non-whitespace character (ignores whitespace).
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
tok = stream.read(1)
|
||||
while tok in WHITESPACES:
|
||||
tok = stream.read(1)
|
||||
return tok
|
||||
|
||||
|
||||
def skip_over_whitespace(stream: StreamType) -> bool:
|
||||
"""
|
||||
Similar to read_non_whitespace, but return a boolean if at least one
|
||||
whitespace character was read.
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
|
||||
Returns:
|
||||
True if one or more whitespace was skipped, otherwise return False.
|
||||
|
||||
"""
|
||||
tok = stream.read(1)
|
||||
cnt = 0
|
||||
while tok in WHITESPACES:
|
||||
cnt += 1
|
||||
tok = stream.read(1)
|
||||
return cnt > 0
|
||||
|
||||
|
||||
def check_if_whitespace_only(value: bytes) -> bool:
|
||||
"""
|
||||
Check if the given value consists of whitespace characters only.
|
||||
|
||||
Args:
|
||||
value: The bytes to check.
|
||||
|
||||
Returns:
|
||||
True if the value only has whitespace characters, otherwise return False.
|
||||
|
||||
"""
|
||||
return all(b in WHITESPACES_AS_BYTES for b in value)
|
||||
|
||||
|
||||
def skip_over_comment(stream: StreamType) -> None:
|
||||
tok = stream.read(1)
|
||||
stream.seek(-1, 1)
|
||||
if tok == b"%":
|
||||
while tok not in (b"\n", b"\r"):
|
||||
tok = stream.read(1)
|
||||
if tok == b"":
|
||||
raise PdfStreamError("File ended unexpectedly.")
|
||||
|
||||
|
||||
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
|
||||
"""
|
||||
Read until the regular expression pattern matched (ignore the match).
|
||||
Treats EOF on the underlying stream as the end of the token to be matched.
|
||||
|
||||
Args:
|
||||
regex: re.Pattern
|
||||
|
||||
Returns:
|
||||
The read bytes.
|
||||
|
||||
"""
|
||||
name = b""
|
||||
while True:
|
||||
tok = stream.read(16)
|
||||
if not tok:
|
||||
return name
|
||||
m = regex.search(name + tok)
|
||||
if m is not None:
|
||||
stream.seek(m.start() - (len(name) + len(tok)), 1)
|
||||
name = (name + tok)[: m.start()]
|
||||
break
|
||||
name += tok
|
||||
return name
|
||||
|
||||
|
||||
def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
|
||||
"""
|
||||
Given a stream at position X, read a block of size to_read ending at position X.
|
||||
|
||||
This changes the stream's position to the beginning of where the block was
|
||||
read.
|
||||
|
||||
Args:
|
||||
stream:
|
||||
to_read:
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
if stream.tell() < to_read:
|
||||
raise PdfStreamError("Could not read malformed PDF file")
|
||||
# Seek to the start of the block we want to read.
|
||||
stream.seek(-to_read, SEEK_CUR)
|
||||
read = stream.read(to_read)
|
||||
# Seek to the start of the block we read after reading it.
|
||||
stream.seek(-to_read, SEEK_CUR)
|
||||
return read
|
||||
|
||||
|
||||
def read_previous_line(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Given a byte stream with current position X, return the previous line.
|
||||
|
||||
All characters between the first CR/LF byte found before X
|
||||
(or, the start of the file, if no such byte is found) and position X
|
||||
After this call, the stream will be positioned one byte after the
|
||||
first non-CRLF character found beyond the first CR/LF byte before X,
|
||||
or, if no such byte is found, at the beginning of the stream.
|
||||
|
||||
Args:
|
||||
stream: StreamType:
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
line_content = []
|
||||
found_crlf = False
|
||||
if stream.tell() == 0:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
while True:
|
||||
to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
|
||||
if to_read == 0:
|
||||
break
|
||||
# Read the block. After this, our stream will be one
|
||||
# beyond the initial position.
|
||||
block = read_block_backwards(stream, to_read)
|
||||
idx = len(block) - 1
|
||||
if not found_crlf:
|
||||
# We haven't found our first CR/LF yet.
|
||||
# Read off characters until we hit one.
|
||||
while idx >= 0 and block[idx] not in b"\r\n":
|
||||
idx -= 1
|
||||
if idx >= 0:
|
||||
found_crlf = True
|
||||
if found_crlf:
|
||||
# We found our first CR/LF already (on this block or
|
||||
# a previous one).
|
||||
# Our combined line is the remainder of the block
|
||||
# plus any previously read blocks.
|
||||
line_content.append(block[idx + 1 :])
|
||||
# Continue to read off any more CRLF characters.
|
||||
while idx >= 0 and block[idx] in b"\r\n":
|
||||
idx -= 1
|
||||
else:
|
||||
# Didn't find CR/LF yet - add this block to our
|
||||
# previously read blocks and continue.
|
||||
line_content.append(block)
|
||||
if idx >= 0:
|
||||
# We found the next non-CRLF character.
|
||||
# Set the stream position correctly, then break
|
||||
stream.seek(idx + 1, SEEK_CUR)
|
||||
break
|
||||
# Join all the blocks in the line (which are in reverse order)
|
||||
return b"".join(line_content[::-1])
|
||||
|
||||
|
||||
def matrix_multiply(
|
||||
a: TransformationMatrixType, b: TransformationMatrixType
|
||||
) -> TransformationMatrixType:
|
||||
return tuple( # type: ignore[return-value]
|
||||
tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b))
|
||||
for row in a
|
||||
)
|
||||
|
||||
|
||||
def mark_location(stream: StreamType) -> None:
|
||||
"""Create text file showing current location in context."""
|
||||
# Mainly for debugging
|
||||
radius = 5000
|
||||
stream.seek(-radius, 1)
|
||||
with open("pypdf_pdfLocation.txt", "wb") as output_fh:
|
||||
output_fh.write(stream.read(radius))
|
||||
output_fh.write(b"HERE")
|
||||
output_fh.write(stream.read(radius))
|
||||
stream.seek(-radius, 1)
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: str) -> int:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: bytes) -> bytes:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: int) -> int:
|
||||
...
|
||||
|
||||
|
||||
def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
|
||||
if isinstance(b, str):
|
||||
return ord(b)
|
||||
return b
|
||||
|
||||
|
||||
def deprecate(msg: str, stacklevel: int = 3) -> None:
|
||||
warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
|
||||
|
||||
|
||||
def deprecation(msg: str) -> None:
|
||||
raise DeprecationError(msg)
|
||||
|
||||
|
||||
def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||
"""Issue a warning that a feature will be removed, but has a replacement."""
|
||||
deprecate(
|
||||
f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
|
||||
4,
|
||||
)
|
||||
|
||||
|
||||
def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||
"""Raise an exception that a feature was already removed, but has a replacement."""
|
||||
deprecation(
|
||||
f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
|
||||
)
|
||||
|
||||
|
||||
def deprecate_no_replacement(name: str, removed_in: str) -> None:
|
||||
"""Issue a warning that a feature will be removed without replacement."""
|
||||
deprecate(f"{name} is deprecated and will be removed in pypdf {removed_in}.", 4)
|
||||
|
||||
|
||||
def deprecation_no_replacement(name: str, removed_in: str) -> None:
|
||||
"""Raise an exception that a feature was already removed without replacement."""
|
||||
deprecation(f"{name} is deprecated and was removed in pypdf {removed_in}.")
|
||||
|
||||
|
||||
def logger_error(msg: str, src: str) -> None:
|
||||
"""
|
||||
Use this instead of logger.error directly.
|
||||
|
||||
That allows people to overwrite it more easily.
|
||||
|
||||
See the docs on when to use which:
|
||||
https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html
|
||||
"""
|
||||
logging.getLogger(src).error(msg)
|
||||
|
||||
|
||||
def logger_warning(msg: str, src: str) -> None:
|
||||
"""
|
||||
Use this instead of logger.warning directly.
|
||||
|
||||
That allows people to overwrite it more easily.
|
||||
|
||||
## Exception, warnings.warn, logger_warning
|
||||
- Exceptions should be used if the user should write code that deals with
|
||||
an error case, e.g. the PDF being completely broken.
|
||||
- warnings.warn should be used if the user needs to fix their code, e.g.
|
||||
DeprecationWarnings
|
||||
- logger_warning should be used if the user needs to know that an issue was
|
||||
handled by pypdf, e.g. a non-compliant PDF being read in a way that
|
||||
pypdf could apply a robustness fix to still read it. This applies mainly
|
||||
to strict=False mode.
|
||||
"""
|
||||
logging.getLogger(src).warning(msg)
|
||||
|
||||
|
||||
def rename_kwargs(
|
||||
func_name: str, kwargs: dict[str, Any], aliases: dict[str, str], fail: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Helper function to deprecate arguments.
|
||||
|
||||
Args:
|
||||
func_name: Name of the function to be deprecated
|
||||
kwargs:
|
||||
aliases:
|
||||
fail:
|
||||
|
||||
"""
|
||||
for old_term, new_term in aliases.items():
|
||||
if old_term in kwargs:
|
||||
if fail:
|
||||
raise DeprecationError(
|
||||
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||
)
|
||||
if new_term in kwargs:
|
||||
raise TypeError(
|
||||
f"{func_name} received both {old_term} and {new_term} as "
|
||||
f"an argument. {old_term} is deprecated. "
|
||||
f"Use {new_term} instead."
|
||||
)
|
||||
kwargs[new_term] = kwargs.pop(old_term)
|
||||
warnings.warn(
|
||||
message=(
|
||||
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||
),
|
||||
category=DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
|
||||
def _human_readable_bytes(bytes: int) -> str:
|
||||
if bytes < 10**3:
|
||||
return f"{bytes} Byte"
|
||||
if bytes < 10**6:
|
||||
return f"{bytes / 10**3:.1f} kB"
|
||||
if bytes < 10**9:
|
||||
return f"{bytes / 10**6:.1f} MB"
|
||||
return f"{bytes / 10**9:.1f} GB"
|
||||
|
||||
|
||||
# The following class has been copied from Django:
|
||||
# https://github.com/django/django/blob/adae619426b6f50046b3daaa744db52989c9d6db/django/utils/functional.py#L51-L65
|
||||
# It received some modifications to comply with our own coding standards.
|
||||
#
|
||||
# Original license:
|
||||
#
|
||||
# ---------------------------------------------------------------------------------
|
||||
# Copyright (c) Django Software Foundation and individual contributors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# ---------------------------------------------------------------------------------
|
||||
class classproperty: # noqa: N801
|
||||
"""
|
||||
Decorator that converts a method with a single cls argument into a property
|
||||
that can be accessed directly from the class.
|
||||
"""
|
||||
|
||||
def __init__(self, method=None) -> None: # type: ignore # noqa: ANN001
|
||||
self.fget = method
|
||||
|
||||
def __get__(self, instance, cls=None) -> Any: # type: ignore # noqa: ANN001
|
||||
return self.fget(cls)
|
||||
|
||||
def getter(self, method) -> Self: # type: ignore # noqa: ANN001
|
||||
self.fget = method
|
||||
return self
|
||||
|
||||
|
||||
@dataclass
|
||||
class File:
|
||||
from .generic import IndirectObject # noqa: PLC0415
|
||||
|
||||
name: str = ""
|
||||
"""
|
||||
Filename as identified within the PDF file.
|
||||
"""
|
||||
data: bytes = b""
|
||||
"""
|
||||
Data as bytes.
|
||||
"""
|
||||
indirect_reference: Optional[IndirectObject] = None
|
||||
"""
|
||||
Reference to the object storing the stream.
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()[:-1] + f", hash: {hash(self.data)})"
|
||||
|
||||
|
||||
@functools.total_ordering
|
||||
class Version:
|
||||
COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$")
|
||||
|
||||
def __init__(self, version_str: str) -> None:
|
||||
self.version_str = version_str
|
||||
self.components = self._parse_version(version_str)
|
||||
|
||||
def _parse_version(self, version_str: str) -> list[tuple[int, str]]:
|
||||
components = version_str.split(".")
|
||||
parsed_components = []
|
||||
for component in components:
|
||||
match = Version.COMPONENT_PATTERN.match(component)
|
||||
if not match:
|
||||
parsed_components.append((0, component))
|
||||
continue
|
||||
integer_prefix = match.group(1)
|
||||
suffix = match.group(2)
|
||||
if integer_prefix is None:
|
||||
integer_prefix = 0
|
||||
parsed_components.append((int(integer_prefix), suffix))
|
||||
return parsed_components
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, Version):
|
||||
return False
|
||||
return self.components == other.components
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# Convert to tuple as lists cannot be hashed.
|
||||
return hash((self.__class__, tuple(self.components)))
|
||||
|
||||
def __lt__(self, other: Any) -> bool:
|
||||
if not isinstance(other, Version):
|
||||
raise ValueError(f"Version cannot be compared against {type(other)}")
|
||||
|
||||
for self_component, other_component in zip(self.components, other.components):
|
||||
self_value, self_suffix = self_component
|
||||
other_value, other_suffix = other_component
|
||||
|
||||
if self_value < other_value:
|
||||
return True
|
||||
if self_value > other_value:
|
||||
return False
|
||||
|
||||
if self_suffix < other_suffix:
|
||||
return True
|
||||
if self_suffix > other_suffix:
|
||||
return False
|
||||
|
||||
return len(self.components) < len(other.components)
|
||||
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "6.6.2"
|
||||
3307
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
3307
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
577
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
577
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
@@ -0,0 +1,577 @@
|
||||
"""Functions to convert an image XObject to an image"""
|
||||
|
||||
import sys
|
||||
from io import BytesIO
|
||||
from typing import Any, Literal, Optional, Union, cast
|
||||
|
||||
from ._utils import check_if_whitespace_only, logger_warning
|
||||
from .constants import ColorSpaces, StreamAttributes
|
||||
from .constants import FilterTypes as FT
|
||||
from .constants import ImageAttributes as IA
|
||||
from .errors import EmptyImageDataError, PdfReadError
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DecodedStreamObject,
|
||||
EncodedStreamObject,
|
||||
NullObject,
|
||||
TextStringObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
|
||||
try:
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pillow is required to do image extraction. "
|
||||
"It can be installed via 'pip install pypdf[image]'"
|
||||
)
|
||||
|
||||
mode_str_type: TypeAlias = Literal[
|
||||
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
|
||||
]
|
||||
|
||||
MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
|
||||
|
||||
|
||||
def _get_image_mode(
|
||||
color_space: Union[str, list[Any], Any],
|
||||
color_components: int,
|
||||
prev_mode: mode_str_type,
|
||||
depth: int = 0,
|
||||
) -> tuple[mode_str_type, bool]:
|
||||
"""
|
||||
Returns:
|
||||
Image mode, not taking into account mask (transparency).
|
||||
ColorInversion is required (like for some DeviceCMYK).
|
||||
|
||||
"""
|
||||
if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
|
||||
raise PdfReadError(
|
||||
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
|
||||
)
|
||||
if is_null_or_none(color_space):
|
||||
return "", False
|
||||
color_space_str: str = ""
|
||||
if isinstance(color_space, str):
|
||||
color_space_str = color_space
|
||||
elif not isinstance(color_space, list):
|
||||
raise PdfReadError(
|
||||
"Cannot interpret color space", color_space
|
||||
) # pragma: no cover
|
||||
elif not color_space:
|
||||
return "", False
|
||||
elif color_space[0].startswith("/Cal"): # /CalRGB or /CalGray
|
||||
color_space_str = "/Device" + color_space[0][4:]
|
||||
elif color_space[0] == "/ICCBased":
|
||||
icc_profile = color_space[1].get_object()
|
||||
color_components = cast(int, icc_profile["/N"])
|
||||
color_space_str = icc_profile.get("/Alternate", "")
|
||||
elif color_space[0] == "/Indexed":
|
||||
color_space_str = color_space[1].get_object()
|
||||
mode, invert_color = _get_image_mode(
|
||||
color_space_str, color_components, prev_mode, depth + 1
|
||||
)
|
||||
if mode in ("RGB", "CMYK"):
|
||||
mode = "P"
|
||||
return mode, invert_color
|
||||
elif color_space[0] == "/Separation":
|
||||
color_space_str = color_space[2].get_object()
|
||||
mode, invert_color = _get_image_mode(
|
||||
color_space_str, color_components, prev_mode, depth + 1
|
||||
)
|
||||
return mode, True
|
||||
elif color_space[0] == "/DeviceN":
|
||||
original_color_space = color_space
|
||||
color_components = len(color_space[1])
|
||||
color_space_str = color_space[2].get_object()
|
||||
if color_space_str == "/DeviceCMYK" and color_components == 1:
|
||||
if original_color_space[1][0] != "/Black":
|
||||
logger_warning(
|
||||
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
|
||||
__name__,
|
||||
)
|
||||
return "L", True
|
||||
mode, invert_color = _get_image_mode(
|
||||
color_space_str, color_components, prev_mode, depth + 1
|
||||
)
|
||||
return mode, invert_color
|
||||
|
||||
mode_map: dict[str, mode_str_type] = {
|
||||
"1bit": "1", # must be zeroth position: color_components may index the values
|
||||
"/DeviceGray": "L", # must be first position: color_components may index the values
|
||||
"palette": "P", # must be second position: color_components may index the values
|
||||
"/DeviceRGB": "RGB", # must be third position: color_components may index the values
|
||||
"/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values
|
||||
"2bit": "2bits",
|
||||
"4bit": "4bits",
|
||||
}
|
||||
|
||||
mode = (
|
||||
mode_map.get(color_space_str)
|
||||
or list(mode_map.values())[color_components]
|
||||
or prev_mode
|
||||
)
|
||||
|
||||
return mode, mode == "CMYK"
|
||||
|
||||
|
||||
def bits2byte(data: bytes, size: tuple[int, int], bits: int) -> bytes:
|
||||
mask = (1 << bits) - 1
|
||||
byte_buffer = bytearray(size[0] * size[1])
|
||||
data_index = 0
|
||||
bit = 8 - bits
|
||||
for y in range(size[1]):
|
||||
if bit != 8 - bits:
|
||||
data_index += 1
|
||||
bit = 8 - bits
|
||||
for x in range(size[0]):
|
||||
byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask
|
||||
bit -= bits
|
||||
if bit < 0:
|
||||
data_index += 1
|
||||
bit = 8 - bits
|
||||
return bytes(byte_buffer)
|
||||
|
||||
|
||||
def _extended_image_from_bytes(
|
||||
mode: str, size: tuple[int, int], data: bytes
|
||||
) -> Image.Image:
|
||||
try:
|
||||
img = Image.frombytes(mode, size, data)
|
||||
except ValueError as exc:
|
||||
nb_pix = size[0] * size[1]
|
||||
data_length = len(data)
|
||||
if data_length == 0:
|
||||
raise EmptyImageDataError(
|
||||
"Data is 0 bytes, cannot process an image from empty data."
|
||||
) from exc
|
||||
if data_length % nb_pix != 0:
|
||||
raise exc
|
||||
k = nb_pix * len(mode) / data_length
|
||||
data = b"".join(bytes((x,) * int(k)) for x in data)
|
||||
img = Image.frombytes(mode, size, data)
|
||||
return img
|
||||
|
||||
|
||||
def __handle_flate__indexed(color_space: ArrayObject) -> tuple[Any, Any, Any, Any]:
|
||||
count = len(color_space)
|
||||
if count == 4:
|
||||
color_space, base, hival, lookup = (value.get_object() for value in color_space)
|
||||
return color_space, base, hival, lookup
|
||||
|
||||
# Deal with strange AutoDesk files where `base` and `hival` look like this:
|
||||
# /DeviceRGB\x00255
|
||||
element1 = color_space[1]
|
||||
element1 = element1 if isinstance(element1, str) else element1.get_object()
|
||||
if count == 3 and "\x00" in element1:
|
||||
color_space, lookup = color_space[0].get_object(), color_space[2].get_object()
|
||||
base, hival = element1.split("\x00")
|
||||
hival = int(hival)
|
||||
return color_space, base, hival, lookup
|
||||
raise PdfReadError(f"Expected color space with 4 values, got {count}: {color_space}")
|
||||
|
||||
|
||||
def _handle_flate(
|
||||
size: tuple[int, int],
|
||||
data: bytes,
|
||||
mode: mode_str_type,
|
||||
color_space: str,
|
||||
colors: int,
|
||||
obj_as_text: str,
|
||||
) -> tuple[Image.Image, str, str, bool]:
|
||||
"""
|
||||
Process image encoded in flateEncode
|
||||
Returns img, image_format, extension, color inversion
|
||||
"""
|
||||
extension = ".png" # mime_type: "image/png"
|
||||
image_format = "PNG"
|
||||
lookup: Any
|
||||
base: Any
|
||||
hival: Any
|
||||
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
|
||||
color_space, base, hival, lookup = __handle_flate__indexed(color_space)
|
||||
if mode == "2bits":
|
||||
mode = "P"
|
||||
data = bits2byte(data, size, 2)
|
||||
elif mode == "4bits":
|
||||
mode = "P"
|
||||
data = bits2byte(data, size, 4)
|
||||
img = _extended_image_from_bytes(mode, size, data)
|
||||
if color_space == "/Indexed":
|
||||
if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
|
||||
lookup = lookup.get_data()
|
||||
if isinstance(lookup, TextStringObject):
|
||||
lookup = lookup.original_bytes
|
||||
if isinstance(lookup, str):
|
||||
lookup = lookup.encode()
|
||||
try:
|
||||
nb, conv, mode = { # type: ignore
|
||||
"1": (0, "", ""),
|
||||
"L": (1, "P", "L"),
|
||||
"P": (0, "", ""),
|
||||
"RGB": (3, "P", "RGB"),
|
||||
"CMYK": (4, "P", "CMYK"),
|
||||
}[_get_image_mode(base, 0, "")[0]]
|
||||
except KeyError: # pragma: no cover
|
||||
logger_warning(
|
||||
f"Base {base} not coded please share the pdf file with pypdf dev team",
|
||||
__name__,
|
||||
)
|
||||
lookup = None
|
||||
else:
|
||||
if img.mode == "1":
|
||||
# Two values ("high" and "low").
|
||||
expected_count = 2 * nb
|
||||
actual_count = len(lookup)
|
||||
if actual_count != expected_count:
|
||||
if actual_count < expected_count:
|
||||
logger_warning(
|
||||
f"Not enough lookup values: Expected {expected_count}, got {actual_count}.",
|
||||
__name__
|
||||
)
|
||||
lookup += bytes([0] * (expected_count - actual_count))
|
||||
elif not check_if_whitespace_only(lookup[expected_count:]):
|
||||
logger_warning(
|
||||
f"Too many lookup values: Expected {expected_count}, got {actual_count}.",
|
||||
__name__
|
||||
)
|
||||
lookup = lookup[:expected_count]
|
||||
colors_arr = [lookup[:nb], lookup[nb:]]
|
||||
arr = b"".join(
|
||||
b"".join(
|
||||
colors_arr[1 if img.getpixel((x, y)) > 127 else 0] # type: ignore[operator,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
|
||||
for x in range(img.size[0])
|
||||
)
|
||||
for y in range(img.size[1])
|
||||
)
|
||||
img = Image.frombytes(mode, img.size, arr)
|
||||
else:
|
||||
img = img.convert(conv)
|
||||
if len(lookup) != (hival + 1) * nb:
|
||||
logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
|
||||
lookup = None
|
||||
elif mode == "L":
|
||||
# gray lookup does not work: it is converted to a similar RGB lookup
|
||||
lookup = b"".join([bytes([b, b, b]) for b in lookup])
|
||||
mode = "RGB"
|
||||
# TODO: https://github.com/py-pdf/pypdf/pull/2039
|
||||
# this is a work around until PIL is able to process CMYK images
|
||||
elif mode == "CMYK":
|
||||
_rgb = []
|
||||
for _c, _m, _y, _k in (
|
||||
lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
|
||||
):
|
||||
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
|
||||
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
|
||||
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
|
||||
_rgb.append(bytes((_r, _g, _b)))
|
||||
lookup = b"".join(_rgb)
|
||||
mode = "RGB"
|
||||
if lookup is not None:
|
||||
img.putpalette(lookup, rawmode=mode)
|
||||
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
|
||||
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
|
||||
# Table 65 - Additional Entries Specific to an ICC Profile Stream Dictionary
|
||||
mode2 = _get_image_mode(color_space, colors, mode)[0]
|
||||
if mode != mode2:
|
||||
img = Image.frombytes(mode2, size, data) # reloaded as mode may have changed
|
||||
if mode == "CMYK":
|
||||
extension = ".tif"
|
||||
image_format = "TIFF"
|
||||
return img, image_format, extension, False
|
||||
|
||||
|
||||
def _handle_jpx(
|
||||
size: tuple[int, int],
|
||||
data: bytes,
|
||||
mode: mode_str_type,
|
||||
color_space: str,
|
||||
colors: int,
|
||||
) -> tuple[Image.Image, str, str, bool]:
|
||||
"""
|
||||
Process image encoded in flateEncode
|
||||
Returns img, image_format, extension, inversion
|
||||
"""
|
||||
extension = ".jp2" # mime_type: "image/x-jp2"
|
||||
img1: Image.Image = Image.open(BytesIO(data), formats=("JPEG2000",))
|
||||
mode, invert_color = _get_image_mode(color_space, colors, mode)
|
||||
if mode == "":
|
||||
mode = cast(mode_str_type, img1.mode)
|
||||
invert_color = mode in ("CMYK",)
|
||||
if img1.mode == "RGBA" and mode == "RGB":
|
||||
mode = "RGBA"
|
||||
# we need to convert to the good mode
|
||||
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
|
||||
# L and P are indexed modes which should not be changed.
|
||||
img = img1
|
||||
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
|
||||
# RGBA / CMYK are 4bytes encoding where
|
||||
# the encoding should be corrected
|
||||
img = Image.frombytes(mode, img1.size, img1.tobytes())
|
||||
else: # pragma: no cover
|
||||
img = img1.convert(mode)
|
||||
# CMYK conversion
|
||||
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
|
||||
# not implemented for the moment as I need to get properly the ICC
|
||||
if img.mode == "CMYK":
|
||||
img = img.convert("RGB")
|
||||
image_format = "JPEG2000"
|
||||
return img, image_format, extension, invert_color
|
||||
|
||||
|
||||
def _apply_decode(
|
||||
img: Image.Image,
|
||||
x_object_obj: dict[str, Any],
|
||||
lfilters: FT,
|
||||
color_space: Union[str, list[Any], Any],
|
||||
invert_color: bool,
|
||||
) -> Image.Image:
|
||||
# CMYK image and other color spaces without decode
|
||||
# requires reverting scale (cf p243,2§ last sentence)
|
||||
decode = x_object_obj.get(
|
||||
IA.DECODE,
|
||||
([1.0, 0.0] * len(img.getbands()))
|
||||
if (
|
||||
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
|
||||
or (invert_color and img.mode == "L")
|
||||
)
|
||||
else None,
|
||||
)
|
||||
if (
|
||||
isinstance(color_space, ArrayObject)
|
||||
and color_space[0].get_object() == "/Indexed"
|
||||
):
|
||||
decode = None # decode is meaningless if Indexed
|
||||
if (
|
||||
isinstance(color_space, ArrayObject)
|
||||
and color_space[0].get_object() == "/Separation"
|
||||
):
|
||||
decode = [1.0, 0.0] * len(img.getbands())
|
||||
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
|
||||
lut: list[int] = []
|
||||
for i in range(0, len(decode), 2):
|
||||
dmin = decode[i]
|
||||
dmax = decode[i + 1]
|
||||
lut.extend(
|
||||
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
|
||||
)
|
||||
img = img.point(lut)
|
||||
return img
|
||||
|
||||
|
||||
def _get_mode_and_invert_color(
|
||||
x_object_obj: dict[str, Any], colors: int, color_space: Union[str, list[Any], Any]
|
||||
) -> tuple[mode_str_type, bool]:
|
||||
if (
|
||||
IA.COLOR_SPACE in x_object_obj
|
||||
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
|
||||
):
|
||||
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
|
||||
mode: mode_str_type = "RGB"
|
||||
if x_object_obj.get("/BitsPerComponent", 8) < 8:
|
||||
mode, invert_color = _get_image_mode(
|
||||
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
|
||||
)
|
||||
else:
|
||||
mode, invert_color = _get_image_mode(
|
||||
color_space,
|
||||
2
|
||||
if (
|
||||
colors == 1
|
||||
and (
|
||||
not is_null_or_none(color_space)
|
||||
and "Gray" not in color_space
|
||||
)
|
||||
)
|
||||
else colors,
|
||||
"",
|
||||
)
|
||||
return mode, invert_color
|
||||
|
||||
|
||||
def _xobj_to_image(
|
||||
x_object: dict[str, Any],
|
||||
pillow_parameters: Union[dict[str, Any], None] = None
|
||||
) -> tuple[Optional[str], bytes, Any]:
|
||||
"""
|
||||
Users need to have the pillow package installed.
|
||||
|
||||
It's unclear if pypdf will keep this function here, hence it's private.
|
||||
It might get removed at any point.
|
||||
|
||||
Args:
|
||||
x_object:
|
||||
pillow_parameters: parameters provided to Pillow Image.save() method,
|
||||
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>
|
||||
|
||||
Returns:
|
||||
Tuple[file extension, bytes, PIL.Image.Image]
|
||||
|
||||
"""
|
||||
def _apply_alpha(
|
||||
img: Image.Image,
|
||||
x_object: dict[str, Any],
|
||||
obj_as_text: str,
|
||||
image_format: str,
|
||||
extension: str,
|
||||
) -> tuple[Image.Image, str, str]:
|
||||
alpha = None
|
||||
if IA.S_MASK in x_object: # add alpha channel
|
||||
alpha = _xobj_to_image(x_object[IA.S_MASK])[2]
|
||||
if img.size != alpha.size:
|
||||
logger_warning(
|
||||
f"image and mask size not matching: {obj_as_text}", __name__
|
||||
)
|
||||
else:
|
||||
# TODO: implement mask
|
||||
if alpha.mode != "L":
|
||||
alpha = alpha.convert("L")
|
||||
if img.mode == "P":
|
||||
img = img.convert("RGB")
|
||||
elif img.mode == "1":
|
||||
img = img.convert("L")
|
||||
img.putalpha(alpha)
|
||||
if "JPEG" in image_format:
|
||||
image_format = "JPEG2000"
|
||||
extension = ".jp2"
|
||||
else:
|
||||
image_format = "PNG"
|
||||
extension = ".png"
|
||||
return img, extension, image_format
|
||||
|
||||
# For error reporting
|
||||
obj_as_text = (
|
||||
x_object.indirect_reference.__repr__()
|
||||
if x_object is None # pragma: no cover
|
||||
else x_object.__repr__()
|
||||
)
|
||||
|
||||
# Get size and data
|
||||
size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT]))
|
||||
data = x_object.get_data() # type: ignore
|
||||
if isinstance(data, str): # pragma: no cover
|
||||
data = data.encode()
|
||||
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
|
||||
data = data[:-1]
|
||||
|
||||
# Get color properties
|
||||
colors = x_object.get("/Colors", 1)
|
||||
color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object()
|
||||
if isinstance(color_space, list) and len(color_space) == 1:
|
||||
color_space = color_space[0].get_object()
|
||||
|
||||
mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space)
|
||||
|
||||
# Get filters
|
||||
filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object()
|
||||
lfilters = filters[-1] if isinstance(filters, list) else filters
|
||||
decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None)
|
||||
if decode_parms and isinstance(decode_parms, (tuple, list)):
|
||||
decode_parms = decode_parms[0]
|
||||
else:
|
||||
decode_parms = {}
|
||||
if not isinstance(decode_parms, dict):
|
||||
decode_parms = {}
|
||||
|
||||
extension = None
|
||||
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
|
||||
img, image_format, extension, _ = _handle_flate(
|
||||
size,
|
||||
data,
|
||||
mode,
|
||||
color_space,
|
||||
colors,
|
||||
obj_as_text,
|
||||
)
|
||||
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE):
|
||||
# I'm not sure if the following logic is correct.
|
||||
# There might not be any relationship between the filters and the
|
||||
# extension
|
||||
if lfilters == FT.LZW_DECODE:
|
||||
image_format = "TIFF"
|
||||
extension = ".tiff" # mime_type = "image/tiff"
|
||||
else:
|
||||
image_format = "PNG"
|
||||
extension = ".png" # mime_type = "image/png"
|
||||
try:
|
||||
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
|
||||
except UnidentifiedImageError:
|
||||
img = _extended_image_from_bytes(mode, size, data)
|
||||
elif lfilters == FT.DCT_DECODE:
|
||||
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
|
||||
# invert_color kept unchanged
|
||||
elif lfilters == FT.JPX_DECODE:
|
||||
img, image_format, extension, invert_color = _handle_jpx(
|
||||
size, data, mode, color_space, colors
|
||||
)
|
||||
elif lfilters == FT.CCITT_FAX_DECODE:
|
||||
img, image_format, extension, invert_color = (
|
||||
Image.open(BytesIO(data), formats=("TIFF",)),
|
||||
"TIFF",
|
||||
".tiff",
|
||||
False,
|
||||
)
|
||||
elif lfilters == FT.JBIG2_DECODE:
|
||||
img, image_format, extension, invert_color = (
|
||||
Image.open(BytesIO(data), formats=("PNG", "PPM")),
|
||||
"PNG",
|
||||
".png",
|
||||
False,
|
||||
)
|
||||
elif mode == "CMYK":
|
||||
img, image_format, extension, invert_color = (
|
||||
_extended_image_from_bytes(mode, size, data),
|
||||
"TIFF",
|
||||
".tif",
|
||||
False,
|
||||
)
|
||||
elif mode == "":
|
||||
raise PdfReadError(f"ColorSpace field not found in {x_object}")
|
||||
else:
|
||||
img, image_format, extension, invert_color = (
|
||||
_extended_image_from_bytes(mode, size, data),
|
||||
"PNG",
|
||||
".png",
|
||||
False,
|
||||
)
|
||||
|
||||
img = _apply_decode(img, x_object, lfilters, color_space, invert_color)
|
||||
img, extension, image_format = _apply_alpha(
|
||||
img, x_object, obj_as_text, image_format, extension
|
||||
)
|
||||
|
||||
if pillow_parameters is None:
|
||||
pillow_parameters = {}
|
||||
# Preserve JPEG image quality - see issue #3515.
|
||||
if image_format == "JPEG":
|
||||
# This prevents: Cannot use 'keep' when original image is not a JPEG:
|
||||
# "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format
|
||||
img.format = "JPEG"
|
||||
if "quality" not in pillow_parameters:
|
||||
pillow_parameters["quality"] = "keep"
|
||||
|
||||
# Save image to bytes
|
||||
img_byte_arr = BytesIO()
|
||||
try:
|
||||
img.save(img_byte_arr, format=image_format, **pillow_parameters)
|
||||
except OSError: # pragma: no cover # covered with pillow 10.3
|
||||
# in case of we convert to RGBA and then to PNG
|
||||
img1 = img.convert("RGBA")
|
||||
image_format = "PNG"
|
||||
extension = ".png"
|
||||
img_byte_arr = BytesIO()
|
||||
img1.save(img_byte_arr, format=image_format)
|
||||
data = img_byte_arr.getvalue()
|
||||
|
||||
try: # temporary try/except until other fixes of images
|
||||
img = Image.open(BytesIO(data))
|
||||
except Exception as exception:
|
||||
logger_warning(f"Failed loading image: {exception}", __name__)
|
||||
img = None # type: ignore[assignment,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
|
||||
return extension, data, img
|
||||
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
PDF specifies several annotation types which pypdf makes available here.
|
||||
|
||||
The names of the annotations and their attributes do not reflect the names in
|
||||
the specification in all cases. For example, the PDF standard defines a
|
||||
'Square' annotation that does not actually need to be square. For this reason,
|
||||
pypdf calls it 'Rectangle'.
|
||||
|
||||
At their core, all annotation types are DictionaryObjects. That means if pypdf
|
||||
does not implement a feature, users can easily extend the given functionality.
|
||||
"""
|
||||
|
||||
|
||||
from ._base import NO_FLAGS, AnnotationDictionary
|
||||
from ._markup_annotations import (
|
||||
Ellipse,
|
||||
FreeText,
|
||||
Highlight,
|
||||
Line,
|
||||
MarkupAnnotation,
|
||||
Polygon,
|
||||
PolyLine,
|
||||
Rectangle,
|
||||
Text,
|
||||
)
|
||||
from ._non_markup_annotations import Link, Popup
|
||||
|
||||
__all__ = [
|
||||
"NO_FLAGS",
|
||||
"AnnotationDictionary",
|
||||
"Ellipse",
|
||||
"FreeText",
|
||||
"Highlight",
|
||||
"Line",
|
||||
"Link",
|
||||
"MarkupAnnotation",
|
||||
"PolyLine",
|
||||
"Polygon",
|
||||
"Popup",
|
||||
"Rectangle",
|
||||
"Text",
|
||||
]
|
||||
29
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
29
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from abc import ABC
|
||||
|
||||
from ..constants import AnnotationFlag
|
||||
from ..generic import NameObject, NumberObject
|
||||
from ..generic._data_structures import DictionaryObject
|
||||
|
||||
|
||||
class AnnotationDictionary(DictionaryObject, ABC):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
from ..generic._base import NameObject # noqa: PLC0415
|
||||
|
||||
# /Rect should not be added here as Polygon and PolyLine can automatically set it
|
||||
self[NameObject("/Type")] = NameObject("/Annot")
|
||||
# The flags were NOT added to the constructor on purpose:
|
||||
# We expect that most users don't want to change the default.
|
||||
# If they do, they can use the property. The default is 0.
|
||||
|
||||
@property
|
||||
def flags(self) -> AnnotationFlag:
|
||||
return self.get(NameObject("/F"), AnnotationFlag(0))
|
||||
|
||||
@flags.setter
|
||||
def flags(self, value: AnnotationFlag) -> None:
|
||||
self[NameObject("/F")] = NumberObject(value)
|
||||
|
||||
|
||||
NO_FLAGS = AnnotationFlag(0)
|
||||
@@ -0,0 +1,305 @@
|
||||
import sys
|
||||
from abc import ABC
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from ..constants import AnnotationFlag
|
||||
from ..generic import ArrayObject, DictionaryObject
|
||||
from ..generic._base import (
|
||||
BooleanObject,
|
||||
FloatObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
TextStringObject,
|
||||
)
|
||||
from ..generic._rectangle import RectangleObject
|
||||
from ..generic._utils import hex_to_rgb
|
||||
from ._base import NO_FLAGS, AnnotationDictionary
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
# PEP 613 introduced typing.TypeAlias with Python 3.10
|
||||
# For older Python versions, the backport typing_extensions is necessary:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
|
||||
Vertex: TypeAlias = tuple[float, float]
|
||||
|
||||
|
||||
def _get_bounding_rectangle(vertices: list[Vertex]) -> RectangleObject:
|
||||
x_min, y_min = vertices[0][0], vertices[0][1]
|
||||
x_max, y_max = vertices[0][0], vertices[0][1]
|
||||
for x, y in vertices:
|
||||
x_min = min(x_min, x)
|
||||
y_min = min(y_min, y)
|
||||
x_max = max(x_max, x)
|
||||
y_max = max(y_max, y)
|
||||
return RectangleObject((x_min, y_min, x_max, y_max))
|
||||
|
||||
|
||||
class MarkupAnnotation(AnnotationDictionary, ABC):
|
||||
"""
|
||||
Base class for all markup annotations.
|
||||
|
||||
Args:
|
||||
title_bar: Text to be displayed in the title bar of the annotation;
|
||||
by convention this is the name of the author
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, title_bar: Optional[str] = None) -> None:
|
||||
if title_bar is not None:
|
||||
self[NameObject("/T")] = TextStringObject(title_bar)
|
||||
|
||||
|
||||
class Text(MarkupAnnotation):
|
||||
"""
|
||||
A text annotation.
|
||||
|
||||
Args:
|
||||
rect: array of four integers ``[xLL, yLL, xUR, yUR]``
|
||||
specifying the clickable rectangular area
|
||||
text: The text that is added to the document
|
||||
open:
|
||||
flags:
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
text: str,
|
||||
open: bool = False,
|
||||
flags: int = NO_FLAGS,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self[NameObject("/Subtype")] = NameObject("/Text")
|
||||
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||
self[NameObject("/Contents")] = TextStringObject(text)
|
||||
self[NameObject("/Open")] = BooleanObject(open)
|
||||
self[NameObject("/Flags")] = NumberObject(flags)
|
||||
|
||||
|
||||
class FreeText(MarkupAnnotation):
|
||||
"""A FreeText annotation"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
font: str = "Helvetica",
|
||||
bold: bool = False,
|
||||
italic: bool = False,
|
||||
font_size: str = "14pt",
|
||||
font_color: str = "000000",
|
||||
border_color: Optional[str] = "000000",
|
||||
background_color: Optional[str] = "ffffff",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self[NameObject("/Subtype")] = NameObject("/FreeText")
|
||||
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||
|
||||
# Table 225 of the 1.7 reference ("CSS2 style attributes used in rich text strings")
|
||||
font_str = "font: "
|
||||
if italic:
|
||||
font_str = f"{font_str}italic "
|
||||
else:
|
||||
font_str = f"{font_str}normal "
|
||||
if bold:
|
||||
font_str = f"{font_str}bold "
|
||||
else:
|
||||
font_str = f"{font_str}normal "
|
||||
font_str = f"{font_str}{font_size} {font}"
|
||||
font_str = f"{font_str};text-align:left;color:#{font_color}"
|
||||
|
||||
default_appearance_string = ""
|
||||
if border_color:
|
||||
for st in hex_to_rgb(border_color):
|
||||
default_appearance_string = f"{default_appearance_string}{st} "
|
||||
default_appearance_string = f"{default_appearance_string}rg"
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/FreeText"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Contents"): TextStringObject(text),
|
||||
# font size color
|
||||
NameObject("/DS"): TextStringObject(font_str),
|
||||
NameObject("/DA"): TextStringObject(default_appearance_string),
|
||||
}
|
||||
)
|
||||
if border_color is None:
|
||||
# Border Style
|
||||
self[NameObject("/BS")] = DictionaryObject(
|
||||
{
|
||||
# width of 0 means no border
|
||||
NameObject("/W"): NumberObject(0)
|
||||
}
|
||||
)
|
||||
if background_color is not None:
|
||||
self[NameObject("/C")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(background_color)]
|
||||
)
|
||||
|
||||
|
||||
class Line(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
p1: Vertex,
|
||||
p2: Vertex,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
text: str = "",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Line"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/L"): ArrayObject(
|
||||
[
|
||||
FloatObject(p1[0]),
|
||||
FloatObject(p1[1]),
|
||||
FloatObject(p2[0]),
|
||||
FloatObject(p2[1]),
|
||||
]
|
||||
),
|
||||
NameObject("/LE"): ArrayObject(
|
||||
[
|
||||
NameObject("/None"),
|
||||
NameObject("/None"),
|
||||
]
|
||||
),
|
||||
NameObject("/IC"): ArrayObject(
|
||||
[
|
||||
FloatObject(0.5),
|
||||
FloatObject(0.5),
|
||||
FloatObject(0.5),
|
||||
]
|
||||
),
|
||||
NameObject("/Contents"): TextStringObject(text),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class PolyLine(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
vertices: list[Vertex],
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if len(vertices) == 0:
|
||||
raise ValueError("A polyline needs at least 1 vertex with two coordinates")
|
||||
coord_list = []
|
||||
for x, y in vertices:
|
||||
coord_list.append(NumberObject(x))
|
||||
coord_list.append(NumberObject(y))
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/PolyLine"),
|
||||
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class Rectangle(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
*,
|
||||
interior_color: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Square"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
}
|
||||
)
|
||||
|
||||
if interior_color:
|
||||
self[NameObject("/IC")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||
)
|
||||
|
||||
|
||||
class Highlight(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
quad_points: ArrayObject,
|
||||
highlight_color: str = "ff0000",
|
||||
printing: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Highlight"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/QuadPoints"): quad_points,
|
||||
NameObject("/C"): ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(highlight_color)]
|
||||
),
|
||||
}
|
||||
)
|
||||
if printing:
|
||||
self.flags = AnnotationFlag.PRINT
|
||||
|
||||
|
||||
class Ellipse(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
*,
|
||||
interior_color: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Circle"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
}
|
||||
)
|
||||
|
||||
if interior_color:
|
||||
self[NameObject("/IC")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||
)
|
||||
|
||||
|
||||
class Polygon(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
vertices: list[tuple[float, float]],
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if len(vertices) == 0:
|
||||
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
|
||||
|
||||
coord_list = []
|
||||
for x, y in vertices:
|
||||
coord_list.append(NumberObject(x))
|
||||
coord_list.append(NumberObject(y))
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Polygon"),
|
||||
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||
NameObject("/IT"): NameObject("/PolygonCloud"),
|
||||
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,106 @@
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
from ..generic._base import (
|
||||
BooleanObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
TextStringObject,
|
||||
)
|
||||
from ..generic._data_structures import ArrayObject, DictionaryObject
|
||||
from ..generic._fit import DEFAULT_FIT, Fit
|
||||
from ..generic._rectangle import RectangleObject
|
||||
from ._base import AnnotationDictionary
|
||||
|
||||
|
||||
class Link(AnnotationDictionary):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
border: Optional[ArrayObject] = None,
|
||||
url: Optional[str] = None,
|
||||
target_page_index: Optional[int] = None,
|
||||
fit: Fit = DEFAULT_FIT,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if TYPE_CHECKING:
|
||||
from ..types import BorderArrayType # noqa: PLC0415
|
||||
|
||||
is_external = url is not None
|
||||
is_internal = target_page_index is not None
|
||||
if not is_external and not is_internal:
|
||||
raise ValueError(
|
||||
"Either 'url' or 'target_page_index' have to be provided. Both were None."
|
||||
)
|
||||
if is_external and is_internal:
|
||||
raise ValueError(
|
||||
"Either 'url' or 'target_page_index' have to be provided. "
|
||||
f"{url=}, {target_page_index=}"
|
||||
)
|
||||
|
||||
border_arr: BorderArrayType
|
||||
if border is not None:
|
||||
border_arr = [NumberObject(n) for n in border[:3]]
|
||||
if len(border) == 4:
|
||||
dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
|
||||
border_arr.append(dash_pattern)
|
||||
else:
|
||||
border_arr = [NumberObject(0)] * 3
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Link"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Border"): ArrayObject(border_arr),
|
||||
}
|
||||
)
|
||||
if is_external:
|
||||
self[NameObject("/A")] = DictionaryObject(
|
||||
{
|
||||
NameObject("/S"): NameObject("/URI"),
|
||||
NameObject("/Type"): NameObject("/Action"),
|
||||
NameObject("/URI"): TextStringObject(url),
|
||||
}
|
||||
)
|
||||
if is_internal:
|
||||
# This needs to be updated later!
|
||||
dest_deferred = DictionaryObject(
|
||||
{
|
||||
"target_page_index": NumberObject(target_page_index),
|
||||
"fit": NameObject(fit.fit_type),
|
||||
"fit_args": fit.fit_args,
|
||||
}
|
||||
)
|
||||
self[NameObject("/Dest")] = dest_deferred
|
||||
|
||||
|
||||
class Popup(AnnotationDictionary):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||
parent: Optional[DictionaryObject] = None,
|
||||
open: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Popup"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Open"): BooleanObject(open),
|
||||
}
|
||||
)
|
||||
if parent:
|
||||
# This needs to be an indirect object
|
||||
try:
|
||||
self[NameObject("/Parent")] = parent.indirect_reference
|
||||
except AttributeError:
|
||||
from .._utils import logger_warning # noqa: PLC0415
|
||||
|
||||
logger_warning(
|
||||
"Unregistered Parent object : No Parent field set",
|
||||
__name__,
|
||||
)
|
||||
796
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
796
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
@@ -0,0 +1,796 @@
|
||||
"""Various constants, enums, and flags to aid readability."""
|
||||
|
||||
from enum import Enum, IntFlag, auto, unique
|
||||
|
||||
|
||||
class StrEnum(str, Enum): # Once we are on Python 3.11+: enum.StrEnum
|
||||
def __str__(self) -> str:
|
||||
return str(self.value)
|
||||
|
||||
|
||||
class Core:
|
||||
"""Keywords that don't quite belong anywhere else."""
|
||||
|
||||
OUTLINES = "/Outlines"
|
||||
THREADS = "/Threads"
|
||||
PAGE = "/Page"
|
||||
PAGES = "/Pages"
|
||||
CATALOG = "/Catalog"
|
||||
|
||||
|
||||
class TrailerKeys:
|
||||
SIZE = "/Size"
|
||||
PREV = "/Prev"
|
||||
ROOT = "/Root"
|
||||
ENCRYPT = "/Encrypt"
|
||||
INFO = "/Info"
|
||||
ID = "/ID"
|
||||
|
||||
|
||||
class CatalogAttributes:
|
||||
NAMES = "/Names"
|
||||
DESTS = "/Dests"
|
||||
|
||||
|
||||
class EncryptionDictAttributes:
|
||||
"""
|
||||
Additional encryption dictionary entries for the standard security handler.
|
||||
|
||||
Table 3.19, Page 122.
|
||||
Table 21 of the 2.0 manual.
|
||||
"""
|
||||
|
||||
R = "/R" # number, required; revision of the standard security handler
|
||||
O = "/O" # 32-byte string, required # noqa: E741
|
||||
U = "/U" # 32-byte string, required
|
||||
P = "/P" # integer flag, required; permitted operations
|
||||
ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional
|
||||
|
||||
|
||||
class UserAccessPermissions(IntFlag):
|
||||
"""
|
||||
Table 3.20 User access permissions.
|
||||
Table 22 of the 2.0 manual.
|
||||
"""
|
||||
|
||||
R1 = 1
|
||||
R2 = 2
|
||||
PRINT = 4
|
||||
MODIFY = 8
|
||||
EXTRACT = 16
|
||||
ADD_OR_MODIFY = 32
|
||||
R7 = 64
|
||||
R8 = 128
|
||||
FILL_FORM_FIELDS = 256
|
||||
EXTRACT_TEXT_AND_GRAPHICS = 512
|
||||
ASSEMBLE_DOC = 1024
|
||||
PRINT_TO_REPRESENTATION = 2048
|
||||
R13 = 2**12
|
||||
R14 = 2**13
|
||||
R15 = 2**14
|
||||
R16 = 2**15
|
||||
R17 = 2**16
|
||||
R18 = 2**17
|
||||
R19 = 2**18
|
||||
R20 = 2**19
|
||||
R21 = 2**20
|
||||
R22 = 2**21
|
||||
R23 = 2**22
|
||||
R24 = 2**23
|
||||
R25 = 2**24
|
||||
R26 = 2**25
|
||||
R27 = 2**26
|
||||
R28 = 2**27
|
||||
R29 = 2**28
|
||||
R30 = 2**29
|
||||
R31 = 2**30
|
||||
R32 = 2**31
|
||||
|
||||
@classmethod
|
||||
def _is_reserved(cls, name: str) -> bool:
|
||||
"""Check if the given name corresponds to a reserved flag entry."""
|
||||
return name.startswith("R") and name[1:].isdigit()
|
||||
|
||||
@classmethod
|
||||
def _is_active(cls, name: str) -> bool:
|
||||
"""Check if the given reserved name defaults to 1 = active."""
|
||||
return name not in {"R1", "R2"}
|
||||
|
||||
def to_dict(self) -> dict[str, bool]:
|
||||
"""Convert the given flag value to a corresponding verbose name mapping."""
|
||||
result: dict[str, bool] = {}
|
||||
for name, flag in UserAccessPermissions.__members__.items():
|
||||
if UserAccessPermissions._is_reserved(name):
|
||||
continue
|
||||
result[name.lower()] = (self & flag) == flag
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions":
|
||||
"""Convert the verbose name mapping to the corresponding flag value."""
|
||||
value_copy = value.copy()
|
||||
result = cls(0)
|
||||
for name, flag in cls.__members__.items():
|
||||
if cls._is_reserved(name):
|
||||
# Reserved names have a required value. Use it.
|
||||
if cls._is_active(name):
|
||||
result |= flag
|
||||
continue
|
||||
is_active = value_copy.pop(name.lower(), False)
|
||||
if is_active:
|
||||
result |= flag
|
||||
if value_copy:
|
||||
raise ValueError(f"Unknown dictionary keys: {value_copy!r}")
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def all(cls) -> "UserAccessPermissions":
|
||||
return cls((2**32 - 1) - cls.R1 - cls.R2)
|
||||
|
||||
|
||||
class Resources:
|
||||
"""
|
||||
Table 3.30 Entries in a resource dictionary.
|
||||
Table 34 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
EXT_G_STATE = "/ExtGState" # dictionary, optional
|
||||
COLOR_SPACE = "/ColorSpace" # dictionary, optional
|
||||
PATTERN = "/Pattern" # dictionary, optional
|
||||
SHADING = "/Shading" # dictionary, optional
|
||||
XOBJECT = "/XObject" # dictionary, optional
|
||||
FONT = "/Font" # dictionary, optional
|
||||
PROC_SET = "/ProcSet" # array, optional
|
||||
PROPERTIES = "/Properties" # dictionary, optional
|
||||
|
||||
|
||||
class PagesAttributes:
|
||||
"""§7.7.3.2 of the 1.7 and 2.0 reference."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Pages
|
||||
PARENT = "/Parent" # dictionary, required; indirect reference to pages object
|
||||
KIDS = "/Kids" # array, required; List of indirect references
|
||||
COUNT = "/Count"
|
||||
# integer, required; the number of leaf nodes (page objects)
|
||||
# that are descendants of this node within the page tree
|
||||
|
||||
|
||||
class PageAttributes:
|
||||
"""§7.7.3.3 of the 1.7 and 2.0 reference."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Page
|
||||
PARENT = "/Parent" # dictionary, required; a pages object
|
||||
LAST_MODIFIED = (
|
||||
"/LastModified" # date, optional; date and time of last modification
|
||||
)
|
||||
RESOURCES = "/Resources" # dictionary, required if there are any
|
||||
MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size
|
||||
CROPBOX = "/CropBox" # rectangle, optional
|
||||
BLEEDBOX = "/BleedBox" # rectangle, optional
|
||||
TRIMBOX = "/TrimBox" # rectangle, optional
|
||||
ARTBOX = "/ArtBox" # rectangle, optional
|
||||
BOX_COLOR_INFO = "/BoxColorInfo" # dictionary, optional
|
||||
CONTENTS = "/Contents" # stream or array, optional
|
||||
ROTATE = "/Rotate" # integer, optional; page rotation in degrees
|
||||
GROUP = "/Group" # dictionary, optional; page group
|
||||
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page
|
||||
B = "/B" # array, optional
|
||||
DUR = "/Dur" # number, optional
|
||||
TRANS = "/Trans" # dictionary, optional
|
||||
ANNOTS = "/Annots" # array, optional; an array of annotations
|
||||
AA = "/AA" # dictionary, optional
|
||||
METADATA = "/Metadata" # stream, optional
|
||||
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||
STRUCT_PARENTS = "/StructParents" # integer, optional
|
||||
ID = "/ID" # byte string, optional
|
||||
PZ = "/PZ" # number, optional
|
||||
SEPARATION_INFO = "/SeparationInfo" # dictionary, optional
|
||||
TABS = "/Tabs" # name, optional
|
||||
TEMPLATE_INSTANTIATED = "/TemplateInstantiated" # name, optional
|
||||
PRES_STEPS = "/PresSteps" # dictionary, optional
|
||||
USER_UNIT = "/UserUnit" # number, optional
|
||||
VP = "/VP" # dictionary, optional
|
||||
AF = "/AF" # array of dictionaries, optional
|
||||
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||
D_PART = "/DPart" # dictionary, required, if this page is within the range of a DPart, not permitted otherwise
|
||||
|
||||
|
||||
class FileSpecificationDictionaryEntries:
|
||||
"""Table 3.41 Entries in a file specification dictionary."""
|
||||
|
||||
Type = "/Type"
|
||||
FS = "/FS" # The name of the file system to be used to interpret this file specification
|
||||
F = "/F" # A file specification string of the form described in §3.10.1
|
||||
UF = "/UF" # A Unicode string of the file as described in §3.10.1
|
||||
DOS = "/DOS"
|
||||
Mac = "/Mac"
|
||||
Unix = "/Unix"
|
||||
ID = "/ID"
|
||||
V = "/V"
|
||||
EF = "/EF" # dictionary, containing a subset of the keys F, UF, DOS, Mac, and Unix
|
||||
RF = "/RF" # dictionary, containing arrays of /EmbeddedFile
|
||||
DESC = "/Desc" # description of the file
|
||||
Cl = "/Cl"
|
||||
|
||||
|
||||
class StreamAttributes:
|
||||
"""
|
||||
Table 4.2.
|
||||
Table 5 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
LENGTH = "/Length" # integer, required
|
||||
FILTER = "/Filter" # name or array of names, optional
|
||||
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong
|
||||
|
||||
|
||||
@unique
|
||||
class FilterTypes(StrEnum):
|
||||
"""§7.4 of the 1.7 and 2.0 references."""
|
||||
|
||||
ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx
|
||||
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85
|
||||
LZW_DECODE = "/LZWDecode" # abbreviation: LZW
|
||||
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl
|
||||
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL
|
||||
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
|
||||
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
|
||||
JPX_DECODE = "/JPXDecode"
|
||||
JBIG2_DECODE = "/JBIG2Decode"
|
||||
|
||||
|
||||
class FilterTypeAbbreviations:
|
||||
"""§8.9.7 of the 1.7 and 2.0 references."""
|
||||
|
||||
AHx = "/AHx"
|
||||
A85 = "/A85"
|
||||
LZW = "/LZW"
|
||||
FL = "/Fl"
|
||||
RL = "/RL"
|
||||
CCF = "/CCF"
|
||||
DCT = "/DCT"
|
||||
|
||||
|
||||
class LzwFilterParameters:
|
||||
"""
|
||||
Table 4.4.
|
||||
Table 8 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
PREDICTOR = "/Predictor" # integer
|
||||
COLORS = "/Colors" # integer
|
||||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer
|
||||
COLUMNS = "/Columns" # integer
|
||||
EARLY_CHANGE = "/EarlyChange" # integer
|
||||
|
||||
|
||||
class CcittFaxDecodeParameters:
|
||||
"""
|
||||
Table 4.5.
|
||||
Table 11 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
K = "/K" # integer
|
||||
END_OF_LINE = "/EndOfLine" # boolean
|
||||
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean
|
||||
COLUMNS = "/Columns" # integer
|
||||
ROWS = "/Rows" # integer
|
||||
END_OF_BLOCK = "/EndOfBlock" # boolean
|
||||
BLACK_IS_1 = "/BlackIs1" # boolean
|
||||
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer
|
||||
|
||||
|
||||
class ImageAttributes:
|
||||
"""§11.6.5 of the 1.7 and 2.0 references."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /XObject
|
||||
SUBTYPE = "/Subtype" # name, required; must be /Image
|
||||
NAME = "/Name" # name, required
|
||||
WIDTH = "/Width" # integer, required
|
||||
HEIGHT = "/Height" # integer, required
|
||||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required
|
||||
COLOR_SPACE = "/ColorSpace" # name, required
|
||||
DECODE = "/Decode" # array, optional
|
||||
INTENT = "/Intent" # string, optional
|
||||
INTERPOLATE = "/Interpolate" # boolean, optional
|
||||
IMAGE_MASK = "/ImageMask" # boolean, optional
|
||||
MASK = "/Mask" # 1-bit image mask stream
|
||||
S_MASK = "/SMask" # dictionary or name, optional
|
||||
|
||||
|
||||
class ColorSpaces:
|
||||
DEVICE_RGB = "/DeviceRGB"
|
||||
DEVICE_CMYK = "/DeviceCMYK"
|
||||
DEVICE_GRAY = "/DeviceGray"
|
||||
|
||||
|
||||
class TypArguments:
|
||||
"""Table 8.2 of the PDF 1.7 reference."""
|
||||
|
||||
LEFT = "/Left"
|
||||
RIGHT = "/Right"
|
||||
BOTTOM = "/Bottom"
|
||||
TOP = "/Top"
|
||||
|
||||
|
||||
class TypFitArguments:
|
||||
"""Table 8.2 of the PDF 1.7 reference."""
|
||||
|
||||
XYZ = "/XYZ"
|
||||
FIT = "/Fit"
|
||||
FIT_H = "/FitH"
|
||||
FIT_V = "/FitV"
|
||||
FIT_R = "/FitR"
|
||||
FIT_B = "/FitB"
|
||||
FIT_BH = "/FitBH"
|
||||
FIT_BV = "/FitBV"
|
||||
|
||||
|
||||
class GoToActionArguments:
|
||||
S = "/S" # name, required: type of action
|
||||
D = "/D" # name, byte string, or array, required: destination to jump to
|
||||
SD = "/SD" # array, optional: structure destination to jump to
|
||||
|
||||
|
||||
class AnnotationDictionaryAttributes:
|
||||
"""Table 8.15 Entries common to all annotation dictionaries."""
|
||||
|
||||
Type = "/Type"
|
||||
Subtype = "/Subtype"
|
||||
Rect = "/Rect"
|
||||
Contents = "/Contents"
|
||||
P = "/P"
|
||||
NM = "/NM"
|
||||
M = "/M"
|
||||
F = "/F"
|
||||
AP = "/AP"
|
||||
AS = "/AS"
|
||||
DA = "/DA"
|
||||
Border = "/Border"
|
||||
C = "/C"
|
||||
StructParent = "/StructParent"
|
||||
OC = "/OC"
|
||||
|
||||
|
||||
class InteractiveFormDictEntries:
|
||||
Fields = "/Fields"
|
||||
NeedAppearances = "/NeedAppearances"
|
||||
SigFlags = "/SigFlags"
|
||||
CO = "/CO"
|
||||
DR = "/DR"
|
||||
DA = "/DA"
|
||||
Q = "/Q"
|
||||
XFA = "/XFA"
|
||||
|
||||
|
||||
class FieldDictionaryAttributes:
|
||||
"""
|
||||
Entries common to all field dictionaries (Table 8.69 PDF 1.7 reference)
|
||||
(*very partially documented here*).
|
||||
|
||||
FFBits provides the constants used for `/Ff` from Table 8.70/8.75/8.77/8.79
|
||||
"""
|
||||
|
||||
FT = "/FT" # name, required for terminal fields
|
||||
Parent = "/Parent" # dictionary, required for children
|
||||
Kids = "/Kids" # array, sometimes required
|
||||
T = "/T" # text string, optional
|
||||
TU = "/TU" # text string, optional
|
||||
TM = "/TM" # text string, optional
|
||||
Ff = "/Ff" # integer, optional
|
||||
V = "/V" # text string or array, optional
|
||||
DV = "/DV" # text string, optional
|
||||
AA = "/AA" # dictionary, optional
|
||||
Opt = "/Opt" # array, optional
|
||||
|
||||
class FfBits(IntFlag):
|
||||
"""
|
||||
Ease building /Ff flags
|
||||
Some entries may be specific to:
|
||||
|
||||
* Text (Tx) (Table 8.75 PDF 1.7 reference)
|
||||
* Buttons (Btn) (Table 8.77 PDF 1.7 reference)
|
||||
* Choice (Ch) (Table 8.79 PDF 1.7 reference)
|
||||
"""
|
||||
|
||||
ReadOnly = 1 << 0
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
Required = 1 << 1
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
NoExport = 1 << 2
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
|
||||
Multiline = 1 << 12
|
||||
"""Tx"""
|
||||
Password = 1 << 13
|
||||
"""Tx"""
|
||||
|
||||
NoToggleToOff = 1 << 14
|
||||
"""Btn"""
|
||||
Radio = 1 << 15
|
||||
"""Btn"""
|
||||
Pushbutton = 1 << 16
|
||||
"""Btn"""
|
||||
|
||||
Combo = 1 << 17
|
||||
"""Ch"""
|
||||
Edit = 1 << 18
|
||||
"""Ch"""
|
||||
Sort = 1 << 19
|
||||
"""Ch"""
|
||||
|
||||
FileSelect = 1 << 20
|
||||
"""Tx"""
|
||||
|
||||
MultiSelect = 1 << 21
|
||||
"""Tx"""
|
||||
|
||||
DoNotSpellCheck = 1 << 22
|
||||
"""Tx/Ch"""
|
||||
DoNotScroll = 1 << 23
|
||||
"""Tx"""
|
||||
Comb = 1 << 24
|
||||
"""Tx"""
|
||||
|
||||
RadiosInUnison = 1 << 25
|
||||
"""Btn"""
|
||||
|
||||
RichText = 1 << 25
|
||||
"""Tx"""
|
||||
|
||||
CommitOnSelChange = 1 << 26
|
||||
"""Ch"""
|
||||
|
||||
@classmethod
|
||||
def attributes(cls) -> tuple[str, ...]:
|
||||
"""
|
||||
Get a tuple of all the attributes present in a Field Dictionary.
|
||||
|
||||
This method returns a tuple of all the attribute constants defined in
|
||||
the FieldDictionaryAttributes class. These attributes correspond to the
|
||||
entries that are common to all field dictionaries as specified in the
|
||||
PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A tuple containing all the attribute constants.
|
||||
|
||||
"""
|
||||
return (
|
||||
cls.TM,
|
||||
cls.T,
|
||||
cls.FT,
|
||||
cls.Parent,
|
||||
cls.TU,
|
||||
cls.Ff,
|
||||
cls.V,
|
||||
cls.DV,
|
||||
cls.Kids,
|
||||
cls.AA,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def attributes_dict(cls) -> dict[str, str]:
|
||||
"""
|
||||
Get a dictionary of attribute keys and their human-readable names.
|
||||
|
||||
This method returns a dictionary where the keys are the attribute
|
||||
constants defined in the FieldDictionaryAttributes class and the values
|
||||
are their corresponding human-readable names. These attributes
|
||||
correspond to the entries that are common to all field dictionaries as
|
||||
specified in the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A dictionary containing attribute keys and their names.
|
||||
|
||||
"""
|
||||
return {
|
||||
cls.FT: "Field Type",
|
||||
cls.Parent: "Parent",
|
||||
cls.T: "Field Name",
|
||||
cls.TU: "Alternate Field Name",
|
||||
cls.TM: "Mapping Name",
|
||||
cls.Ff: "Field Flags",
|
||||
cls.V: "Value",
|
||||
cls.DV: "Default Value",
|
||||
}
|
||||
|
||||
|
||||
class CheckboxRadioButtonAttributes:
|
||||
"""Table 8.76 Field flags common to all field types."""
|
||||
|
||||
Opt = "/Opt" # Options, Optional
|
||||
|
||||
@classmethod
|
||||
def attributes(cls) -> tuple[str, ...]:
|
||||
"""
|
||||
Get a tuple of all the attributes present in a Field Dictionary.
|
||||
|
||||
This method returns a tuple of all the attribute constants defined in
|
||||
the CheckboxRadioButtonAttributes class. These attributes correspond to
|
||||
the entries that are common to all field dictionaries as specified in
|
||||
the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A tuple containing all the attribute constants.
|
||||
|
||||
"""
|
||||
return (cls.Opt,)
|
||||
|
||||
@classmethod
|
||||
def attributes_dict(cls) -> dict[str, str]:
|
||||
"""
|
||||
Get a dictionary of attribute keys and their human-readable names.
|
||||
|
||||
This method returns a dictionary where the keys are the attribute
|
||||
constants defined in the CheckboxRadioButtonAttributes class and the
|
||||
values are their corresponding human-readable names. These attributes
|
||||
correspond to the entries that are common to all field dictionaries as
|
||||
specified in the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A dictionary containing attribute keys and their names.
|
||||
|
||||
"""
|
||||
return {
|
||||
cls.Opt: "Options",
|
||||
}
|
||||
|
||||
|
||||
class FieldFlag(IntFlag):
|
||||
"""Table 8.70 Field flags common to all field types."""
|
||||
|
||||
READ_ONLY = 1
|
||||
REQUIRED = 2
|
||||
NO_EXPORT = 4
|
||||
|
||||
|
||||
class DocumentInformationAttributes:
|
||||
"""Table 10.2 Entries in the document information dictionary."""
|
||||
|
||||
TITLE = "/Title" # text string, optional
|
||||
AUTHOR = "/Author" # text string, optional
|
||||
SUBJECT = "/Subject" # text string, optional
|
||||
KEYWORDS = "/Keywords" # text string, optional
|
||||
CREATOR = "/Creator" # text string, optional
|
||||
PRODUCER = "/Producer" # text string, optional
|
||||
CREATION_DATE = "/CreationDate" # date, optional
|
||||
MOD_DATE = "/ModDate" # date, optional
|
||||
TRAPPED = "/Trapped" # name, optional
|
||||
|
||||
|
||||
class PageLayouts:
|
||||
"""
|
||||
Page 84, PDF 1.4 reference.
|
||||
Page 115, PDF 2.0 reference.
|
||||
"""
|
||||
|
||||
SINGLE_PAGE = "/SinglePage"
|
||||
ONE_COLUMN = "/OneColumn"
|
||||
TWO_COLUMN_LEFT = "/TwoColumnLeft"
|
||||
TWO_COLUMN_RIGHT = "/TwoColumnRight"
|
||||
TWO_PAGE_LEFT = "/TwoPageLeft" # (PDF 1.5)
|
||||
TWO_PAGE_RIGHT = "/TwoPageRight" # (PDF 1.5)
|
||||
|
||||
|
||||
class GraphicsStateParameters:
|
||||
"""Table 58 – Entries in a Graphics State Parameter Dictionary"""
|
||||
|
||||
TYPE = "/Type" # name, optional
|
||||
LW = "/LW" # number, optional
|
||||
LC = "/LC" # integer, optional
|
||||
LJ = "/LJ" # integer, optional
|
||||
ML = "/ML" # number, optional
|
||||
D = "/D" # array, optional
|
||||
RI = "/RI" # name, optional
|
||||
OP = "/OP"
|
||||
op = "/op"
|
||||
OPM = "/OPM"
|
||||
FONT = "/Font" # array, optional
|
||||
BG = "/BG"
|
||||
BG2 = "/BG2"
|
||||
UCR = "/UCR"
|
||||
UCR2 = "/UCR2"
|
||||
TR = "/TR"
|
||||
TR2 = "/TR2"
|
||||
HT = "/HT"
|
||||
FL = "/FL"
|
||||
SM = "/SM"
|
||||
SA = "/SA"
|
||||
BM = "/BM"
|
||||
S_MASK = "/SMask" # dictionary or name, optional
|
||||
CA = "/CA"
|
||||
ca = "/ca"
|
||||
AIS = "/AIS"
|
||||
TK = "/TK"
|
||||
|
||||
|
||||
class CatalogDictionary:
|
||||
"""§7.7.2 of the 1.7 and 2.0 references."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Catalog
|
||||
VERSION = "/Version" # name
|
||||
EXTENSIONS = "/Extensions" # dictionary, optional; ISO 32000-1
|
||||
PAGES = "/Pages" # dictionary, required
|
||||
PAGE_LABELS = "/PageLabels" # number tree, optional
|
||||
NAMES = "/Names" # dictionary, optional
|
||||
DESTS = "/Dests" # dictionary, optional
|
||||
VIEWER_PREFERENCES = "/ViewerPreferences" # dictionary, optional
|
||||
PAGE_LAYOUT = "/PageLayout" # name, optional
|
||||
PAGE_MODE = "/PageMode" # name, optional
|
||||
OUTLINES = "/Outlines" # dictionary, optional
|
||||
THREADS = "/Threads" # array, optional
|
||||
OPEN_ACTION = "/OpenAction" # array or dictionary or name, optional
|
||||
AA = "/AA" # dictionary, optional
|
||||
URI = "/URI" # dictionary, optional
|
||||
ACRO_FORM = "/AcroForm" # dictionary, optional
|
||||
METADATA = "/Metadata" # stream, optional
|
||||
STRUCT_TREE_ROOT = "/StructTreeRoot" # dictionary, optional
|
||||
MARK_INFO = "/MarkInfo" # dictionary, optional
|
||||
LANG = "/Lang" # text string, optional
|
||||
SPIDER_INFO = "/SpiderInfo" # dictionary, optional
|
||||
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||
OC_PROPERTIES = "/OCProperties" # dictionary, optional
|
||||
PERMS = "/Perms" # dictionary, optional
|
||||
LEGAL = "/Legal" # dictionary, optional
|
||||
REQUIREMENTS = "/Requirements" # array, optional
|
||||
COLLECTION = "/Collection" # dictionary, optional
|
||||
NEEDS_RENDERING = "/NeedsRendering" # boolean, optional
|
||||
DSS = "/DSS" # dictionary, optional
|
||||
AF = "/AF" # array of dictionaries, optional
|
||||
D_PART_ROOT = "/DPartRoot" # dictionary, optional
|
||||
|
||||
|
||||
class OutlineFontFlag(IntFlag):
|
||||
"""A class used as an enumerable flag for formatting an outline font."""
|
||||
|
||||
italic = 1
|
||||
bold = 2
|
||||
|
||||
|
||||
class PageLabelStyle:
|
||||
"""
|
||||
Table 8.10 in the 1.7 reference.
|
||||
Table 161 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
DECIMAL = "/D" # Decimal Arabic numerals
|
||||
UPPERCASE_ROMAN = "/R" # Uppercase Roman numerals
|
||||
LOWERCASE_ROMAN = "/r" # Lowercase Roman numerals
|
||||
UPPERCASE_LETTER = "/A" # Uppercase letters
|
||||
LOWERCASE_LETTER = "/a" # Lowercase letters
|
||||
|
||||
|
||||
class AnnotationFlag(IntFlag):
|
||||
"""See §12.5.3 "Annotation Flags"."""
|
||||
|
||||
INVISIBLE = 1
|
||||
HIDDEN = 2
|
||||
PRINT = 4
|
||||
NO_ZOOM = 8
|
||||
NO_ROTATE = 16
|
||||
NO_VIEW = 32
|
||||
READ_ONLY = 64
|
||||
LOCKED = 128
|
||||
TOGGLE_NO_VIEW = 256
|
||||
LOCKED_CONTENTS = 512
|
||||
|
||||
|
||||
PDF_KEYS = (
|
||||
AnnotationDictionaryAttributes,
|
||||
CatalogAttributes,
|
||||
CatalogDictionary,
|
||||
CcittFaxDecodeParameters,
|
||||
CheckboxRadioButtonAttributes,
|
||||
ColorSpaces,
|
||||
Core,
|
||||
DocumentInformationAttributes,
|
||||
EncryptionDictAttributes,
|
||||
FieldDictionaryAttributes,
|
||||
FileSpecificationDictionaryEntries,
|
||||
FilterTypeAbbreviations,
|
||||
FilterTypes,
|
||||
GoToActionArguments,
|
||||
GraphicsStateParameters,
|
||||
ImageAttributes,
|
||||
InteractiveFormDictEntries,
|
||||
LzwFilterParameters,
|
||||
PageAttributes,
|
||||
PageLayouts,
|
||||
PagesAttributes,
|
||||
Resources,
|
||||
StreamAttributes,
|
||||
TrailerKeys,
|
||||
TypArguments,
|
||||
TypFitArguments,
|
||||
)
|
||||
|
||||
|
||||
class ImageType(IntFlag):
|
||||
NONE = 0
|
||||
XOBJECT_IMAGES = auto()
|
||||
INLINE_IMAGES = auto()
|
||||
DRAWING_IMAGES = auto()
|
||||
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
|
||||
IMAGES = ALL # for consistency with ObjectDeletionFlag
|
||||
|
||||
|
||||
_INLINE_IMAGE_VALUE_MAPPING = {
|
||||
"/G": "/DeviceGray",
|
||||
"/RGB": "/DeviceRGB",
|
||||
"/CMYK": "/DeviceCMYK",
|
||||
"/I": "/Indexed",
|
||||
"/AHx": "/ASCIIHexDecode",
|
||||
"/A85": "/ASCII85Decode",
|
||||
"/LZW": "/LZWDecode",
|
||||
"/Fl": "/FlateDecode",
|
||||
"/RL": "/RunLengthDecode",
|
||||
"/CCF": "/CCITTFaxDecode",
|
||||
"/DCT": "/DCTDecode",
|
||||
"/DeviceGray": "/DeviceGray",
|
||||
"/DeviceRGB": "/DeviceRGB",
|
||||
"/DeviceCMYK": "/DeviceCMYK",
|
||||
"/Indexed": "/Indexed",
|
||||
"/ASCIIHexDecode": "/ASCIIHexDecode",
|
||||
"/ASCII85Decode": "/ASCII85Decode",
|
||||
"/LZWDecode": "/LZWDecode",
|
||||
"/FlateDecode": "/FlateDecode",
|
||||
"/RunLengthDecode": "/RunLengthDecode",
|
||||
"/CCITTFaxDecode": "/CCITTFaxDecode",
|
||||
"/DCTDecode": "/DCTDecode",
|
||||
"/RelativeColorimetric": "/RelativeColorimetric",
|
||||
}
|
||||
|
||||
_INLINE_IMAGE_KEY_MAPPING = {
|
||||
"/BPC": "/BitsPerComponent",
|
||||
"/CS": "/ColorSpace",
|
||||
"/D": "/Decode",
|
||||
"/DP": "/DecodeParms",
|
||||
"/F": "/Filter",
|
||||
"/H": "/Height",
|
||||
"/W": "/Width",
|
||||
"/I": "/Interpolate",
|
||||
"/Intent": "/Intent",
|
||||
"/IM": "/ImageMask",
|
||||
"/BitsPerComponent": "/BitsPerComponent",
|
||||
"/ColorSpace": "/ColorSpace",
|
||||
"/Decode": "/Decode",
|
||||
"/DecodeParms": "/DecodeParms",
|
||||
"/Filter": "/Filter",
|
||||
"/Height": "/Height",
|
||||
"/Width": "/Width",
|
||||
"/Interpolate": "/Interpolate",
|
||||
"/ImageMask": "/ImageMask",
|
||||
}
|
||||
|
||||
|
||||
class AFRelationship:
|
||||
"""
|
||||
Associated file relationship types, defining the relationship between
|
||||
the PDF component and the associated file.
|
||||
|
||||
Defined in table 43 of the PDF 2.0 reference.
|
||||
"""
|
||||
|
||||
SOURCE = "/Source" # Original content source
|
||||
DATA = "/Data" # Base data for visual presentation
|
||||
ALTERNATIVE = "/Alternative" # Alternative content representation
|
||||
SUPPLEMENT = "/Supplement" # Supplemental representation of original source/data
|
||||
ENCRYPTED_PAYLOAD = "/EncryptedPayload" # Encrypted payload document
|
||||
FORM_DATA = "/FormData" # Data associated with AcroForm of this PDF
|
||||
SCHEMA = "/Schema" # Schema definition for associated object
|
||||
UNSPECIFIED = "/Unspecified" # Not known or cannot be described with values
|
||||
|
||||
|
||||
class BorderStyles:
|
||||
"""
|
||||
A class defining border styles used in PDF documents.
|
||||
|
||||
Defined in table 168 of the PDF 2.0 reference.
|
||||
"""
|
||||
|
||||
BEVELED = "/B"
|
||||
DASHED = "/D"
|
||||
INSET = "/I"
|
||||
SOLID = "/S"
|
||||
UNDERLINED = "/U"
|
||||
74
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
74
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
All errors/exceptions pypdf raises and all of the warnings it uses.
|
||||
|
||||
Please note that broken PDF files might cause other Exceptions.
|
||||
"""
|
||||
|
||||
|
||||
class DeprecationError(Exception):
|
||||
"""Raised when a deprecated feature is used."""
|
||||
|
||||
|
||||
class DependencyError(Exception):
|
||||
"""
|
||||
Raised when a required dependency (a library or module that pypdf depends on)
|
||||
is not available or cannot be imported.
|
||||
"""
|
||||
|
||||
|
||||
class PyPdfError(Exception):
|
||||
"""Base class for all exceptions raised by pypdf."""
|
||||
|
||||
|
||||
class PdfReadError(PyPdfError):
|
||||
"""Raised when there is an issue reading a PDF file."""
|
||||
|
||||
|
||||
class PageSizeNotDefinedError(PyPdfError):
|
||||
"""Raised when the page size of a PDF document is not defined."""
|
||||
|
||||
|
||||
class PdfReadWarning(UserWarning):
|
||||
"""Issued when there is a potential issue reading a PDF file, but it can still be read."""
|
||||
|
||||
|
||||
class PdfStreamError(PdfReadError):
|
||||
"""Raised when there is an issue reading the stream of data in a PDF file."""
|
||||
|
||||
|
||||
class ParseError(PyPdfError):
|
||||
"""
|
||||
Raised when there is an issue parsing (analyzing and understanding the
|
||||
structure and meaning of) a PDF file.
|
||||
"""
|
||||
|
||||
|
||||
class FileNotDecryptedError(PdfReadError):
|
||||
"""
|
||||
Raised when a PDF file that has been encrypted
|
||||
(meaning it requires a password to be accessed) has not been successfully
|
||||
decrypted.
|
||||
"""
|
||||
|
||||
|
||||
class WrongPasswordError(FileNotDecryptedError):
|
||||
"""Raised when the wrong password is used to try to decrypt an encrypted PDF file."""
|
||||
|
||||
|
||||
class EmptyFileError(PdfReadError):
|
||||
"""Raised when a PDF file is empty or has no content."""
|
||||
|
||||
|
||||
class EmptyImageDataError(PyPdfError):
|
||||
"""Raised when trying to process an image that has no data."""
|
||||
|
||||
|
||||
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
|
||||
|
||||
|
||||
class LimitReachedError(PyPdfError):
|
||||
"""Raised when a limit is reached."""
|
||||
|
||||
|
||||
class XmpDocumentError(PyPdfError, RuntimeError):
|
||||
"""Raised when the XMP XML document context is invalid or missing."""
|
||||
815
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
815
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
@@ -0,0 +1,815 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
"""
|
||||
Implementation of stream filters; §7.4 Filters of the PDF 2.0 specification.
|
||||
|
||||
§8.9.7 Inline images of the PDF 2.0 specification has abbreviations that can be
|
||||
used for the names of filters in an inline image object.
|
||||
"""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import struct
|
||||
import subprocess
|
||||
import zlib
|
||||
from base64 import a85decode
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from ._codecs._codecs import LzwCodec as _LzwCodec
|
||||
from ._utils import (
|
||||
WHITESPACES_AS_BYTES,
|
||||
deprecation_with_replacement,
|
||||
logger_warning,
|
||||
)
|
||||
from .constants import CcittFaxDecodeParameters as CCITT
|
||||
from .constants import FilterTypeAbbreviations as FTA
|
||||
from .constants import FilterTypes as FT
|
||||
from .constants import ImageAttributes as IA
|
||||
from .constants import LzwFilterParameters as LZW
|
||||
from .constants import StreamAttributes as SA
|
||||
from .errors import DependencyError, LimitReachedError, PdfReadError, PdfStreamError
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DictionaryObject,
|
||||
IndirectObject,
|
||||
NullObject,
|
||||
StreamObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
JBIG2_MAX_OUTPUT_LENGTH = 75_000_000
|
||||
LZW_MAX_OUTPUT_LENGTH = 75_000_000
|
||||
ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
|
||||
|
||||
|
||||
|
||||
def _decompress_with_limit(data: bytes) -> bytes:
|
||||
decompressor = zlib.decompressobj()
|
||||
result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH)
|
||||
if decompressor.unconsumed_tail:
|
||||
raise LimitReachedError(
|
||||
f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining."
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def decompress(data: bytes) -> bytes:
|
||||
"""
|
||||
Decompress the given data using zlib.
|
||||
|
||||
Attempts to decompress the input data using zlib.
|
||||
If the decompression fails due to a zlib error, it falls back
|
||||
to using a decompression object with a larger window size.
|
||||
|
||||
Please note that the output length is limited to avoid memory
|
||||
issues. If you need to process larger content streams, consider
|
||||
adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
|
||||
are only dealing with trusted inputs and/or want to disable these
|
||||
limits, set the value to `0`.
|
||||
|
||||
Args:
|
||||
data: The input data to be decompressed.
|
||||
|
||||
Returns:
|
||||
The decompressed data.
|
||||
|
||||
"""
|
||||
try:
|
||||
return _decompress_with_limit(data)
|
||||
except zlib.error:
|
||||
# First quick approach: There are known issues with faulty added bytes to the
|
||||
# tail of the encoded stream from early Adobe Distiller or Pitstop versions
|
||||
# with CR char as the default line separator (assumed by reverse engineering)
|
||||
# that breaks the decoding process in the end.
|
||||
#
|
||||
# Try first to cut off some of the tail byte by byte, but limited to not
|
||||
# iterate through too many loops and kill the performance for large streams,
|
||||
# to then allow the final fallback to run. Added this intermediate attempt,
|
||||
# because starting from the head of the stream byte by byte kills completely
|
||||
# the performance for large streams (e.g., 6 MB) with the tail-byte-issue
|
||||
# and takes ages. This solution is really fast:
|
||||
max_tail_cut_off_bytes: int = 8
|
||||
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
|
||||
try:
|
||||
return _decompress_with_limit(data[:-i])
|
||||
except zlib.error:
|
||||
pass
|
||||
|
||||
# If still failing, then try with increased window size.
|
||||
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32)
|
||||
result_str = b""
|
||||
remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
|
||||
data_single_bytes = [data[i : i + 1] for i in range(len(data))]
|
||||
known_errors = set()
|
||||
for index, b in enumerate(data_single_bytes):
|
||||
try:
|
||||
decompressed = decompressor.decompress(b, max_length=remaining_limit)
|
||||
result_str += decompressed
|
||||
remaining_limit -= len(decompressed)
|
||||
if remaining_limit <= 0:
|
||||
raise LimitReachedError(
|
||||
f"Limit reached while decompressing. {len(data_single_bytes) - index} bytes remaining."
|
||||
)
|
||||
except zlib.error as error:
|
||||
error_str = str(error)
|
||||
if error_str in known_errors:
|
||||
continue
|
||||
logger_warning(error_str, __name__)
|
||||
known_errors.add(error_str)
|
||||
return result_str
|
||||
|
||||
|
||||
class FlateDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode data which is flate-encoded.
|
||||
|
||||
Args:
|
||||
data: flate-encoded data.
|
||||
decode_parms: a dictionary of values, understanding the
|
||||
"/Predictor":<int> key only
|
||||
|
||||
Returns:
|
||||
The flate-decoded data.
|
||||
|
||||
Raises:
|
||||
PdfReadError:
|
||||
|
||||
"""
|
||||
str_data = decompress(data)
|
||||
predictor = 1
|
||||
|
||||
if decode_parms:
|
||||
try:
|
||||
predictor = decode_parms.get("/Predictor", 1)
|
||||
except (AttributeError, TypeError): # Type Error is NullObject
|
||||
pass # Usually an array with a null object was read
|
||||
# predictor 1 == no predictor
|
||||
if predictor != 1:
|
||||
# /Columns, the number of samples in each row, has a default value of 1;
|
||||
# §7.4.4.3, ISO 32000.
|
||||
DEFAULT_BITS_PER_COMPONENT = 8
|
||||
try:
|
||||
columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore
|
||||
except (TypeError, KeyError):
|
||||
columns = 1
|
||||
try:
|
||||
colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore
|
||||
except (TypeError, KeyError):
|
||||
colors = 1
|
||||
try:
|
||||
bits_per_component = cast(
|
||||
int,
|
||||
decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore
|
||||
)
|
||||
except (TypeError, KeyError):
|
||||
bits_per_component = DEFAULT_BITS_PER_COMPONENT
|
||||
|
||||
# PNG predictor can vary by row and so is the lead byte on each row
|
||||
rowlength = (
|
||||
math.ceil(columns * colors * bits_per_component / 8) + 1
|
||||
) # number of bytes
|
||||
|
||||
# TIFF prediction:
|
||||
if predictor == 2:
|
||||
rowlength -= 1 # remove the predictor byte
|
||||
bpp = rowlength // columns
|
||||
str_data = bytearray(str_data)
|
||||
for i in range(len(str_data)):
|
||||
if i % rowlength >= bpp:
|
||||
str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
|
||||
str_data = bytes(str_data)
|
||||
# PNG prediction:
|
||||
elif 10 <= predictor <= 15:
|
||||
str_data = FlateDecode._decode_png_prediction(
|
||||
str_data, columns, rowlength
|
||||
)
|
||||
else:
|
||||
raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
|
||||
return str_data
|
||||
|
||||
@staticmethod
|
||||
def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes:
|
||||
# PNG prediction can vary from row to row
|
||||
if (remainder := len(data) % rowlength) != 0:
|
||||
logger_warning("Image data is not rectangular. Adding padding.", __name__)
|
||||
data += b"\x00" * (rowlength - remainder)
|
||||
assert len(data) % rowlength == 0
|
||||
output = []
|
||||
prev_rowdata = (0,) * rowlength
|
||||
bpp = (rowlength - 1) // columns # recomputed locally to not change params
|
||||
for row in range(0, len(data), rowlength):
|
||||
rowdata: list[int] = list(data[row : row + rowlength])
|
||||
filter_byte = rowdata[0]
|
||||
|
||||
if filter_byte == 0:
|
||||
# PNG None Predictor
|
||||
pass
|
||||
elif filter_byte == 1:
|
||||
# PNG Sub Predictor
|
||||
for i in range(bpp + 1, rowlength):
|
||||
rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256
|
||||
elif filter_byte == 2:
|
||||
# PNG Up Predictor
|
||||
for i in range(1, rowlength):
|
||||
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||
elif filter_byte == 3:
|
||||
# PNG Average Predictor
|
||||
for i in range(1, bpp + 1):
|
||||
floor = prev_rowdata[i] // 2
|
||||
rowdata[i] = (rowdata[i] + floor) % 256
|
||||
for i in range(bpp + 1, rowlength):
|
||||
left = rowdata[i - bpp]
|
||||
floor = (left + prev_rowdata[i]) // 2
|
||||
rowdata[i] = (rowdata[i] + floor) % 256
|
||||
elif filter_byte == 4:
|
||||
# PNG Paeth Predictor
|
||||
for i in range(1, bpp + 1):
|
||||
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||
for i in range(bpp + 1, rowlength):
|
||||
left = rowdata[i - bpp]
|
||||
up = prev_rowdata[i]
|
||||
up_left = prev_rowdata[i - bpp]
|
||||
|
||||
p = left + up - up_left
|
||||
dist_left = abs(p - left)
|
||||
dist_up = abs(p - up)
|
||||
dist_up_left = abs(p - up_left)
|
||||
|
||||
if dist_left <= dist_up and dist_left <= dist_up_left:
|
||||
paeth = left
|
||||
elif dist_up <= dist_up_left:
|
||||
paeth = up
|
||||
else:
|
||||
paeth = up_left
|
||||
|
||||
rowdata[i] = (rowdata[i] + paeth) % 256
|
||||
else:
|
||||
raise PdfReadError(
|
||||
f"Unsupported PNG filter {filter_byte!r}"
|
||||
) # pragma: no cover
|
||||
prev_rowdata = tuple(rowdata)
|
||||
output.extend(rowdata[1:])
|
||||
return bytes(output)
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, level: int = -1) -> bytes:
|
||||
"""
|
||||
Compress the input data using zlib.
|
||||
|
||||
Args:
|
||||
data: The data to be compressed.
|
||||
level: See https://docs.python.org/3/library/zlib.html#zlib.compress
|
||||
|
||||
Returns:
|
||||
The compressed data.
|
||||
|
||||
"""
|
||||
return zlib.compress(data, level)
|
||||
|
||||
|
||||
class ASCIIHexDecode:
|
||||
"""
|
||||
The ASCIIHexDecode filter decodes data that has been encoded in ASCII
|
||||
hexadecimal form into a base-7 ASCII format.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: Union[str, bytes],
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an ASCII-Hex encoded data stream.
|
||||
|
||||
Args:
|
||||
data: a str sequence of hexadecimal-encoded values to be
|
||||
converted into a base-7 ASCII string
|
||||
decode_parms: this filter does not use parameters.
|
||||
|
||||
Returns:
|
||||
A string conversion in base-7 ASCII, where each of its values
|
||||
v is such that 0 <= ord(v) <= 127.
|
||||
|
||||
Raises:
|
||||
PdfStreamError:
|
||||
|
||||
"""
|
||||
if isinstance(data, str):
|
||||
data = data.encode()
|
||||
retval = b""
|
||||
hex_pair = b""
|
||||
index = 0
|
||||
while True:
|
||||
if index >= len(data):
|
||||
logger_warning(
|
||||
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
|
||||
)
|
||||
break # Reached end of string without an EOD
|
||||
char = data[index : index + 1]
|
||||
if char == b">":
|
||||
break
|
||||
if char.isspace():
|
||||
index += 1
|
||||
continue
|
||||
hex_pair += char
|
||||
if len(hex_pair) == 2:
|
||||
retval += bytes((int(hex_pair, base=16),))
|
||||
hex_pair = b""
|
||||
index += 1
|
||||
# If the filter encounters the EOD marker after reading
|
||||
# an odd number of hexadecimal digits,
|
||||
# it shall behave as if a 0 (zero) followed the last digit.
|
||||
# For every even number of hexadecimal digits, hex_pair is reset to b"".
|
||||
if hex_pair != b"":
|
||||
hex_pair += b"0"
|
||||
retval += bytes((int(hex_pair, base=16),))
|
||||
return retval
|
||||
|
||||
|
||||
class RunLengthDecode:
|
||||
"""
|
||||
The RunLengthDecode filter decodes data that has been encoded in a
|
||||
simple byte-oriented format based on run length.
|
||||
The encoded data is a sequence of runs, where each run consists of
|
||||
a length byte followed by 1 to 128 bytes of data. If the length byte is
|
||||
in the range 0 to 127,
|
||||
the following length + 1 (1 to 128) bytes are copied literally during
|
||||
decompression.
|
||||
If length is in the range 129 to 255, the following single byte is to be
|
||||
copied 257 − length (2 to 128) times during decompression. A length value
|
||||
of 128 denotes EOD.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode a run length encoded data stream.
|
||||
|
||||
Args:
|
||||
data: a bytes sequence of length/data
|
||||
decode_parms: this filter does not use parameters.
|
||||
|
||||
Returns:
|
||||
A bytes decompressed sequence.
|
||||
|
||||
Raises:
|
||||
PdfStreamError:
|
||||
|
||||
"""
|
||||
lst = []
|
||||
index = 0
|
||||
while True:
|
||||
if index >= len(data):
|
||||
logger_warning(
|
||||
"missing EOD in RunLengthDecode, check if output is OK", __name__
|
||||
)
|
||||
break # Reached end of string without an EOD
|
||||
length = data[index]
|
||||
index += 1
|
||||
if length == 128:
|
||||
data_length = len(data)
|
||||
if index < data_length:
|
||||
# We should first check, if we have an inner stream from a multi-encoded
|
||||
# stream with a faulty trailing newline that we can decode properly.
|
||||
# We will just ignore the last byte and raise a warning ...
|
||||
if (index == data_length - 1) and (data[index : index + 1] == b"\n"):
|
||||
logger_warning(
|
||||
"Found trailing newline in stream data, check if output is OK", __name__
|
||||
)
|
||||
break
|
||||
# Raising an exception here breaks all image extraction for this file, which might
|
||||
# not be desirable. For this reason, indicate that the output is most likely wrong,
|
||||
# as processing stopped after the first EOD marker. See issue #3517.
|
||||
logger_warning(
|
||||
"Early EOD in RunLengthDecode, check if output is OK", __name__
|
||||
)
|
||||
break
|
||||
if length < 128:
|
||||
length += 1
|
||||
lst.append(data[index : (index + length)])
|
||||
index += length
|
||||
else: # >128
|
||||
length = 257 - length
|
||||
lst.append(bytes((data[index],)) * length)
|
||||
index += 1
|
||||
return b"".join(lst)
|
||||
|
||||
|
||||
class LZWDecode:
|
||||
class Decoder:
|
||||
STOP = 257
|
||||
CLEARDICT = 256
|
||||
|
||||
def __init__(self, data: bytes) -> None:
|
||||
self.data = data
|
||||
|
||||
def decode(self) -> bytes:
|
||||
return _LzwCodec(max_output_length=LZW_MAX_OUTPUT_LENGTH).decode(self.data)
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an LZW encoded data stream.
|
||||
|
||||
Args:
|
||||
data: ``bytes`` or ``str`` text to decode.
|
||||
decode_parms: a dictionary of parameter values.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
# decode_parms is unused here
|
||||
return LZWDecode.Decoder(data).decode()
|
||||
|
||||
|
||||
class ASCII85Decode:
|
||||
"""Decodes string ASCII85-encoded data into a byte format."""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: Union[str, bytes],
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an Ascii85 encoded data stream.
|
||||
|
||||
Args:
|
||||
data: ``bytes`` or ``str`` text to decode.
|
||||
decode_parms: this filter does not use parameters.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
if isinstance(data, str):
|
||||
data = data.encode()
|
||||
data = data.strip(WHITESPACES_AS_BYTES)
|
||||
if len(data) > 2 and data.endswith(b">"):
|
||||
data = data[:-1].rstrip(WHITESPACES_AS_BYTES) + data[-1:]
|
||||
try:
|
||||
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
|
||||
except ValueError as error:
|
||||
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
|
||||
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
|
||||
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
|
||||
raise
|
||||
|
||||
|
||||
class DCTDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decompresses data encoded using a DCT (discrete cosine transform)
|
||||
technique based on the JPEG standard (IS0/IEC 10918),
|
||||
reproducing image sample data that approximates the original data.
|
||||
|
||||
Args:
|
||||
data: text to decode.
|
||||
decode_parms: this filter does not use parameters.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
return data
|
||||
|
||||
|
||||
class JPXDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decompresses data encoded using the wavelet-based JPEG 2000 standard,
|
||||
reproducing the original image data.
|
||||
|
||||
Args:
|
||||
data: text to decode.
|
||||
decode_parms: this filter does not use parameters.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
return data
|
||||
|
||||
|
||||
@dataclass
|
||||
class CCITTParameters:
|
||||
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
|
||||
|
||||
K: int = 0
|
||||
columns: int = 1728
|
||||
rows: int = 0
|
||||
EndOfLine: Union[bool, None] = False
|
||||
EncodedByteAlign: Union[bool, None] = False
|
||||
EndOfBlock: Union[bool, None] = True
|
||||
BlackIs1: bool = False
|
||||
DamagedRowsBeforeError: Union[int, None] = 0
|
||||
|
||||
@property
|
||||
def group(self) -> int:
|
||||
if self.K < 0:
|
||||
# Pure two-dimensional encoding (Group 4)
|
||||
CCITTgroup = 4
|
||||
else:
|
||||
# K == 0: Pure one-dimensional encoding (Group 3, 1-D)
|
||||
# K > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
|
||||
CCITTgroup = 3
|
||||
return CCITTgroup
|
||||
|
||||
|
||||
def __create_old_class_instance(
|
||||
K: int = 0,
|
||||
columns: int = 0,
|
||||
rows: int = 0
|
||||
) -> CCITTParameters:
|
||||
deprecation_with_replacement("CCITParameters", "CCITTParameters", "6.0.0")
|
||||
return CCITTParameters(K, columns, rows)
|
||||
|
||||
|
||||
# Create an alias for the old class name
|
||||
CCITParameters = __create_old_class_instance
|
||||
|
||||
|
||||
class CCITTFaxDecode:
|
||||
"""
|
||||
§7.4.6, CCITTFaxDecode filter (ISO 32000).
|
||||
|
||||
Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
|
||||
CCITT encoding is bit-oriented, not byte-oriented.
|
||||
|
||||
§7.4.6, optional parameters for the CCITTFaxDecode filter.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _get_parameters(
|
||||
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
|
||||
rows: Union[int, IndirectObject],
|
||||
) -> CCITTParameters:
|
||||
ccitt_parameters = CCITTParameters(rows=int(rows))
|
||||
if parameters:
|
||||
parameters_unwrapped = cast(
|
||||
Union[ArrayObject, DictionaryObject], parameters.get_object()
|
||||
)
|
||||
if isinstance(parameters_unwrapped, ArrayObject):
|
||||
for decode_parm in parameters_unwrapped:
|
||||
if CCITT.K in decode_parm:
|
||||
ccitt_parameters.K = decode_parm[CCITT.K].get_object()
|
||||
if CCITT.COLUMNS in decode_parm:
|
||||
ccitt_parameters.columns = decode_parm[CCITT.COLUMNS].get_object()
|
||||
if CCITT.BLACK_IS_1 in decode_parm:
|
||||
ccitt_parameters.BlackIs1 = decode_parm[CCITT.BLACK_IS_1].get_object().value
|
||||
else:
|
||||
if CCITT.K in parameters_unwrapped:
|
||||
ccitt_parameters.K = parameters_unwrapped[CCITT.K].get_object() # type: ignore
|
||||
if CCITT.COLUMNS in parameters_unwrapped:
|
||||
ccitt_parameters.columns = parameters_unwrapped[CCITT.COLUMNS].get_object() # type: ignore
|
||||
if CCITT.BLACK_IS_1 in parameters_unwrapped:
|
||||
ccitt_parameters.BlackIs1 = parameters_unwrapped[CCITT.BLACK_IS_1].get_object().value # type: ignore
|
||||
return ccitt_parameters
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
height: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
params = CCITTFaxDecode._get_parameters(decode_parms, height)
|
||||
|
||||
img_size = len(data)
|
||||
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
|
||||
tiff_header = struct.pack(
|
||||
tiff_header_struct,
|
||||
b"II", # Byte order indication: Little endian
|
||||
42, # Version number (always 42)
|
||||
8, # Offset to the first image file directory (IFD)
|
||||
8, # Number of tags in IFD
|
||||
256, # ImageWidth, LONG, 1, width
|
||||
4,
|
||||
1,
|
||||
params.columns,
|
||||
257, # ImageLength, LONG, 1, length
|
||||
4,
|
||||
1,
|
||||
params.rows,
|
||||
258, # BitsPerSample, SHORT, 1, 1
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
259, # Compression, SHORT, 1, compression Type
|
||||
3,
|
||||
1,
|
||||
params.group,
|
||||
262, # Thresholding, SHORT, 1, 0 = BlackIs1
|
||||
3,
|
||||
1,
|
||||
int(params.BlackIs1),
|
||||
273, # StripOffsets, LONG, 1, length of header
|
||||
4,
|
||||
1,
|
||||
struct.calcsize(
|
||||
tiff_header_struct
|
||||
),
|
||||
278, # RowsPerStrip, LONG, 1, length
|
||||
4,
|
||||
1,
|
||||
params.rows,
|
||||
279, # StripByteCounts, LONG, 1, size of image
|
||||
4,
|
||||
1,
|
||||
img_size,
|
||||
0, # last IFD
|
||||
)
|
||||
|
||||
return tiff_header + data
|
||||
|
||||
|
||||
JBIG2DEC_BINARY = shutil.which("jbig2dec")
|
||||
|
||||
|
||||
class JBIG2Decode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
if JBIG2DEC_BINARY is None:
|
||||
raise DependencyError("jbig2dec binary is not available.")
|
||||
|
||||
with TemporaryDirectory() as tempdir:
|
||||
directory = Path(tempdir)
|
||||
paths: list[Path] = []
|
||||
|
||||
if decode_parms and "/JBIG2Globals" in decode_parms:
|
||||
jbig2_globals = decode_parms["/JBIG2Globals"]
|
||||
if not is_null_or_none(jbig2_globals) and not is_null_or_none(pointer := jbig2_globals.get_object()):
|
||||
assert pointer is not None, "mypy"
|
||||
if isinstance(pointer, StreamObject):
|
||||
path = directory.joinpath("globals.jbig2")
|
||||
path.write_bytes(pointer.get_data())
|
||||
paths.append(path)
|
||||
|
||||
path = directory.joinpath("image.jbig2")
|
||||
path.write_bytes(data)
|
||||
paths.append(path)
|
||||
|
||||
environment = os.environ.copy()
|
||||
environment["LC_ALL"] = "C"
|
||||
result = subprocess.run( # noqa: S603
|
||||
[
|
||||
JBIG2DEC_BINARY,
|
||||
"--embedded",
|
||||
"--format", "png",
|
||||
"--output", "-",
|
||||
"-M", str(JBIG2_MAX_OUTPUT_LENGTH),
|
||||
*paths
|
||||
],
|
||||
capture_output=True,
|
||||
env=environment,
|
||||
)
|
||||
if b"unrecognized option '--embedded'" in result.stderr or b"unrecognized option '-M'" in result.stderr:
|
||||
raise DependencyError("jbig2dec>=0.19 is required.")
|
||||
if b"FATAL ERROR failed to allocate image data buffer" in result.stderr:
|
||||
raise LimitReachedError(
|
||||
f"Memory limit reached while reading JBIG2 data:\n{result.stderr.decode('utf-8')}"
|
||||
)
|
||||
if result.stderr:
|
||||
for line in result.stderr.decode("utf-8").splitlines():
|
||||
logger_warning(line, __name__)
|
||||
if result.returncode != 0:
|
||||
raise PdfStreamError(f"Unable to decode JBIG2 data. Exit code: {result.returncode}")
|
||||
return result.stdout
|
||||
|
||||
@staticmethod
|
||||
def _is_binary_compatible() -> bool:
|
||||
if not JBIG2DEC_BINARY: # pragma: no cover
|
||||
return False
|
||||
result = subprocess.run( # noqa: S603
|
||||
[JBIG2DEC_BINARY, "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
version = result.stdout.split(" ", maxsplit=1)[1]
|
||||
|
||||
from ._utils import Version # noqa: PLC0415
|
||||
return Version(version) >= Version("0.19")
|
||||
|
||||
|
||||
def decode_stream_data(stream: Any) -> bytes:
|
||||
"""
|
||||
Decode the stream data based on the specified filters.
|
||||
|
||||
This function decodes the stream data using the filters provided in the
|
||||
stream.
|
||||
|
||||
Args:
|
||||
stream: The input stream object containing the data and filters.
|
||||
|
||||
Returns:
|
||||
The decoded stream data.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If an unsupported filter type is encountered.
|
||||
|
||||
"""
|
||||
filters = stream.get(SA.FILTER, ())
|
||||
if isinstance(filters, IndirectObject):
|
||||
filters = cast(ArrayObject, filters.get_object())
|
||||
if not isinstance(filters, ArrayObject):
|
||||
# We have a single filter instance
|
||||
filters = (filters,)
|
||||
decode_parms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
|
||||
if not isinstance(decode_parms, (list, tuple)):
|
||||
decode_parms = (decode_parms,)
|
||||
data: bytes = stream._data
|
||||
# If there is no data to decode, we should not try to decode it.
|
||||
if not data:
|
||||
return data
|
||||
for filter_name, params in zip(filters, decode_parms):
|
||||
if isinstance(params, NullObject):
|
||||
params = {}
|
||||
if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx):
|
||||
data = ASCIIHexDecode.decode(data)
|
||||
elif filter_name in (FT.ASCII_85_DECODE, FTA.A85):
|
||||
data = ASCII85Decode.decode(data)
|
||||
elif filter_name in (FT.LZW_DECODE, FTA.LZW):
|
||||
data = LZWDecode.decode(data, params)
|
||||
elif filter_name in (FT.FLATE_DECODE, FTA.FL):
|
||||
data = FlateDecode.decode(data, params)
|
||||
elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL):
|
||||
data = RunLengthDecode.decode(data)
|
||||
elif filter_name == FT.CCITT_FAX_DECODE:
|
||||
height = stream.get(IA.HEIGHT, ())
|
||||
data = CCITTFaxDecode.decode(data, params, height)
|
||||
elif filter_name == FT.DCT_DECODE:
|
||||
data = DCTDecode.decode(data)
|
||||
elif filter_name == FT.JPX_DECODE:
|
||||
data = JPXDecode.decode(data)
|
||||
elif filter_name == FT.JBIG2_DECODE:
|
||||
data = JBIG2Decode.decode(data, params)
|
||||
elif filter_name == "/Crypt":
|
||||
if "/Name" in params or "/Type" in params:
|
||||
raise NotImplementedError(
|
||||
"/Crypt filter with /Name or /Type not supported yet"
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported filter {filter_name}")
|
||||
return data
|
||||
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
from ..constants import OutlineFontFlag
|
||||
from ._base import (
|
||||
BooleanObject,
|
||||
ByteStringObject,
|
||||
FloatObject,
|
||||
IndirectObject,
|
||||
NameObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
PdfObject,
|
||||
TextStringObject,
|
||||
encode_pdfdocencoding,
|
||||
is_null_or_none,
|
||||
)
|
||||
from ._data_structures import (
|
||||
ArrayObject,
|
||||
ContentStream,
|
||||
DecodedStreamObject,
|
||||
Destination,
|
||||
DictionaryObject,
|
||||
EncodedStreamObject,
|
||||
Field,
|
||||
StreamObject,
|
||||
TreeObject,
|
||||
read_object,
|
||||
)
|
||||
from ._files import EmbeddedFile
|
||||
from ._fit import Fit
|
||||
from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links
|
||||
from ._outline import OutlineItem
|
||||
from ._rectangle import RectangleObject
|
||||
from ._utils import (
|
||||
create_string_object,
|
||||
decode_pdfdocencoding,
|
||||
hex_to_rgb,
|
||||
read_hex_string_from_stream,
|
||||
read_string_from_stream,
|
||||
)
|
||||
from ._viewerpref import ViewerPreferences
|
||||
|
||||
PAGE_FIT = Fit.fit()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PAGE_FIT",
|
||||
"ArrayObject",
|
||||
"BooleanObject",
|
||||
"ByteStringObject",
|
||||
"ContentStream",
|
||||
"DecodedStreamObject",
|
||||
"Destination",
|
||||
"DictionaryObject",
|
||||
"DirectReferenceLink",
|
||||
"EmbeddedFile",
|
||||
"EncodedStreamObject",
|
||||
"Field",
|
||||
"Fit",
|
||||
"FloatObject",
|
||||
"IndirectObject",
|
||||
"NameObject",
|
||||
"NamedReferenceLink",
|
||||
"NullObject",
|
||||
"NumberObject",
|
||||
"OutlineFontFlag",
|
||||
"OutlineItem",
|
||||
"PdfObject",
|
||||
"RectangleObject",
|
||||
"ReferenceLink",
|
||||
"StreamObject",
|
||||
"TextStringObject",
|
||||
"TreeObject",
|
||||
"ViewerPreferences",
|
||||
# Utility functions
|
||||
"create_string_object",
|
||||
"decode_pdfdocencoding",
|
||||
"encode_pdfdocencoding",
|
||||
"extract_links",
|
||||
"hex_to_rgb",
|
||||
"is_null_or_none",
|
||||
"read_hex_string_from_stream",
|
||||
# Data structures core functions
|
||||
"read_object",
|
||||
"read_string_from_stream",
|
||||
]
|
||||
@@ -0,0 +1,547 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum
|
||||
from typing import Any, Optional, Union, cast
|
||||
|
||||
from .._codecs import fill_from_encoding
|
||||
from .._codecs.core_fontmetrics import CORE_FONT_METRICS
|
||||
from .._font import Font
|
||||
from .._utils import logger_warning
|
||||
from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
|
||||
from ..generic import (
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
RectangleObject,
|
||||
)
|
||||
from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none
|
||||
|
||||
DEFAULT_FONT_SIZE_IN_MULTILINE = 12
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseStreamConfig:
|
||||
"""A container representing the basic layout of an appearance stream."""
|
||||
rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0)
|
||||
border_width: int = 1 # The width of the border in points
|
||||
border_style: str = BorderStyles.SOLID
|
||||
|
||||
|
||||
class BaseStreamAppearance(DecodedStreamObject):
|
||||
"""A class representing the very base of an appearance stream, that is, a rectangle and a border."""
|
||||
|
||||
def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None:
|
||||
"""
|
||||
Takes the appearance stream layout as an argument.
|
||||
|
||||
Args:
|
||||
layout: The basic layout parameters.
|
||||
"""
|
||||
super().__init__()
|
||||
self._layout = layout or BaseStreamConfig()
|
||||
self[NameObject("/Type")] = NameObject("/XObject")
|
||||
self[NameObject("/Subtype")] = NameObject("/Form")
|
||||
self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle)
|
||||
|
||||
|
||||
class TextAlignment(IntEnum):
|
||||
"""Defines the alignment options for text within a form field's appearance stream."""
|
||||
|
||||
LEFT = 0
|
||||
CENTER = 1
|
||||
RIGHT = 2
|
||||
|
||||
|
||||
class TextStreamAppearance(BaseStreamAppearance):
|
||||
"""
|
||||
A class representing the appearance stream for a text-based form field.
|
||||
|
||||
This class generates the content stream (the `ap_stream_data`) that dictates
|
||||
how text is rendered within a form field's bounding box. It handles properties
|
||||
like font, font size, color, multiline text, and text selection highlighting.
|
||||
"""
|
||||
|
||||
def _scale_text(
|
||||
self,
|
||||
font: Font,
|
||||
font_size: float,
|
||||
leading_factor: float,
|
||||
field_width: float,
|
||||
field_height: float,
|
||||
text: str,
|
||||
min_font_size: float,
|
||||
font_size_step: float = 0.2
|
||||
) -> tuple[list[tuple[float, str]], float]:
|
||||
"""
|
||||
Takes a piece of text and scales it to field_width or field_height, given font_name
|
||||
and font_size. Wraps text where necessary.
|
||||
|
||||
Args:
|
||||
font: The font to be used.
|
||||
font_size: The font size in points.
|
||||
leading_factor: The line distance.
|
||||
field_width: The width of the field in which to fit the text.
|
||||
field_height: The height of the field in which to fit the text.
|
||||
text: The text to fit with the field.
|
||||
min_font_size: The minimum font size at which to scale the text.
|
||||
font_size_step: The amount by which to decrement font size per step while scaling.
|
||||
|
||||
Returns:
|
||||
The text in the form of list of tuples, each tuple containing the length of a line
|
||||
and its contents, and the font_size for these lines and lengths.
|
||||
"""
|
||||
orig_text = text
|
||||
paragraphs = text.replace("\n", "\r").split("\r")
|
||||
wrapped_lines = []
|
||||
current_line_words: list[str] = []
|
||||
current_line_width: float = 0
|
||||
space_width = font.space_width * font_size / 1000
|
||||
for paragraph in paragraphs:
|
||||
if not paragraph.strip():
|
||||
wrapped_lines.append((0.0, ""))
|
||||
continue
|
||||
words = paragraph.split(" ")
|
||||
for i, word in enumerate(words):
|
||||
word_width = font.text_width(word) * font_size / 1000
|
||||
test_width = current_line_width + word_width + (space_width if i else 0)
|
||||
if test_width > field_width and current_line_words:
|
||||
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||
current_line_words = [word]
|
||||
current_line_width = word_width
|
||||
elif not current_line_words and word_width > field_width:
|
||||
wrapped_lines.append((word_width, word))
|
||||
current_line_words = []
|
||||
current_line_width = 0
|
||||
else:
|
||||
if current_line_words:
|
||||
current_line_width += space_width
|
||||
current_line_words.append(word)
|
||||
current_line_width += word_width
|
||||
if current_line_words:
|
||||
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||
current_line_words = []
|
||||
current_line_width = 0
|
||||
# Estimate total height.
|
||||
estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size
|
||||
if estimated_total_height > field_height:
|
||||
# Text overflows height; Retry with smaller font size.
|
||||
new_font_size = font_size - font_size_step
|
||||
if new_font_size >= min_font_size:
|
||||
return self._scale_text(
|
||||
font,
|
||||
new_font_size,
|
||||
leading_factor,
|
||||
field_width,
|
||||
field_height,
|
||||
orig_text,
|
||||
min_font_size,
|
||||
font_size_step
|
||||
)
|
||||
return wrapped_lines, round(font_size, 1)
|
||||
|
||||
def _generate_appearance_stream_data(
|
||||
self,
|
||||
text: str,
|
||||
selection: Union[list[str], None],
|
||||
font: Font,
|
||||
font_glyph_byte_map: Optional[dict[str, bytes]] = None,
|
||||
font_name: str = "/Helv",
|
||||
font_size: float = 0.0,
|
||||
font_color: str = "0 g",
|
||||
is_multiline: bool = False,
|
||||
alignment: TextAlignment = TextAlignment.LEFT,
|
||||
is_comb: bool = False,
|
||||
max_length: Optional[int] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generates the raw bytes of the PDF appearance stream for a text field.
|
||||
|
||||
This private method assembles the PDF content stream operators to draw
|
||||
the provided text within the specified rectangle. It handles text positioning,
|
||||
font application, color, and special formatting like selected text.
|
||||
|
||||
Args:
|
||||
text: The text to be rendered in the form field.
|
||||
selection: An optional list of strings that should be highlighted as selected.
|
||||
font: The font to use.
|
||||
font_glyph_byte_map: An optional dictionary mapping characters to their
|
||||
byte representation for glyph encoding.
|
||||
font_name: The name of the font resource to use (e.g., "/Helv").
|
||||
font_size: The font size. If 0, it is automatically calculated
|
||||
based on whether the field is multiline or not.
|
||||
font_color: The color to apply to the font, represented as a PDF
|
||||
graphics state string (e.g., "0 g" for black).
|
||||
is_multiline: A boolean indicating if the text field is multiline.
|
||||
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||
is_comb: Boolean that designates fixed-length fields, where every character
|
||||
fills one "cell", such as in a postcode.
|
||||
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||
length field.
|
||||
|
||||
Returns:
|
||||
A byte string containing the PDF content stream data.
|
||||
|
||||
"""
|
||||
rectangle = self._layout.rectangle
|
||||
font_glyph_byte_map = font_glyph_byte_map or {}
|
||||
if isinstance(rectangle, tuple):
|
||||
rectangle = RectangleObject(rectangle)
|
||||
leading_factor = (font.font_descriptor.bbox[3] - font.font_descriptor.bbox[1]) / 1000.0
|
||||
|
||||
# Set margins based on border width and style, but never less than 1 point
|
||||
factor = 2 if self._layout.border_style in {"/B", "/I"} else 1
|
||||
margin = max(self._layout.border_width * factor, 1)
|
||||
field_height = rectangle.height - 2 * margin
|
||||
field_width = rectangle.width - 4 * margin
|
||||
|
||||
# If font_size is 0, apply the logic for multiline or large-as-possible font
|
||||
if font_size == 0:
|
||||
min_font_size = 4.0 # The mininum font size
|
||||
if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems
|
||||
is_multiline = False # with matching "selection" with "line" later on.
|
||||
if is_multiline:
|
||||
font_size = DEFAULT_FONT_SIZE_IN_MULTILINE
|
||||
lines, font_size = self._scale_text(
|
||||
font,
|
||||
font_size,
|
||||
leading_factor,
|
||||
field_width,
|
||||
field_height,
|
||||
text,
|
||||
min_font_size
|
||||
)
|
||||
else:
|
||||
max_vertical_size = field_height / leading_factor
|
||||
text_width_unscaled = font.text_width(text) / 1000
|
||||
max_horizontal_size = field_width / (text_width_unscaled or 1)
|
||||
font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1)
|
||||
lines = [(text_width_unscaled * font_size, text)]
|
||||
elif is_comb:
|
||||
if max_length and len(text) > max_length:
|
||||
logger_warning (
|
||||
f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.",
|
||||
__name__
|
||||
)
|
||||
# We act as if each character is one line, because we draw it separately later on
|
||||
lines = [(
|
||||
font.text_width(char) * font_size / 1000,
|
||||
char
|
||||
) for index, char in enumerate(text) if index < (max_length or len(text))]
|
||||
else:
|
||||
lines = [(
|
||||
font.text_width(line) * font_size / 1000,
|
||||
line
|
||||
) for line in text.replace("\n", "\r").split("\r")]
|
||||
|
||||
# Set the vertical offset
|
||||
if is_multiline:
|
||||
y_offset = rectangle.height + margin - font.font_descriptor.bbox[3] * font_size / 1000.0
|
||||
else:
|
||||
y_offset = margin + ((field_height - font.font_descriptor.ascent * font_size / 1000) / 2)
|
||||
default_appearance = f"{font_name} {font_size} Tf {font_color}"
|
||||
|
||||
ap_stream = (
|
||||
f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} "
|
||||
f"re\nW\nBT\n{default_appearance}\n"
|
||||
).encode()
|
||||
current_x_pos: float = 0 # Initial virtual position within the text object.
|
||||
|
||||
for line_number, (line_width, line) in enumerate(lines):
|
||||
if selection and line in selection:
|
||||
# Might be improved, but cannot find how to get fill working => replaced with lined box
|
||||
ap_stream += (
|
||||
f"1 {y_offset - (line_number * font_size * leading_factor) - 1} "
|
||||
f"{rectangle.width - 2} {font_size + 2} re\n"
|
||||
f"0.5 0.5 0.5 rg s\n{default_appearance}\n"
|
||||
).encode()
|
||||
|
||||
# Calculate the desired absolute starting X for the current line
|
||||
desired_abs_x_start: float = 0
|
||||
if is_comb and max_length:
|
||||
# Calculate the width of a cell for one character
|
||||
cell_width = rectangle.width / max_length
|
||||
# Space from the left edge of the cell to the character's baseline start
|
||||
# line_width here is the *actual* character width in points for the single character 'line'
|
||||
centering_offset_in_cell = (cell_width - line_width) / 2
|
||||
# Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset
|
||||
desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell
|
||||
elif alignment == TextAlignment.RIGHT:
|
||||
desired_abs_x_start = rectangle.width - margin * 2 - line_width
|
||||
elif alignment == TextAlignment.CENTER:
|
||||
desired_abs_x_start = (rectangle.width - line_width) / 2
|
||||
else: # Left aligned; default
|
||||
desired_abs_x_start = margin * 2
|
||||
# Calculate x_rel_offset: how much to move from the current_x_pos
|
||||
# to reach the desired_abs_x_start.
|
||||
x_rel_offset = desired_abs_x_start - current_x_pos
|
||||
|
||||
# Y-offset:
|
||||
y_rel_offset: float = 0
|
||||
if line_number == 0:
|
||||
y_rel_offset = y_offset # Initial vertical position
|
||||
elif is_comb:
|
||||
y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters
|
||||
else:
|
||||
y_rel_offset = - font_size * leading_factor # Move down by line height
|
||||
|
||||
# Td is a relative translation (Tx and Ty).
|
||||
# It updates the current text position.
|
||||
ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode()
|
||||
# Update current_x_pos based on the Td operation for the next iteration.
|
||||
# This is the X position where the *current line* will start.
|
||||
current_x_pos = desired_abs_x_start
|
||||
|
||||
encoded_line: list[bytes] = [
|
||||
font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line
|
||||
]
|
||||
if any(len(c) >= 2 for c in encoded_line):
|
||||
ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n"
|
||||
else:
|
||||
ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n"
|
||||
ap_stream += b"ET\nQ\nEMC\nQ\n"
|
||||
return ap_stream
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layout: Optional[BaseStreamConfig] = None,
|
||||
text: str = "",
|
||||
selection: Optional[list[str]] = None,
|
||||
font_resource: Optional[DictionaryObject] = None,
|
||||
font_name: str = "/Helv",
|
||||
font_size: float = 0.0,
|
||||
font_color: str = "0 g",
|
||||
is_multiline: bool = False,
|
||||
alignment: TextAlignment = TextAlignment.LEFT,
|
||||
is_comb: bool = False,
|
||||
max_length: Optional[int] = None
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a TextStreamAppearance object.
|
||||
|
||||
This constructor creates a new PDF stream object configured as an XObject
|
||||
of subtype Form. It uses the `_appearance_stream_data` method to generate
|
||||
the content for the stream.
|
||||
|
||||
Args:
|
||||
layout: The basic layout parameters.
|
||||
text: The text to be rendered in the form field.
|
||||
selection: An optional list of strings that should be highlighted as selected.
|
||||
font_resource: An optional variable that represents a PDF font dictionary.
|
||||
font_name: The name of the font resource, e.g., "/Helv".
|
||||
font_size: The font size. If 0, it's auto-calculated.
|
||||
font_color: The font color string.
|
||||
is_multiline: A boolean indicating if the text field is multiline.
|
||||
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||
is_comb: Boolean that designates fixed-length fields, where every character
|
||||
fills one "cell", such as in a postcode.
|
||||
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||
length field.
|
||||
|
||||
"""
|
||||
super().__init__(layout)
|
||||
|
||||
# If a font resource was added, get the font character map
|
||||
if font_resource:
|
||||
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||
font = Font.from_font_resource(font_resource)
|
||||
else:
|
||||
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
|
||||
font_name = "/Helv"
|
||||
font_resource = DictionaryObject({
|
||||
NameObject("/Subtype"): NameObject("/Type1"),
|
||||
NameObject("/Name"): NameObject("/Helv"),
|
||||
NameObject("/Type"): NameObject("/Font"),
|
||||
NameObject("/BaseFont"): NameObject("/Helvetica"),
|
||||
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
|
||||
})
|
||||
font_descriptor = CORE_FONT_METRICS["Helvetica"]
|
||||
font_descriptor.character_widths["default"] = 2 * font_descriptor.character_widths[" "]
|
||||
font = Font(
|
||||
name="Helvetica",
|
||||
character_map={},
|
||||
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
|
||||
sub_type="Type1",
|
||||
font_descriptor = font_descriptor,
|
||||
character_widths = font_descriptor.character_widths
|
||||
)
|
||||
|
||||
font_glyph_byte_map: dict[str, bytes]
|
||||
if isinstance(font.encoding, str):
|
||||
font_glyph_byte_map = {
|
||||
v: k.encode(font.encoding) for k, v in font.character_map.items()
|
||||
}
|
||||
else:
|
||||
font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||
font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||
for key, value in font.character_map.items():
|
||||
font_glyph_byte_map[value] = font_encoding_rev.get(key, key)
|
||||
|
||||
ap_stream_data = self._generate_appearance_stream_data(
|
||||
text,
|
||||
selection,
|
||||
font,
|
||||
font_glyph_byte_map,
|
||||
font_name=font_name,
|
||||
font_size=font_size,
|
||||
font_color=font_color,
|
||||
is_multiline=is_multiline,
|
||||
alignment=alignment,
|
||||
is_comb=is_comb,
|
||||
max_length=max_length
|
||||
)
|
||||
|
||||
self.set_data(ByteStringObject(ap_stream_data))
|
||||
self[NameObject("/Length")] = NumberObject(len(ap_stream_data))
|
||||
# Update Resources with font information
|
||||
self[NameObject("/Resources")] = DictionaryObject({
|
||||
NameObject("/Font"): DictionaryObject({
|
||||
NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource)
|
||||
})
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def from_text_annotation(
|
||||
cls,
|
||||
acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM])
|
||||
field: DictionaryObject,
|
||||
annotation: DictionaryObject,
|
||||
user_font_name: str = "",
|
||||
user_font_size: float = -1,
|
||||
) -> "TextStreamAppearance":
|
||||
"""
|
||||
Creates a TextStreamAppearance object from a text field annotation.
|
||||
|
||||
This class method is a factory for creating a `TextStreamAppearance`
|
||||
instance by extracting all necessary information (bounding box, font,
|
||||
text content, etc.) from the PDF field and annotation dictionaries.
|
||||
It respects inheritance for properties like default appearance (`/DA`).
|
||||
|
||||
Args:
|
||||
acro_form: The root AcroForm dictionary from the PDF catalog.
|
||||
field: The field dictionary object.
|
||||
annotation: The widget annotation dictionary object associated with the field.
|
||||
user_font_name: An optional user-provided font name to override the
|
||||
default. Defaults to an empty string.
|
||||
user_font_size: An optional user-provided font size to override the
|
||||
default. A value of -1 indicates no override.
|
||||
|
||||
Returns:
|
||||
A new `TextStreamAppearance` instance configured for the given field.
|
||||
|
||||
"""
|
||||
# Calculate rectangle dimensions
|
||||
_rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect])
|
||||
rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1])))
|
||||
|
||||
# Get default appearance dictionary from annotation
|
||||
default_appearance = annotation.get_inherited(
|
||||
AnnotationDictionaryAttributes.DA,
|
||||
acro_form.get(AnnotationDictionaryAttributes.DA, None),
|
||||
)
|
||||
if not default_appearance:
|
||||
# Create a default appearance if none was found in the annotation
|
||||
default_appearance = TextStringObject("/Helv 0 Tf 0 g")
|
||||
else:
|
||||
default_appearance = default_appearance.get_object()
|
||||
|
||||
# Derive font name, size and color from the default appearance. Also set
|
||||
# user-provided font name and font size in the default appearance, if given.
|
||||
# For a font name, this presumes that we can find an associated font resource
|
||||
# dictionary. Uses the variable font_properties as an intermediate.
|
||||
# As per the PDF spec:
|
||||
# "At a minimum, the string [that is, default_appearance] shall include a Tf (text
|
||||
# font) operator along with its two operands, font and size" (Section 12.7.4.3
|
||||
# "Variable text" of the PDF 2.0 specification).
|
||||
font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop]
|
||||
font_name = font_properties.pop(font_properties.index("Tf") - 2)
|
||||
font_size = float(font_properties.pop(font_properties.index("Tf") - 1))
|
||||
font_properties.remove("Tf")
|
||||
font_color = " ".join(font_properties)
|
||||
# Determine the font name to use, prioritizing the user's input
|
||||
if user_font_name:
|
||||
font_name = user_font_name
|
||||
# Determine the font size to use, prioritizing the user's input
|
||||
if user_font_size > 0:
|
||||
font_size = user_font_size
|
||||
|
||||
# Try to find a resource dictionary for the font
|
||||
document_resources: Any = cast(
|
||||
DictionaryObject,
|
||||
cast(
|
||||
DictionaryObject,
|
||||
annotation.get_inherited(
|
||||
"/DR",
|
||||
acro_form.get("/DR", DictionaryObject()),
|
||||
),
|
||||
).get_object(),
|
||||
)
|
||||
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
|
||||
# CORE_FONT_METRICS is the dict with Standard font metrics
|
||||
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
|
||||
# ...or AcroForm dictionary
|
||||
document_resources = cast(
|
||||
dict[Any, Any],
|
||||
acro_form.get("/DR", {}),
|
||||
)
|
||||
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
|
||||
font_resource = document_font_resources.get(font_name, None)
|
||||
if not is_null_or_none(font_resource):
|
||||
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||
|
||||
# Retrieve field text and selected values
|
||||
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
|
||||
if (
|
||||
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
|
||||
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
|
||||
):
|
||||
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
|
||||
selection = field.get("/V", [])
|
||||
if not isinstance(selection, list):
|
||||
selection = [selection]
|
||||
else: # /Tx
|
||||
text = field.get("/V", "")
|
||||
selection = []
|
||||
|
||||
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
|
||||
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
|
||||
|
||||
# Retrieve formatting information
|
||||
is_comb = False
|
||||
max_length = None
|
||||
if field_flags & FieldDictionaryAttributes.FfBits.Comb:
|
||||
is_comb = True
|
||||
max_length = annotation.get("/MaxLen")
|
||||
is_multiline = False
|
||||
if field_flags & FieldDictionaryAttributes.FfBits.Multiline:
|
||||
is_multiline = True
|
||||
alignment = field.get("/Q", TextAlignment.LEFT)
|
||||
border_width = 1
|
||||
border_style = BorderStyles.SOLID
|
||||
if "/BS" in field:
|
||||
border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width)
|
||||
border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style)
|
||||
|
||||
# Create the TextStreamAppearance instance
|
||||
layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style)
|
||||
new_appearance_stream = cls(
|
||||
layout,
|
||||
text,
|
||||
selection,
|
||||
font_resource,
|
||||
font_name=font_name,
|
||||
font_size=font_size,
|
||||
font_color=font_color,
|
||||
is_multiline=is_multiline,
|
||||
alignment=alignment,
|
||||
is_comb=is_comb,
|
||||
max_length=max_length
|
||||
)
|
||||
if AnnotationDictionaryAttributes.AP in annotation:
|
||||
for key, value in (
|
||||
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
|
||||
):
|
||||
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
|
||||
new_appearance_stream[key] = value
|
||||
|
||||
return new_appearance_stream
|
||||
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
@@ -0,0 +1,937 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
import binascii
|
||||
import codecs
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
from binascii import unhexlify
|
||||
from collections.abc import Sequence
|
||||
from math import log10
|
||||
from struct import iter_unpack
|
||||
from typing import Any, Callable, ClassVar, Optional, Union, cast
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeGuard
|
||||
else:
|
||||
from typing_extensions import TypeGuard # PEP 647
|
||||
|
||||
from .._codecs import _pdfdoc_encoding_rev
|
||||
from .._protocols import PdfObjectProtocol, PdfWriterProtocol
|
||||
from .._utils import (
|
||||
StreamType,
|
||||
classproperty,
|
||||
deprecation_no_replacement,
|
||||
deprecation_with_replacement,
|
||||
logger_warning,
|
||||
read_non_whitespace,
|
||||
read_until_regex,
|
||||
)
|
||||
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
|
||||
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
|
||||
class PdfObject(PdfObjectProtocol):
|
||||
# function for calculating a hash value
|
||||
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
|
||||
indirect_reference: Optional["IndirectObject"]
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement .hash_bin() so far"
|
||||
)
|
||||
|
||||
def hash_value_data(self) -> bytes:
|
||||
return f"{self}".encode()
|
||||
|
||||
def hash_value(self) -> bytes:
|
||||
return (
|
||||
f"{self.__class__.__name__}:"
|
||||
f"{self.hash_func(self.hash_value_data()).hexdigest()}"
|
||||
).encode()
|
||||
|
||||
def replicate(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
) -> "PdfObject":
|
||||
"""
|
||||
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
|
||||
without ensuring links. This is used in clone_document_from_root with incremental = True.
|
||||
|
||||
Args:
|
||||
pdf_dest: Target to clone to.
|
||||
|
||||
Returns:
|
||||
The cloned PdfObject
|
||||
|
||||
"""
|
||||
return self.clone(pdf_dest)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "PdfObject":
|
||||
"""
|
||||
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
|
||||
|
||||
By default, this method will call ``_reference_clone`` (see ``_reference``).
|
||||
|
||||
|
||||
Args:
|
||||
pdf_dest: Target to clone to.
|
||||
force_duplicate: By default, if the object has already been cloned and referenced,
|
||||
the copy will be returned; when ``True``, a new copy will be created.
|
||||
(Default value = ``False``)
|
||||
ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
|
||||
during cloning (applies to children duplication as well). If fields are to be
|
||||
considered for a limited number of levels, you have to add it as integer, for
|
||||
example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
|
||||
level only but ``"/TOTO"`` on all levels.
|
||||
|
||||
Returns:
|
||||
The cloned PdfObject
|
||||
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement .clone so far"
|
||||
)
|
||||
|
||||
def _reference_clone(
|
||||
self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
|
||||
) -> PdfObjectProtocol:
|
||||
"""
|
||||
Reference the object within the _objects of pdf_dest only if
|
||||
indirect_reference attribute exists (which means the objects was
|
||||
already identified in xref/xobjstm) if object has been already
|
||||
referenced do nothing.
|
||||
|
||||
Args:
|
||||
clone:
|
||||
pdf_dest:
|
||||
|
||||
Returns:
|
||||
The clone
|
||||
|
||||
"""
|
||||
try:
|
||||
if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
|
||||
return clone
|
||||
except Exception:
|
||||
pass
|
||||
# if hasattr(clone, "indirect_reference"):
|
||||
try:
|
||||
ind = self.indirect_reference
|
||||
except AttributeError:
|
||||
return clone
|
||||
if (
|
||||
pdf_dest.incremental
|
||||
and ind is not None
|
||||
and ind.pdf == pdf_dest._reader
|
||||
and ind.idnum <= len(pdf_dest._objects)
|
||||
):
|
||||
i = ind.idnum
|
||||
else:
|
||||
i = len(pdf_dest._objects) + 1
|
||||
if ind is not None:
|
||||
if id(ind.pdf) not in pdf_dest._id_translated:
|
||||
pdf_dest._id_translated[id(ind.pdf)] = {}
|
||||
pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
|
||||
if (
|
||||
not force_duplicate
|
||||
and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
|
||||
):
|
||||
obj = pdf_dest.get_object(
|
||||
pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
|
||||
)
|
||||
assert obj is not None
|
||||
return obj
|
||||
pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
|
||||
try:
|
||||
pdf_dest._objects[i - 1] = clone
|
||||
except IndexError:
|
||||
pdf_dest._objects.append(clone)
|
||||
i = len(pdf_dest._objects)
|
||||
clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
|
||||
return clone
|
||||
|
||||
def get_object(self) -> Optional["PdfObject"]:
|
||||
"""Resolve indirect references."""
|
||||
return self
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class NullObject(PdfObject):
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NullObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__,))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"null")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> "NullObject":
|
||||
nulltxt = stream.read(4)
|
||||
if nulltxt != b"null":
|
||||
raise PdfReadError("Could not read Null object")
|
||||
return NullObject()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "NullObject"
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return isinstance(other, NullObject)
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.hash_bin()
|
||||
|
||||
|
||||
class BooleanObject(PdfObject):
|
||||
def __init__(self, value: Any) -> None:
|
||||
self.value = value
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "BooleanObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"BooleanObject",
|
||||
self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.value))
|
||||
|
||||
def __eq__(self, o: object, /) -> bool:
|
||||
if isinstance(o, BooleanObject):
|
||||
return self.value == o.value
|
||||
if isinstance(o, bool):
|
||||
return self.value == o
|
||||
return False
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return self.hash_bin()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "True" if self.value else "False"
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
if self.value:
|
||||
stream.write(b"true")
|
||||
else:
|
||||
stream.write(b"false")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> "BooleanObject":
|
||||
word = stream.read(4)
|
||||
if word == b"true":
|
||||
return BooleanObject(True)
|
||||
if word == b"fals":
|
||||
stream.read(1)
|
||||
return BooleanObject(False)
|
||||
raise PdfReadError("Could not read Boolean object")
|
||||
|
||||
|
||||
class IndirectObject(PdfObject):
|
||||
def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
|
||||
self.idnum = idnum
|
||||
self.generation = generation
|
||||
self.pdf = pdf
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.idnum, self.generation, id(self.pdf)))
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
|
||||
|
||||
def replicate(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
) -> "PdfObject":
|
||||
return IndirectObject(self.idnum, self.generation, pdf_dest)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: PdfWriterProtocol,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "IndirectObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
if self.pdf == pdf_dest and not force_duplicate:
|
||||
# Already duplicated and no extra duplication required
|
||||
return self
|
||||
if id(self.pdf) not in pdf_dest._id_translated:
|
||||
pdf_dest._id_translated[id(self.pdf)] = {}
|
||||
pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
|
||||
|
||||
if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
|
||||
dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
|
||||
if force_duplicate:
|
||||
assert dup is not None
|
||||
assert dup.indirect_reference is not None
|
||||
idref = dup.indirect_reference
|
||||
return IndirectObject(idref.idnum, idref.generation, idref.pdf)
|
||||
else:
|
||||
obj = self.get_object()
|
||||
# case observed : a pointed object can not be found
|
||||
if obj is None:
|
||||
# this normally
|
||||
obj = NullObject()
|
||||
assert isinstance(self, (IndirectObject,))
|
||||
obj.indirect_reference = self
|
||||
dup = pdf_dest._add_object(
|
||||
obj.clone(pdf_dest, force_duplicate, ignore_fields)
|
||||
)
|
||||
assert dup is not None, "mypy"
|
||||
assert dup.indirect_reference is not None, "mypy"
|
||||
return dup.indirect_reference
|
||||
|
||||
@property
|
||||
def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
|
||||
return self
|
||||
|
||||
def get_object(self) -> Optional["PdfObject"]:
|
||||
return self.pdf.get_object(self)
|
||||
|
||||
def __deepcopy__(self, memo: Any) -> "IndirectObject":
|
||||
return IndirectObject(self.idnum, self.generation, self.pdf)
|
||||
|
||||
def _get_object_with_check(self) -> Optional["PdfObject"]:
|
||||
o = self.get_object()
|
||||
# the check is done here to not slow down get_object()
|
||||
if isinstance(o, IndirectObject):
|
||||
raise PdfStreamError(
|
||||
f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
|
||||
)
|
||||
return o
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
# Attribute not found in object: look in pointed object
|
||||
try:
|
||||
return getattr(self._get_object_with_check(), name)
|
||||
except AttributeError:
|
||||
raise AttributeError(
|
||||
f"No attribute {name} found in IndirectObject or pointed object"
|
||||
)
|
||||
|
||||
def __getitem__(self, key: Any) -> Any:
|
||||
# items should be extracted from pointed Object
|
||||
return self._get_object_with_check()[key] # type: ignore
|
||||
|
||||
def __contains__(self, key: Any) -> bool:
|
||||
return key in self._get_object_with_check() # type: ignore
|
||||
|
||||
def __iter__(self) -> Any:
|
||||
return self._get_object_with_check().__iter__() # type: ignore
|
||||
|
||||
def __float__(self) -> str:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__float__() # type: ignore
|
||||
|
||||
def __int__(self) -> int:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__int__() # type: ignore
|
||||
|
||||
def __str__(self) -> str:
|
||||
# in this case we are looking for the pointed data
|
||||
return self.get_object().__str__()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return (
|
||||
other is not None
|
||||
and isinstance(other, IndirectObject)
|
||||
and self.idnum == other.idnum
|
||||
and self.generation == other.generation
|
||||
and self.pdf is other.pdf
|
||||
)
|
||||
|
||||
def __ne__(self, other: object) -> bool:
|
||||
return not self.__eq__(other)
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(f"{self.idnum} {self.generation} R".encode())
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
|
||||
idnum = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok.isspace():
|
||||
break
|
||||
idnum += tok
|
||||
generation = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok.isspace():
|
||||
if not generation:
|
||||
continue
|
||||
break
|
||||
generation += tok
|
||||
r = read_non_whitespace(stream)
|
||||
if r != b"R":
|
||||
raise PdfReadError(
|
||||
f"Error reading indirect object reference at byte {hex(stream.tell())}"
|
||||
)
|
||||
return IndirectObject(int(idnum), int(generation), pdf)
|
||||
|
||||
|
||||
FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
|
||||
|
||||
|
||||
class FloatObject(float, PdfObject):
|
||||
def __new__(
|
||||
cls, value: Any = "0.0", context: Optional[Any] = None
|
||||
) -> "FloatObject":
|
||||
try:
|
||||
value = float(value)
|
||||
return float.__new__(cls, value)
|
||||
except Exception as e:
|
||||
# If this isn't a valid decimal (happens in malformed PDFs)
|
||||
# fallback to 0
|
||||
logger_warning(
|
||||
f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
|
||||
)
|
||||
return float.__new__(cls, 0.0)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "FloatObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"FloatObject",
|
||||
self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.as_numeric))
|
||||
|
||||
def myrepr(self) -> str:
|
||||
if self == 0:
|
||||
return "0.0"
|
||||
nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
|
||||
return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.myrepr() # repr(float(self))
|
||||
|
||||
def as_numeric(self) -> float:
|
||||
return float(self)
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(self.myrepr().encode("utf8"))
|
||||
|
||||
|
||||
class NumberObject(int, PdfObject):
|
||||
NumberPattern = re.compile(b"[^+-.0-9]")
|
||||
|
||||
def __new__(cls, value: Any) -> "NumberObject":
|
||||
try:
|
||||
return int.__new__(cls, int(value))
|
||||
except ValueError:
|
||||
logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
|
||||
return int.__new__(cls, 0)
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NumberObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NumberObject",
|
||||
self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.as_numeric()))
|
||||
|
||||
def as_numeric(self) -> int:
|
||||
return int(repr(self).encode("utf8"))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(repr(self).encode("utf8"))
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
|
||||
num = read_until_regex(stream, NumberObject.NumberPattern)
|
||||
if b"." in num:
|
||||
return FloatObject(num)
|
||||
return NumberObject(num)
|
||||
|
||||
|
||||
class ByteStringObject(bytes, PdfObject):
|
||||
"""
|
||||
Represents a string object where the text encoding could not be determined.
|
||||
|
||||
This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
||||
represent strings -- for example, the encryption data stored in files (like
|
||||
/O) is clearly not text, but is still stored in a "String" object.
|
||||
"""
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "ByteStringObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"ByteStringObject",
|
||||
self._reference_clone(
|
||||
ByteStringObject(bytes(self)), pdf_dest, force_duplicate
|
||||
),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, bytes(self)))
|
||||
|
||||
@property
|
||||
def original_bytes(self) -> bytes:
|
||||
"""For compatibility with TextStringObject.original_bytes."""
|
||||
return self
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"<")
|
||||
stream.write(binascii.hexlify(self))
|
||||
stream.write(b">")
|
||||
|
||||
def __str__(self) -> str:
|
||||
charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
|
||||
for enc in charset_to_try:
|
||||
try:
|
||||
return self.decode(enc)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
raise PdfReadError("Cannot decode ByteStringObject.")
|
||||
|
||||
|
||||
class TextStringObject(str, PdfObject): # noqa: SLOT000
|
||||
"""
|
||||
A string object that has been decoded into a real unicode string.
|
||||
|
||||
If read from a PDF document, this string appeared to match the
|
||||
PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
|
||||
to occur.
|
||||
"""
|
||||
|
||||
autodetect_pdfdocencoding: bool
|
||||
autodetect_utf16: bool
|
||||
utf16_bom: bytes
|
||||
_original_bytes: Optional[bytes] = None
|
||||
|
||||
def __new__(cls, value: Any) -> "TextStringObject":
|
||||
original_bytes = None
|
||||
if isinstance(value, bytes):
|
||||
original_bytes = value
|
||||
value = value.decode("charmap")
|
||||
text_string_object = str.__new__(cls, value)
|
||||
text_string_object._original_bytes = original_bytes
|
||||
text_string_object.autodetect_utf16 = False
|
||||
text_string_object.autodetect_pdfdocencoding = False
|
||||
text_string_object.utf16_bom = b""
|
||||
if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
|
||||
# The value of `original_bytes` is only set for inputs being `bytes`.
|
||||
# If this is UTF-16 data according to the BOM (first two characters),
|
||||
# perform special handling. All other cases should not need any special conversion
|
||||
# due to already being a string.
|
||||
try:
|
||||
text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
|
||||
except UnicodeDecodeError as exception:
|
||||
logger_warning(
|
||||
f"{exception!s}\ninitial string:{exception.object!r}",
|
||||
__name__,
|
||||
)
|
||||
text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
|
||||
text_string_object._original_bytes = original_bytes
|
||||
text_string_object.autodetect_utf16 = True
|
||||
text_string_object.utf16_bom = original_bytes[:2]
|
||||
else:
|
||||
try:
|
||||
encode_pdfdocencoding(text_string_object)
|
||||
text_string_object.autodetect_pdfdocencoding = True
|
||||
except UnicodeEncodeError:
|
||||
text_string_object.autodetect_utf16 = True
|
||||
text_string_object.utf16_bom = codecs.BOM_UTF16_BE
|
||||
return text_string_object
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "TextStringObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
obj = TextStringObject(self)
|
||||
obj._original_bytes = self._original_bytes
|
||||
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
|
||||
obj.autodetect_utf16 = self.autodetect_utf16
|
||||
obj.utf16_bom = self.utf16_bom
|
||||
return cast(
|
||||
"TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self.original_bytes))
|
||||
|
||||
@property
|
||||
def original_bytes(self) -> bytes:
|
||||
"""
|
||||
It is occasionally possible that a text string object gets created where
|
||||
a byte string object was expected due to the autodetection mechanism --
|
||||
if that occurs, this "original_bytes" property can be used to
|
||||
back-calculate what the original encoded bytes were.
|
||||
"""
|
||||
if self._original_bytes is not None:
|
||||
return self._original_bytes
|
||||
return self.get_original_bytes()
|
||||
|
||||
def get_original_bytes(self) -> bytes:
|
||||
# We're a text string object, but the library is trying to get our raw
|
||||
# bytes. This can happen if we auto-detected this string as text, but
|
||||
# we were wrong. It's pretty common. Return the original bytes that
|
||||
# would have been used to create this object, based upon the autodetect
|
||||
# method.
|
||||
if self.autodetect_utf16:
|
||||
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||
return codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||
if self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||
return self.encode("utf-16be")
|
||||
if self.autodetect_pdfdocencoding:
|
||||
return encode_pdfdocencoding(self)
|
||||
raise Exception("no information about original bytes") # pragma: no cover
|
||||
|
||||
def get_encoded_bytes(self) -> bytes:
|
||||
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
||||
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
||||
# here for trying...
|
||||
try:
|
||||
if self._original_bytes is not None:
|
||||
return self._original_bytes
|
||||
if self.autodetect_utf16:
|
||||
raise UnicodeEncodeError("", "forced", -1, -1, "")
|
||||
bytearr = encode_pdfdocencoding(self)
|
||||
except UnicodeEncodeError:
|
||||
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||
bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||
elif self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||
else:
|
||||
bytearr = self.encode("utf-16be")
|
||||
return bytearr
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
bytearr = self.get_encoded_bytes()
|
||||
stream.write(b"(")
|
||||
for c_ in iter_unpack("c", bytearr):
|
||||
c = cast(bytes, c_[0])
|
||||
if not c.isalnum() and c != b" ":
|
||||
# This:
|
||||
# stream.write(rf"\{c:0>3o}".encode())
|
||||
# gives
|
||||
# https://github.com/davidhalter/parso/issues/207
|
||||
stream.write(b"\\%03o" % ord(c))
|
||||
else:
|
||||
stream.write(c)
|
||||
stream.write(b")")
|
||||
|
||||
|
||||
class NameObject(str, PdfObject): # noqa: SLOT000
|
||||
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
|
||||
prefix = b"/"
|
||||
renumber_table: ClassVar[dict[str, bytes]] = {
|
||||
**{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
|
||||
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
|
||||
}
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||
) -> "NameObject":
|
||||
"""Clone object into pdf_dest."""
|
||||
return cast(
|
||||
"NameObject",
|
||||
self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
|
||||
)
|
||||
|
||||
def hash_bin(self) -> int:
|
||||
"""
|
||||
Used to detect modified object.
|
||||
|
||||
Returns:
|
||||
Hash considering type and value.
|
||||
|
||||
"""
|
||||
return hash((self.__class__, self))
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(self.renumber())
|
||||
|
||||
def renumber(self) -> bytes:
|
||||
out = self[0].encode("utf-8")
|
||||
if out != b"/":
|
||||
deprecation_no_replacement(
|
||||
f"Incorrect first char in NameObject, should start with '/': ({self})",
|
||||
"5.0.0",
|
||||
)
|
||||
for c in self[1:]:
|
||||
if c > "~":
|
||||
for x in c.encode("utf-8"):
|
||||
out += f"#{x:02X}".encode()
|
||||
else:
|
||||
try:
|
||||
out += self.renumber_table[c]
|
||||
except KeyError:
|
||||
out += c.encode("utf-8")
|
||||
return out
|
||||
|
||||
def _sanitize(self) -> "NameObject":
|
||||
"""
|
||||
Sanitize the NameObject's name to be a valid PDF name part
|
||||
(alphanumeric, underscore, hyphen). The _sanitize method replaces
|
||||
spaces and any non-alphanumeric/non-underscore/non-hyphen with
|
||||
underscores.
|
||||
|
||||
Returns:
|
||||
NameObject with sanitized name.
|
||||
"""
|
||||
name = str(self).removeprefix("/")
|
||||
name = re.sub(r"\ ", "_", name)
|
||||
name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||
return NameObject("/" + name)
|
||||
|
||||
@classproperty
|
||||
def surfix(cls) -> bytes: # noqa: N805
|
||||
deprecation_with_replacement("surfix", "prefix", "5.0.0")
|
||||
return b"/"
|
||||
|
||||
@staticmethod
|
||||
def unnumber(sin: bytes) -> bytes:
|
||||
i = sin.find(b"#", 0)
|
||||
while i >= 0:
|
||||
try:
|
||||
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
|
||||
i = sin.find(b"#", i + 1)
|
||||
except ValueError:
|
||||
# if the 2 characters after # can not be converted to hex
|
||||
# we change nothing and carry on
|
||||
i = i + 1
|
||||
return sin
|
||||
|
||||
CHARSETS = ("utf-8", "gbk", "latin1")
|
||||
|
||||
@staticmethod
|
||||
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
|
||||
name = stream.read(1)
|
||||
if name != NameObject.prefix:
|
||||
raise PdfReadError("Name read error")
|
||||
name += read_until_regex(stream, NameObject.delimiter_pattern)
|
||||
try:
|
||||
# Name objects should represent irregular characters
|
||||
# with a '#' followed by the symbol's hex number
|
||||
name = NameObject.unnumber(name)
|
||||
for enc in NameObject.CHARSETS:
|
||||
try:
|
||||
ret = name.decode(enc)
|
||||
return NameObject(ret)
|
||||
except Exception:
|
||||
pass
|
||||
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError) as e:
|
||||
if not pdf.strict:
|
||||
logger_warning(
|
||||
f"Illegal character in NameObject ({name!r}), "
|
||||
"you may need to adjust NameObject.CHARSETS",
|
||||
__name__,
|
||||
)
|
||||
return NameObject(name.decode("charmap"))
|
||||
raise PdfReadError(
|
||||
f"Illegal character in NameObject ({name!r}). "
|
||||
"You may need to adjust NameObject.CHARSETS.",
|
||||
) from e
|
||||
|
||||
|
||||
def encode_pdfdocencoding(unicode_string: str) -> bytes:
|
||||
try:
|
||||
return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
|
||||
except KeyError:
|
||||
raise UnicodeEncodeError(
|
||||
"pdfdocencoding",
|
||||
unicode_string,
|
||||
-1,
|
||||
-1,
|
||||
"does not exist in translation table",
|
||||
)
|
||||
|
||||
|
||||
def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
|
||||
"""
|
||||
Returns:
|
||||
True if x is None or NullObject.
|
||||
|
||||
"""
|
||||
return x is None or (
|
||||
isinstance(x, PdfObject)
|
||||
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
|
||||
)
|
||||
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
File diff suppressed because it is too large
Load Diff
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
@@ -0,0 +1,401 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import bisect
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from pypdf._utils import format_iso8824_date, parse_iso8824_date
|
||||
from pypdf.constants import CatalogAttributes as CA
|
||||
from pypdf.constants import FileSpecificationDictionaryEntries
|
||||
from pypdf.constants import PageAttributes as PG
|
||||
from pypdf.errors import PdfReadError, PyPdfError
|
||||
from pypdf.generic import (
|
||||
ArrayObject,
|
||||
ByteStringObject,
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NameObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
StreamObject,
|
||||
TextStringObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from collections.abc import Generator
|
||||
|
||||
from pypdf._writer import PdfWriter
|
||||
|
||||
|
||||
class EmbeddedFile:
|
||||
"""
|
||||
Container holding the information on an embedded file.
|
||||
|
||||
Attributes are evaluated lazily if possible.
|
||||
|
||||
Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.
|
||||
"""
|
||||
def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None:
|
||||
"""
|
||||
Args:
|
||||
name: The (primary) name as provided in the name tree.
|
||||
pdf_object: The corresponding PDF object to allow retrieving further data.
|
||||
parent: The parent list.
|
||||
"""
|
||||
self._name = name
|
||||
self.pdf_object = pdf_object
|
||||
self._parent = parent
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""The (primary) name of the embedded file as provided in the name tree."""
|
||||
return self._name
|
||||
|
||||
@classmethod
|
||||
def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile:
|
||||
"""
|
||||
Create a new embedded file and add it to the PdfWriter.
|
||||
|
||||
Args:
|
||||
writer: The PdfWriter instance to add the embedded file to.
|
||||
name: The filename to display.
|
||||
content: The data in the file.
|
||||
|
||||
Returns:
|
||||
EmbeddedFile instance for the newly created embedded file.
|
||||
"""
|
||||
# Convert string content to bytes if needed
|
||||
if isinstance(content, str):
|
||||
content = content.encode("latin-1")
|
||||
|
||||
# Create the file entry (the actual embedded file stream)
|
||||
file_entry = DecodedStreamObject()
|
||||
file_entry.set_data(content)
|
||||
file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")})
|
||||
|
||||
# Create the /EF entry
|
||||
ef_entry = DictionaryObject()
|
||||
ef_entry.update({NameObject("/F"): writer._add_object(file_entry)})
|
||||
|
||||
# Create the filespec dictionary
|
||||
from pypdf.generic import create_string_object # noqa: PLC0415
|
||||
filespec = DictionaryObject()
|
||||
filespec_reference = writer._add_object(filespec)
|
||||
name_object = cast(TextStringObject, create_string_object(name))
|
||||
filespec.update(
|
||||
{
|
||||
NameObject(PG.TYPE): NameObject("/Filespec"),
|
||||
NameObject(FileSpecificationDictionaryEntries.F): name_object,
|
||||
NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
|
||||
}
|
||||
)
|
||||
|
||||
# Add the name and filespec to the names array.
|
||||
# We use the inverse order for insertion, as this allows us to re-use the
|
||||
# same index.
|
||||
names_array = cls._get_names_array(writer)
|
||||
insertion_index = cls._get_insertion_index(names_array, name_object)
|
||||
names_array.insert(insertion_index, filespec_reference)
|
||||
names_array.insert(insertion_index, name_object)
|
||||
|
||||
# Return an EmbeddedFile instance
|
||||
return cls(name=name, pdf_object=filespec, parent=names_array)
|
||||
|
||||
@classmethod
|
||||
def _get_names_array(cls, writer: PdfWriter) -> ArrayObject:
|
||||
"""Get the names array for embedded files, possibly creating and flattening it."""
|
||||
if CA.NAMES not in writer.root_object:
|
||||
# Add the /Names entry to the catalog.
|
||||
writer.root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject())
|
||||
|
||||
names_dict = cast(DictionaryObject, writer.root_object[CA.NAMES])
|
||||
if "/EmbeddedFiles" not in names_dict:
|
||||
# We do not yet have an entry for embedded files. Create and return it.
|
||||
names = ArrayObject()
|
||||
embedded_files_names_dictionary = DictionaryObject(
|
||||
{NameObject(CA.NAMES): names}
|
||||
)
|
||||
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||
return names
|
||||
|
||||
# We have an existing embedded files entry.
|
||||
embedded_files_names_tree = cast(DictionaryObject, names_dict["/EmbeddedFiles"])
|
||||
if "/Names" in embedded_files_names_tree:
|
||||
# Simple case: We already have a flat list.
|
||||
return cast(ArrayObject, embedded_files_names_tree[NameObject(CA.NAMES)])
|
||||
if "/Kids" not in embedded_files_names_tree:
|
||||
# Invalid case: This is no name tree.
|
||||
raise PdfReadError("Got neither Names nor Kids in embedded files tree.")
|
||||
|
||||
# Complex case: Convert a /Kids-based name tree to a /Names-based one.
|
||||
# /Name-based ones are much easier to handle and allow us to simplify the
|
||||
# actual insertion logic by only having to consider one case.
|
||||
names = ArrayObject()
|
||||
kids = cast(ArrayObject, embedded_files_names_tree["/Kids"].get_object())
|
||||
embedded_files_names_dictionary = DictionaryObject(
|
||||
{NameObject(CA.NAMES): names}
|
||||
)
|
||||
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||
for kid in kids:
|
||||
# Write the flattened file entries. As we do not change the actual files,
|
||||
# this should not have any impact on references to them.
|
||||
# There might be further (nested) kids here.
|
||||
# Wait for an example before evaluating an implementation.
|
||||
for name in kid.get_object().get("/Names", []):
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
@classmethod
|
||||
def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int:
|
||||
keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)]
|
||||
name_bytes = name.encode("utf-8")
|
||||
|
||||
start = bisect.bisect_left(keys, name_bytes)
|
||||
end = bisect.bisect_right(keys, name_bytes)
|
||||
|
||||
if start != end:
|
||||
return end * 2
|
||||
if start == 0:
|
||||
return 0
|
||||
if start == (key_count := len(keys)):
|
||||
return key_count * 2
|
||||
return end * 2
|
||||
|
||||
@property
|
||||
def alternative_name(self) -> str | None:
|
||||
"""Retrieve the alternative name (file specification)."""
|
||||
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||
# PDF 2.0 reference, table 43:
|
||||
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
|
||||
if key in self.pdf_object:
|
||||
value = self.pdf_object[key].get_object()
|
||||
if not is_null_or_none(value):
|
||||
return cast(str, value)
|
||||
return None
|
||||
|
||||
@alternative_name.setter
|
||||
def alternative_name(self, value: TextStringObject | None) -> None:
|
||||
"""Set the alternative name (file specification)."""
|
||||
if value is None:
|
||||
if FileSpecificationDictionaryEntries.UF in self.pdf_object:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject()
|
||||
if FileSpecificationDictionaryEntries.F in self.pdf_object:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject()
|
||||
else:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value
|
||||
|
||||
@property
|
||||
def description(self) -> str | None:
|
||||
"""Retrieve the description."""
|
||||
value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC)
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@description.setter
|
||||
def description(self, value: TextStringObject | None) -> None:
|
||||
"""Set the description."""
|
||||
if value is None:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject()
|
||||
else:
|
||||
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value
|
||||
|
||||
@property
|
||||
def associated_file_relationship(self) -> str:
|
||||
"""Retrieve the relationship of the referring document to this embedded file."""
|
||||
return self.pdf_object.get("/AFRelationship", "/Unspecified")
|
||||
|
||||
@associated_file_relationship.setter
|
||||
def associated_file_relationship(self, value: NameObject) -> None:
|
||||
"""Set the relationship of the referring document to this embedded file."""
|
||||
self.pdf_object[NameObject("/AFRelationship")] = value
|
||||
|
||||
@property
|
||||
def _embedded_file(self) -> StreamObject:
|
||||
"""Retrieve the actual embedded file stream."""
|
||||
if "/EF" not in self.pdf_object:
|
||||
raise PdfReadError(f"/EF entry not found: {self.pdf_object}")
|
||||
ef = cast(DictionaryObject, self.pdf_object["/EF"])
|
||||
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||
if key in ef:
|
||||
return cast(StreamObject, ef[key].get_object())
|
||||
raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}")
|
||||
|
||||
@property
|
||||
def _params(self) -> DictionaryObject:
|
||||
"""Retrieve the file-specific parameters."""
|
||||
return self._embedded_file.get("/Params", DictionaryObject()).get_object()
|
||||
|
||||
@cached_property
|
||||
def _ensure_params(self) -> DictionaryObject:
|
||||
"""Ensure the /Params dictionary exists and return it."""
|
||||
embedded_file = self._embedded_file
|
||||
if "/Params" not in embedded_file:
|
||||
embedded_file[NameObject("/Params")] = DictionaryObject()
|
||||
return cast(DictionaryObject, embedded_file["/Params"])
|
||||
|
||||
@property
|
||||
def subtype(self) -> str | None:
|
||||
"""Retrieve the subtype. This is a MIME media type, prefixed by a slash."""
|
||||
value = self._embedded_file.get("/Subtype")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@subtype.setter
|
||||
def subtype(self, value: NameObject | None) -> None:
|
||||
"""Set the subtype. This should be a MIME media type, prefixed by a slash."""
|
||||
embedded_file = self._embedded_file
|
||||
if value is None:
|
||||
embedded_file[NameObject("/Subtype")] = NullObject()
|
||||
else:
|
||||
embedded_file[NameObject("/Subtype")] = value
|
||||
|
||||
@property
|
||||
def content(self) -> bytes:
|
||||
"""Retrieve the actual file content."""
|
||||
return self._embedded_file.get_data()
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str | bytes) -> None:
|
||||
"""Set the file content."""
|
||||
if isinstance(value, str):
|
||||
value = value.encode("latin-1")
|
||||
self._embedded_file.set_data(value)
|
||||
|
||||
@property
|
||||
def size(self) -> int | None:
|
||||
"""Retrieve the size of the uncompressed file in bytes."""
|
||||
value = self._params.get("/Size")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@size.setter
|
||||
def size(self, value: NumberObject | None) -> None:
|
||||
"""Set the size of the uncompressed file in bytes."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/Size")] = NullObject()
|
||||
else:
|
||||
params[NameObject("/Size")] = value
|
||||
|
||||
@property
|
||||
def creation_date(self) -> datetime.datetime | None:
|
||||
"""Retrieve the file creation datetime."""
|
||||
return parse_iso8824_date(self._params.get("/CreationDate"))
|
||||
|
||||
@creation_date.setter
|
||||
def creation_date(self, value: datetime.datetime | None) -> None:
|
||||
"""Set the file creation datetime."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/CreationDate")] = NullObject()
|
||||
else:
|
||||
date_str = format_iso8824_date(value)
|
||||
params[NameObject("/CreationDate")] = TextStringObject(date_str)
|
||||
|
||||
@property
|
||||
def modification_date(self) -> datetime.datetime | None:
|
||||
"""Retrieve the datetime of the last file modification."""
|
||||
return parse_iso8824_date(self._params.get("/ModDate"))
|
||||
|
||||
@modification_date.setter
|
||||
def modification_date(self, value: datetime.datetime | None) -> None:
|
||||
"""Set the datetime of the last file modification."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/ModDate")] = NullObject()
|
||||
else:
|
||||
date_str = format_iso8824_date(value)
|
||||
params[NameObject("/ModDate")] = TextStringObject(date_str)
|
||||
|
||||
@property
|
||||
def checksum(self) -> bytes | None:
|
||||
"""Retrieve the MD5 checksum of the (uncompressed) file."""
|
||||
value = self._params.get("/CheckSum")
|
||||
if is_null_or_none(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
@checksum.setter
|
||||
def checksum(self, value: ByteStringObject | None) -> None:
|
||||
"""Set the MD5 checksum of the (uncompressed) file."""
|
||||
params = self._ensure_params
|
||||
if value is None:
|
||||
params[NameObject("/CheckSum")] = NullObject()
|
||||
else:
|
||||
params[NameObject("/CheckSum")] = value
|
||||
|
||||
def delete(self) -> None:
|
||||
"""Delete the file from the document."""
|
||||
if not self._parent:
|
||||
raise PyPdfError("Parent required to delete file from document.")
|
||||
if self.pdf_object in self._parent:
|
||||
index = self._parent.index(self.pdf_object)
|
||||
elif (
|
||||
(indirect_reference := getattr(self.pdf_object, "indirect_reference", None)) is not None
|
||||
and indirect_reference in self._parent
|
||||
):
|
||||
index = self._parent.index(indirect_reference)
|
||||
else:
|
||||
raise PyPdfError("File not found in parent object.")
|
||||
self._parent.pop(index) # Reference.
|
||||
self._parent.pop(index - 1) # Name.
|
||||
self.pdf_object = DictionaryObject() # Invalidate.
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.__class__.__name__} name={self.name!r}>"
|
||||
|
||||
@classmethod
|
||||
def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]:
|
||||
"""
|
||||
Convert the given name tree into class instances.
|
||||
|
||||
Args:
|
||||
names: The name tree to load the data from.
|
||||
|
||||
Returns:
|
||||
Iterable of class instances for the files found.
|
||||
"""
|
||||
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
|
||||
for i, name in enumerate(names):
|
||||
if not isinstance(name, str):
|
||||
# Skip plain strings and retrieve them as `direct_name` by index.
|
||||
file_dictionary = name.get_object()
|
||||
direct_name = names[i - 1].get_object()
|
||||
yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names)
|
||||
|
||||
@classmethod
|
||||
def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]:
|
||||
"""
|
||||
Load the embedded files for the given document catalog.
|
||||
|
||||
This method and its signature are considered internal API and thus not exposed publicly for now.
|
||||
|
||||
Args:
|
||||
catalog: The document catalog to load from.
|
||||
|
||||
Returns:
|
||||
Iterable of class instances for the files found.
|
||||
"""
|
||||
try:
|
||||
container = cast(
|
||||
DictionaryObject,
|
||||
cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
|
||||
)
|
||||
except KeyError:
|
||||
return
|
||||
|
||||
if "/Kids" in container:
|
||||
for kid in cast(ArrayObject, container["/Kids"].get_object()):
|
||||
# There might be further (nested) kids here.
|
||||
# Wait for an example before evaluating an implementation.
|
||||
kid = kid.get_object()
|
||||
if "/Names" in kid:
|
||||
yield from cls._load_from_names(cast(ArrayObject, kid["/Names"]))
|
||||
if "/Names" in container:
|
||||
yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))
|
||||
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
@@ -0,0 +1,174 @@
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from ._base import is_null_or_none
|
||||
|
||||
|
||||
class Fit:
|
||||
def __init__(
|
||||
self, fit_type: str, fit_args: tuple[Union[None, float, Any], ...] = ()
|
||||
) -> None:
|
||||
from ._base import FloatObject, NameObject, NullObject, NumberObject # noqa: PLC0415
|
||||
|
||||
self.fit_type = NameObject(fit_type)
|
||||
self.fit_args: list[Union[NullObject, FloatObject, NumberObject]] = [
|
||||
NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def xyz(
|
||||
cls,
|
||||
left: Optional[float] = None,
|
||||
top: Optional[float] = None,
|
||||
zoom: Optional[float] = None,
|
||||
) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the coordinates (left, top)
|
||||
positioned at the upper-left corner of the window and the contents
|
||||
of the page magnified by the factor zoom.
|
||||
|
||||
A null value for any of the parameters left, top, or zoom specifies
|
||||
that the current value of that parameter is to be retained unchanged.
|
||||
|
||||
A zoom value of 0 has the same meaning as a null value.
|
||||
|
||||
Args:
|
||||
left:
|
||||
top:
|
||||
zoom:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/XYZ", fit_args=(left, top, zoom))
|
||||
|
||||
@classmethod
|
||||
def fit(cls) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified just
|
||||
enough to fit the entire page within the window both horizontally and
|
||||
vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the page within the
|
||||
window in the other dimension.
|
||||
"""
|
||||
return Fit(fit_type="/Fit")
|
||||
|
||||
@classmethod
|
||||
def fit_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the vertical coordinate top
|
||||
positioned at the top edge of the window and the contents of the page
|
||||
magnified just enough to fit the entire width of the page within the
|
||||
window.
|
||||
|
||||
A null value for ``top`` specifies that the current value of that
|
||||
parameter is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitH", fit_args=(top,))
|
||||
|
||||
@classmethod
|
||||
def fit_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||
return Fit(fit_type="/FitV", fit_args=(left,))
|
||||
|
||||
@classmethod
|
||||
def fit_rectangle(
|
||||
cls,
|
||||
left: Optional[float] = None,
|
||||
bottom: Optional[float] = None,
|
||||
right: Optional[float] = None,
|
||||
top: Optional[float] = None,
|
||||
) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified
|
||||
just enough to fit the rectangle specified by the coordinates
|
||||
left, bottom, right, and top entirely within the window
|
||||
both horizontally and vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the rectangle within
|
||||
the window in the other dimension.
|
||||
|
||||
A null value for any of the parameters may result in unpredictable
|
||||
behavior.
|
||||
|
||||
Args:
|
||||
left:
|
||||
bottom:
|
||||
right:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top))
|
||||
|
||||
@classmethod
|
||||
def fit_box(cls) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with its contents magnified just
|
||||
enough to fit its bounding box entirely within the window both
|
||||
horizontally and vertically.
|
||||
|
||||
If the required horizontal and vertical magnification factors are
|
||||
different, use the smaller of the two, centering the bounding box
|
||||
within the window in the other dimension.
|
||||
"""
|
||||
return Fit(fit_type="/FitB")
|
||||
|
||||
@classmethod
|
||||
def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the vertical coordinate top
|
||||
positioned at the top edge of the window and the contents of the page
|
||||
magnified just enough to fit the entire width of its bounding box
|
||||
within the window.
|
||||
|
||||
A null value for top specifies that the current value of that parameter
|
||||
is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
top:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitBH", fit_args=(top,))
|
||||
|
||||
@classmethod
|
||||
def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||
"""
|
||||
Display the page designated by page, with the horizontal coordinate
|
||||
left positioned at the left edge of the window and the contents of the
|
||||
page magnified just enough to fit the entire height of its bounding box
|
||||
within the window.
|
||||
|
||||
A null value for left specifies that the current value of that
|
||||
parameter is to be retained unchanged.
|
||||
|
||||
Args:
|
||||
left:
|
||||
|
||||
Returns:
|
||||
The created fit object.
|
||||
|
||||
"""
|
||||
return Fit(fit_type="/FitBV", fit_args=(left,))
|
||||
|
||||
def __str__(self) -> str:
|
||||
if not self.fit_args:
|
||||
return f"Fit({self.fit_type})"
|
||||
return f"Fit({self.fit_type}, {self.fit_args})"
|
||||
|
||||
|
||||
DEFAULT_FIT = Fit.fit()
|
||||
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
@@ -0,0 +1,314 @@
|
||||
# Copyright (c) 2024, pypdf contributors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import IO
|
||||
|
||||
from .._utils import (
|
||||
WHITESPACES,
|
||||
WHITESPACES_AS_BYTES,
|
||||
StreamType,
|
||||
logger_warning,
|
||||
read_non_whitespace,
|
||||
)
|
||||
from ..errors import PdfReadError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# An inline image should be used only for small images (4096 bytes or less),
|
||||
# but allow twice this for cases where this has been exceeded.
|
||||
BUFFER_SIZE = 8192
|
||||
|
||||
|
||||
def _check_end_image_marker(stream: StreamType) -> bool:
|
||||
ei_tok = read_non_whitespace(stream)
|
||||
ei_tok += stream.read(2)
|
||||
stream.seek(-3, 1)
|
||||
return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)
|
||||
|
||||
|
||||
def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract HexEncoded stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter > and EI as backup.
|
||||
while True:
|
||||
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b">")
|
||||
if pos_tok >= 0: # found >
|
||||
data_out += data_buffered[: pos_tok + 1]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||
break
|
||||
pos_ei = data_buffered.find(b"EI")
|
||||
if pos_ei >= 0: # found EI
|
||||
stream.seek(-len(data_buffered) + pos_ei - 1, 1)
|
||||
c = stream.read(1)
|
||||
while c in WHITESPACES:
|
||||
stream.seek(-2, 1)
|
||||
c = stream.read(1)
|
||||
pos_ei -= 1
|
||||
data_out += data_buffered[:pos_ei]
|
||||
break
|
||||
if len(data_buffered) == 2:
|
||||
data_out += data_buffered
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
# Neither > nor EI found
|
||||
data_out += data_buffered[:-2]
|
||||
stream.seek(-2, 1)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract A85 stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter ~>
|
||||
while True:
|
||||
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b"~>")
|
||||
if pos_tok >= 0: # found!
|
||||
data_out += data_buffered[: pos_tok + 2]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 2, 1)
|
||||
break
|
||||
if len(data_buffered) == 2: # end of buffer
|
||||
data_out += data_buffered
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
data_out += data_buffered[
|
||||
:-2
|
||||
] # back by one char in case of in the middle of ~>
|
||||
stream.seek(-2, 1)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__run_length_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract RL (RunLengthDecode) stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
data_out: bytes = b""
|
||||
# Read data until delimiter 128
|
||||
while True:
|
||||
data_buffered = stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_tok = data_buffered.find(b"\x80")
|
||||
if pos_tok >= 0: # found
|
||||
# Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
|
||||
# marks the EOD. But there apparently are cases like in issue #3517, where we have
|
||||
# an inline image with up to 51 EOD markers. In these cases, be resilient here and
|
||||
# use the default `EI` marker detection instead. Please note that this fallback
|
||||
# still omits special `EI` handling within the stream, but for now assume that having
|
||||
# both of these cases occur at the same time is very unlikely (and the image stream
|
||||
# is broken anyway).
|
||||
# For now, do not skip over more than one whitespace character.
|
||||
after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
|
||||
if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
|
||||
data_out += data_buffered[: pos_tok + 1]
|
||||
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||
else:
|
||||
logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
|
||||
ei_marker = data_buffered.find(b"EI")
|
||||
if ei_marker > 0:
|
||||
data_out += data_buffered[: ei_marker]
|
||||
stream.seek(-len(data_buffered) + ei_marker - 1, 1)
|
||||
break
|
||||
data_out += data_buffered
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline__dct_decode(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Extract DCT (JPEG) stream from inline image.
|
||||
The stream will be moved onto the EI.
|
||||
"""
|
||||
def read(length: int) -> bytes:
|
||||
# If 0 bytes are returned, and *size* was not 0, this indicates end of file.
|
||||
# If the object is in non-blocking mode and no bytes are available, `None` is returned.
|
||||
_result = stream.read(length)
|
||||
if _result is None or len(_result) != length:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
return _result
|
||||
|
||||
data_out: bytes = b""
|
||||
# Read Blocks of data (ID/Size/data) up to ID=FF/D9
|
||||
# https://www.digicamsoft.com/itu/itu-t81-36.html
|
||||
not_first = False
|
||||
while True:
|
||||
c = read(1)
|
||||
if not_first or (c == b"\xff"):
|
||||
data_out += c
|
||||
if c != b"\xff":
|
||||
continue
|
||||
not_first = True
|
||||
c = read(1)
|
||||
data_out += c
|
||||
if c == b"\xff":
|
||||
stream.seek(-1, 1) # pragma: no cover
|
||||
elif c == b"\x00": # stuffing
|
||||
pass
|
||||
elif c == b"\xd9": # end
|
||||
break
|
||||
elif c in (
|
||||
b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
|
||||
b"\xda\xdb\xdc\xdd\xde\xdf"
|
||||
b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
|
||||
):
|
||||
c = read(2)
|
||||
data_out += c
|
||||
sz = c[0] * 256 + c[1]
|
||||
data_out += read(sz - 2)
|
||||
|
||||
if not _check_end_image_marker(stream):
|
||||
raise PdfReadError("EI stream not found")
|
||||
return data_out
|
||||
|
||||
|
||||
def extract_inline_default(stream: StreamType) -> bytes:
|
||||
"""Legacy method, used by default"""
|
||||
stream_out = BytesIO()
|
||||
# Read the inline image, while checking for EI (End Image) operator.
|
||||
while True:
|
||||
data_buffered = stream.read(BUFFER_SIZE)
|
||||
if not data_buffered:
|
||||
raise PdfReadError("Unexpected end of stream")
|
||||
pos_ei = data_buffered.find(
|
||||
b"E"
|
||||
) # We can not look straight for "EI" because it may not have been loaded in the buffer
|
||||
|
||||
if pos_ei == -1:
|
||||
stream_out.write(data_buffered)
|
||||
else:
|
||||
# Write out everything including E (the one from EI to be removed)
|
||||
stream_out.write(data_buffered[0 : pos_ei + 1])
|
||||
sav_pos_ei = stream_out.tell() - 1
|
||||
# Seek back in the stream to read the E next
|
||||
stream.seek(pos_ei + 1 - len(data_buffered), 1)
|
||||
saved_pos = stream.tell()
|
||||
# Check for End Image
|
||||
tok2 = stream.read(1) # I of "EI"
|
||||
if tok2 != b"I":
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
tok3 = stream.read(1) # possible space after "EI"
|
||||
if tok3 not in WHITESPACES:
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
while tok3 in WHITESPACES:
|
||||
tok3 = stream.read(1)
|
||||
if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
|
||||
b"Q",
|
||||
b"E",
|
||||
}: # for Q or EMC
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
if is_followed_by_binary_data(stream):
|
||||
# Inline image contains `EI ` sequence usually marking the end of it, but
|
||||
# is followed by binary data which does not make sense for the actual end.
|
||||
stream.seek(saved_pos, 0)
|
||||
continue
|
||||
# Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient
|
||||
# remove E(I) wrongly inserted earlier
|
||||
stream.seek(saved_pos - 1, 0)
|
||||
stream_out.truncate(sav_pos_ei)
|
||||
break
|
||||
|
||||
return stream_out.getvalue()
|
||||
|
||||
|
||||
def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool:
|
||||
"""
|
||||
Check if the next bytes of the stream look like binary image data or regular page content.
|
||||
|
||||
This is just some heuristics due to the PDF specification being too imprecise about
|
||||
inline images containing the `EI` marker which would end an image. Starting with PDF 2.0,
|
||||
we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited
|
||||
everywhere, we should not expect to be able to remove such hacks in the near future - especially
|
||||
considering legacy documents as well.
|
||||
|
||||
The actual implementation draws some inspiration from
|
||||
https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
|
||||
"""
|
||||
position = stream.tell()
|
||||
data = stream.read(length)
|
||||
stream.seek(position)
|
||||
if not data:
|
||||
return False
|
||||
operator_start = None
|
||||
operator_end = None
|
||||
|
||||
for index, byte in enumerate(data):
|
||||
if byte < 32 and byte not in WHITESPACES_AS_BYTES:
|
||||
# This covers all characters not being displayable directly, although omitting whitespace
|
||||
# to allow for operator detection.
|
||||
return True
|
||||
is_whitespace = byte in WHITESPACES_AS_BYTES
|
||||
if operator_start is None and not is_whitespace:
|
||||
# Interpret all other non-whitespace characters as the start of an operation.
|
||||
operator_start = index
|
||||
if operator_start is not None and is_whitespace:
|
||||
# A whitespace stops an operation.
|
||||
# Assume that having an inline image with tons of whitespace is rather unlikely.
|
||||
operator_end = index
|
||||
break
|
||||
|
||||
if operator_start is None:
|
||||
# Inline images should not have tons of whitespaces, which would lead to no operator start.
|
||||
return False
|
||||
if operator_end is None:
|
||||
# We probably are inside an operation.
|
||||
operator_end = length
|
||||
operator_length = operator_end - operator_start
|
||||
operator = data[operator_start:operator_end]
|
||||
if operator.startswith(b"/") and operator_length > 1:
|
||||
# Name object.
|
||||
return False
|
||||
if operator.replace(b".", b"").isdigit():
|
||||
# Graphics operator, for example a move. A number (integer or float).
|
||||
return False
|
||||
if operator_length > 3: # noqa: SIM103
|
||||
# Usually, the operators inside a content stream should not have more than three characters,
|
||||
# especially after an inline image.
|
||||
return True
|
||||
return False
|
||||
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
# This module contains code used by _writer.py to track links in pages
|
||||
# being added to the writer until the links can be resolved.
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union, cast
|
||||
|
||||
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .._page import PageObject
|
||||
from .._reader import PdfReader
|
||||
from .._writer import PdfWriter
|
||||
|
||||
|
||||
class NamedReferenceLink:
|
||||
"""Named reference link being preserved until we can resolve it correctly."""
|
||||
|
||||
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
|
||||
"""reference: TextStringObject with named reference"""
|
||||
self._reference = reference
|
||||
self._source_pdf = source_pdf
|
||||
|
||||
def find_referenced_page(self) -> Union[IndirectObject, None]:
|
||||
destination = self._source_pdf.named_destinations.get(str(self._reference))
|
||||
return destination.page if destination else None
|
||||
|
||||
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||
"""target_pdf: PdfWriter which the new link went into"""
|
||||
# point named destination in new PDF to the new page
|
||||
if str(self._reference) not in target_pdf.named_destinations:
|
||||
target_pdf.add_named_destination(str(self._reference), new_page.page_number)
|
||||
|
||||
|
||||
class DirectReferenceLink:
|
||||
"""Direct reference link being preserved until we can resolve it correctly."""
|
||||
|
||||
def __init__(self, reference: ArrayObject) -> None:
|
||||
"""reference: an ArrayObject whose first element is the Page indirect object"""
|
||||
self._reference = reference
|
||||
|
||||
def find_referenced_page(self) -> IndirectObject:
|
||||
return self._reference[0]
|
||||
|
||||
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||
"""target_pdf: PdfWriter which the new link went into"""
|
||||
self._reference[0] = new_page
|
||||
|
||||
|
||||
ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink]
|
||||
|
||||
|
||||
def extract_links(new_page: "PageObject", old_page: "PageObject") -> list[tuple[ReferenceLink, ReferenceLink]]:
|
||||
"""Extracts links from two pages on the assumption that the two pages are
|
||||
the same. Produces one list of (new link, old link) tuples.
|
||||
"""
|
||||
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
|
||||
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
|
||||
|
||||
return [
|
||||
(new_link, old_link) for (new_link, old_link)
|
||||
in zip(new_links, old_links)
|
||||
if new_link and old_link
|
||||
]
|
||||
|
||||
|
||||
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
|
||||
src = cast("PdfReader", page.pdf)
|
||||
link = cast(DictionaryObject, indirect_object.get_object())
|
||||
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
|
||||
return None
|
||||
|
||||
if "/A" in link:
|
||||
action = cast(DictionaryObject, link["/A"])
|
||||
if action.get("/S") != "/GoTo":
|
||||
return None
|
||||
|
||||
if "/D" not in action:
|
||||
return None
|
||||
return _create_link(action["/D"], src)
|
||||
|
||||
if "/Dest" in link:
|
||||
return _create_link(link["/Dest"], src)
|
||||
|
||||
return None # Nothing to do here
|
||||
|
||||
|
||||
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
|
||||
if isinstance(reference, TextStringObject):
|
||||
return NamedReferenceLink(reference, source_pdf)
|
||||
if isinstance(reference, ArrayObject):
|
||||
return DirectReferenceLink(reference)
|
||||
return None
|
||||
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from typing import Union
|
||||
|
||||
from .._utils import StreamType, deprecation_no_replacement
|
||||
from ._base import NameObject
|
||||
from ._data_structures import Destination
|
||||
|
||||
|
||||
class OutlineItem(Destination):
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
stream.write(b"<<\n")
|
||||
for key in [
|
||||
NameObject(x)
|
||||
for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"]
|
||||
if x in self
|
||||
]:
|
||||
key.write_to_stream(stream)
|
||||
stream.write(b" ")
|
||||
value = self.raw_get(key)
|
||||
value.write_to_stream(stream)
|
||||
stream.write(b"\n")
|
||||
key = NameObject("/Dest")
|
||||
key.write_to_stream(stream)
|
||||
stream.write(b" ")
|
||||
value = self.dest_array
|
||||
value.write_to_stream(stream)
|
||||
stream.write(b"\n")
|
||||
stream.write(b">>")
|
||||
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import FloatObject, NumberObject
|
||||
from ._data_structures import ArrayObject
|
||||
|
||||
|
||||
class RectangleObject(ArrayObject):
|
||||
"""
|
||||
This class is used to represent *page boxes* in pypdf.
|
||||
|
||||
These boxes include:
|
||||
|
||||
* :attr:`artbox <pypdf._page.PageObject.artbox>`
|
||||
* :attr:`bleedbox <pypdf._page.PageObject.bleedbox>`
|
||||
* :attr:`cropbox <pypdf._page.PageObject.cropbox>`
|
||||
* :attr:`mediabox <pypdf._page.PageObject.mediabox>`
|
||||
* :attr:`trimbox <pypdf._page.PageObject.trimbox>`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, arr: Union["RectangleObject", tuple[float, float, float, float]]
|
||||
) -> None:
|
||||
# must have four points
|
||||
assert len(arr) == 4
|
||||
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
||||
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr])
|
||||
|
||||
def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
|
||||
if not isinstance(value, (FloatObject, NumberObject)):
|
||||
value = FloatObject(value)
|
||||
return value
|
||||
|
||||
def scale(self, sx: float, sy: float) -> "RectangleObject":
|
||||
return RectangleObject(
|
||||
(
|
||||
float(self.left) * sx,
|
||||
float(self.bottom) * sy,
|
||||
float(self.right) * sx,
|
||||
float(self.top) * sy,
|
||||
)
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"RectangleObject({list(self)!r})"
|
||||
|
||||
@property
|
||||
def left(self) -> FloatObject:
|
||||
return self[0]
|
||||
|
||||
@left.setter
|
||||
def left(self, f: float) -> None:
|
||||
self[0] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def bottom(self) -> FloatObject:
|
||||
return self[1]
|
||||
|
||||
@bottom.setter
|
||||
def bottom(self, f: float) -> None:
|
||||
self[1] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def right(self) -> FloatObject:
|
||||
return self[2]
|
||||
|
||||
@right.setter
|
||||
def right(self, f: float) -> None:
|
||||
self[2] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def top(self) -> FloatObject:
|
||||
return self[3]
|
||||
|
||||
@top.setter
|
||||
def top(self, f: float) -> None:
|
||||
self[3] = FloatObject(f)
|
||||
|
||||
@property
|
||||
def lower_left(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the lower left coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.left, self.bottom
|
||||
|
||||
@lower_left.setter
|
||||
def lower_left(self, value: tuple[float, float]) -> None:
|
||||
self[0], self[1] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def lower_right(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the lower right coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.right, self.bottom
|
||||
|
||||
@lower_right.setter
|
||||
def lower_right(self, value: tuple[float, float]) -> None:
|
||||
self[2], self[1] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def upper_left(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the upper left coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.left, self.top
|
||||
|
||||
@upper_left.setter
|
||||
def upper_left(self, value: tuple[float, float]) -> None:
|
||||
self[0], self[3] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def upper_right(self) -> tuple[float, float]:
|
||||
"""
|
||||
Property to read and modify the upper right coordinate of this box
|
||||
in (x,y) form.
|
||||
"""
|
||||
return self.right, self.top
|
||||
|
||||
@upper_right.setter
|
||||
def upper_right(self, value: tuple[float, float]) -> None:
|
||||
self[2], self[3] = (self._ensure_is_number(x) for x in value)
|
||||
|
||||
@property
|
||||
def width(self) -> float:
|
||||
return self.right - self.left
|
||||
|
||||
@property
|
||||
def height(self) -> float:
|
||||
return self.top - self.bottom
|
||||
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
@@ -0,0 +1,208 @@
|
||||
import codecs
|
||||
from typing import Union
|
||||
|
||||
from .._codecs import _pdfdoc_encoding
|
||||
from .._utils import StreamType, logger_warning, read_non_whitespace
|
||||
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
|
||||
from ._base import ByteStringObject, TextStringObject
|
||||
|
||||
|
||||
def hex_to_rgb(value: str) -> tuple[float, float, float]:
|
||||
return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
|
||||
|
||||
|
||||
def read_hex_string_from_stream(
|
||||
stream: StreamType,
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||
stream.read(1)
|
||||
arr = []
|
||||
x = b""
|
||||
while True:
|
||||
tok = read_non_whitespace(stream)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok == b">":
|
||||
break
|
||||
x += tok
|
||||
if len(x) == 2:
|
||||
arr.append(int(x, base=16))
|
||||
x = b""
|
||||
if len(x) == 1:
|
||||
x += b"0"
|
||||
if x != b"":
|
||||
arr.append(int(x, base=16))
|
||||
return create_string_object(bytes(arr), forced_encoding)
|
||||
|
||||
|
||||
__ESCAPE_DICT__ = {
|
||||
b"n": ord(b"\n"),
|
||||
b"r": ord(b"\r"),
|
||||
b"t": ord(b"\t"),
|
||||
b"b": ord(b"\b"),
|
||||
b"f": ord(b"\f"),
|
||||
b"(": ord(b"("),
|
||||
b")": ord(b")"),
|
||||
b"/": ord(b"/"),
|
||||
b"\\": ord(b"\\"),
|
||||
b" ": ord(b" "),
|
||||
b"%": ord(b"%"),
|
||||
b"<": ord(b"<"),
|
||||
b">": ord(b">"),
|
||||
b"[": ord(b"["),
|
||||
b"]": ord(b"]"),
|
||||
b"#": ord(b"#"),
|
||||
b"_": ord(b"_"),
|
||||
b"&": ord(b"&"),
|
||||
b"$": ord(b"$"),
|
||||
}
|
||||
__BACKSLASH_CODE__ = 92
|
||||
|
||||
|
||||
def read_string_from_stream(
|
||||
stream: StreamType,
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||
tok = stream.read(1)
|
||||
parens = 1
|
||||
txt = []
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if not tok:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
if tok == b"(":
|
||||
parens += 1
|
||||
elif tok == b")":
|
||||
parens -= 1
|
||||
if parens == 0:
|
||||
break
|
||||
elif tok == b"\\":
|
||||
tok = stream.read(1)
|
||||
try:
|
||||
txt.append(__ESCAPE_DICT__[tok])
|
||||
continue
|
||||
except KeyError:
|
||||
if b"0" <= tok <= b"7":
|
||||
# "The number ddd may consist of one, two, or three
|
||||
# octal digits; high-order overflow shall be ignored.
|
||||
# Three octal digits shall be used, with leading zeros
|
||||
# as needed, if the next character of the string is also
|
||||
# a digit." (PDF reference 7.3.4.2, p 16)
|
||||
sav = stream.tell() - 1
|
||||
for _ in range(2):
|
||||
ntok = stream.read(1)
|
||||
if b"0" <= ntok <= b"7":
|
||||
tok += ntok
|
||||
else:
|
||||
stream.seek(-1, 1) # ntok has to be analyzed
|
||||
break
|
||||
i = int(tok, base=8)
|
||||
if i > 255:
|
||||
txt.append(__BACKSLASH_CODE__)
|
||||
stream.seek(sav)
|
||||
else:
|
||||
txt.append(i)
|
||||
continue
|
||||
if tok in b"\n\r":
|
||||
# This case is hit when a backslash followed by a line
|
||||
# break occurs. If it's a multi-char EOL, consume the
|
||||
# second character:
|
||||
tok = stream.read(1)
|
||||
if tok not in b"\n\r":
|
||||
stream.seek(-1, 1)
|
||||
# Then don't add anything to the actual string, since this
|
||||
# line break was escaped:
|
||||
continue
|
||||
msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"
|
||||
logger_warning(msg, __name__)
|
||||
txt.append(__BACKSLASH_CODE__)
|
||||
txt.append(ord(tok))
|
||||
return create_string_object(bytes(txt), forced_encoding)
|
||||
|
||||
|
||||
def create_string_object(
|
||||
string: Union[str, bytes],
|
||||
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||
) -> Union[TextStringObject, ByteStringObject]:
|
||||
"""
|
||||
Create a ByteStringObject or a TextStringObject from a string to represent the string.
|
||||
|
||||
Args:
|
||||
string: The data being used
|
||||
forced_encoding: Typically None, or an encoding string
|
||||
|
||||
Returns:
|
||||
A ByteStringObject
|
||||
|
||||
Raises:
|
||||
TypeError: If string is not of type str or bytes.
|
||||
|
||||
"""
|
||||
if isinstance(string, str):
|
||||
return TextStringObject(string)
|
||||
if isinstance(string, bytes):
|
||||
if isinstance(forced_encoding, (list, dict)):
|
||||
out = ""
|
||||
for x in string:
|
||||
try:
|
||||
out += forced_encoding[x]
|
||||
except Exception:
|
||||
out += bytes((x,)).decode("charmap")
|
||||
obj = TextStringObject(out)
|
||||
obj._original_bytes = string
|
||||
return obj
|
||||
if isinstance(forced_encoding, str):
|
||||
if forced_encoding == "bytes":
|
||||
return ByteStringObject(string)
|
||||
obj = TextStringObject(string.decode(forced_encoding))
|
||||
obj._original_bytes = string
|
||||
return obj
|
||||
try:
|
||||
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
|
||||
retval = TextStringObject(string.decode("utf-16"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = string[:2]
|
||||
return retval
|
||||
if string.startswith(b"\x00"):
|
||||
retval = TextStringObject(string.decode("utf-16be"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = codecs.BOM_UTF16_BE
|
||||
return retval
|
||||
if string[1:2] == b"\x00":
|
||||
retval = TextStringObject(string.decode("utf-16le"))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_utf16 = True
|
||||
retval.utf16_bom = codecs.BOM_UTF16_LE
|
||||
return retval
|
||||
|
||||
# This is probably a big performance hit here, but we need
|
||||
# to convert string objects into the text/unicode-aware
|
||||
# version if possible... and the only way to check if that's
|
||||
# possible is to try.
|
||||
# Some strings are strings, some are just byte arrays.
|
||||
retval = TextStringObject(decode_pdfdocencoding(string))
|
||||
retval._original_bytes = string
|
||||
retval.autodetect_pdfdocencoding = True
|
||||
return retval
|
||||
except UnicodeDecodeError:
|
||||
return ByteStringObject(string)
|
||||
else:
|
||||
raise TypeError("create_string_object should have str or unicode arg")
|
||||
|
||||
|
||||
def decode_pdfdocencoding(byte_array: bytes) -> str:
|
||||
retval = ""
|
||||
for b in byte_array:
|
||||
c = _pdfdoc_encoding[b]
|
||||
if c == "\u0000":
|
||||
raise UnicodeDecodeError(
|
||||
"pdfdocencoding",
|
||||
bytearray(b),
|
||||
-1,
|
||||
-1,
|
||||
"does not exist in translation table",
|
||||
)
|
||||
retval += c
|
||||
return retval
|
||||
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Copyright (c) 2023, Pubpub-ZZ
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none
|
||||
from ._data_structures import ArrayObject, DictionaryObject
|
||||
|
||||
f_obj = BooleanObject(False)
|
||||
|
||||
|
||||
class ViewerPreferences(DictionaryObject):
|
||||
def __init__(self, obj: Optional[DictionaryObject] = None) -> None:
|
||||
super().__init__(self)
|
||||
if not is_null_or_none(obj):
|
||||
self.update(obj.items()) # type: ignore
|
||||
try:
|
||||
self.indirect_reference = obj.indirect_reference # type: ignore
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _get_bool(self, key: str, default: Optional[BooleanObject]) -> Optional[BooleanObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_bool(self, key: str, v: bool) -> None:
|
||||
self[NameObject(key)] = BooleanObject(v is True)
|
||||
|
||||
def _get_name(self, key: str, default: Optional[NameObject]) -> Optional[NameObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_name(self, key: str, lst: list[str], v: NameObject) -> None:
|
||||
if v[0] != "/":
|
||||
raise ValueError(f"{v} does not start with '/'")
|
||||
if lst != [] and v not in lst:
|
||||
raise ValueError(f"{v} is an unacceptable value")
|
||||
self[NameObject(key)] = NameObject(v)
|
||||
|
||||
def _get_arr(self, key: str, default: Optional[list[Any]]) -> Optional[ArrayObject]:
|
||||
return self.get(key, None if default is None else ArrayObject(default))
|
||||
|
||||
def _set_arr(self, key: str, v: Optional[ArrayObject]) -> None:
|
||||
if v is None:
|
||||
try:
|
||||
del self[NameObject(key)]
|
||||
except KeyError:
|
||||
pass
|
||||
return
|
||||
if not isinstance(v, ArrayObject):
|
||||
raise ValueError("ArrayObject is expected")
|
||||
self[NameObject(key)] = v
|
||||
|
||||
def _get_int(self, key: str, default: Optional[NumberObject]) -> Optional[NumberObject]:
|
||||
return self.get(key, default)
|
||||
|
||||
def _set_int(self, key: str, v: int) -> None:
|
||||
self[NameObject(key)] = NumberObject(v)
|
||||
|
||||
@property
|
||||
def PRINT_SCALING(self) -> NameObject:
|
||||
return NameObject("/PrintScaling")
|
||||
|
||||
def __new__(cls: Any, value: Any = None) -> "ViewerPreferences":
|
||||
def _add_prop_bool(key: str, default: Optional[BooleanObject]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_bool(key, default),
|
||||
lambda self, v: self._set_bool(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_name(
|
||||
key: str, lst: list[str], default: Optional[NameObject]
|
||||
) -> property:
|
||||
return property(
|
||||
lambda self: self._get_name(key, default),
|
||||
lambda self, v: self._set_name(key, lst, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined.
|
||||
Acceptable values: {lst}
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_arr(key: str, default: Optional[ArrayObject]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_arr(key, default),
|
||||
lambda self, v: self._set_arr(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
def _add_prop_int(key: str, default: Optional[int]) -> property:
|
||||
return property(
|
||||
lambda self: self._get_int(key, default),
|
||||
lambda self, v: self._set_int(key, v),
|
||||
None,
|
||||
f"""
|
||||
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||
""",
|
||||
)
|
||||
|
||||
cls.hide_toolbar = _add_prop_bool("/HideToolbar", f_obj)
|
||||
cls.hide_menubar = _add_prop_bool("/HideMenubar", f_obj)
|
||||
cls.hide_windowui = _add_prop_bool("/HideWindowUI", f_obj)
|
||||
cls.fit_window = _add_prop_bool("/FitWindow", f_obj)
|
||||
cls.center_window = _add_prop_bool("/CenterWindow", f_obj)
|
||||
cls.display_doctitle = _add_prop_bool("/DisplayDocTitle", f_obj)
|
||||
|
||||
cls.non_fullscreen_pagemode = _add_prop_name(
|
||||
"/NonFullScreenPageMode",
|
||||
["/UseNone", "/UseOutlines", "/UseThumbs", "/UseOC"],
|
||||
NameObject("/UseNone"),
|
||||
)
|
||||
cls.direction = _add_prop_name(
|
||||
"/Direction", ["/L2R", "/R2L"], NameObject("/L2R")
|
||||
)
|
||||
cls.view_area = _add_prop_name("/ViewArea", [], None)
|
||||
cls.view_clip = _add_prop_name("/ViewClip", [], None)
|
||||
cls.print_area = _add_prop_name("/PrintArea", [], None)
|
||||
cls.print_clip = _add_prop_name("/PrintClip", [], None)
|
||||
cls.print_scaling = _add_prop_name("/PrintScaling", [], None)
|
||||
cls.duplex = _add_prop_name(
|
||||
"/Duplex", ["/Simplex", "/DuplexFlipShortEdge", "/DuplexFlipLongEdge"], None
|
||||
)
|
||||
cls.pick_tray_by_pdfsize = _add_prop_bool("/PickTrayByPDFSize", None)
|
||||
cls.print_pagerange = _add_prop_arr("/PrintPageRange", None)
|
||||
cls.num_copies = _add_prop_int("/NumCopies", None)
|
||||
|
||||
cls.enforce = _add_prop_arr("/Enforce", ArrayObject())
|
||||
|
||||
return DictionaryObject.__new__(cls)
|
||||
200
venv/lib/python3.12/site-packages/pypdf/pagerange.py
Normal file
200
venv/lib/python3.12/site-packages/pypdf/pagerange.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Representation and utils for ranges of PDF file pages.
|
||||
|
||||
Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
|
||||
All rights reserved. This software is available under a BSD license;
|
||||
see https://github.com/py-pdf/pypdf/blob/main/LICENSE
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Union
|
||||
|
||||
from .errors import ParseError
|
||||
|
||||
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
|
||||
PAGE_RANGE_RE = f"^({_INT_RE}|({_INT_RE}?(:{_INT_RE}?(:{_INT_RE}?)?)))$"
|
||||
# groups: 12 34 5 6 7 8
|
||||
|
||||
|
||||
class PageRange:
|
||||
"""
|
||||
A slice-like representation of a range of page indices.
|
||||
|
||||
For example, page numbers, only starting at zero.
|
||||
|
||||
The syntax is like what you would put between brackets [ ].
|
||||
The slice is one of the few Python types that can't be subclassed,
|
||||
but this class converts to and from slices, and allows similar use.
|
||||
|
||||
- PageRange(str) parses a string representing a page range.
|
||||
- PageRange(slice) directly "imports" a slice.
|
||||
- to_slice() gives the equivalent slice.
|
||||
- str() and repr() allow printing.
|
||||
- indices(n) is like slice.indices(n).
|
||||
"""
|
||||
|
||||
def __init__(self, arg: Union[slice, "PageRange", str]) -> None:
|
||||
"""
|
||||
Initialize with either a slice -- giving the equivalent page range,
|
||||
or a PageRange object -- making a copy,
|
||||
or a string like
|
||||
"int", "[int]:[int]" or "[int]:[int]:[int]",
|
||||
where the brackets indicate optional ints.
|
||||
Remember, page indices start with zero.
|
||||
Page range expression examples:
|
||||
|
||||
: all pages. -1 last page.
|
||||
22 just the 23rd page. :-1 all but the last page.
|
||||
0:3 the first three pages. -2 second-to-last page.
|
||||
:3 the first three pages. -2: last two pages.
|
||||
5: from the sixth page onward. -3:-1 third & second to last.
|
||||
The third, "stride" or "step" number is also recognized.
|
||||
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
|
||||
1:10:2 1 3 5 7 9 2::-1 2 1 0.
|
||||
::-1 all pages in reverse order.
|
||||
Note the difference between this notation and arguments to slice():
|
||||
slice(3) means the first three pages;
|
||||
PageRange("3") means the range of only the fourth page.
|
||||
However PageRange(slice(3)) means the first three pages.
|
||||
"""
|
||||
if isinstance(arg, slice):
|
||||
self._slice = arg
|
||||
return
|
||||
|
||||
if isinstance(arg, PageRange):
|
||||
self._slice = arg.to_slice()
|
||||
return
|
||||
|
||||
m = isinstance(arg, str) and re.match(PAGE_RANGE_RE, arg)
|
||||
if not m:
|
||||
raise ParseError(arg)
|
||||
if m.group(2):
|
||||
# Special case: just an int means a range of one page.
|
||||
start = int(m.group(2))
|
||||
stop = start + 1 if start != -1 else None
|
||||
self._slice = slice(start, stop)
|
||||
else:
|
||||
self._slice = slice(*[int(g) if g else None for g in m.group(4, 6, 8)])
|
||||
|
||||
@staticmethod
|
||||
def valid(input: Any) -> bool:
|
||||
"""
|
||||
True if input is a valid initializer for a PageRange.
|
||||
|
||||
Args:
|
||||
input: A possible PageRange string or a PageRange object.
|
||||
|
||||
Returns:
|
||||
True, if the ``input`` is a valid PageRange.
|
||||
|
||||
"""
|
||||
return isinstance(input, (slice, PageRange)) or (
|
||||
isinstance(input, str) and bool(re.match(PAGE_RANGE_RE, input))
|
||||
)
|
||||
|
||||
def to_slice(self) -> slice:
|
||||
"""Return the slice equivalent of this page range."""
|
||||
return self._slice
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""A string like "1:2:3"."""
|
||||
s = self._slice
|
||||
indices: Union[tuple[int, int], tuple[int, int, int]]
|
||||
if s.step is None:
|
||||
if s.start is not None and s.stop == s.start + 1:
|
||||
return str(s.start)
|
||||
|
||||
indices = s.start, s.stop
|
||||
else:
|
||||
indices = s.start, s.stop, s.step
|
||||
return ":".join("" if i is None else str(i) for i in indices)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""A string like "PageRange('1:2:3')"."""
|
||||
return "PageRange(" + repr(str(self)) + ")"
|
||||
|
||||
def indices(self, n: int) -> tuple[int, int, int]:
|
||||
"""
|
||||
Assuming a sequence of length n, calculate the start and stop indices,
|
||||
and the stride length of the PageRange.
|
||||
|
||||
See help(slice.indices).
|
||||
|
||||
Args:
|
||||
n: the length of the list of pages to choose from.
|
||||
|
||||
Returns:
|
||||
Arguments for range().
|
||||
|
||||
"""
|
||||
return self._slice.indices(n)
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, PageRange):
|
||||
return False
|
||||
return self._slice == other._slice
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.__class__, (self._slice.start, self._slice.stop, self._slice.step)))
|
||||
|
||||
def __add__(self, other: "PageRange") -> "PageRange":
|
||||
if not isinstance(other, PageRange):
|
||||
raise TypeError(f"Can't add PageRange and {type(other)}")
|
||||
if self._slice.step is not None or other._slice.step is not None:
|
||||
raise ValueError("Can't add PageRange with stride")
|
||||
a = self._slice.start, self._slice.stop
|
||||
b = other._slice.start, other._slice.stop
|
||||
|
||||
if a[0] > b[0]:
|
||||
a, b = b, a
|
||||
|
||||
# Now a[0] is the smallest
|
||||
if b[0] > a[1]:
|
||||
# There is a gap between a and b.
|
||||
raise ValueError("Can't add PageRanges with gap")
|
||||
return PageRange(slice(a[0], max(a[1], b[1])))
|
||||
|
||||
|
||||
PAGE_RANGE_ALL = PageRange(":") # The range of all pages.
|
||||
|
||||
|
||||
def parse_filename_page_ranges(
|
||||
args: list[Union[str, PageRange, None]]
|
||||
) -> list[tuple[str, PageRange]]:
|
||||
"""
|
||||
Given a list of filenames and page ranges, return a list of (filename, page_range) pairs.
|
||||
|
||||
Args:
|
||||
args: A list where the first element is a filename. The other elements are
|
||||
filenames, page-range expressions, slice objects, or PageRange objects.
|
||||
A filename not followed by a page range indicates all pages of the file.
|
||||
|
||||
Returns:
|
||||
A list of (filename, page_range) pairs.
|
||||
|
||||
"""
|
||||
pairs: list[tuple[str, PageRange]] = []
|
||||
pdf_filename: Union[str, None] = None
|
||||
did_page_range = False
|
||||
for arg in [*args, None]:
|
||||
if PageRange.valid(arg):
|
||||
if not pdf_filename:
|
||||
raise ValueError(
|
||||
"The first argument must be a filename, not a page range."
|
||||
)
|
||||
|
||||
assert arg is not None
|
||||
pairs.append((pdf_filename, PageRange(arg)))
|
||||
did_page_range = True
|
||||
else:
|
||||
# New filename or end of list - use the complete previous file?
|
||||
if pdf_filename and not did_page_range:
|
||||
pairs.append((pdf_filename, PAGE_RANGE_ALL))
|
||||
|
||||
assert not isinstance(arg, PageRange), arg
|
||||
pdf_filename = arg
|
||||
did_page_range = False
|
||||
return pairs
|
||||
|
||||
|
||||
PageRangeSpec = Union[str, PageRange, tuple[int, int], tuple[int, int, int], list[int]]
|
||||
52
venv/lib/python3.12/site-packages/pypdf/papersizes.py
Normal file
52
venv/lib/python3.12/site-packages/pypdf/papersizes.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Helper to get paper sizes."""
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
class Dimensions(NamedTuple):
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
class PaperSize:
|
||||
"""(width, height) of the paper in portrait mode in pixels at 72 ppi."""
|
||||
|
||||
# Notes of how to calculate it:
|
||||
# 1. Get the size of the paper in millimeters
|
||||
# 2. Convert it to inches (25.4 millimeters is equal to 1 inch)
|
||||
# 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels)
|
||||
|
||||
# All Din-A paper sizes follow this pattern:
|
||||
# 2 x A(n - 1) = A(n)
|
||||
# So the height of the next bigger one is the width of the smaller one
|
||||
# The ratio is always approximately 1:2**0.5
|
||||
# Additionally, A0 is defined to have an area of 1 m**2
|
||||
# https://en.wikipedia.org/wiki/ISO_216
|
||||
# Be aware of rounding issues!
|
||||
A0 = Dimensions(2384, 3370) # 841mm x 1189mm
|
||||
A1 = Dimensions(1684, 2384)
|
||||
A2 = Dimensions(1191, 1684)
|
||||
A3 = Dimensions(842, 1191)
|
||||
A4 = Dimensions(
|
||||
595, 842
|
||||
) # Printer paper, documents - this is by far the most common
|
||||
A5 = Dimensions(420, 595) # Paperback books
|
||||
A6 = Dimensions(298, 420) # Postcards
|
||||
A7 = Dimensions(210, 298)
|
||||
A8 = Dimensions(147, 210)
|
||||
|
||||
# Envelopes
|
||||
C4 = Dimensions(649, 918)
|
||||
|
||||
|
||||
_din_a = (
|
||||
PaperSize.A0,
|
||||
PaperSize.A1,
|
||||
PaperSize.A2,
|
||||
PaperSize.A3,
|
||||
PaperSize.A4,
|
||||
PaperSize.A5,
|
||||
PaperSize.A6,
|
||||
PaperSize.A7,
|
||||
PaperSize.A8,
|
||||
)
|
||||
0
venv/lib/python3.12/site-packages/pypdf/py.typed
Normal file
0
venv/lib/python3.12/site-packages/pypdf/py.typed
Normal file
80
venv/lib/python3.12/site-packages/pypdf/types.py
Normal file
80
venv/lib/python3.12/site-packages/pypdf/types.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Helpers for working with PDF types."""
|
||||
|
||||
import sys
|
||||
from typing import Literal, Union
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
# Python 3.10+: https://www.python.org/dev/peps/pep-0484
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .generic._base import NameObject, NullObject, NumberObject
|
||||
from .generic._data_structures import ArrayObject, Destination
|
||||
from .generic._outline import OutlineItem
|
||||
|
||||
BorderArrayType: TypeAlias = list[Union[NameObject, NumberObject, ArrayObject]]
|
||||
|
||||
OutlineItemType: TypeAlias = Union[OutlineItem, Destination]
|
||||
|
||||
FitType: TypeAlias = Literal[
|
||||
"/XYZ", "/Fit", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV"
|
||||
]
|
||||
# These go with the FitType, they specify values for the fit
|
||||
ZoomArgType: TypeAlias = Union[NumberObject, NullObject, float]
|
||||
ZoomArgsType: TypeAlias = list[ZoomArgType]
|
||||
|
||||
# Recursive types like the following are not yet supported by Sphinx:
|
||||
# OutlineType = List[Union[Destination, "OutlineType"]]
|
||||
# Hence use this for the moment:
|
||||
OutlineType = list[Union[Destination, list[Union[Destination, list[Destination]]]]]
|
||||
|
||||
LayoutType: TypeAlias = Literal[
|
||||
"/NoLayout",
|
||||
"/SinglePage",
|
||||
"/OneColumn",
|
||||
"/TwoColumnLeft",
|
||||
"/TwoColumnRight",
|
||||
"/TwoPageLeft",
|
||||
"/TwoPageRight",
|
||||
]
|
||||
|
||||
PagemodeType: TypeAlias = Literal[
|
||||
"/UseNone",
|
||||
"/UseOutlines",
|
||||
"/UseThumbs",
|
||||
"/FullScreen",
|
||||
"/UseOC",
|
||||
"/UseAttachments",
|
||||
]
|
||||
|
||||
AnnotationSubtype: TypeAlias = Literal[
|
||||
"/Text",
|
||||
"/Link",
|
||||
"/FreeText",
|
||||
"/Line",
|
||||
"/Square",
|
||||
"/Circle",
|
||||
"/Polygon",
|
||||
"/PolyLine",
|
||||
"/Highlight",
|
||||
"/Underline",
|
||||
"/Squiggly",
|
||||
"/StrikeOut",
|
||||
"/Caret",
|
||||
"/Stamp",
|
||||
"/Ink",
|
||||
"/Popup",
|
||||
"/FileAttachment",
|
||||
"/Sound",
|
||||
"/Movie",
|
||||
"/Screen",
|
||||
"/Widget",
|
||||
"/PrinterMark",
|
||||
"/TrapNet",
|
||||
"/Watermark",
|
||||
"/3D",
|
||||
"/Redact",
|
||||
"/Projection",
|
||||
"/RichMedia",
|
||||
]
|
||||
748
venv/lib/python3.12/site-packages/pypdf/xmp.py
Normal file
748
venv/lib/python3.12/site-packages/pypdf/xmp.py
Normal file
@@ -0,0 +1,748 @@
|
||||
"""
|
||||
Anything related to Extensible Metadata Platform (XMP) metadata.
|
||||
|
||||
https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import decimal
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Optional,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
from xml.dom.minidom import Document, parseString
|
||||
from xml.dom.minidom import Element as XmlElement
|
||||
from xml.parsers.expat import ExpatError
|
||||
|
||||
from ._protocols import XmpInformationProtocol
|
||||
from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
|
||||
from .errors import PdfReadError, XmpDocumentError
|
||||
from .generic import ContentStream, PdfObject
|
||||
|
||||
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
|
||||
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
|
||||
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
|
||||
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
|
||||
|
||||
# What is the PDFX namespace, you might ask?
|
||||
# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
|
||||
# This namespace is used to place "custom metadata"
|
||||
# properties, which are arbitrary metadata properties with no semantic or
|
||||
# documented meaning.
|
||||
#
|
||||
# Elements in the namespace are key/value-style storage,
|
||||
# where the element name is the key and the content is the value. The keys
|
||||
# are transformed into valid XML identifiers by substituting an invalid
|
||||
# identifier character with \u2182 followed by the unicode hex ID of the
|
||||
# original character. A key like "my car" is therefore "my\u21820020car".
|
||||
#
|
||||
# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
|
||||
#
|
||||
# The pdfx namespace should be avoided.
|
||||
# A custom data schema and sensical XML elements could be used instead, as is
|
||||
# suggested by Adobe's own documentation on XMP under "Extensibility of
|
||||
# Schemas".
|
||||
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
|
||||
|
||||
# PDF/A
|
||||
PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
|
||||
|
||||
# Internal mapping of namespace URI → prefix
|
||||
_NAMESPACE_PREFIX_MAP = {
|
||||
DC_NAMESPACE: "dc",
|
||||
XMP_NAMESPACE: "xmp",
|
||||
PDF_NAMESPACE: "pdf",
|
||||
XMPMM_NAMESPACE: "xmpMM",
|
||||
PDFAID_NAMESPACE: "pdfaid",
|
||||
PDFX_NAMESPACE: "pdfx",
|
||||
}
|
||||
|
||||
iso8601 = re.compile(
|
||||
"""
|
||||
(?P<year>[0-9]{4})
|
||||
(-
|
||||
(?P<month>[0-9]{2})
|
||||
(-
|
||||
(?P<day>[0-9]+)
|
||||
(T
|
||||
(?P<hour>[0-9]{2}):
|
||||
(?P<minute>[0-9]{2})
|
||||
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
|
||||
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
|
||||
)?
|
||||
)?
|
||||
)?
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
K = TypeVar("K")
|
||||
|
||||
# Minimal XMP template
|
||||
_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf">
|
||||
<rdf:RDF xmlns:rdf="{RDF_NAMESPACE}">
|
||||
<rdf:Description rdf:about=""
|
||||
xmlns:dc="{DC_NAMESPACE}"
|
||||
xmlns:xmp="{XMP_NAMESPACE}"
|
||||
xmlns:pdf="{PDF_NAMESPACE}"
|
||||
xmlns:xmpMM="{XMPMM_NAMESPACE}"
|
||||
xmlns:pdfaid="{PDFAID_NAMESPACE}"
|
||||
xmlns:pdfx="{PDFX_NAMESPACE}">
|
||||
</rdf:Description>
|
||||
</rdf:RDF>
|
||||
</x:xmpmeta>
|
||||
<?xpacket end="r"?>"""
|
||||
|
||||
|
||||
def _identity(value: K) -> K:
|
||||
return value
|
||||
|
||||
|
||||
def _converter_date(value: str) -> datetime.datetime:
|
||||
matches = iso8601.match(value)
|
||||
if matches is None:
|
||||
raise ValueError(f"Invalid date format: {value}")
|
||||
year = int(matches.group("year"))
|
||||
month = int(matches.group("month") or "1")
|
||||
day = int(matches.group("day") or "1")
|
||||
hour = int(matches.group("hour") or "0")
|
||||
minute = int(matches.group("minute") or "0")
|
||||
second = decimal.Decimal(matches.group("second") or "0")
|
||||
seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
|
||||
milliseconds_dec = (second - seconds_dec) * 1_000_000
|
||||
|
||||
seconds = int(seconds_dec)
|
||||
milliseconds = int(milliseconds_dec)
|
||||
|
||||
tzd = matches.group("tzd") or "Z"
|
||||
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
|
||||
if tzd != "Z":
|
||||
tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
|
||||
tzd_hours *= -1
|
||||
if tzd_hours < 0:
|
||||
tzd_minutes *= -1
|
||||
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
|
||||
return dt
|
||||
|
||||
|
||||
def _format_datetime_utc(value: datetime.datetime) -> str:
|
||||
"""Format a datetime as UTC with trailing 'Z'.
|
||||
|
||||
- If the input is timezone-aware, convert to UTC first.
|
||||
- If naive, assume UTC.
|
||||
"""
|
||||
if value.tzinfo is not None and value.utcoffset() is not None:
|
||||
value = value.astimezone(datetime.timezone.utc)
|
||||
|
||||
value = value.replace(tzinfo=None)
|
||||
return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
|
||||
def _generic_get(
|
||||
element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
|
||||
) -> Optional[list[str]]:
|
||||
containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
|
||||
retval: list[Any] = []
|
||||
if len(containers):
|
||||
for container in containers:
|
||||
for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||
value = self._get_text(item)
|
||||
value = converter(value)
|
||||
retval.append(value)
|
||||
return retval
|
||||
return None
|
||||
|
||||
|
||||
class XmpInformation(XmpInformationProtocol, PdfObject):
|
||||
"""
|
||||
An object that represents Extensible Metadata Platform (XMP) metadata.
|
||||
Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
|
||||
|
||||
Raises:
|
||||
PdfReadError: if XML is invalid
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, stream: ContentStream) -> None:
|
||||
self.stream = stream
|
||||
try:
|
||||
data = self.stream.get_data()
|
||||
doc_root: Document = parseString(data) # noqa: S318
|
||||
except (AttributeError, ExpatError) as e:
|
||||
raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
|
||||
self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
|
||||
RDF_NAMESPACE, "RDF"
|
||||
)[0]
|
||||
self.cache: dict[Any, Any] = {}
|
||||
|
||||
@classmethod
|
||||
def create(cls) -> "XmpInformation":
|
||||
"""
|
||||
Create a new XmpInformation object with minimal structure.
|
||||
|
||||
Returns:
|
||||
A new XmpInformation instance with empty metadata fields.
|
||||
"""
|
||||
stream = ContentStream(None, None)
|
||||
stream.set_data(_MINIMAL_XMP.encode("utf-8"))
|
||||
return cls(stream)
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
deprecate_with_replacement(
|
||||
"XmpInformation.write_to_stream",
|
||||
"PdfWriter.xmp_metadata",
|
||||
"6.0.0"
|
||||
)
|
||||
if encryption_key is not None: # deprecated
|
||||
deprecation_no_replacement(
|
||||
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||
)
|
||||
self.stream.write_to_stream(stream)
|
||||
|
||||
def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
|
||||
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||
attr = desc.getAttributeNodeNS(namespace, name)
|
||||
if attr is not None:
|
||||
yield attr
|
||||
yield from desc.getElementsByTagNameNS(namespace, name)
|
||||
|
||||
def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
|
||||
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||
for i in range(desc.attributes.length):
|
||||
attr = desc.attributes.item(i)
|
||||
if attr and attr.namespaceURI == namespace:
|
||||
yield attr
|
||||
for child in desc.childNodes:
|
||||
if child.namespaceURI == namespace:
|
||||
yield child
|
||||
|
||||
def _get_text(self, element: XmlElement) -> str:
|
||||
text = ""
|
||||
for child in element.childNodes:
|
||||
if child.nodeType == child.TEXT_NODE:
|
||||
text += child.data
|
||||
return text
|
||||
|
||||
def _get_single_value(
|
||||
self,
|
||||
namespace: str,
|
||||
name: str,
|
||||
converter: Callable[[str], Any] = _identity,
|
||||
) -> Optional[Any]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
value = None
|
||||
for element in self.get_element("", namespace, name):
|
||||
if element.nodeType == element.ATTRIBUTE_NODE:
|
||||
value = element.nodeValue
|
||||
else:
|
||||
value = self._get_text(element)
|
||||
break
|
||||
if value is not None:
|
||||
value = converter(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = value
|
||||
return value
|
||||
|
||||
def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval: list[str] = []
|
||||
for element in self.get_element("", namespace, name):
|
||||
if (bags := _generic_get(element, self, list_type="Bag")) is not None:
|
||||
retval.extend(bags)
|
||||
else:
|
||||
value = self._get_text(element)
|
||||
retval.append(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
def _get_seq_values(
|
||||
self,
|
||||
namespace: str,
|
||||
name: str,
|
||||
converter: Callable[[Any], Any] = _identity,
|
||||
) -> Optional[list[Any]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval: list[Any] = []
|
||||
for element in self.get_element("", namespace, name):
|
||||
if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
|
||||
retval.extend(seqs)
|
||||
elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
|
||||
# See issue at https://github.com/py-pdf/pypdf/issues/3324
|
||||
# Some applications violate the XMP metadata standard regarding `dc:creator` which should
|
||||
# be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
|
||||
# This seems to stem from the fact that the original Dublin Core specification does indeed
|
||||
# use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
|
||||
# the plain Dublin Core variant. For this reason, add a fallback here to deal with such
|
||||
# issues accordingly.
|
||||
retval.extend(bags)
|
||||
else:
|
||||
value = converter(self._get_text(element))
|
||||
retval.append(value)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]:
|
||||
cached = self.cache.get(namespace, {}).get(name)
|
||||
if cached:
|
||||
return cached
|
||||
retval: dict[Any, Any] = {}
|
||||
for element in self.get_element("", namespace, name):
|
||||
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
|
||||
if len(alts):
|
||||
for alt in alts:
|
||||
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||
value = self._get_text(item)
|
||||
retval[item.getAttribute("xml:lang")] = value
|
||||
else:
|
||||
retval["x-default"] = self._get_text(element)
|
||||
ns_cache = self.cache.setdefault(namespace, {})
|
||||
ns_cache[name] = retval
|
||||
return retval
|
||||
|
||||
@property
|
||||
def dc_contributor(self) -> Optional[list[str]]:
|
||||
"""Contributors to the resource (other than the authors)."""
|
||||
return self._getter_bag(DC_NAMESPACE, "contributor")
|
||||
|
||||
@dc_contributor.setter
|
||||
def dc_contributor(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "contributor", values)
|
||||
|
||||
@property
|
||||
def dc_coverage(self) -> Optional[str]:
|
||||
"""Text describing the extent or scope of the resource."""
|
||||
return self._get_single_value(DC_NAMESPACE, "coverage")
|
||||
|
||||
@dc_coverage.setter
|
||||
def dc_coverage(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(DC_NAMESPACE, "coverage", value)
|
||||
|
||||
@property
|
||||
def dc_creator(self) -> Optional[list[str]]:
|
||||
"""A sorted array of names of the authors of the resource, listed in order of precedence."""
|
||||
return self._get_seq_values(DC_NAMESPACE, "creator")
|
||||
|
||||
@dc_creator.setter
|
||||
def dc_creator(self, values: Optional[list[str]]) -> None:
|
||||
self._set_seq_values(DC_NAMESPACE, "creator", values)
|
||||
|
||||
@property
|
||||
def dc_date(self) -> Optional[list[datetime.datetime]]:
|
||||
"""A sorted array of dates of significance to the resource. The dates and times are in UTC."""
|
||||
return self._get_seq_values(DC_NAMESPACE, "date", _converter_date)
|
||||
|
||||
@dc_date.setter
|
||||
def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None:
|
||||
if values is None:
|
||||
self._set_seq_values(DC_NAMESPACE, "date", None)
|
||||
else:
|
||||
date_strings = []
|
||||
for value in values:
|
||||
if isinstance(value, datetime.datetime):
|
||||
date_strings.append(_format_datetime_utc(value))
|
||||
else:
|
||||
date_strings.append(str(value))
|
||||
self._set_seq_values(DC_NAMESPACE, "date", date_strings)
|
||||
|
||||
@property
|
||||
def dc_description(self) -> Optional[dict[str, str]]:
|
||||
"""A language-keyed dictionary of textual descriptions of the content of the resource."""
|
||||
return self._get_langalt_values(DC_NAMESPACE, "description")
|
||||
|
||||
@dc_description.setter
|
||||
def dc_description(self, values: Optional[dict[str, str]]) -> None:
|
||||
self._set_langalt_values(DC_NAMESPACE, "description", values)
|
||||
|
||||
@property
|
||||
def dc_format(self) -> Optional[str]:
|
||||
"""The mime-type of the resource."""
|
||||
return self._get_single_value(DC_NAMESPACE, "format")
|
||||
|
||||
@dc_format.setter
|
||||
def dc_format(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(DC_NAMESPACE, "format", value)
|
||||
|
||||
@property
|
||||
def dc_identifier(self) -> Optional[str]:
|
||||
"""Unique identifier of the resource."""
|
||||
return self._get_single_value(DC_NAMESPACE, "identifier")
|
||||
|
||||
@dc_identifier.setter
|
||||
def dc_identifier(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(DC_NAMESPACE, "identifier", value)
|
||||
|
||||
@property
|
||||
def dc_language(self) -> Optional[list[str]]:
|
||||
"""An unordered array specifying the languages used in the resource."""
|
||||
return self._getter_bag(DC_NAMESPACE, "language")
|
||||
|
||||
@dc_language.setter
|
||||
def dc_language(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "language", values)
|
||||
|
||||
@property
|
||||
def dc_publisher(self) -> Optional[list[str]]:
|
||||
"""An unordered array of publisher names."""
|
||||
return self._getter_bag(DC_NAMESPACE, "publisher")
|
||||
|
||||
@dc_publisher.setter
|
||||
def dc_publisher(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "publisher", values)
|
||||
|
||||
@property
|
||||
def dc_relation(self) -> Optional[list[str]]:
|
||||
"""An unordered array of text descriptions of relationships to other documents."""
|
||||
return self._getter_bag(DC_NAMESPACE, "relation")
|
||||
|
||||
@dc_relation.setter
|
||||
def dc_relation(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "relation", values)
|
||||
|
||||
@property
|
||||
def dc_rights(self) -> Optional[dict[str, str]]:
|
||||
"""A language-keyed dictionary of textual descriptions of the rights the user has to this resource."""
|
||||
return self._get_langalt_values(DC_NAMESPACE, "rights")
|
||||
|
||||
@dc_rights.setter
|
||||
def dc_rights(self, values: Optional[dict[str, str]]) -> None:
|
||||
self._set_langalt_values(DC_NAMESPACE, "rights", values)
|
||||
|
||||
@property
|
||||
def dc_source(self) -> Optional[str]:
|
||||
"""Unique identifier of the work from which this resource was derived."""
|
||||
return self._get_single_value(DC_NAMESPACE, "source")
|
||||
|
||||
@dc_source.setter
|
||||
def dc_source(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(DC_NAMESPACE, "source", value)
|
||||
|
||||
@property
|
||||
def dc_subject(self) -> Optional[list[str]]:
|
||||
"""An unordered array of descriptive phrases or keywords that specify the topic of the content."""
|
||||
return self._getter_bag(DC_NAMESPACE, "subject")
|
||||
|
||||
@dc_subject.setter
|
||||
def dc_subject(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "subject", values)
|
||||
|
||||
@property
|
||||
def dc_title(self) -> Optional[dict[str, str]]:
|
||||
"""A language-keyed dictionary of the title of the resource."""
|
||||
return self._get_langalt_values(DC_NAMESPACE, "title")
|
||||
|
||||
@dc_title.setter
|
||||
def dc_title(self, values: Optional[dict[str, str]]) -> None:
|
||||
self._set_langalt_values(DC_NAMESPACE, "title", values)
|
||||
|
||||
@property
|
||||
def dc_type(self) -> Optional[list[str]]:
|
||||
"""An unordered array of textual descriptions of the document type."""
|
||||
return self._getter_bag(DC_NAMESPACE, "type")
|
||||
|
||||
@dc_type.setter
|
||||
def dc_type(self, values: Optional[list[str]]) -> None:
|
||||
self._set_bag_values(DC_NAMESPACE, "type", values)
|
||||
|
||||
@property
|
||||
def pdf_keywords(self) -> Optional[str]:
|
||||
"""An unformatted text string representing document keywords."""
|
||||
return self._get_single_value(PDF_NAMESPACE, "Keywords")
|
||||
|
||||
@pdf_keywords.setter
|
||||
def pdf_keywords(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(PDF_NAMESPACE, "Keywords", value)
|
||||
|
||||
@property
|
||||
def pdf_pdfversion(self) -> Optional[str]:
|
||||
"""The PDF file version, for example 1.0 or 1.3."""
|
||||
return self._get_single_value(PDF_NAMESPACE, "PDFVersion")
|
||||
|
||||
@pdf_pdfversion.setter
|
||||
def pdf_pdfversion(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(PDF_NAMESPACE, "PDFVersion", value)
|
||||
|
||||
@property
|
||||
def pdf_producer(self) -> Optional[str]:
|
||||
"""The name of the tool that saved the document as a PDF."""
|
||||
return self._get_single_value(PDF_NAMESPACE, "Producer")
|
||||
|
||||
@pdf_producer.setter
|
||||
def pdf_producer(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(PDF_NAMESPACE, "Producer", value)
|
||||
|
||||
@property
|
||||
def xmp_create_date(self) -> Optional[datetime.datetime]:
|
||||
"""The date and time the resource was originally created. Returned as a UTC datetime object."""
|
||||
return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date)
|
||||
|
||||
@xmp_create_date.setter
|
||||
def xmp_create_date(self, value: Optional[datetime.datetime]) -> None:
|
||||
if value:
|
||||
date_str = _format_datetime_utc(value)
|
||||
self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str)
|
||||
else:
|
||||
self._set_single_value(XMP_NAMESPACE, "CreateDate", None)
|
||||
|
||||
@property
|
||||
def xmp_modify_date(self) -> Optional[datetime.datetime]:
|
||||
"""The date and time the resource was last modified. Returned as a UTC datetime object."""
|
||||
return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date)
|
||||
|
||||
@xmp_modify_date.setter
|
||||
def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None:
|
||||
if value:
|
||||
date_str = _format_datetime_utc(value)
|
||||
self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str)
|
||||
else:
|
||||
self._set_single_value(XMP_NAMESPACE, "ModifyDate", None)
|
||||
|
||||
@property
|
||||
def xmp_metadata_date(self) -> Optional[datetime.datetime]:
|
||||
"""The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object."""
|
||||
return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date)
|
||||
|
||||
@xmp_metadata_date.setter
|
||||
def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None:
|
||||
if value:
|
||||
date_str = _format_datetime_utc(value)
|
||||
self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str)
|
||||
else:
|
||||
self._set_single_value(XMP_NAMESPACE, "MetadataDate", None)
|
||||
|
||||
@property
|
||||
def xmp_creator_tool(self) -> Optional[str]:
|
||||
"""The name of the first known tool used to create the resource."""
|
||||
return self._get_single_value(XMP_NAMESPACE, "CreatorTool")
|
||||
|
||||
@xmp_creator_tool.setter
|
||||
def xmp_creator_tool(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(XMP_NAMESPACE, "CreatorTool", value)
|
||||
|
||||
@property
|
||||
def xmpmm_document_id(self) -> Optional[str]:
|
||||
"""The common identifier for all versions and renditions of this resource."""
|
||||
return self._get_single_value(XMPMM_NAMESPACE, "DocumentID")
|
||||
|
||||
@xmpmm_document_id.setter
|
||||
def xmpmm_document_id(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value)
|
||||
|
||||
@property
|
||||
def xmpmm_instance_id(self) -> Optional[str]:
|
||||
"""An identifier for a specific incarnation of a document, updated each time a file is saved."""
|
||||
return self._get_single_value(XMPMM_NAMESPACE, "InstanceID")
|
||||
|
||||
@xmpmm_instance_id.setter
|
||||
def xmpmm_instance_id(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value)
|
||||
|
||||
@property
|
||||
def pdfaid_part(self) -> Optional[str]:
|
||||
"""The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
|
||||
return self._get_single_value(PDFAID_NAMESPACE, "part")
|
||||
|
||||
@pdfaid_part.setter
|
||||
def pdfaid_part(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(PDFAID_NAMESPACE, "part", value)
|
||||
|
||||
@property
|
||||
def pdfaid_conformance(self) -> Optional[str]:
|
||||
"""The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
|
||||
return self._get_single_value(PDFAID_NAMESPACE, "conformance")
|
||||
|
||||
@pdfaid_conformance.setter
|
||||
def pdfaid_conformance(self, value: Optional[str]) -> None:
|
||||
self._set_single_value(PDFAID_NAMESPACE, "conformance", value)
|
||||
|
||||
@property
|
||||
def custom_properties(self) -> dict[Any, Any]:
|
||||
"""
|
||||
Retrieve custom metadata properties defined in the undocumented pdfx
|
||||
metadata schema.
|
||||
|
||||
Returns:
|
||||
A dictionary of key/value items for custom metadata properties.
|
||||
|
||||
"""
|
||||
if not hasattr(self, "_custom_properties"):
|
||||
self._custom_properties = {}
|
||||
for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
|
||||
key = node.localName
|
||||
while True:
|
||||
# see documentation about PDFX_NAMESPACE earlier in file
|
||||
idx = key.find("\u2182")
|
||||
if idx == -1:
|
||||
break
|
||||
key = (
|
||||
key[:idx]
|
||||
+ chr(int(key[idx + 1 : idx + 5], base=16))
|
||||
+ key[idx + 5 :]
|
||||
)
|
||||
if node.nodeType == node.ATTRIBUTE_NODE:
|
||||
value = node.nodeValue
|
||||
else:
|
||||
value = self._get_text(node)
|
||||
self._custom_properties[key] = value
|
||||
return self._custom_properties
|
||||
|
||||
def _get_or_create_description(self, about_uri: str = "") -> XmlElement:
|
||||
"""Get or create an rdf:Description element with the given about URI."""
|
||||
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||
return desc
|
||||
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description")
|
||||
desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri)
|
||||
self.rdf_root.appendChild(desc)
|
||||
return desc
|
||||
|
||||
def _clear_cache_entry(self, namespace: str, name: str) -> None:
|
||||
"""Remove a cached value for a given namespace/name if present."""
|
||||
ns_cache = self.cache.get(namespace)
|
||||
if ns_cache and name in ns_cache:
|
||||
del ns_cache[name]
|
||||
|
||||
def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None:
|
||||
"""Set or remove a single metadata value."""
|
||||
self._clear_cache_entry(namespace, name)
|
||||
desc = self._get_or_create_description()
|
||||
|
||||
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||
for elem in existing_elements:
|
||||
desc.removeChild(elem)
|
||||
|
||||
if existing_attr := desc.getAttributeNodeNS(namespace, name):
|
||||
desc.removeAttributeNode(existing_attr)
|
||||
|
||||
if value is not None:
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
prefix = self._get_namespace_prefix(namespace)
|
||||
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||
text_node = doc.createTextNode(str(value))
|
||||
elem.appendChild(text_node)
|
||||
desc.appendChild(elem)
|
||||
|
||||
self._update_stream()
|
||||
|
||||
def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
|
||||
"""Set or remove bag values (unordered array)."""
|
||||
self._clear_cache_entry(namespace, name)
|
||||
desc = self._get_or_create_description()
|
||||
|
||||
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||
for elem in existing_elements:
|
||||
desc.removeChild(elem)
|
||||
|
||||
if values:
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
prefix = self._get_namespace_prefix(namespace)
|
||||
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||
bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag")
|
||||
|
||||
for value in values:
|
||||
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||
text_node = doc.createTextNode(str(value))
|
||||
li.appendChild(text_node)
|
||||
bag.appendChild(li)
|
||||
|
||||
elem.appendChild(bag)
|
||||
desc.appendChild(elem)
|
||||
|
||||
self._update_stream()
|
||||
|
||||
def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
|
||||
"""Set or remove sequence values (ordered array)."""
|
||||
self._clear_cache_entry(namespace, name)
|
||||
desc = self._get_or_create_description()
|
||||
|
||||
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||
for elem in existing_elements:
|
||||
desc.removeChild(elem)
|
||||
|
||||
if values:
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
prefix = self._get_namespace_prefix(namespace)
|
||||
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||
seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq")
|
||||
|
||||
for value in values:
|
||||
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||
text_node = doc.createTextNode(str(value))
|
||||
li.appendChild(text_node)
|
||||
seq.appendChild(li)
|
||||
|
||||
elem.appendChild(seq)
|
||||
desc.appendChild(elem)
|
||||
|
||||
self._update_stream()
|
||||
|
||||
def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None:
|
||||
"""Set or remove language alternative values."""
|
||||
self._clear_cache_entry(namespace, name)
|
||||
desc = self._get_or_create_description()
|
||||
|
||||
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||
for elem in existing_elements:
|
||||
desc.removeChild(elem)
|
||||
|
||||
if values:
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
prefix = self._get_namespace_prefix(namespace)
|
||||
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||
alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt")
|
||||
|
||||
for lang, value in values.items():
|
||||
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||
li.setAttribute("xml:lang", lang)
|
||||
text_node = doc.createTextNode(str(value))
|
||||
li.appendChild(text_node)
|
||||
alt.appendChild(li)
|
||||
|
||||
elem.appendChild(alt)
|
||||
desc.appendChild(elem)
|
||||
|
||||
self._update_stream()
|
||||
|
||||
def _get_namespace_prefix(self, namespace: str) -> str:
|
||||
"""Get the appropriate namespace prefix for a given namespace URI."""
|
||||
return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown")
|
||||
|
||||
def _update_stream(self) -> None:
|
||||
"""Update the stream with the current XML content."""
|
||||
doc = self.rdf_root.ownerDocument
|
||||
if doc is None:
|
||||
raise XmpDocumentError("XMP Document is None")
|
||||
|
||||
xml_data = doc.toxml(encoding="utf-8")
|
||||
self.stream.set_data(xml_data)
|
||||
Reference in New Issue
Block a user