Update ashboard, dashboard, memory +1 more (+2 ~3)
This commit is contained in:
@@ -687,6 +687,11 @@
|
|||||||
border-color: var(--accent);
|
border-color: var(--accent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.issue-checkbox.in-progress {
|
||||||
|
background: rgba(59, 130, 246, 0.3);
|
||||||
|
border-color: #3b82f6;
|
||||||
|
}
|
||||||
|
|
||||||
.issue-checkbox svg {
|
.issue-checkbox svg {
|
||||||
width: 12px;
|
width: 12px;
|
||||||
height: 12px;
|
height: 12px;
|
||||||
@@ -698,6 +703,14 @@
|
|||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.issue-checkbox.in-progress::after {
|
||||||
|
content: '';
|
||||||
|
width: 8px;
|
||||||
|
height: 8px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: #3b82f6;
|
||||||
|
}
|
||||||
|
|
||||||
.issue-content {
|
.issue-content {
|
||||||
flex: 1;
|
flex: 1;
|
||||||
min-width: 0;
|
min-width: 0;
|
||||||
@@ -738,6 +751,27 @@
|
|||||||
.issue-owner.marius { color: #22c55e; }
|
.issue-owner.marius { color: #22c55e; }
|
||||||
.issue-owner.robert { color: #f59e0b; }
|
.issue-owner.robert { color: #f59e0b; }
|
||||||
|
|
||||||
|
.issue-status {
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: var(--radius-sm);
|
||||||
|
font-size: 11px;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.3px;
|
||||||
|
}
|
||||||
|
.issue-status.todo {
|
||||||
|
background: rgba(156, 163, 175, 0.2);
|
||||||
|
color: #9ca3af;
|
||||||
|
}
|
||||||
|
.issue-status.in-progress {
|
||||||
|
background: rgba(59, 130, 246, 0.2);
|
||||||
|
color: #3b82f6;
|
||||||
|
}
|
||||||
|
.issue-status.done {
|
||||||
|
background: rgba(34, 197, 94, 0.2);
|
||||||
|
color: #22c55e;
|
||||||
|
}
|
||||||
|
|
||||||
/* Todo's Panel */
|
/* Todo's Panel */
|
||||||
.todos-panel { border-left: 3px solid #8b5cf6; }
|
.todos-panel { border-left: 3px solid #8b5cf6; }
|
||||||
.todo-section { margin-bottom: 16px; }
|
.todo-section { margin-bottom: 16px; }
|
||||||
@@ -1266,10 +1300,21 @@
|
|||||||
<option value="backlog">⚪ Backlog</option>
|
<option value="backlog">⚪ Backlog</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label class="form-label">Status</label>
|
||||||
|
<select class="input" id="issueStatus">
|
||||||
|
<option value="todo">Todo</option>
|
||||||
|
<option value="in-progress">In Progress</option>
|
||||||
|
<option value="done">Done</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label class="form-label">Deadline</label>
|
<label class="form-label">Deadline</label>
|
||||||
<input type="date" class="input" id="issueDeadline">
|
<input type="date" class="input" id="issueDeadline">
|
||||||
</div>
|
</div>
|
||||||
|
<div class="form-group"></div>
|
||||||
</div>
|
</div>
|
||||||
<div class="modal-actions">
|
<div class="modal-actions">
|
||||||
<button class="btn btn-danger" id="issueDeleteBtn" onclick="deleteIssue()" style="margin-right: auto; display: none;">Șterge</button>
|
<button class="btn btn-danger" id="issueDeleteBtn" onclick="deleteIssue()" style="margin-right: auto; display: none;">Șterge</button>
|
||||||
@@ -2129,7 +2174,6 @@
|
|||||||
<div class="priority-group">
|
<div class="priority-group">
|
||||||
<div class="priority-header ${isCollapsed ? 'collapsed' : ''}" onclick="togglePriority('${priority}')">
|
<div class="priority-header ${isCollapsed ? 'collapsed' : ''}" onclick="togglePriority('${priority}')">
|
||||||
<i data-lucide="chevron-down"></i>
|
<i data-lucide="chevron-down"></i>
|
||||||
<span class="priority-dot ${priority}"></span>
|
|
||||||
<span>${priorityLabels[priority]}</span>
|
<span>${priorityLabels[priority]}</span>
|
||||||
<span style="margin-left: auto; opacity: 0.7">${todoCount}/${issues.length}</span>
|
<span style="margin-left: auto; opacity: 0.7">${todoCount}/${issues.length}</span>
|
||||||
</div>
|
</div>
|
||||||
@@ -2146,18 +2190,23 @@
|
|||||||
|
|
||||||
function renderIssueItem(issue) {
|
function renderIssueItem(issue) {
|
||||||
const isDone = issue.status === 'done';
|
const isDone = issue.status === 'done';
|
||||||
|
const isInProgress = issue.status === 'in-progress';
|
||||||
const ownerIcons = { 'clawdbot': '🤖', 'robert': '👷', 'marius': '👤' };
|
const ownerIcons = { 'clawdbot': '🤖', 'robert': '👷', 'marius': '👤' };
|
||||||
const ownerIcon = ownerIcons[issue.owner] || '👤';
|
const ownerIcon = ownerIcons[issue.owner] || '👤';
|
||||||
const dateStr = new Date(issue.created).toLocaleDateString('ro-RO', { day: 'numeric', month: 'short' });
|
const dateStr = new Date(issue.created).toLocaleDateString('ro-RO', { day: 'numeric', month: 'short' });
|
||||||
|
const statusLabels = { 'todo': 'Todo', 'in-progress': 'In Progress', 'done': 'Done' };
|
||||||
|
const statusLabel = statusLabels[issue.status] || 'Todo';
|
||||||
|
const checkboxClass = isDone ? 'checked' : (isInProgress ? 'in-progress' : '');
|
||||||
|
|
||||||
return `
|
return `
|
||||||
<div class="issue-item ${isDone ? 'done' : ''}" data-id="${issue.id}">
|
<div class="issue-item ${isDone ? 'done' : ''}" data-id="${issue.id}">
|
||||||
<div class="issue-checkbox ${isDone ? 'checked' : ''}" onclick="toggleIssue('${issue.id}')">
|
<div class="issue-checkbox ${checkboxClass}" onclick="toggleIssue('${issue.id}')" title="Click pentru a schimba statusul">
|
||||||
<i data-lucide="check"></i>
|
<i data-lucide="check"></i>
|
||||||
</div>
|
</div>
|
||||||
<div class="issue-content" onclick="editIssue('${issue.id}')">
|
<div class="issue-content" onclick="editIssue('${issue.id}')">
|
||||||
<div class="issue-title">${issue.title}</div>
|
<div class="issue-title">${issue.title}</div>
|
||||||
<div class="issue-meta">
|
<div class="issue-meta">
|
||||||
|
<span class="issue-status ${issue.status || 'todo'}">${statusLabel}</span>
|
||||||
${issue.program ? `<span class="issue-tag program">${issue.program}</span>` : ''}
|
${issue.program ? `<span class="issue-tag program">${issue.program}</span>` : ''}
|
||||||
<span class="issue-owner ${issue.owner}">${ownerIcon} ${issue.owner === 'clawdbot' ? 'Clawdbot' : (issue.owner === 'robert' ? 'Robert' : 'Marius')}</span>
|
<span class="issue-owner ${issue.owner}">${ownerIcon} ${issue.owner === 'clawdbot' ? 'Clawdbot' : (issue.owner === 'robert' ? 'Robert' : 'Marius')}</span>
|
||||||
<span class="issue-date">${dateStr}</span>
|
<span class="issue-date">${dateStr}</span>
|
||||||
@@ -2180,17 +2229,27 @@
|
|||||||
const issue = issuesData.issues.find(i => i.id === id);
|
const issue = issuesData.issues.find(i => i.id === id);
|
||||||
if (!issue) return;
|
if (!issue) return;
|
||||||
|
|
||||||
issue.status = issue.status === 'done' ? 'todo' : 'done';
|
// Cycle: todo → in-progress → done → todo
|
||||||
|
const statusCycle = { 'todo': 'in-progress', 'in-progress': 'done', 'done': 'todo' };
|
||||||
|
const currentStatus = issue.status || 'todo';
|
||||||
|
issue.status = statusCycle[currentStatus] || 'in-progress';
|
||||||
|
|
||||||
if (issue.status === 'done') {
|
if (issue.status === 'done') {
|
||||||
issue.completed = new Date().toISOString();
|
issue.completed = new Date().toISOString();
|
||||||
} else {
|
} else {
|
||||||
delete issue.completed;
|
delete issue.completed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const statusMessages = {
|
||||||
|
'in-progress': '🔄 In Progress',
|
||||||
|
'done': '✅ Done!',
|
||||||
|
'todo': '📋 Todo'
|
||||||
|
};
|
||||||
|
|
||||||
renderIssues();
|
renderIssues();
|
||||||
updateIssuesCount();
|
updateIssuesCount();
|
||||||
await saveIssues();
|
await saveIssues();
|
||||||
showToast(issue.status === 'done' ? 'Issue finalizat! ✓' : 'Issue redeschis');
|
showToast(statusMessages[issue.status]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filters
|
// Filters
|
||||||
@@ -2212,6 +2271,7 @@
|
|||||||
document.getElementById('issueProgram').value = '';
|
document.getElementById('issueProgram').value = '';
|
||||||
document.getElementById('issueOwner').value = 'marius';
|
document.getElementById('issueOwner').value = 'marius';
|
||||||
document.getElementById('issuePriority').value = 'urgent-important';
|
document.getElementById('issuePriority').value = 'urgent-important';
|
||||||
|
document.getElementById('issueStatus').value = 'todo';
|
||||||
document.getElementById('issueDeadline').value = '';
|
document.getElementById('issueDeadline').value = '';
|
||||||
document.getElementById('issueDeleteBtn').style.display = 'none';
|
document.getElementById('issueDeleteBtn').style.display = 'none';
|
||||||
document.getElementById('issueSaveBtn').textContent = 'Adaugă';
|
document.getElementById('issueSaveBtn').textContent = 'Adaugă';
|
||||||
@@ -2230,6 +2290,7 @@
|
|||||||
document.getElementById('issueProgram').value = issue.program || '';
|
document.getElementById('issueProgram').value = issue.program || '';
|
||||||
document.getElementById('issueOwner').value = issue.owner || 'marius';
|
document.getElementById('issueOwner').value = issue.owner || 'marius';
|
||||||
document.getElementById('issuePriority').value = issue.priority || 'backlog';
|
document.getElementById('issuePriority').value = issue.priority || 'backlog';
|
||||||
|
document.getElementById('issueStatus').value = issue.status || 'todo';
|
||||||
document.getElementById('issueDeadline').value = issue.deadline || '';
|
document.getElementById('issueDeadline').value = issue.deadline || '';
|
||||||
document.getElementById('issueDeleteBtn').style.display = 'block';
|
document.getElementById('issueDeleteBtn').style.display = 'block';
|
||||||
document.getElementById('issueSaveBtn').textContent = 'Salvează';
|
document.getElementById('issueSaveBtn').textContent = 'Salvează';
|
||||||
@@ -2272,6 +2333,13 @@
|
|||||||
issue.program = document.getElementById('issueProgram').value;
|
issue.program = document.getElementById('issueProgram').value;
|
||||||
issue.owner = document.getElementById('issueOwner').value;
|
issue.owner = document.getElementById('issueOwner').value;
|
||||||
issue.priority = document.getElementById('issuePriority').value;
|
issue.priority = document.getElementById('issuePriority').value;
|
||||||
|
const newStatus = document.getElementById('issueStatus').value;
|
||||||
|
if (newStatus === 'done' && issue.status !== 'done') {
|
||||||
|
issue.completed = new Date().toISOString();
|
||||||
|
} else if (newStatus !== 'done') {
|
||||||
|
delete issue.completed;
|
||||||
|
}
|
||||||
|
issue.status = newStatus;
|
||||||
issue.deadline = document.getElementById('issueDeadline').value || null;
|
issue.deadline = document.getElementById('issueDeadline').value || null;
|
||||||
issue.updated = new Date().toISOString();
|
issue.updated = new Date().toISOString();
|
||||||
}
|
}
|
||||||
@@ -2285,7 +2353,7 @@
|
|||||||
program: document.getElementById('issueProgram').value,
|
program: document.getElementById('issueProgram').value,
|
||||||
owner: document.getElementById('issueOwner').value,
|
owner: document.getElementById('issueOwner').value,
|
||||||
priority: document.getElementById('issuePriority').value,
|
priority: document.getElementById('issuePriority').value,
|
||||||
status: 'todo',
|
status: document.getElementById('issueStatus').value || 'todo',
|
||||||
created: new Date().toISOString(),
|
created: new Date().toISOString(),
|
||||||
deadline: document.getElementById('issueDeadline').value || null
|
deadline: document.getElementById('issueDeadline').value || null
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"lastUpdated": "2026-02-02T11:25:18.119Z",
|
"lastUpdated": "2026-02-02T22:27:06.452Z",
|
||||||
"programs": [
|
"programs": [
|
||||||
"ROACONT",
|
"ROACONT",
|
||||||
"ROAGEST",
|
"ROAGEST",
|
||||||
@@ -23,7 +23,8 @@
|
|||||||
"priority": "urgent-important",
|
"priority": "urgent-important",
|
||||||
"status": "todo",
|
"status": "todo",
|
||||||
"created": "2026-02-02T11:25:18.115Z",
|
"created": "2026-02-02T11:25:18.115Z",
|
||||||
"deadline": "2026-02-02"
|
"deadline": "2026-02-02",
|
||||||
|
"updated": "2026-02-02T22:27:06.428Z"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "ROA-001",
|
"id": "ROA-001",
|
||||||
@@ -31,10 +32,11 @@
|
|||||||
"description": "RD 49 = în urma inspecției fiscale\nRD 50 = impozit precedent\nFormularul nu recalculează impozitul de 16%\nRD 40 se modifică și la 4.1",
|
"description": "RD 49 = în urma inspecției fiscale\nRD 50 = impozit precedent\nFormularul nu recalculează impozitul de 16%\nRD 40 se modifică și la 4.1",
|
||||||
"program": "ROACONT",
|
"program": "ROACONT",
|
||||||
"owner": "marius",
|
"owner": "marius",
|
||||||
"priority": "urgent-important",
|
"priority": "important",
|
||||||
"status": "todo",
|
"status": "todo",
|
||||||
"created": "2026-01-30T15:10:00Z",
|
"created": "2026-01-30T15:10:00Z",
|
||||||
"deadline": null
|
"deadline": "2026-02-06",
|
||||||
|
"updated": "2026-02-02T22:26:59.690Z"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -1,52 +1,33 @@
|
|||||||
# 2026-02-02 - Note de sesiune
|
# 2 Februarie 2026
|
||||||
|
|
||||||
## Decizii
|
## Decizii
|
||||||
|
- Marius aprobă TOATE propunerile din raportul de seară ("Da")
|
||||||
|
- A0 + A3 executate imediat
|
||||||
|
- A1 + A2 (sesiuni TU+EU) de programat luni-joi 15:00-16:00
|
||||||
|
|
||||||
### Rapoarte pe EMAIL (nu Discord)
|
## Executat
|
||||||
- Morning-report și evening-report merg acum pe **email** (mmarius28@gmail.com)
|
- **A0:** Git commit și push (2 commits: TOOLS.md, KB index, coaching, email tool)
|
||||||
- Format nou cu două secțiuni:
|
- **A3:** Integrată întrebarea "Ce poveste despre tine ar trebui să renunți?" în insights pentru coaching dimineață
|
||||||
- **📚 Sinteză** - modele/concepte → fișier separat + link
|
|
||||||
- **⚡ Acționabile** - task-uri cu CINE/CE/EFORT/REZULTAT clar
|
|
||||||
- 3 răspunsuri predefinite (1/2/3) pentru 80/20
|
|
||||||
- Job-uri actualizate: `morning-report`, `evening-report`
|
|
||||||
|
|
||||||
### Fix email_send.py
|
## De programat
|
||||||
- Problema: MailChannels + Gmail respingeau emailurile
|
- **A1:** Sesiune "Dizolvarea lui Nu Merit" (30 min) - exercițiu Monica Ion
|
||||||
- Cauza: Emoji în FROM_NAME + header-e non-RFC
|
- **A2:** Sistemul 5 pași pentru frici (15 min) - Zoltan Vereș
|
||||||
- Fix:
|
|
||||||
- `FROM_NAME = "Echo"` (fără emoji)
|
|
||||||
- `Header(subject, 'utf-8')` pentru encoding
|
|
||||||
- `formataddr((FROM_NAME, SMTP_USER))` pentru RFC compliance
|
|
||||||
|
|
||||||
### Reguli sub-agenți (AGENTS.md)
|
## Feedback Marius
|
||||||
- Când lansez sub-agent, TREBUIE să-i dau tot contextul: AGENTS.md, SOUL.md, USER.md, memory relevant
|
1. **Email replies:** Nu primește email-urile de confirmare - de verificat flux
|
||||||
- Sub-agentul rulează izolat, nu are acces automat la fișierele mele
|
2. **Insights → Rapoarte:** Raportul de seară a fost prea conservator - 22 insights extrase dar doar 4 propuneri în raport. De ajustat job-ul evening-report să propună mai multe.
|
||||||
|
|
||||||
## Fișiere create/modificate
|
## Stats azi
|
||||||
|
- 23 note YouTube în KB (20 procesate azi - Zoltan Vereș workshop)
|
||||||
|
- 22 insights extrase în `memory/kb/insights/2026-02-02.md`
|
||||||
|
- Job insights-extract funcționează, dar rapoartele nu folosesc toate
|
||||||
|
|
||||||
- `memory/kb/insights/2026-02-02.md` - 22 insights din 20 video-uri
|
## De făcut
|
||||||
- `memory/kb/insights/sinteza-2026-02-02.md` - 16 modele/concepte (sinteză)
|
- [x] Ajustez evening-report și morning-report să propună cu ZI și ORĂ concrete
|
||||||
- `tools/email_send.py` - fix RFC compliance
|
- [x] Adăugat listare insights disponibile în rapoarte
|
||||||
- `AGENTS.md` - reguli sub-agenți
|
- [ ] Programez A1 și A2 cu Marius
|
||||||
- `TOOLS.md` - documentație joburi actualizată
|
|
||||||
|
|
||||||
## Aprobat și executat (răspuns email: DA)
|
## Lecții învățate
|
||||||
|
- **Rapoarte:** TOATE propunerile TU+EU/FAC TU trebuie să aibă zi și oră concrete
|
||||||
**Executat:**
|
- **Email flow:** Reply #1 imediat (confirmare primire), Reply #2 după execuție (ce s-a făcut)
|
||||||
- ✅ A0: Git commit + push (54 fișiere)
|
- **Insights:** Listează TOATE insight-urile disponibile, nu doar câteva
|
||||||
- ✅ A4: Template seară "10 lucruri" → memory/kb/projects/templates/template-seara-merit.md
|
|
||||||
|
|
||||||
**Programat mâine (job grup-sprijin-pregatire):**
|
|
||||||
- A3: Fișă grup sprijin - starea de victimă (tema pregătită din insights)
|
|
||||||
|
|
||||||
**Programat miercuri-joi 15-16:**
|
|
||||||
- A1: Lista eforturilor pt clienți noi (template + completăm împreună)
|
|
||||||
- A2: Template valoare adusă clienți (template + completăm împreună)
|
|
||||||
- A5: Sesiune film interior (30 min conversație)
|
|
||||||
|
|
||||||
## Învățat
|
|
||||||
|
|
||||||
- Email deliverability: MailChannels poate bloca emailuri de la hosting shared
|
|
||||||
- Gmail e strict pe RFC 5322 - header-ele trebuie corect formatate
|
|
||||||
- Rapoarte pe email > Discord pentru decizii care necesită gândire
|
|
||||||
- Format "sinteză + acționabile + răspunsuri predefinite" = 80/20 friendly
|
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
pip
|
||||||
170
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/METADATA
Normal file
170
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/METADATA
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
Metadata-Version: 2.4
|
||||||
|
Name: pypdf
|
||||||
|
Version: 6.6.2
|
||||||
|
Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
|
||||||
|
Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>
|
||||||
|
Maintainer: stefan6419846
|
||||||
|
Maintainer-email: Martin Thoma <info@martin-thoma.de>
|
||||||
|
Requires-Python: >=3.9
|
||||||
|
Description-Content-Type: text/markdown
|
||||||
|
License-Expression: BSD-3-Clause
|
||||||
|
Classifier: Development Status :: 5 - Production/Stable
|
||||||
|
Classifier: Intended Audience :: Developers
|
||||||
|
Classifier: Programming Language :: Python :: 3
|
||||||
|
Classifier: Programming Language :: Python :: 3 :: Only
|
||||||
|
Classifier: Programming Language :: Python :: 3.9
|
||||||
|
Classifier: Programming Language :: Python :: 3.10
|
||||||
|
Classifier: Programming Language :: Python :: 3.11
|
||||||
|
Classifier: Programming Language :: Python :: 3.12
|
||||||
|
Classifier: Programming Language :: Python :: 3.13
|
||||||
|
Classifier: Programming Language :: Python :: 3.14
|
||||||
|
Classifier: Operating System :: OS Independent
|
||||||
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||||
|
Classifier: Typing :: Typed
|
||||||
|
License-File: LICENSE
|
||||||
|
Requires-Dist: typing_extensions >= 4.0; python_version < '3.11'
|
||||||
|
Requires-Dist: cryptography ; extra == "crypto"
|
||||||
|
Requires-Dist: PyCryptodome ; extra == "cryptodome"
|
||||||
|
Requires-Dist: black ; extra == "dev"
|
||||||
|
Requires-Dist: flit ; extra == "dev"
|
||||||
|
Requires-Dist: pip-tools ; extra == "dev"
|
||||||
|
Requires-Dist: pre-commit ; extra == "dev"
|
||||||
|
Requires-Dist: pytest-cov ; extra == "dev"
|
||||||
|
Requires-Dist: pytest-socket ; extra == "dev"
|
||||||
|
Requires-Dist: pytest-timeout ; extra == "dev"
|
||||||
|
Requires-Dist: pytest-xdist ; extra == "dev"
|
||||||
|
Requires-Dist: wheel ; extra == "dev"
|
||||||
|
Requires-Dist: myst_parser ; extra == "docs"
|
||||||
|
Requires-Dist: sphinx ; extra == "docs"
|
||||||
|
Requires-Dist: sphinx_rtd_theme ; extra == "docs"
|
||||||
|
Requires-Dist: cryptography ; extra == "full"
|
||||||
|
Requires-Dist: Pillow>=8.0.0 ; extra == "full"
|
||||||
|
Requires-Dist: Pillow>=8.0.0 ; extra == "image"
|
||||||
|
Project-URL: Bug Reports, https://github.com/py-pdf/pypdf/issues
|
||||||
|
Project-URL: Changelog, https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html
|
||||||
|
Project-URL: Documentation, https://pypdf.readthedocs.io/en/latest/
|
||||||
|
Project-URL: Source, https://github.com/py-pdf/pypdf
|
||||||
|
Provides-Extra: crypto
|
||||||
|
Provides-Extra: cryptodome
|
||||||
|
Provides-Extra: dev
|
||||||
|
Provides-Extra: docs
|
||||||
|
Provides-Extra: full
|
||||||
|
Provides-Extra: image
|
||||||
|
|
||||||
|
[](https://badge.fury.io/py/pypdf)
|
||||||
|
[](https://pypi.org/project/pypdf/)
|
||||||
|
[](https://pypdf.readthedocs.io/en/stable/)
|
||||||
|
[](https://github.com/py-pdf/pypdf)
|
||||||
|
[](https://codecov.io/gh/py-pdf/pypdf)
|
||||||
|
|
||||||
|
# pypdf
|
||||||
|
|
||||||
|
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||||
|
[merging](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html),
|
||||||
|
[cropping, and transforming](https://pypdf.readthedocs.io/en/stable/user/cropping-and-transforming.html)
|
||||||
|
the pages of PDF files. It can also add
|
||||||
|
custom data, viewing options, and
|
||||||
|
[passwords](https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html)
|
||||||
|
to PDF files. pypdf can
|
||||||
|
[retrieve text](https://pypdf.readthedocs.io/en/stable/user/extract-text.html)
|
||||||
|
and
|
||||||
|
[metadata](https://pypdf.readthedocs.io/en/stable/user/metadata.html)
|
||||||
|
from PDFs as well.
|
||||||
|
|
||||||
|
See [pdfly](https://github.com/py-pdf/pdfly) for a CLI application that uses pypdf to interact with PDFs.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Install pypdf using pip:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install pypdf
|
||||||
|
```
|
||||||
|
|
||||||
|
For using pypdf with AES encryption or decryption, install extra dependencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install pypdf[crypto]
|
||||||
|
```
|
||||||
|
|
||||||
|
> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to
|
||||||
|
> previous versions. Please refer to [the migration
|
||||||
|
> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for
|
||||||
|
> more information.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
reader = PdfReader("example.pdf")
|
||||||
|
number_of_pages = len(reader.pages)
|
||||||
|
page = reader.pages[0]
|
||||||
|
text = page.extract_text()
|
||||||
|
```
|
||||||
|
|
||||||
|
pypdf can do a lot more, e.g. splitting, merging, reading and creating annotations, decrypting and encrypting. Check out the
|
||||||
|
[documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage
|
||||||
|
examples!
|
||||||
|
|
||||||
|
For questions and answers, visit
|
||||||
|
[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
|
||||||
|
(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)).
|
||||||
|
|
||||||
|
## Contributions
|
||||||
|
|
||||||
|
Maintaining pypdf is a collaborative effort. You can support the project by
|
||||||
|
writing documentation, helping to narrow down issues, and submitting code.
|
||||||
|
See the [CONTRIBUTING.md](https://github.com/py-pdf/pypdf/blob/main/CONTRIBUTING.md) file for more information.
|
||||||
|
|
||||||
|
### Q&A
|
||||||
|
|
||||||
|
The experience pypdf users have covers the whole range from beginner to expert. You can contribute to the pypdf community by answering questions
|
||||||
|
on [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf),
|
||||||
|
helping in [discussions](https://github.com/py-pdf/pypdf/discussions),
|
||||||
|
and asking users who report issues for [MCVE](https://stackoverflow.com/help/minimal-reproducible-example)'s (Code + example PDF!).
|
||||||
|
|
||||||
|
|
||||||
|
### Issues
|
||||||
|
|
||||||
|
A good bug ticket includes a MCVE - a minimal complete verifiable example.
|
||||||
|
For pypdf, this means that you must upload a PDF that causes the bug to occur
|
||||||
|
as well as the code you're executing with all of the output. Use
|
||||||
|
`print(pypdf.__version__)` to tell us which version you're using.
|
||||||
|
|
||||||
|
### Code
|
||||||
|
|
||||||
|
All code contributions are welcome, but smaller ones have a better chance to
|
||||||
|
get included in a timely manner. Adding unit tests for new features or test
|
||||||
|
cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
|
||||||
|
|
||||||
|
pypdf includes a test suite which can be executed with `pytest`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pytest
|
||||||
|
===================== test session starts =====================
|
||||||
|
platform linux -- Python 3.6.15, pytest-7.0.1, pluggy-1.0.0
|
||||||
|
rootdir: /home/moose/GitHub/Martin/pypdf
|
||||||
|
plugins: cov-3.0.0
|
||||||
|
collected 233 items
|
||||||
|
|
||||||
|
tests/test_basic_features.py .. [ 0%]
|
||||||
|
tests/test_constants.py . [ 1%]
|
||||||
|
tests/test_filters.py .................x..... [ 11%]
|
||||||
|
tests/test_generic.py ................................. [ 25%]
|
||||||
|
............. [ 30%]
|
||||||
|
tests/test_javascript.py .. [ 31%]
|
||||||
|
tests/test_merger.py . [ 32%]
|
||||||
|
tests/test_page.py ......................... [ 42%]
|
||||||
|
tests/test_pagerange.py ................ [ 49%]
|
||||||
|
tests/test_papersizes.py .................. [ 57%]
|
||||||
|
tests/test_reader.py .................................. [ 72%]
|
||||||
|
............... [ 78%]
|
||||||
|
tests/test_utils.py .................... [ 87%]
|
||||||
|
tests/test_workflows.py .......... [ 91%]
|
||||||
|
tests/test_writer.py ................. [ 98%]
|
||||||
|
tests/test_xmp.py ... [100%]
|
||||||
|
|
||||||
|
========== 232 passed, 1 xfailed, 1 warning in 4.52s ==========
|
||||||
|
```
|
||||||
|
|
||||||
117
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/RECORD
Normal file
117
venv/lib/python3.12/site-packages/pypdf-6.6.2.dist-info/RECORD
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
pypdf-6.6.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||||
|
pypdf-6.6.2.dist-info/METADATA,sha256=1Vu0OgjW3amj2S_YMUmD0Lj_7_GEw-f5VaIM-_9niK8,7149
|
||||||
|
pypdf-6.6.2.dist-info/RECORD,,
|
||||||
|
pypdf-6.6.2.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||||
|
pypdf-6.6.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
||||||
|
pypdf-6.6.2.dist-info/licenses/LICENSE,sha256=qXrCMOXzPvEKU2eoUOsB-R8aCwZONHQsd5TSKUVX9SQ,1605
|
||||||
|
pypdf/__init__.py,sha256=YS_1ZrQ3jBPHsRgMstqJrAts3lUApj_lMOMK5qiLG5w,1283
|
||||||
|
pypdf/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_cmap.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_doc_common.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_encryption.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_font.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_page.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_page_labels.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_protocols.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_reader.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_utils.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_version.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_writer.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/_xobj_image_helpers.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/constants.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/errors.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/filters.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/pagerange.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/papersizes.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/types.cpython-312.pyc,,
|
||||||
|
pypdf/__pycache__/xmp.cpython-312.pyc,,
|
||||||
|
pypdf/_cmap.py,sha256=iaAvJQQKBxkqMj5-WdD4vZV-Zdz-Sba5j6q3oPQyLT0,11713
|
||||||
|
pypdf/_codecs/__init__.py,sha256=PF1KlsLWCOF0cgdqns7G4X-l3zq5_OnZePw7RFIn1bE,1645
|
||||||
|
pypdf/_codecs/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/_codecs.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/adobe_glyphs.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/core_fontmetrics.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/pdfdoc.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/std.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/symbol.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/__pycache__/zapfding.cpython-312.pyc,,
|
||||||
|
pypdf/_codecs/_codecs.py,sha256=46oRZJySwGxCJp1kjIer7js_TYSjj4Gs2i2Uce3v-eE,10555
|
||||||
|
pypdf/_codecs/adobe_glyphs.py,sha256=t3cDFPDqwIz1w9B0gdVzjdc8eEK9AuRjk5f7laEw_fY,447213
|
||||||
|
pypdf/_codecs/core_fontmetrics.py,sha256=qQvNRQi8V8FOBmSwGcsak4qyl9cQ80cDjbpD5TvhuBg,113269
|
||||||
|
pypdf/_codecs/pdfdoc.py,sha256=xfSvMFYsvxuaSQ0Uu9vZDKaB0Wu85h1uCiB1i9rAcUU,4269
|
||||||
|
pypdf/_codecs/std.py,sha256=DyQMuEpAGEpS9uy1jWf4cnj-kqShPOAij5sI7Q1YD8E,2630
|
||||||
|
pypdf/_codecs/symbol.py,sha256=nIaGQIlhWCJiPMHrwUlmGHH-_fOXyEKvguRmuKXcGAk,3734
|
||||||
|
pypdf/_codecs/zapfding.py,sha256=PQxjxRC616d41xF3exVxP1W8nM4QrZfjO3lmtLxpE_s,3742
|
||||||
|
pypdf/_crypt_providers/__init__.py,sha256=K3Z6AuXhXVeXgLet-Tukq2gt9H66OgdupsvxIS1CmkI,3054
|
||||||
|
pypdf/_crypt_providers/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/_crypt_providers/__pycache__/_base.cpython-312.pyc,,
|
||||||
|
pypdf/_crypt_providers/__pycache__/_cryptography.cpython-312.pyc,,
|
||||||
|
pypdf/_crypt_providers/__pycache__/_fallback.cpython-312.pyc,,
|
||||||
|
pypdf/_crypt_providers/__pycache__/_pycryptodome.cpython-312.pyc,,
|
||||||
|
pypdf/_crypt_providers/_base.py,sha256=_f53Mj6vivhEZMQ4vNxN5G0IOgFY-n5_leke0c_qiNU,1711
|
||||||
|
pypdf/_crypt_providers/_cryptography.py,sha256=zT3WmbPzesvgHRkGcKAldqJ24MY3BwZViVbSc55Zxhw,4557
|
||||||
|
pypdf/_crypt_providers/_fallback.py,sha256=vsYoowR1YCAV_q-HrdIZhkUcrCb6HvRBNMYm03QtCU8,3334
|
||||||
|
pypdf/_crypt_providers/_pycryptodome.py,sha256=U1aQZ9iYBrZo-hKCjJUhGOPhwEFToiitowQ316TNrrA,3381
|
||||||
|
pypdf/_doc_common.py,sha256=Cbsc2uczFhAi2JRioaICx0ISC4lCBkRdo_tKRGw3bpc,53243
|
||||||
|
pypdf/_encryption.py,sha256=-LwFEKfhL3B10afkco6fXx-EqtjoXf67pAUgH2VBfDw,48762
|
||||||
|
pypdf/_font.py,sha256=R5jQsBYa_eMrK7VezyoWCmbBARZyS5xp8jzD2XRvKeE,14146
|
||||||
|
pypdf/_page.py,sha256=Tp2GyjjOHLFwQ1tw8bO-poyZA65PJn3k94BymXMmurw,89909
|
||||||
|
pypdf/_page_labels.py,sha256=_HXqgEhSLTH_mMhy8m4QAOzIOHRQLV6_lYvg81-l9hI,8546
|
||||||
|
pypdf/_protocols.py,sha256=7qz92LVdPrYkSpdUPpAp9U4GW5jxNBTfVcpUWwUhEOo,2123
|
||||||
|
pypdf/_reader.py,sha256=KyeDHVEI5n4cZBHGVzbGIfhaPC1nZMiIU0W_ZNb0w_Y,55079
|
||||||
|
pypdf/_text_extraction/__init__.py,sha256=a3Z33rQVTiMKGtwt7_bfXlPosbST8rzELoNnt053_Qw,8515
|
||||||
|
pypdf/_text_extraction/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/__pycache__/_text_extractor.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/_layout_mode/__init__.py,sha256=RUQIwiUwzneNtcljnVM6jkRaem6pgP7mOD2-MBmtpvw,340
|
||||||
|
pypdf/_text_extraction/_layout_mode/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/_layout_mode/__pycache__/_fixed_width_page.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_manager.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_params.cpython-312.pyc,,
|
||||||
|
pypdf/_text_extraction/_layout_mode/_fixed_width_page.py,sha256=eJveDbyMooG970qJOhM5Rwb9ZoyyJDynzWpV9a7IS20,15370
|
||||||
|
pypdf/_text_extraction/_layout_mode/_text_state_manager.py,sha256=XVrIjeTd5jSdMexBQxs0tL5I5RUOitRmN1mELOcKYm4,8221
|
||||||
|
pypdf/_text_extraction/_layout_mode/_text_state_params.py,sha256=hyw6pnC8upBkoFVUJ3LH8hBIIHrNwiqaqcYyzIIyr6Y,5481
|
||||||
|
pypdf/_text_extraction/_text_extractor.py,sha256=wRmFtgMYTbJFbZRJVG3j1-lQWhb6mUC5uiE73DLRhIo,14454
|
||||||
|
pypdf/_utils.py,sha256=v579jJEHn-JophTC4Ej2MBFTEoQGitPWs_d507pyS6g,20194
|
||||||
|
pypdf/_version.py,sha256=S2Qku7VqFDmWPW_O3fID47IPC76TVFqesX1qVVa575w,22
|
||||||
|
pypdf/_writer.py,sha256=K7ANMEgNz-tPngYVMW9j07SEcksk5tFf1_tgi0JDRIg,129793
|
||||||
|
pypdf/_xobj_image_helpers.py,sha256=y7EMrXlYqwbIeUtdQS2XH9nO_2R73DOLf9-T1IyHMIA,21450
|
||||||
|
pypdf/annotations/__init__.py,sha256=f2k_-jAn39CCB27KxQ_e93GinnzkAHbUnnSeGJl1jyE,990
|
||||||
|
pypdf/annotations/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/annotations/__pycache__/_base.cpython-312.pyc,,
|
||||||
|
pypdf/annotations/__pycache__/_markup_annotations.cpython-312.pyc,,
|
||||||
|
pypdf/annotations/__pycache__/_non_markup_annotations.cpython-312.pyc,,
|
||||||
|
pypdf/annotations/_base.py,sha256=eeoc9v2w15jAUhKXj48l1bB66YgBgV-2v5IIUJH-vws,961
|
||||||
|
pypdf/annotations/_markup_annotations.py,sha256=PLDCbsEWSgOmk6HTxepolEzj-Q3EE5J4hXMgnTDFaqc,9590
|
||||||
|
pypdf/annotations/_non_markup_annotations.py,sha256=Z2IUvcCOcTcpJhSXrex_9riYM2D64XxFQ_vac10BNRU,3649
|
||||||
|
pypdf/constants.py,sha256=_U_xkH1REx2rsgtx3jCOaKivhmyqPA25PLL7Z4A1_ZI,23260
|
||||||
|
pypdf/errors.py,sha256=Bw1W9hxOsDgwqwU6YoQ2l0-JiUyTq6l5QjVCr-W4GFA,1947
|
||||||
|
pypdf/filters.py,sha256=FzfrqdZK9bs3MjU75KJ2uIMPpx6VcxYQ4oV9wLh3j-w,29210
|
||||||
|
pypdf/generic/__init__.py,sha256=VrqdYftQECePDU2rXVMgEqRaYFR8zOV_fvJgo19x_uw,3468
|
||||||
|
pypdf/generic/__pycache__/__init__.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_appearance_stream.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_base.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_data_structures.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_files.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_fit.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_image_inline.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_link.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_outline.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_rectangle.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_utils.cpython-312.pyc,,
|
||||||
|
pypdf/generic/__pycache__/_viewerpref.cpython-312.pyc,,
|
||||||
|
pypdf/generic/_appearance_stream.py,sha256=ofXHlJC4-jSBCLOhkKztoeFiYlD-zi8QMdvRrMm3rdE,24867
|
||||||
|
pypdf/generic/_base.py,sha256=N8O_NcqK5y5O70OF8-p6vsac9R1ykTDcBIksBY_9rnA,32531
|
||||||
|
pypdf/generic/_data_structures.py,sha256=g1Jy5tpPSTHIhOme6HFXdMvxV2HuxbZx-HOsF2Awdc0,63602
|
||||||
|
pypdf/generic/_files.py,sha256=NtSkRo6JBgisi4QOyrVneO891boVsuY25hRwij6X9RA,16238
|
||||||
|
pypdf/generic/_fit.py,sha256=X_iADJj1YY4PUStS7rFWC2xR2LUVSvKtUAky0AFAIDM,5515
|
||||||
|
pypdf/generic/_image_inline.py,sha256=4cADiCeaCYq2kgJu0wOYXRn5YZ27cCHb3hGFqFFT5D4,12787
|
||||||
|
pypdf/generic/_link.py,sha256=ibdLhdU0mP_phneaJs-CzUDErkJuqnMT6TsQoHNOYiE,4951
|
||||||
|
pypdf/generic/_outline.py,sha256=qKbMX42OWfqnopIiE6BUy6EvdTLGe3ZtjaiWN85JpaY,1094
|
||||||
|
pypdf/generic/_rectangle.py,sha256=lOqSfFivQxgBN9LU9aqHoxPH8aCPTDUNgRZsNEUd6fc,3785
|
||||||
|
pypdf/generic/_utils.py,sha256=vTDAesfG-cJNDKilz_kbgFodAITzd5ejppWHGjvConk,7258
|
||||||
|
pypdf/generic/_viewerpref.py,sha256=6a_s0Avm9-XvV0wqxiW23cE92qK98ry3y6EPjfsFSdo,6758
|
||||||
|
pypdf/pagerange.py,sha256=2bt21jQZm-9aq2bVf3TXuH8_wGVx7b9T6UrMFXCEJhQ,7108
|
||||||
|
pypdf/papersizes.py,sha256=6Tz5sfNN_3JOUapY83U-lakohnpXYA0hSEQNmOVLFL8,1413
|
||||||
|
pypdf/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||||
|
pypdf/types.py,sha256=sJ7wHzk7ER_CJ7kP-s8u9axFnkCXnFpr8nzcj1AxTas,1915
|
||||||
|
pypdf/xmp.py,sha256=gqh3IlgTNP7ZuyhvE59p2tsMvu4adGkq0G8RDg0OtQw,29238
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
Wheel-Version: 1.0
|
||||||
|
Generator: flit 3.12.0
|
||||||
|
Root-Is-Purelib: true
|
||||||
|
Tag: py3-none-any
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
Copyright (c) 2006-2008, Mathieu Fenniak
|
||||||
|
Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||||
|
Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
* The name of the author may not be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGE.
|
||||||
48
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
48
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
"""
|
||||||
|
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||||
|
merging, cropping, and transforming the pages of PDF files. It can also add
|
||||||
|
custom data, viewing options, and passwords to PDF files. pypdf can retrieve
|
||||||
|
text and metadata from PDFs as well.
|
||||||
|
|
||||||
|
You can read the full docs at https://pypdf.readthedocs.io/.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from ._crypt_providers import crypt_provider
|
||||||
|
from ._doc_common import DocumentInformation
|
||||||
|
from ._encryption import PasswordType
|
||||||
|
from ._page import PageObject, Transformation
|
||||||
|
from ._reader import PdfReader
|
||||||
|
from ._text_extraction import mult
|
||||||
|
from ._version import __version__
|
||||||
|
from ._writer import ObjectDeletionFlag, PdfWriter
|
||||||
|
from .constants import ImageType
|
||||||
|
from .pagerange import PageRange, parse_filename_page_ranges
|
||||||
|
from .papersizes import PaperSize
|
||||||
|
|
||||||
|
try:
|
||||||
|
import PIL
|
||||||
|
|
||||||
|
pil_version = PIL.__version__
|
||||||
|
except ImportError:
|
||||||
|
pil_version = "none"
|
||||||
|
|
||||||
|
_debug_versions = (
|
||||||
|
f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}"
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DocumentInformation",
|
||||||
|
"ImageType",
|
||||||
|
"ObjectDeletionFlag",
|
||||||
|
"PageObject",
|
||||||
|
"PageRange",
|
||||||
|
"PaperSize",
|
||||||
|
"PasswordType",
|
||||||
|
"PdfReader",
|
||||||
|
"PdfWriter",
|
||||||
|
"Transformation",
|
||||||
|
"__version__",
|
||||||
|
"_debug_versions",
|
||||||
|
"mult",
|
||||||
|
"parse_filename_page_ranges",
|
||||||
|
]
|
||||||
338
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
338
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
@@ -0,0 +1,338 @@
|
|||||||
|
import binascii
|
||||||
|
from binascii import Error as BinasciiError
|
||||||
|
from binascii import unhexlify
|
||||||
|
from math import ceil
|
||||||
|
from typing import Any, Union, cast
|
||||||
|
|
||||||
|
from ._codecs import adobe_glyphs, charset_encoding
|
||||||
|
from ._utils import logger_error, logger_warning
|
||||||
|
from .generic import (
|
||||||
|
DecodedStreamObject,
|
||||||
|
DictionaryObject,
|
||||||
|
NullObject,
|
||||||
|
StreamObject,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
|
||||||
|
_predefined_cmap: dict[str, str] = {
|
||||||
|
"/Identity-H": "utf-16-be",
|
||||||
|
"/Identity-V": "utf-16-be",
|
||||||
|
"/GB-EUC-H": "gbk",
|
||||||
|
"/GB-EUC-V": "gbk",
|
||||||
|
"/GBpc-EUC-H": "gb2312",
|
||||||
|
"/GBpc-EUC-V": "gb2312",
|
||||||
|
"/GBK-EUC-H": "gbk",
|
||||||
|
"/GBK-EUC-V": "gbk",
|
||||||
|
"/GBK2K-H": "gb18030",
|
||||||
|
"/GBK2K-V": "gb18030",
|
||||||
|
"/ETen-B5-H": "cp950",
|
||||||
|
"/ETen-B5-V": "cp950",
|
||||||
|
"/ETenms-B5-H": "cp950",
|
||||||
|
"/ETenms-B5-V": "cp950",
|
||||||
|
"/UniCNS-UTF16-H": "utf-16-be",
|
||||||
|
"/UniCNS-UTF16-V": "utf-16-be",
|
||||||
|
"/UniGB-UTF16-H": "gb18030",
|
||||||
|
"/UniGB-UTF16-V": "gb18030",
|
||||||
|
# UCS2 in code
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_encoding(
|
||||||
|
ft: DictionaryObject
|
||||||
|
) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
|
||||||
|
encoding = _parse_encoding(ft)
|
||||||
|
map_dict, int_entry = _parse_to_unicode(ft)
|
||||||
|
|
||||||
|
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
|
||||||
|
# if cmap not empty encoding should be discarded
|
||||||
|
# (here transformed into identity for those characters)
|
||||||
|
# If encoding is a string, it is expected to be an identity translation.
|
||||||
|
if isinstance(encoding, dict):
|
||||||
|
for x in int_entry:
|
||||||
|
if x <= 255:
|
||||||
|
encoding[x] = chr(x)
|
||||||
|
|
||||||
|
return encoding, map_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_encoding(
|
||||||
|
ft: DictionaryObject
|
||||||
|
) -> Union[str, dict[int, str]]:
|
||||||
|
encoding: Union[str, list[str], dict[int, str]] = []
|
||||||
|
if "/Encoding" not in ft:
|
||||||
|
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
|
||||||
|
encoding = dict(
|
||||||
|
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
encoding = "charmap"
|
||||||
|
return encoding
|
||||||
|
enc: Union[str, DictionaryObject, NullObject] = cast(
|
||||||
|
Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
|
||||||
|
)
|
||||||
|
if isinstance(enc, str):
|
||||||
|
try:
|
||||||
|
# already done : enc = NameObject.unnumber(enc.encode()).decode()
|
||||||
|
# for #xx decoding
|
||||||
|
if enc in charset_encoding:
|
||||||
|
encoding = charset_encoding[enc].copy()
|
||||||
|
elif enc in _predefined_cmap:
|
||||||
|
encoding = _predefined_cmap[enc]
|
||||||
|
elif "-UCS2-" in enc:
|
||||||
|
encoding = "utf-16-be"
|
||||||
|
else:
|
||||||
|
raise Exception("not found")
|
||||||
|
except Exception:
|
||||||
|
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
|
||||||
|
encoding = enc
|
||||||
|
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
|
||||||
|
try:
|
||||||
|
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
|
||||||
|
except Exception:
|
||||||
|
logger_error(
|
||||||
|
f"Advanced encoding {encoding} not implemented yet",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||||
|
else:
|
||||||
|
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||||
|
if isinstance(enc, DictionaryObject) and "/Differences" in enc:
|
||||||
|
x: int = 0
|
||||||
|
o: Union[int, str]
|
||||||
|
for o in cast(DictionaryObject, enc["/Differences"]):
|
||||||
|
if isinstance(o, int):
|
||||||
|
x = o
|
||||||
|
else: # isinstance(o, str):
|
||||||
|
try:
|
||||||
|
if x < len(encoding):
|
||||||
|
encoding[x] = adobe_glyphs[o] # type: ignore
|
||||||
|
except Exception:
|
||||||
|
encoding[x] = o # type: ignore
|
||||||
|
x += 1
|
||||||
|
if isinstance(encoding, list):
|
||||||
|
encoding = dict(zip(range(256), encoding))
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_to_unicode(
|
||||||
|
ft: DictionaryObject
|
||||||
|
) -> tuple[dict[Any, Any], list[int]]:
|
||||||
|
# will store all translation code
|
||||||
|
# and map_dict[-1] we will have the number of bytes to convert
|
||||||
|
map_dict: dict[Any, Any] = {}
|
||||||
|
|
||||||
|
# will provide the list of cmap keys as int to correct encoding
|
||||||
|
int_entry: list[int] = []
|
||||||
|
|
||||||
|
if "/ToUnicode" not in ft:
|
||||||
|
if ft.get("/Subtype", "") == "/Type1":
|
||||||
|
return _type1_alternative(ft, map_dict, int_entry)
|
||||||
|
return {}, []
|
||||||
|
process_rg: bool = False
|
||||||
|
process_char: bool = False
|
||||||
|
multiline_rg: Union[
|
||||||
|
None, tuple[int, int]
|
||||||
|
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
|
||||||
|
cm = prepare_cm(ft)
|
||||||
|
for line in cm.split(b"\n"):
|
||||||
|
process_rg, process_char, multiline_rg = process_cm_line(
|
||||||
|
line.strip(b" \t"),
|
||||||
|
process_rg,
|
||||||
|
process_char,
|
||||||
|
multiline_rg,
|
||||||
|
map_dict,
|
||||||
|
int_entry,
|
||||||
|
)
|
||||||
|
|
||||||
|
return map_dict, int_entry
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_cm(ft: DictionaryObject) -> bytes:
|
||||||
|
tu = ft["/ToUnicode"]
|
||||||
|
cm: bytes
|
||||||
|
if isinstance(tu, StreamObject):
|
||||||
|
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
|
||||||
|
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
|
||||||
|
# the full range 0000-FFFF will be processed
|
||||||
|
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
|
||||||
|
if isinstance(cm, str):
|
||||||
|
cm = cm.encode()
|
||||||
|
# we need to prepare cm before due to missing return line in pdf printed
|
||||||
|
# to pdf from word
|
||||||
|
cm = (
|
||||||
|
cm.strip()
|
||||||
|
.replace(b"beginbfchar", b"\nbeginbfchar\n")
|
||||||
|
.replace(b"endbfchar", b"\nendbfchar\n")
|
||||||
|
.replace(b"beginbfrange", b"\nbeginbfrange\n")
|
||||||
|
.replace(b"endbfrange", b"\nendbfrange\n")
|
||||||
|
.replace(b"<<", b"\n{\n") # text between << and >> not used but
|
||||||
|
.replace(b">>", b"\n}\n") # some solution to find it back
|
||||||
|
)
|
||||||
|
ll = cm.split(b"<")
|
||||||
|
for i in range(len(ll)):
|
||||||
|
j = ll[i].find(b">")
|
||||||
|
if j >= 0:
|
||||||
|
if j == 0:
|
||||||
|
# string is empty: stash a placeholder here (see below)
|
||||||
|
# see https://github.com/py-pdf/pypdf/issues/1111
|
||||||
|
content = b"."
|
||||||
|
else:
|
||||||
|
content = ll[i][:j].replace(b" ", b"")
|
||||||
|
ll[i] = content + b" " + ll[i][j + 1 :]
|
||||||
|
cm = (
|
||||||
|
(b" ".join(ll))
|
||||||
|
.replace(b"[", b" [ ")
|
||||||
|
.replace(b"]", b" ]\n ")
|
||||||
|
.replace(b"\r", b"\n")
|
||||||
|
)
|
||||||
|
return cm
|
||||||
|
|
||||||
|
|
||||||
|
def process_cm_line(
|
||||||
|
line: bytes,
|
||||||
|
process_rg: bool,
|
||||||
|
process_char: bool,
|
||||||
|
multiline_rg: Union[None, tuple[int, int]],
|
||||||
|
map_dict: dict[Any, Any],
|
||||||
|
int_entry: list[int],
|
||||||
|
) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
|
||||||
|
if line == b"" or line[0] == 37: # 37 = %
|
||||||
|
return process_rg, process_char, multiline_rg
|
||||||
|
line = line.replace(b"\t", b" ")
|
||||||
|
if b"beginbfrange" in line:
|
||||||
|
process_rg = True
|
||||||
|
elif b"endbfrange" in line:
|
||||||
|
process_rg = False
|
||||||
|
elif b"beginbfchar" in line:
|
||||||
|
process_char = True
|
||||||
|
elif b"endbfchar" in line:
|
||||||
|
process_char = False
|
||||||
|
elif process_rg:
|
||||||
|
try:
|
||||||
|
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
|
||||||
|
except binascii.Error as error:
|
||||||
|
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
|
||||||
|
elif process_char:
|
||||||
|
parse_bfchar(line, map_dict, int_entry)
|
||||||
|
return process_rg, process_char, multiline_rg
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bfrange(
|
||||||
|
line: bytes,
|
||||||
|
map_dict: dict[Any, Any],
|
||||||
|
int_entry: list[int],
|
||||||
|
multiline_rg: Union[None, tuple[int, int]],
|
||||||
|
) -> Union[None, tuple[int, int]]:
|
||||||
|
lst = [x for x in line.split(b" ") if x]
|
||||||
|
closure_found = False
|
||||||
|
if multiline_rg is not None:
|
||||||
|
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||||
|
a = multiline_rg[0] # a, b not in the current line
|
||||||
|
b = multiline_rg[1]
|
||||||
|
for sq in lst:
|
||||||
|
if sq == b"]":
|
||||||
|
closure_found = True
|
||||||
|
break
|
||||||
|
map_dict[
|
||||||
|
unhexlify(fmt % a).decode(
|
||||||
|
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||||
|
"surrogatepass",
|
||||||
|
)
|
||||||
|
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||||
|
int_entry.append(a)
|
||||||
|
a += 1
|
||||||
|
else:
|
||||||
|
a = int(lst[0], 16)
|
||||||
|
b = int(lst[1], 16)
|
||||||
|
nbi = max(len(lst[0]), len(lst[1]))
|
||||||
|
map_dict[-1] = ceil(nbi / 2)
|
||||||
|
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||||
|
if lst[2] == b"[":
|
||||||
|
for sq in lst[3:]:
|
||||||
|
if sq == b"]":
|
||||||
|
closure_found = True
|
||||||
|
break
|
||||||
|
map_dict[
|
||||||
|
unhexlify(fmt % a).decode(
|
||||||
|
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||||
|
"surrogatepass",
|
||||||
|
)
|
||||||
|
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||||
|
int_entry.append(a)
|
||||||
|
a += 1
|
||||||
|
else: # case without list
|
||||||
|
c = int(lst[2], 16)
|
||||||
|
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
|
||||||
|
closure_found = True
|
||||||
|
while a <= b:
|
||||||
|
map_dict[
|
||||||
|
unhexlify(fmt % a).decode(
|
||||||
|
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||||
|
"surrogatepass",
|
||||||
|
)
|
||||||
|
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
|
||||||
|
int_entry.append(a)
|
||||||
|
a += 1
|
||||||
|
c += 1
|
||||||
|
return None if closure_found else (a, b)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
|
||||||
|
lst = [x for x in line.split(b" ") if x]
|
||||||
|
map_dict[-1] = len(lst[0]) // 2
|
||||||
|
while len(lst) > 1:
|
||||||
|
map_to = ""
|
||||||
|
# placeholder (see above) means empty string
|
||||||
|
if lst[1] != b".":
|
||||||
|
try:
|
||||||
|
map_to = unhexlify(lst[1]).decode(
|
||||||
|
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
|
||||||
|
) # join is here as some cases where the code was split
|
||||||
|
except BinasciiError as exception:
|
||||||
|
logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
|
||||||
|
map_dict[
|
||||||
|
unhexlify(lst[0]).decode(
|
||||||
|
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
|
||||||
|
)
|
||||||
|
] = map_to
|
||||||
|
int_entry.append(int(lst[0], 16))
|
||||||
|
lst = lst[2:]
|
||||||
|
|
||||||
|
|
||||||
|
def _type1_alternative(
|
||||||
|
ft: DictionaryObject,
|
||||||
|
map_dict: dict[Any, Any],
|
||||||
|
int_entry: list[int],
|
||||||
|
) -> tuple[dict[Any, Any], list[int]]:
|
||||||
|
if "/FontDescriptor" not in ft:
|
||||||
|
return map_dict, int_entry
|
||||||
|
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
|
||||||
|
if is_null_or_none(ft_desc):
|
||||||
|
return map_dict, int_entry
|
||||||
|
assert ft_desc is not None, "mypy"
|
||||||
|
txt = ft_desc.get_object().get_data()
|
||||||
|
txt = txt.split(b"eexec\n")[0] # only clear part
|
||||||
|
txt = txt.split(b"/Encoding")[1] # to get the encoding part
|
||||||
|
lines = txt.replace(b"\r", b"\n").split(b"\n")
|
||||||
|
for li in lines:
|
||||||
|
if li.startswith(b"dup"):
|
||||||
|
words = [_w for _w in li.split(b" ") if _w != b""]
|
||||||
|
if len(words) > 3 and words[3] != b"put":
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
i = int(words[1])
|
||||||
|
except ValueError: # pragma: no cover
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
v = adobe_glyphs[words[2].decode()]
|
||||||
|
except KeyError:
|
||||||
|
if words[2].startswith(b"/uni"):
|
||||||
|
try:
|
||||||
|
v = chr(int(words[2][4:], 16))
|
||||||
|
except ValueError: # pragma: no cover
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
map_dict[chr(i)] = v
|
||||||
|
int_entry.append(i)
|
||||||
|
return map_dict, int_entry
|
||||||
59
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
59
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
from .adobe_glyphs import adobe_glyphs
|
||||||
|
from .pdfdoc import _pdfdoc_encoding
|
||||||
|
from .std import _std_encoding
|
||||||
|
from .symbol import _symbol_encoding
|
||||||
|
from .zapfding import _zapfding_encoding
|
||||||
|
|
||||||
|
|
||||||
|
def fill_from_encoding(enc: str) -> list[str]:
|
||||||
|
lst: list[str] = []
|
||||||
|
for x in range(256):
|
||||||
|
try:
|
||||||
|
lst += (bytes((x,)).decode(enc),)
|
||||||
|
except Exception:
|
||||||
|
lst += (chr(x),)
|
||||||
|
return lst
|
||||||
|
|
||||||
|
|
||||||
|
def rev_encoding(enc: list[str]) -> dict[str, int]:
|
||||||
|
rev: dict[str, int] = {}
|
||||||
|
for i in range(256):
|
||||||
|
char = enc[i]
|
||||||
|
if char == "\u0000":
|
||||||
|
continue
|
||||||
|
assert char not in rev, f"{char} at {i} already at {rev[char]}"
|
||||||
|
rev[char] = i
|
||||||
|
return rev
|
||||||
|
|
||||||
|
|
||||||
|
_win_encoding = fill_from_encoding("cp1252")
|
||||||
|
_mac_encoding = fill_from_encoding("mac_roman")
|
||||||
|
|
||||||
|
|
||||||
|
_win_encoding_rev: dict[str, int] = rev_encoding(_win_encoding)
|
||||||
|
_mac_encoding_rev: dict[str, int] = rev_encoding(_mac_encoding)
|
||||||
|
_symbol_encoding_rev: dict[str, int] = rev_encoding(_symbol_encoding)
|
||||||
|
_zapfding_encoding_rev: dict[str, int] = rev_encoding(_zapfding_encoding)
|
||||||
|
_pdfdoc_encoding_rev: dict[str, int] = rev_encoding(_pdfdoc_encoding)
|
||||||
|
|
||||||
|
|
||||||
|
charset_encoding: dict[str, list[str]] = {
|
||||||
|
"/StandardEncoding": _std_encoding,
|
||||||
|
"/WinAnsiEncoding": _win_encoding,
|
||||||
|
"/MacRomanEncoding": _mac_encoding,
|
||||||
|
"/PDFDocEncoding": _pdfdoc_encoding,
|
||||||
|
"/Symbol": _symbol_encoding,
|
||||||
|
"/ZapfDingbats": _zapfding_encoding,
|
||||||
|
}
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"_mac_encoding",
|
||||||
|
"_pdfdoc_encoding",
|
||||||
|
"_pdfdoc_encoding_rev",
|
||||||
|
"_std_encoding",
|
||||||
|
"_symbol_encoding",
|
||||||
|
"_win_encoding",
|
||||||
|
"_zapfding_encoding",
|
||||||
|
"adobe_glyphs",
|
||||||
|
"charset_encoding",
|
||||||
|
]
|
||||||
281
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
281
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
"""
|
||||||
|
This module is for codecs only.
|
||||||
|
|
||||||
|
While the codec implementation can contain details of the PDF specification,
|
||||||
|
the module should not do any PDF parsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from pypdf._utils import logger_warning
|
||||||
|
from pypdf.errors import LimitReachedError
|
||||||
|
|
||||||
|
|
||||||
|
class Codec(ABC):
|
||||||
|
"""Abstract base class for all codecs."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def encode(self, data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Encode the input data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data to encode.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Encoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def decode(self, data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode the input data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data to decode.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class LzwCodec(Codec):
|
||||||
|
"""Lempel-Ziv-Welch (LZW) adaptive compression codec."""
|
||||||
|
|
||||||
|
CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset
|
||||||
|
EOD_MARKER = 257 # End-of-data marker
|
||||||
|
INITIAL_BITS_PER_CODE = 9 # Initial code bit width
|
||||||
|
MAX_BITS_PER_CODE = 12 # Maximum code bit width
|
||||||
|
|
||||||
|
def __init__(self, max_output_length: int = 75_000_000) -> None:
|
||||||
|
self.max_output_length = max_output_length
|
||||||
|
|
||||||
|
def _initialize_encoding_table(self) -> None:
|
||||||
|
"""Initialize the encoding table and state to initial conditions."""
|
||||||
|
self.encoding_table: dict[bytes, int] = {bytes([i]): i for i in range(256)}
|
||||||
|
self.next_code = self.EOD_MARKER + 1
|
||||||
|
self.bits_per_code = self.INITIAL_BITS_PER_CODE
|
||||||
|
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||||
|
|
||||||
|
def _increase_next_code(self) -> None:
|
||||||
|
"""Update bits_per_code and max_code_value if necessary."""
|
||||||
|
self.next_code += 1
|
||||||
|
if (
|
||||||
|
self.next_code > self.max_code_value
|
||||||
|
and self.bits_per_code < self.MAX_BITS_PER_CODE
|
||||||
|
):
|
||||||
|
self.bits_per_code += 1
|
||||||
|
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||||
|
|
||||||
|
def encode(self, data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Encode data using the LZW compression algorithm.
|
||||||
|
|
||||||
|
Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
|
||||||
|
"""
|
||||||
|
result_codes: list[int] = []
|
||||||
|
|
||||||
|
# The encoder shall begin by issuing a clear-table code
|
||||||
|
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||||
|
self._initialize_encoding_table()
|
||||||
|
|
||||||
|
current_sequence = b""
|
||||||
|
for byte in data:
|
||||||
|
next_sequence = current_sequence + bytes([byte])
|
||||||
|
|
||||||
|
if next_sequence in self.encoding_table:
|
||||||
|
# Extend current sequence if already in the table
|
||||||
|
current_sequence = next_sequence
|
||||||
|
else:
|
||||||
|
# Output code for the current sequence
|
||||||
|
result_codes.append(self.encoding_table[current_sequence])
|
||||||
|
|
||||||
|
# Add the new sequence to the table if there's room
|
||||||
|
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
|
||||||
|
self.encoding_table[next_sequence] = self.next_code
|
||||||
|
self._increase_next_code()
|
||||||
|
else:
|
||||||
|
# If the table is full, emit a clear-table command
|
||||||
|
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||||
|
self._initialize_encoding_table()
|
||||||
|
|
||||||
|
# Start new sequence
|
||||||
|
current_sequence = bytes([byte])
|
||||||
|
|
||||||
|
# Ensure everything actually is encoded
|
||||||
|
if current_sequence:
|
||||||
|
result_codes.append(self.encoding_table[current_sequence])
|
||||||
|
result_codes.append(self.EOD_MARKER)
|
||||||
|
|
||||||
|
return self._pack_codes_into_bytes(result_codes)
|
||||||
|
|
||||||
|
def _pack_codes_into_bytes(self, codes: list[int]) -> bytes:
|
||||||
|
"""
|
||||||
|
Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
|
||||||
|
The bit-width starts at 9 bits and expands as needed.
|
||||||
|
"""
|
||||||
|
self._initialize_encoding_table()
|
||||||
|
buffer = 0
|
||||||
|
bits_in_buffer = 0
|
||||||
|
output = bytearray()
|
||||||
|
|
||||||
|
for code in codes:
|
||||||
|
buffer = (buffer << self.bits_per_code) | code
|
||||||
|
bits_in_buffer += self.bits_per_code
|
||||||
|
|
||||||
|
# Codes shall be packed into a continuous bit stream, high-order bit
|
||||||
|
# first. This stream shall then be divided into bytes, high-order bit
|
||||||
|
# first.
|
||||||
|
while bits_in_buffer >= 8:
|
||||||
|
bits_in_buffer -= 8
|
||||||
|
output.append((buffer >> bits_in_buffer) & 0xFF)
|
||||||
|
|
||||||
|
if code == self.CLEAR_TABLE_MARKER:
|
||||||
|
self._initialize_encoding_table()
|
||||||
|
elif code == self.EOD_MARKER:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self._increase_next_code()
|
||||||
|
|
||||||
|
# Flush any remaining bits in the buffer
|
||||||
|
if bits_in_buffer > 0:
|
||||||
|
output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
|
||||||
|
|
||||||
|
return bytes(output)
|
||||||
|
|
||||||
|
def _initialize_decoding_table(self) -> None:
|
||||||
|
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
|
||||||
|
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
|
||||||
|
b""
|
||||||
|
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
|
||||||
|
self._table_index = self.EOD_MARKER + 1
|
||||||
|
self._bits_to_get = 9
|
||||||
|
|
||||||
|
def _next_code_decode(self, data: bytes) -> int:
|
||||||
|
self._next_data: int
|
||||||
|
try:
|
||||||
|
while self._next_bits < self._bits_to_get:
|
||||||
|
self._next_data = (self._next_data << 8) | (
|
||||||
|
data[self._byte_pointer]
|
||||||
|
)
|
||||||
|
self._byte_pointer += 1
|
||||||
|
self._next_bits += 8
|
||||||
|
|
||||||
|
code = (
|
||||||
|
self._next_data >> (self._next_bits - self._bits_to_get)
|
||||||
|
) & self._and_table[self._bits_to_get - 9]
|
||||||
|
self._next_bits -= self._bits_to_get
|
||||||
|
|
||||||
|
# Reduce data to get rid of the overhead,
|
||||||
|
# which increases performance on large streams significantly.
|
||||||
|
self._next_data = self._next_data & 0xFFFFF
|
||||||
|
|
||||||
|
return code
|
||||||
|
except IndexError:
|
||||||
|
return self.EOD_MARKER
|
||||||
|
|
||||||
|
# The following method has been converted to Python from PDFsharp:
|
||||||
|
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||||
|
#
|
||||||
|
# Original license:
|
||||||
|
#
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
|
||||||
|
# Germany
|
||||||
|
#
|
||||||
|
# http://docs.pdfsharp.net
|
||||||
|
#
|
||||||
|
# MIT License
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
# copy of this software and associated documentation files (the "Software"),
|
||||||
|
# to deal in the Software without restriction, including without limitation
|
||||||
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
# and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
# Software is furnished to do so, subject to the following conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be included
|
||||||
|
# in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
# DEALINGS IN THE SOFTWARE.
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
def decode(self, data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
The following code was converted to Python from the following code:
|
||||||
|
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||||
|
"""
|
||||||
|
self._and_table = [511, 1023, 2047, 4095]
|
||||||
|
self._table_index = 0
|
||||||
|
self._bits_to_get = 9
|
||||||
|
self._byte_pointer = 0
|
||||||
|
self._next_data = 0
|
||||||
|
self._next_bits = 0
|
||||||
|
|
||||||
|
output_stream = io.BytesIO()
|
||||||
|
output_length = 0
|
||||||
|
|
||||||
|
self._initialize_decoding_table()
|
||||||
|
self._byte_pointer = 0
|
||||||
|
self._next_data = 0
|
||||||
|
self._next_bits = 0
|
||||||
|
old_code = self.CLEAR_TABLE_MARKER
|
||||||
|
|
||||||
|
while True:
|
||||||
|
code = self._next_code_decode(data)
|
||||||
|
if code == self.EOD_MARKER:
|
||||||
|
break
|
||||||
|
|
||||||
|
if code == self.CLEAR_TABLE_MARKER:
|
||||||
|
self._initialize_decoding_table()
|
||||||
|
code = self._next_code_decode(data)
|
||||||
|
if code == self.EOD_MARKER:
|
||||||
|
break
|
||||||
|
output_stream.write(decoded := self.decoding_table[code])
|
||||||
|
old_code = code
|
||||||
|
elif code < self._table_index:
|
||||||
|
decoded = self.decoding_table[code]
|
||||||
|
output_stream.write(decoded)
|
||||||
|
if old_code != self.CLEAR_TABLE_MARKER:
|
||||||
|
self._add_entry_decode(self.decoding_table[old_code], decoded[0])
|
||||||
|
old_code = code
|
||||||
|
else:
|
||||||
|
# The code is not in the table and not one of the special codes
|
||||||
|
decoded = (
|
||||||
|
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
|
||||||
|
)
|
||||||
|
output_stream.write(decoded)
|
||||||
|
self._add_entry_decode(self.decoding_table[old_code], decoded[0])
|
||||||
|
old_code = code
|
||||||
|
|
||||||
|
output_length += len(decoded)
|
||||||
|
if output_length > self.max_output_length:
|
||||||
|
raise LimitReachedError(
|
||||||
|
f"Limit reached while decompressing: {output_length} > {self.max_output_length}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return output_stream.getvalue()
|
||||||
|
|
||||||
|
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
|
||||||
|
new_string = old_string + bytes([new_char])
|
||||||
|
if self._table_index > self.max_code_value:
|
||||||
|
logger_warning("Ignoring too large LZW table index.", __name__)
|
||||||
|
return
|
||||||
|
self.decoding_table[self._table_index] = new_string
|
||||||
|
self._table_index += 1
|
||||||
|
|
||||||
|
# Update the number of bits to get based on the table index
|
||||||
|
if self._table_index == 511:
|
||||||
|
self._bits_to_get = 10
|
||||||
|
elif self._table_index == 1023:
|
||||||
|
self._bits_to_get = 11
|
||||||
|
elif self._table_index == 2047:
|
||||||
|
self._bits_to_get = 12
|
||||||
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
File diff suppressed because it is too large
Load Diff
4441
venv/lib/python3.12/site-packages/pypdf/_codecs/core_fontmetrics.py
Normal file
4441
venv/lib/python3.12/site-packages/pypdf/_codecs/core_fontmetrics.py
Normal file
File diff suppressed because it is too large
Load Diff
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
|
||||||
|
# C.1 Predefined encodings sorted by character name of another PDF reference
|
||||||
|
# Some indices have '\u0000' although they should have something else:
|
||||||
|
# 22: should be '\u0017'
|
||||||
|
_pdfdoc_encoding = [
|
||||||
|
"\u0000",
|
||||||
|
"\u0001",
|
||||||
|
"\u0002",
|
||||||
|
"\u0003",
|
||||||
|
"\u0004",
|
||||||
|
"\u0005",
|
||||||
|
"\u0006",
|
||||||
|
"\u0007", # 0 - 7
|
||||||
|
"\u0008",
|
||||||
|
"\u0009",
|
||||||
|
"\u000a",
|
||||||
|
"\u000b",
|
||||||
|
"\u000c",
|
||||||
|
"\u000d",
|
||||||
|
"\u000e",
|
||||||
|
"\u000f", # 8 - 15
|
||||||
|
"\u0010",
|
||||||
|
"\u0011",
|
||||||
|
"\u0012",
|
||||||
|
"\u0013",
|
||||||
|
"\u0014",
|
||||||
|
"\u0015",
|
||||||
|
"\u0000",
|
||||||
|
"\u0017", # 16 - 23
|
||||||
|
"\u02d8",
|
||||||
|
"\u02c7",
|
||||||
|
"\u02c6",
|
||||||
|
"\u02d9",
|
||||||
|
"\u02dd",
|
||||||
|
"\u02db",
|
||||||
|
"\u02da",
|
||||||
|
"\u02dc", # 24 - 31
|
||||||
|
"\u0020",
|
||||||
|
"\u0021",
|
||||||
|
"\u0022",
|
||||||
|
"\u0023",
|
||||||
|
"\u0024",
|
||||||
|
"\u0025",
|
||||||
|
"\u0026",
|
||||||
|
"\u0027", # 32 - 39
|
||||||
|
"\u0028",
|
||||||
|
"\u0029",
|
||||||
|
"\u002a",
|
||||||
|
"\u002b",
|
||||||
|
"\u002c",
|
||||||
|
"\u002d",
|
||||||
|
"\u002e",
|
||||||
|
"\u002f", # 40 - 47
|
||||||
|
"\u0030",
|
||||||
|
"\u0031",
|
||||||
|
"\u0032",
|
||||||
|
"\u0033",
|
||||||
|
"\u0034",
|
||||||
|
"\u0035",
|
||||||
|
"\u0036",
|
||||||
|
"\u0037", # 48 - 55
|
||||||
|
"\u0038",
|
||||||
|
"\u0039",
|
||||||
|
"\u003a",
|
||||||
|
"\u003b",
|
||||||
|
"\u003c",
|
||||||
|
"\u003d",
|
||||||
|
"\u003e",
|
||||||
|
"\u003f", # 56 - 63
|
||||||
|
"\u0040",
|
||||||
|
"\u0041",
|
||||||
|
"\u0042",
|
||||||
|
"\u0043",
|
||||||
|
"\u0044",
|
||||||
|
"\u0045",
|
||||||
|
"\u0046",
|
||||||
|
"\u0047", # 64 - 71
|
||||||
|
"\u0048",
|
||||||
|
"\u0049",
|
||||||
|
"\u004a",
|
||||||
|
"\u004b",
|
||||||
|
"\u004c",
|
||||||
|
"\u004d",
|
||||||
|
"\u004e",
|
||||||
|
"\u004f", # 72 - 79
|
||||||
|
"\u0050",
|
||||||
|
"\u0051",
|
||||||
|
"\u0052",
|
||||||
|
"\u0053",
|
||||||
|
"\u0054",
|
||||||
|
"\u0055",
|
||||||
|
"\u0056",
|
||||||
|
"\u0057", # 80 - 87
|
||||||
|
"\u0058",
|
||||||
|
"\u0059",
|
||||||
|
"\u005a",
|
||||||
|
"\u005b",
|
||||||
|
"\u005c",
|
||||||
|
"\u005d",
|
||||||
|
"\u005e",
|
||||||
|
"\u005f", # 88 - 95
|
||||||
|
"\u0060",
|
||||||
|
"\u0061",
|
||||||
|
"\u0062",
|
||||||
|
"\u0063",
|
||||||
|
"\u0064",
|
||||||
|
"\u0065",
|
||||||
|
"\u0066",
|
||||||
|
"\u0067", # 96 - 103
|
||||||
|
"\u0068",
|
||||||
|
"\u0069",
|
||||||
|
"\u006a",
|
||||||
|
"\u006b",
|
||||||
|
"\u006c",
|
||||||
|
"\u006d",
|
||||||
|
"\u006e",
|
||||||
|
"\u006f", # 104 - 111
|
||||||
|
"\u0070",
|
||||||
|
"\u0071",
|
||||||
|
"\u0072",
|
||||||
|
"\u0073",
|
||||||
|
"\u0074",
|
||||||
|
"\u0075",
|
||||||
|
"\u0076",
|
||||||
|
"\u0077", # 112 - 119
|
||||||
|
"\u0078",
|
||||||
|
"\u0079",
|
||||||
|
"\u007a",
|
||||||
|
"\u007b",
|
||||||
|
"\u007c",
|
||||||
|
"\u007d",
|
||||||
|
"\u007e",
|
||||||
|
"\u0000", # 120 - 127
|
||||||
|
"\u2022",
|
||||||
|
"\u2020",
|
||||||
|
"\u2021",
|
||||||
|
"\u2026",
|
||||||
|
"\u2014",
|
||||||
|
"\u2013",
|
||||||
|
"\u0192",
|
||||||
|
"\u2044", # 128 - 135
|
||||||
|
"\u2039",
|
||||||
|
"\u203a",
|
||||||
|
"\u2212",
|
||||||
|
"\u2030",
|
||||||
|
"\u201e",
|
||||||
|
"\u201c",
|
||||||
|
"\u201d",
|
||||||
|
"\u2018", # 136 - 143
|
||||||
|
"\u2019",
|
||||||
|
"\u201a",
|
||||||
|
"\u2122",
|
||||||
|
"\ufb01",
|
||||||
|
"\ufb02",
|
||||||
|
"\u0141",
|
||||||
|
"\u0152",
|
||||||
|
"\u0160", # 144 - 151
|
||||||
|
"\u0178",
|
||||||
|
"\u017d",
|
||||||
|
"\u0131",
|
||||||
|
"\u0142",
|
||||||
|
"\u0153",
|
||||||
|
"\u0161",
|
||||||
|
"\u017e",
|
||||||
|
"\u0000", # 152 - 159
|
||||||
|
"\u20ac",
|
||||||
|
"\u00a1",
|
||||||
|
"\u00a2",
|
||||||
|
"\u00a3",
|
||||||
|
"\u00a4",
|
||||||
|
"\u00a5",
|
||||||
|
"\u00a6",
|
||||||
|
"\u00a7", # 160 - 167
|
||||||
|
"\u00a8",
|
||||||
|
"\u00a9",
|
||||||
|
"\u00aa",
|
||||||
|
"\u00ab",
|
||||||
|
"\u00ac",
|
||||||
|
"\u0000",
|
||||||
|
"\u00ae",
|
||||||
|
"\u00af", # 168 - 175
|
||||||
|
"\u00b0",
|
||||||
|
"\u00b1",
|
||||||
|
"\u00b2",
|
||||||
|
"\u00b3",
|
||||||
|
"\u00b4",
|
||||||
|
"\u00b5",
|
||||||
|
"\u00b6",
|
||||||
|
"\u00b7", # 176 - 183
|
||||||
|
"\u00b8",
|
||||||
|
"\u00b9",
|
||||||
|
"\u00ba",
|
||||||
|
"\u00bb",
|
||||||
|
"\u00bc",
|
||||||
|
"\u00bd",
|
||||||
|
"\u00be",
|
||||||
|
"\u00bf", # 184 - 191
|
||||||
|
"\u00c0",
|
||||||
|
"\u00c1",
|
||||||
|
"\u00c2",
|
||||||
|
"\u00c3",
|
||||||
|
"\u00c4",
|
||||||
|
"\u00c5",
|
||||||
|
"\u00c6",
|
||||||
|
"\u00c7", # 192 - 199
|
||||||
|
"\u00c8",
|
||||||
|
"\u00c9",
|
||||||
|
"\u00ca",
|
||||||
|
"\u00cb",
|
||||||
|
"\u00cc",
|
||||||
|
"\u00cd",
|
||||||
|
"\u00ce",
|
||||||
|
"\u00cf", # 200 - 207
|
||||||
|
"\u00d0",
|
||||||
|
"\u00d1",
|
||||||
|
"\u00d2",
|
||||||
|
"\u00d3",
|
||||||
|
"\u00d4",
|
||||||
|
"\u00d5",
|
||||||
|
"\u00d6",
|
||||||
|
"\u00d7", # 208 - 215
|
||||||
|
"\u00d8",
|
||||||
|
"\u00d9",
|
||||||
|
"\u00da",
|
||||||
|
"\u00db",
|
||||||
|
"\u00dc",
|
||||||
|
"\u00dd",
|
||||||
|
"\u00de",
|
||||||
|
"\u00df", # 216 - 223
|
||||||
|
"\u00e0",
|
||||||
|
"\u00e1",
|
||||||
|
"\u00e2",
|
||||||
|
"\u00e3",
|
||||||
|
"\u00e4",
|
||||||
|
"\u00e5",
|
||||||
|
"\u00e6",
|
||||||
|
"\u00e7", # 224 - 231
|
||||||
|
"\u00e8",
|
||||||
|
"\u00e9",
|
||||||
|
"\u00ea",
|
||||||
|
"\u00eb",
|
||||||
|
"\u00ec",
|
||||||
|
"\u00ed",
|
||||||
|
"\u00ee",
|
||||||
|
"\u00ef", # 232 - 239
|
||||||
|
"\u00f0",
|
||||||
|
"\u00f1",
|
||||||
|
"\u00f2",
|
||||||
|
"\u00f3",
|
||||||
|
"\u00f4",
|
||||||
|
"\u00f5",
|
||||||
|
"\u00f6",
|
||||||
|
"\u00f7", # 240 - 247
|
||||||
|
"\u00f8",
|
||||||
|
"\u00f9",
|
||||||
|
"\u00fa",
|
||||||
|
"\u00fb",
|
||||||
|
"\u00fc",
|
||||||
|
"\u00fd",
|
||||||
|
"\u00fe",
|
||||||
|
"\u00ff", # 248 - 255
|
||||||
|
]
|
||||||
|
|
||||||
|
assert len(_pdfdoc_encoding) == 256
|
||||||
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
_std_encoding = [
|
||||||
|
"\x00",
|
||||||
|
"\x01",
|
||||||
|
"\x02",
|
||||||
|
"\x03",
|
||||||
|
"\x04",
|
||||||
|
"\x05",
|
||||||
|
"\x06",
|
||||||
|
"\x07",
|
||||||
|
"\x08",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\x0b",
|
||||||
|
"\x0c",
|
||||||
|
"\r",
|
||||||
|
"\x0e",
|
||||||
|
"\x0f",
|
||||||
|
"\x10",
|
||||||
|
"\x11",
|
||||||
|
"\x12",
|
||||||
|
"\x13",
|
||||||
|
"\x14",
|
||||||
|
"\x15",
|
||||||
|
"\x16",
|
||||||
|
"\x17",
|
||||||
|
"\x18",
|
||||||
|
"\x19",
|
||||||
|
"\x1a",
|
||||||
|
"\x1b",
|
||||||
|
"\x1c",
|
||||||
|
"\x1d",
|
||||||
|
"\x1e",
|
||||||
|
"\x1f",
|
||||||
|
" ",
|
||||||
|
"!",
|
||||||
|
'"',
|
||||||
|
"#",
|
||||||
|
"$",
|
||||||
|
"%",
|
||||||
|
"&",
|
||||||
|
"’",
|
||||||
|
"(",
|
||||||
|
")",
|
||||||
|
"*",
|
||||||
|
"+",
|
||||||
|
",",
|
||||||
|
"-",
|
||||||
|
".",
|
||||||
|
"/",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"2",
|
||||||
|
"3",
|
||||||
|
"4",
|
||||||
|
"5",
|
||||||
|
"6",
|
||||||
|
"7",
|
||||||
|
"8",
|
||||||
|
"9",
|
||||||
|
":",
|
||||||
|
";",
|
||||||
|
"<",
|
||||||
|
"=",
|
||||||
|
">",
|
||||||
|
"?",
|
||||||
|
"@",
|
||||||
|
"A",
|
||||||
|
"B",
|
||||||
|
"C",
|
||||||
|
"D",
|
||||||
|
"E",
|
||||||
|
"F",
|
||||||
|
"G",
|
||||||
|
"H",
|
||||||
|
"I",
|
||||||
|
"J",
|
||||||
|
"K",
|
||||||
|
"L",
|
||||||
|
"M",
|
||||||
|
"N",
|
||||||
|
"O",
|
||||||
|
"P",
|
||||||
|
"Q",
|
||||||
|
"R",
|
||||||
|
"S",
|
||||||
|
"T",
|
||||||
|
"U",
|
||||||
|
"V",
|
||||||
|
"W",
|
||||||
|
"X",
|
||||||
|
"Y",
|
||||||
|
"Z",
|
||||||
|
"[",
|
||||||
|
"\\",
|
||||||
|
"]",
|
||||||
|
"^",
|
||||||
|
"_",
|
||||||
|
"‘",
|
||||||
|
"a",
|
||||||
|
"b",
|
||||||
|
"c",
|
||||||
|
"d",
|
||||||
|
"e",
|
||||||
|
"f",
|
||||||
|
"g",
|
||||||
|
"h",
|
||||||
|
"i",
|
||||||
|
"j",
|
||||||
|
"k",
|
||||||
|
"l",
|
||||||
|
"m",
|
||||||
|
"n",
|
||||||
|
"o",
|
||||||
|
"p",
|
||||||
|
"q",
|
||||||
|
"r",
|
||||||
|
"s",
|
||||||
|
"t",
|
||||||
|
"u",
|
||||||
|
"v",
|
||||||
|
"w",
|
||||||
|
"x",
|
||||||
|
"y",
|
||||||
|
"z",
|
||||||
|
"{",
|
||||||
|
"|",
|
||||||
|
"}",
|
||||||
|
"~",
|
||||||
|
"\x7f",
|
||||||
|
"\x80",
|
||||||
|
"\x81",
|
||||||
|
"\x82",
|
||||||
|
"\x83",
|
||||||
|
"\x84",
|
||||||
|
"\x85",
|
||||||
|
"\x86",
|
||||||
|
"\x87",
|
||||||
|
"\x88",
|
||||||
|
"\x89",
|
||||||
|
"\x8a",
|
||||||
|
"\x8b",
|
||||||
|
"\x8c",
|
||||||
|
"\x8d",
|
||||||
|
"\x8e",
|
||||||
|
"\x8f",
|
||||||
|
"\x90",
|
||||||
|
"\x91",
|
||||||
|
"\x92",
|
||||||
|
"\x93",
|
||||||
|
"\x94",
|
||||||
|
"\x95",
|
||||||
|
"\x96",
|
||||||
|
"\x97",
|
||||||
|
"\x98",
|
||||||
|
"\x99",
|
||||||
|
"\x9a",
|
||||||
|
"\x9b",
|
||||||
|
"\x9c",
|
||||||
|
"\x9d",
|
||||||
|
"\x9e",
|
||||||
|
"\x9f",
|
||||||
|
"\xa0",
|
||||||
|
"¡",
|
||||||
|
"¢",
|
||||||
|
"£",
|
||||||
|
"⁄",
|
||||||
|
"¥",
|
||||||
|
"ƒ",
|
||||||
|
"§",
|
||||||
|
"¤",
|
||||||
|
"'",
|
||||||
|
"“",
|
||||||
|
"«",
|
||||||
|
"‹",
|
||||||
|
"›",
|
||||||
|
"fi",
|
||||||
|
"fl",
|
||||||
|
"°",
|
||||||
|
"–",
|
||||||
|
"†",
|
||||||
|
"‡",
|
||||||
|
"·",
|
||||||
|
"µ",
|
||||||
|
"¶",
|
||||||
|
"•",
|
||||||
|
"‚",
|
||||||
|
"„",
|
||||||
|
"”",
|
||||||
|
"»",
|
||||||
|
"…",
|
||||||
|
"‰",
|
||||||
|
"¾",
|
||||||
|
"¿",
|
||||||
|
"À",
|
||||||
|
"`",
|
||||||
|
"´",
|
||||||
|
"ˆ",
|
||||||
|
"˜",
|
||||||
|
"¯",
|
||||||
|
"˘",
|
||||||
|
"˙",
|
||||||
|
"¨",
|
||||||
|
"É",
|
||||||
|
"˚",
|
||||||
|
"¸",
|
||||||
|
"Ì",
|
||||||
|
"˝",
|
||||||
|
"˛",
|
||||||
|
"ˇ",
|
||||||
|
"—",
|
||||||
|
"Ñ",
|
||||||
|
"Ò",
|
||||||
|
"Ó",
|
||||||
|
"Ô",
|
||||||
|
"Õ",
|
||||||
|
"Ö",
|
||||||
|
"×",
|
||||||
|
"Ø",
|
||||||
|
"Ù",
|
||||||
|
"Ú",
|
||||||
|
"Û",
|
||||||
|
"Ü",
|
||||||
|
"Ý",
|
||||||
|
"Þ",
|
||||||
|
"ß",
|
||||||
|
"à",
|
||||||
|
"Æ",
|
||||||
|
"â",
|
||||||
|
"ª",
|
||||||
|
"ä",
|
||||||
|
"å",
|
||||||
|
"æ",
|
||||||
|
"ç",
|
||||||
|
"Ł",
|
||||||
|
"Ø",
|
||||||
|
"Œ",
|
||||||
|
"º",
|
||||||
|
"ì",
|
||||||
|
"í",
|
||||||
|
"î",
|
||||||
|
"ï",
|
||||||
|
"ð",
|
||||||
|
"æ",
|
||||||
|
"ò",
|
||||||
|
"ó",
|
||||||
|
"ô",
|
||||||
|
"ı",
|
||||||
|
"ö",
|
||||||
|
"÷",
|
||||||
|
"ł",
|
||||||
|
"ø",
|
||||||
|
"œ",
|
||||||
|
"ß",
|
||||||
|
"ü",
|
||||||
|
"ý",
|
||||||
|
"þ",
|
||||||
|
"ÿ",
|
||||||
|
]
|
||||||
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
|
||||||
|
_symbol_encoding = [
|
||||||
|
"\u0000",
|
||||||
|
"\u0001",
|
||||||
|
"\u0002",
|
||||||
|
"\u0003",
|
||||||
|
"\u0004",
|
||||||
|
"\u0005",
|
||||||
|
"\u0006",
|
||||||
|
"\u0007",
|
||||||
|
"\u0008",
|
||||||
|
"\u0009",
|
||||||
|
"\u000A",
|
||||||
|
"\u000B",
|
||||||
|
"\u000C",
|
||||||
|
"\u000D",
|
||||||
|
"\u000E",
|
||||||
|
"\u000F",
|
||||||
|
"\u0010",
|
||||||
|
"\u0011",
|
||||||
|
"\u0012",
|
||||||
|
"\u0013",
|
||||||
|
"\u0014",
|
||||||
|
"\u0015",
|
||||||
|
"\u0016",
|
||||||
|
"\u0017",
|
||||||
|
"\u0018",
|
||||||
|
"\u0019",
|
||||||
|
"\u001A",
|
||||||
|
"\u001B",
|
||||||
|
"\u001C",
|
||||||
|
"\u001D",
|
||||||
|
"\u001E",
|
||||||
|
"\u001F",
|
||||||
|
"\u0020",
|
||||||
|
"\u0021",
|
||||||
|
"\u2200",
|
||||||
|
"\u0023",
|
||||||
|
"\u2203",
|
||||||
|
"\u0025",
|
||||||
|
"\u0026",
|
||||||
|
"\u220B",
|
||||||
|
"\u0028",
|
||||||
|
"\u0029",
|
||||||
|
"\u2217",
|
||||||
|
"\u002B",
|
||||||
|
"\u002C",
|
||||||
|
"\u2212",
|
||||||
|
"\u002E",
|
||||||
|
"\u002F",
|
||||||
|
"\u0030",
|
||||||
|
"\u0031",
|
||||||
|
"\u0032",
|
||||||
|
"\u0033",
|
||||||
|
"\u0034",
|
||||||
|
"\u0035",
|
||||||
|
"\u0036",
|
||||||
|
"\u0037",
|
||||||
|
"\u0038",
|
||||||
|
"\u0039",
|
||||||
|
"\u003A",
|
||||||
|
"\u003B",
|
||||||
|
"\u003C",
|
||||||
|
"\u003D",
|
||||||
|
"\u003E",
|
||||||
|
"\u003F",
|
||||||
|
"\u2245",
|
||||||
|
"\u0391",
|
||||||
|
"\u0392",
|
||||||
|
"\u03A7",
|
||||||
|
"\u0394",
|
||||||
|
"\u0395",
|
||||||
|
"\u03A6",
|
||||||
|
"\u0393",
|
||||||
|
"\u0397",
|
||||||
|
"\u0399",
|
||||||
|
"\u03D1",
|
||||||
|
"\u039A",
|
||||||
|
"\u039B",
|
||||||
|
"\u039C",
|
||||||
|
"\u039D",
|
||||||
|
"\u039F",
|
||||||
|
"\u03A0",
|
||||||
|
"\u0398",
|
||||||
|
"\u03A1",
|
||||||
|
"\u03A3",
|
||||||
|
"\u03A4",
|
||||||
|
"\u03A5",
|
||||||
|
"\u03C2",
|
||||||
|
"\u03A9",
|
||||||
|
"\u039E",
|
||||||
|
"\u03A8",
|
||||||
|
"\u0396",
|
||||||
|
"\u005B",
|
||||||
|
"\u2234",
|
||||||
|
"\u005D",
|
||||||
|
"\u22A5",
|
||||||
|
"\u005F",
|
||||||
|
"\uF8E5",
|
||||||
|
"\u03B1",
|
||||||
|
"\u03B2",
|
||||||
|
"\u03C7",
|
||||||
|
"\u03B4",
|
||||||
|
"\u03B5",
|
||||||
|
"\u03C6",
|
||||||
|
"\u03B3",
|
||||||
|
"\u03B7",
|
||||||
|
"\u03B9",
|
||||||
|
"\u03D5",
|
||||||
|
"\u03BA",
|
||||||
|
"\u03BB",
|
||||||
|
"\u00B5",
|
||||||
|
"\u03BD",
|
||||||
|
"\u03BF",
|
||||||
|
"\u03C0",
|
||||||
|
"\u03B8",
|
||||||
|
"\u03C1",
|
||||||
|
"\u03C3",
|
||||||
|
"\u03C4",
|
||||||
|
"\u03C5",
|
||||||
|
"\u03D6",
|
||||||
|
"\u03C9",
|
||||||
|
"\u03BE",
|
||||||
|
"\u03C8",
|
||||||
|
"\u03B6",
|
||||||
|
"\u007B",
|
||||||
|
"\u007C",
|
||||||
|
"\u007D",
|
||||||
|
"\u223C",
|
||||||
|
"\u007F",
|
||||||
|
"\u0080",
|
||||||
|
"\u0081",
|
||||||
|
"\u0082",
|
||||||
|
"\u0083",
|
||||||
|
"\u0084",
|
||||||
|
"\u0085",
|
||||||
|
"\u0086",
|
||||||
|
"\u0087",
|
||||||
|
"\u0088",
|
||||||
|
"\u0089",
|
||||||
|
"\u008A",
|
||||||
|
"\u008B",
|
||||||
|
"\u008C",
|
||||||
|
"\u008D",
|
||||||
|
"\u008E",
|
||||||
|
"\u008F",
|
||||||
|
"\u0090",
|
||||||
|
"\u0091",
|
||||||
|
"\u0092",
|
||||||
|
"\u0093",
|
||||||
|
"\u0094",
|
||||||
|
"\u0095",
|
||||||
|
"\u0096",
|
||||||
|
"\u0097",
|
||||||
|
"\u0098",
|
||||||
|
"\u0099",
|
||||||
|
"\u009A",
|
||||||
|
"\u009B",
|
||||||
|
"\u009C",
|
||||||
|
"\u009D",
|
||||||
|
"\u009E",
|
||||||
|
"\u009F",
|
||||||
|
"\u20AC",
|
||||||
|
"\u03D2",
|
||||||
|
"\u2032",
|
||||||
|
"\u2264",
|
||||||
|
"\u2044",
|
||||||
|
"\u221E",
|
||||||
|
"\u0192",
|
||||||
|
"\u2663",
|
||||||
|
"\u2666",
|
||||||
|
"\u2665",
|
||||||
|
"\u2660",
|
||||||
|
"\u2194",
|
||||||
|
"\u2190",
|
||||||
|
"\u2191",
|
||||||
|
"\u2192",
|
||||||
|
"\u2193",
|
||||||
|
"\u00B0",
|
||||||
|
"\u00B1",
|
||||||
|
"\u2033",
|
||||||
|
"\u2265",
|
||||||
|
"\u00D7",
|
||||||
|
"\u221D",
|
||||||
|
"\u2202",
|
||||||
|
"\u2022",
|
||||||
|
"\u00F7",
|
||||||
|
"\u2260",
|
||||||
|
"\u2261",
|
||||||
|
"\u2248",
|
||||||
|
"\u2026",
|
||||||
|
"\uF8E6",
|
||||||
|
"\uF8E7",
|
||||||
|
"\u21B5",
|
||||||
|
"\u2135",
|
||||||
|
"\u2111",
|
||||||
|
"\u211C",
|
||||||
|
"\u2118",
|
||||||
|
"\u2297",
|
||||||
|
"\u2295",
|
||||||
|
"\u2205",
|
||||||
|
"\u2229",
|
||||||
|
"\u222A",
|
||||||
|
"\u2283",
|
||||||
|
"\u2287",
|
||||||
|
"\u2284",
|
||||||
|
"\u2282",
|
||||||
|
"\u2286",
|
||||||
|
"\u2208",
|
||||||
|
"\u2209",
|
||||||
|
"\u2220",
|
||||||
|
"\u2207",
|
||||||
|
"\uF6DA",
|
||||||
|
"\uF6D9",
|
||||||
|
"\uF6DB",
|
||||||
|
"\u220F",
|
||||||
|
"\u221A",
|
||||||
|
"\u22C5",
|
||||||
|
"\u00AC",
|
||||||
|
"\u2227",
|
||||||
|
"\u2228",
|
||||||
|
"\u21D4",
|
||||||
|
"\u21D0",
|
||||||
|
"\u21D1",
|
||||||
|
"\u21D2",
|
||||||
|
"\u21D3",
|
||||||
|
"\u25CA",
|
||||||
|
"\u2329",
|
||||||
|
"\uF8E8",
|
||||||
|
"\uF8E9",
|
||||||
|
"\uF8EA",
|
||||||
|
"\u2211",
|
||||||
|
"\uF8EB",
|
||||||
|
"\uF8EC",
|
||||||
|
"\uF8ED",
|
||||||
|
"\uF8EE",
|
||||||
|
"\uF8EF",
|
||||||
|
"\uF8F0",
|
||||||
|
"\uF8F1",
|
||||||
|
"\uF8F2",
|
||||||
|
"\uF8F3",
|
||||||
|
"\uF8F4",
|
||||||
|
"\u00F0",
|
||||||
|
"\u232A",
|
||||||
|
"\u222B",
|
||||||
|
"\u2320",
|
||||||
|
"\uF8F5",
|
||||||
|
"\u2321",
|
||||||
|
"\uF8F6",
|
||||||
|
"\uF8F7",
|
||||||
|
"\uF8F8",
|
||||||
|
"\uF8F9",
|
||||||
|
"\uF8FA",
|
||||||
|
"\uF8FB",
|
||||||
|
"\uF8FC",
|
||||||
|
"\uF8FD",
|
||||||
|
"\uF8FE",
|
||||||
|
"\u00FF",
|
||||||
|
]
|
||||||
|
assert len(_symbol_encoding) == 256
|
||||||
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
|
||||||
|
|
||||||
|
_zapfding_encoding = [
|
||||||
|
"\u0000",
|
||||||
|
"\u0001",
|
||||||
|
"\u0002",
|
||||||
|
"\u0003",
|
||||||
|
"\u0004",
|
||||||
|
"\u0005",
|
||||||
|
"\u0006",
|
||||||
|
"\u0007",
|
||||||
|
"\u0008",
|
||||||
|
"\u0009",
|
||||||
|
"\u000A",
|
||||||
|
"\u000B",
|
||||||
|
"\u000C",
|
||||||
|
"\u000D",
|
||||||
|
"\u000E",
|
||||||
|
"\u000F",
|
||||||
|
"\u0010",
|
||||||
|
"\u0011",
|
||||||
|
"\u0012",
|
||||||
|
"\u0013",
|
||||||
|
"\u0014",
|
||||||
|
"\u0015",
|
||||||
|
"\u0016",
|
||||||
|
"\u0017",
|
||||||
|
"\u0018",
|
||||||
|
"\u0019",
|
||||||
|
"\u001A",
|
||||||
|
"\u001B",
|
||||||
|
"\u001C",
|
||||||
|
"\u001D",
|
||||||
|
"\u001E",
|
||||||
|
"\u001F",
|
||||||
|
"\u0020",
|
||||||
|
"\u2701",
|
||||||
|
"\u2702",
|
||||||
|
"\u2703",
|
||||||
|
"\u2704",
|
||||||
|
"\u260E",
|
||||||
|
"\u2706",
|
||||||
|
"\u2707",
|
||||||
|
"\u2708",
|
||||||
|
"\u2709",
|
||||||
|
"\u261B",
|
||||||
|
"\u261E",
|
||||||
|
"\u270C",
|
||||||
|
"\u270D",
|
||||||
|
"\u270E",
|
||||||
|
"\u270F",
|
||||||
|
"\u2710",
|
||||||
|
"\u2711",
|
||||||
|
"\u2712",
|
||||||
|
"\u2713",
|
||||||
|
"\u2714",
|
||||||
|
"\u2715",
|
||||||
|
"\u2716",
|
||||||
|
"\u2717",
|
||||||
|
"\u2718",
|
||||||
|
"\u2719",
|
||||||
|
"\u271A",
|
||||||
|
"\u271B",
|
||||||
|
"\u271C",
|
||||||
|
"\u271D",
|
||||||
|
"\u271E",
|
||||||
|
"\u271F",
|
||||||
|
"\u2720",
|
||||||
|
"\u2721",
|
||||||
|
"\u2722",
|
||||||
|
"\u2723",
|
||||||
|
"\u2724",
|
||||||
|
"\u2725",
|
||||||
|
"\u2726",
|
||||||
|
"\u2727",
|
||||||
|
"\u2605",
|
||||||
|
"\u2729",
|
||||||
|
"\u272A",
|
||||||
|
"\u272B",
|
||||||
|
"\u272C",
|
||||||
|
"\u272D",
|
||||||
|
"\u272E",
|
||||||
|
"\u272F",
|
||||||
|
"\u2730",
|
||||||
|
"\u2731",
|
||||||
|
"\u2732",
|
||||||
|
"\u2733",
|
||||||
|
"\u2734",
|
||||||
|
"\u2735",
|
||||||
|
"\u2736",
|
||||||
|
"\u2737",
|
||||||
|
"\u2738",
|
||||||
|
"\u2739",
|
||||||
|
"\u273A",
|
||||||
|
"\u273B",
|
||||||
|
"\u273C",
|
||||||
|
"\u273D",
|
||||||
|
"\u273E",
|
||||||
|
"\u273F",
|
||||||
|
"\u2740",
|
||||||
|
"\u2741",
|
||||||
|
"\u2742",
|
||||||
|
"\u2743",
|
||||||
|
"\u2744",
|
||||||
|
"\u2745",
|
||||||
|
"\u2746",
|
||||||
|
"\u2747",
|
||||||
|
"\u2748",
|
||||||
|
"\u2749",
|
||||||
|
"\u274A",
|
||||||
|
"\u274B",
|
||||||
|
"\u25CF",
|
||||||
|
"\u274D",
|
||||||
|
"\u25A0",
|
||||||
|
"\u274F",
|
||||||
|
"\u2750",
|
||||||
|
"\u2751",
|
||||||
|
"\u2752",
|
||||||
|
"\u25B2",
|
||||||
|
"\u25BC",
|
||||||
|
"\u25C6",
|
||||||
|
"\u2756",
|
||||||
|
"\u25D7",
|
||||||
|
"\u2758",
|
||||||
|
"\u2759",
|
||||||
|
"\u275A",
|
||||||
|
"\u275B",
|
||||||
|
"\u275C",
|
||||||
|
"\u275D",
|
||||||
|
"\u275E",
|
||||||
|
"\u007F",
|
||||||
|
"\uF8D7",
|
||||||
|
"\uF8D8",
|
||||||
|
"\uF8D9",
|
||||||
|
"\uF8DA",
|
||||||
|
"\uF8DB",
|
||||||
|
"\uF8DC",
|
||||||
|
"\uF8DD",
|
||||||
|
"\uF8DE",
|
||||||
|
"\uF8DF",
|
||||||
|
"\uF8E0",
|
||||||
|
"\uF8E1",
|
||||||
|
"\uF8E2",
|
||||||
|
"\uF8E3",
|
||||||
|
"\uF8E4",
|
||||||
|
"\u008E",
|
||||||
|
"\u008F",
|
||||||
|
"\u0090",
|
||||||
|
"\u0091",
|
||||||
|
"\u0092",
|
||||||
|
"\u0093",
|
||||||
|
"\u0094",
|
||||||
|
"\u0095",
|
||||||
|
"\u0096",
|
||||||
|
"\u0097",
|
||||||
|
"\u0098",
|
||||||
|
"\u0099",
|
||||||
|
"\u009A",
|
||||||
|
"\u009B",
|
||||||
|
"\u009C",
|
||||||
|
"\u009D",
|
||||||
|
"\u009E",
|
||||||
|
"\u009F",
|
||||||
|
"\u00A0",
|
||||||
|
"\u2761",
|
||||||
|
"\u2762",
|
||||||
|
"\u2763",
|
||||||
|
"\u2764",
|
||||||
|
"\u2765",
|
||||||
|
"\u2766",
|
||||||
|
"\u2767",
|
||||||
|
"\u2663",
|
||||||
|
"\u2666",
|
||||||
|
"\u2665",
|
||||||
|
"\u2660",
|
||||||
|
"\u2460",
|
||||||
|
"\u2461",
|
||||||
|
"\u2462",
|
||||||
|
"\u2463",
|
||||||
|
"\u2464",
|
||||||
|
"\u2465",
|
||||||
|
"\u2466",
|
||||||
|
"\u2467",
|
||||||
|
"\u2468",
|
||||||
|
"\u2469",
|
||||||
|
"\u2776",
|
||||||
|
"\u2777",
|
||||||
|
"\u2778",
|
||||||
|
"\u2779",
|
||||||
|
"\u277A",
|
||||||
|
"\u277B",
|
||||||
|
"\u277C",
|
||||||
|
"\u277D",
|
||||||
|
"\u277E",
|
||||||
|
"\u277F",
|
||||||
|
"\u2780",
|
||||||
|
"\u2781",
|
||||||
|
"\u2782",
|
||||||
|
"\u2783",
|
||||||
|
"\u2784",
|
||||||
|
"\u2785",
|
||||||
|
"\u2786",
|
||||||
|
"\u2787",
|
||||||
|
"\u2788",
|
||||||
|
"\u2789",
|
||||||
|
"\u278A",
|
||||||
|
"\u278B",
|
||||||
|
"\u278C",
|
||||||
|
"\u278D",
|
||||||
|
"\u278E",
|
||||||
|
"\u278F",
|
||||||
|
"\u2790",
|
||||||
|
"\u2791",
|
||||||
|
"\u2792",
|
||||||
|
"\u2793",
|
||||||
|
"\u2794",
|
||||||
|
"\u2192",
|
||||||
|
"\u2194",
|
||||||
|
"\u2195",
|
||||||
|
"\u2798",
|
||||||
|
"\u2799",
|
||||||
|
"\u279A",
|
||||||
|
"\u279B",
|
||||||
|
"\u279C",
|
||||||
|
"\u279D",
|
||||||
|
"\u279E",
|
||||||
|
"\u279F",
|
||||||
|
"\u27A0",
|
||||||
|
"\u27A1",
|
||||||
|
"\u27A2",
|
||||||
|
"\u27A3",
|
||||||
|
"\u27A4",
|
||||||
|
"\u27A5",
|
||||||
|
"\u27A6",
|
||||||
|
"\u27A7",
|
||||||
|
"\u27A8",
|
||||||
|
"\u27A9",
|
||||||
|
"\u27AA",
|
||||||
|
"\u27AB",
|
||||||
|
"\u27AC",
|
||||||
|
"\u27AD",
|
||||||
|
"\u27AE",
|
||||||
|
"\u27AF",
|
||||||
|
"\u00F0",
|
||||||
|
"\u27B1",
|
||||||
|
"\u27B2",
|
||||||
|
"\u27B3",
|
||||||
|
"\u27B4",
|
||||||
|
"\u27B5",
|
||||||
|
"\u27B6",
|
||||||
|
"\u27B7",
|
||||||
|
"\u27B8",
|
||||||
|
"\u27B9",
|
||||||
|
"\u27BA",
|
||||||
|
"\u27BB",
|
||||||
|
"\u27BC",
|
||||||
|
"\u27BD",
|
||||||
|
"\u27BE",
|
||||||
|
"\u00FF",
|
||||||
|
]
|
||||||
|
assert len(_zapfding_encoding) == 256
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
# Copyright (c) 2023, exiledkingcc
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from pypdf._crypt_providers._base import CryptBase, CryptIdentity
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pypdf._crypt_providers._cryptography import (
|
||||||
|
CryptAES,
|
||||||
|
CryptRC4,
|
||||||
|
aes_cbc_decrypt,
|
||||||
|
aes_cbc_encrypt,
|
||||||
|
aes_ecb_decrypt,
|
||||||
|
aes_ecb_encrypt,
|
||||||
|
crypt_provider,
|
||||||
|
rc4_decrypt,
|
||||||
|
rc4_encrypt,
|
||||||
|
)
|
||||||
|
from pypdf._utils import Version
|
||||||
|
|
||||||
|
if Version(crypt_provider[1]) <= Version("3.0"):
|
||||||
|
# This is due to the backend parameter being required back then:
|
||||||
|
# https://cryptography.io/en/latest/changelog/#v3-1
|
||||||
|
raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from pypdf._crypt_providers._pycryptodome import ( # type: ignore
|
||||||
|
CryptAES,
|
||||||
|
CryptRC4,
|
||||||
|
aes_cbc_decrypt,
|
||||||
|
aes_cbc_encrypt,
|
||||||
|
aes_ecb_decrypt,
|
||||||
|
aes_ecb_encrypt,
|
||||||
|
crypt_provider,
|
||||||
|
rc4_decrypt,
|
||||||
|
rc4_encrypt,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
from pypdf._crypt_providers._fallback import ( # type: ignore
|
||||||
|
CryptAES,
|
||||||
|
CryptRC4,
|
||||||
|
aes_cbc_decrypt,
|
||||||
|
aes_cbc_encrypt,
|
||||||
|
aes_ecb_decrypt,
|
||||||
|
aes_ecb_encrypt,
|
||||||
|
crypt_provider,
|
||||||
|
rc4_decrypt,
|
||||||
|
rc4_encrypt,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"CryptAES",
|
||||||
|
"CryptBase",
|
||||||
|
"CryptIdentity",
|
||||||
|
"CryptRC4",
|
||||||
|
"aes_cbc_decrypt",
|
||||||
|
"aes_cbc_encrypt",
|
||||||
|
"aes_ecb_decrypt",
|
||||||
|
"aes_ecb_encrypt",
|
||||||
|
"crypt_provider",
|
||||||
|
"rc4_decrypt",
|
||||||
|
"rc4_encrypt",
|
||||||
|
]
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
# Copyright (c) 2023, exiledkingcc
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
class CryptBase:
|
||||||
|
def encrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||||
|
return data
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class CryptIdentity(CryptBase):
|
||||||
|
pass
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
# Copyright (c) 2023, exiledkingcc
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import secrets
|
||||||
|
|
||||||
|
from cryptography import __version__
|
||||||
|
from cryptography.hazmat.primitives import padding
|
||||||
|
from cryptography.hazmat.primitives.ciphers.algorithms import AES
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 43.0.0 - https://cryptography.io/en/latest/changelog/#v43-0-0
|
||||||
|
from cryptography.hazmat.decrepit.ciphers.algorithms import ARC4
|
||||||
|
except ImportError:
|
||||||
|
from cryptography.hazmat.primitives.ciphers.algorithms import ARC4
|
||||||
|
from cryptography.hazmat.primitives.ciphers.base import Cipher
|
||||||
|
from cryptography.hazmat.primitives.ciphers.modes import CBC, ECB
|
||||||
|
|
||||||
|
from pypdf._crypt_providers._base import CryptBase
|
||||||
|
|
||||||
|
crypt_provider = ("cryptography", __version__)
|
||||||
|
|
||||||
|
|
||||||
|
class CryptRC4(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
self.cipher = Cipher(ARC4(key), mode=None)
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
encryptor = self.cipher.encryptor()
|
||||||
|
return encryptor.update(data) + encryptor.finalize()
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
decryptor = self.cipher.decryptor()
|
||||||
|
return decryptor.update(data) + decryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
class CryptAES(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
self.alg = AES(key)
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
iv = secrets.token_bytes(16)
|
||||||
|
pad = padding.PKCS7(128).padder()
|
||||||
|
data = pad.update(data) + pad.finalize()
|
||||||
|
|
||||||
|
cipher = Cipher(self.alg, CBC(iv))
|
||||||
|
encryptor = cipher.encryptor()
|
||||||
|
return iv + encryptor.update(data) + encryptor.finalize()
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
iv = data[:16]
|
||||||
|
data = data[16:]
|
||||||
|
# for empty encrypted data
|
||||||
|
if not data:
|
||||||
|
return data
|
||||||
|
|
||||||
|
# just for robustness, it does not happen under normal circumstances
|
||||||
|
if len(data) % 16 != 0:
|
||||||
|
pad = padding.PKCS7(128).padder()
|
||||||
|
data = pad.update(data) + pad.finalize()
|
||||||
|
|
||||||
|
cipher = Cipher(self.alg, CBC(iv))
|
||||||
|
decryptor = cipher.decryptor()
|
||||||
|
d = decryptor.update(data) + decryptor.finalize()
|
||||||
|
return d[: -d[-1]]
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
encryptor = Cipher(ARC4(key), mode=None).encryptor()
|
||||||
|
return encryptor.update(data) + encryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
decryptor = Cipher(ARC4(key), mode=None).decryptor()
|
||||||
|
return decryptor.update(data) + decryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
encryptor = Cipher(AES(key), mode=ECB()).encryptor()
|
||||||
|
return encryptor.update(data) + encryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
decryptor = Cipher(AES(key), mode=ECB()).decryptor()
|
||||||
|
return decryptor.update(data) + decryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
encryptor = Cipher(AES(key), mode=CBC(iv)).encryptor()
|
||||||
|
return encryptor.update(data) + encryptor.finalize()
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
decryptor = Cipher(AES(key), mode=CBC(iv)).decryptor()
|
||||||
|
return decryptor.update(data) + decryptor.finalize()
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
# Copyright (c) 2023, exiledkingcc
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from pypdf._crypt_providers._base import CryptBase
|
||||||
|
from pypdf.errors import DependencyError
|
||||||
|
|
||||||
|
_DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm"
|
||||||
|
|
||||||
|
|
||||||
|
crypt_provider = ("local_crypt_fallback", "0.0.0")
|
||||||
|
|
||||||
|
|
||||||
|
class CryptRC4(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
self.s = bytearray(range(256))
|
||||||
|
j = 0
|
||||||
|
for i in range(256):
|
||||||
|
j = (j + self.s[i] + key[i % len(key)]) % 256
|
||||||
|
self.s[i], self.s[j] = self.s[j], self.s[i]
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
s = bytearray(self.s)
|
||||||
|
out = [0 for _ in range(len(data))]
|
||||||
|
i, j = 0, 0
|
||||||
|
for k in range(len(data)):
|
||||||
|
i = (i + 1) % 256
|
||||||
|
j = (j + s[i]) % 256
|
||||||
|
s[i], s[j] = s[j], s[i]
|
||||||
|
x = s[(s[i] + s[j]) % 256]
|
||||||
|
out[k] = data[k] ^ x
|
||||||
|
return bytes(out)
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
return self.encrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
class CryptAES(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return CryptRC4(key).encrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return CryptRC4(key).decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
# Copyright (c) 2023, exiledkingcc
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import secrets
|
||||||
|
|
||||||
|
from Crypto import __version__
|
||||||
|
from Crypto.Cipher import AES, ARC4
|
||||||
|
from Crypto.Util.Padding import pad
|
||||||
|
|
||||||
|
from pypdf._crypt_providers._base import CryptBase
|
||||||
|
|
||||||
|
crypt_provider = ("pycryptodome", __version__)
|
||||||
|
|
||||||
|
|
||||||
|
class CryptRC4(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
self.key = key
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
return ARC4.ARC4Cipher(self.key).encrypt(data)
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
return ARC4.ARC4Cipher(self.key).decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
class CryptAES(CryptBase):
|
||||||
|
def __init__(self, key: bytes) -> None:
|
||||||
|
self.key = key
|
||||||
|
|
||||||
|
def encrypt(self, data: bytes) -> bytes:
|
||||||
|
iv = secrets.token_bytes(16)
|
||||||
|
data = pad(data, 16)
|
||||||
|
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||||
|
return iv + aes.encrypt(data)
|
||||||
|
|
||||||
|
def decrypt(self, data: bytes) -> bytes:
|
||||||
|
iv = data[:16]
|
||||||
|
data = data[16:]
|
||||||
|
# for empty encrypted data
|
||||||
|
if not data:
|
||||||
|
return data
|
||||||
|
|
||||||
|
# just for robustness, it does not happen under normal circumstances
|
||||||
|
if len(data) % 16 != 0:
|
||||||
|
data = pad(data, 16)
|
||||||
|
|
||||||
|
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||||
|
d = aes.decrypt(data)
|
||||||
|
return d[: -d[-1]]
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return ARC4.ARC4Cipher(key).encrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return ARC4.ARC4Cipher(key).decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return AES.new(key, AES.MODE_ECB).encrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||||
|
return AES.new(key, AES.MODE_ECB).decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
return AES.new(key, AES.MODE_CBC, iv).encrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||||
|
return AES.new(key, AES.MODE_CBC, iv).decrypt(data)
|
||||||
1461
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
1461
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
File diff suppressed because it is too large
Load Diff
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
File diff suppressed because it is too large
Load Diff
327
venv/lib/python3.12/site-packages/pypdf/_font.py
Normal file
327
venv/lib/python3.12/site-packages/pypdf/_font.py
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
from collections.abc import Sequence
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Optional, Union, cast
|
||||||
|
|
||||||
|
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
||||||
|
|
||||||
|
from ._cmap import get_encoding
|
||||||
|
from ._codecs.adobe_glyphs import adobe_glyphs
|
||||||
|
from ._utils import logger_warning
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class FontDescriptor:
|
||||||
|
"""
|
||||||
|
Represents the FontDescriptor dictionary as defined in the PDF specification.
|
||||||
|
This contains both descriptive and metric information.
|
||||||
|
|
||||||
|
The defaults are derived from the mean values of the 14 core fonts, rounded
|
||||||
|
to 100.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Unknown"
|
||||||
|
family: str = "Unknown"
|
||||||
|
weight: str = "Unknown"
|
||||||
|
|
||||||
|
ascent: float = 700.0
|
||||||
|
descent: float = -200.0
|
||||||
|
cap_height: float = 600.0
|
||||||
|
x_height: float = 500.0
|
||||||
|
italic_angle: float = 0.0 # Non-italic
|
||||||
|
flags: int = 32 # Non-serif, non-symbolic, not fixed width
|
||||||
|
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
|
||||||
|
|
||||||
|
character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
|
||||||
|
font_descriptor_dict: DictionaryObject = (
|
||||||
|
font_descriptor_obj.get_object()
|
||||||
|
if isinstance(font_descriptor_obj, IndirectObject)
|
||||||
|
else font_descriptor_obj
|
||||||
|
)
|
||||||
|
for source_key, target_key in [
|
||||||
|
("/FontName", "name"),
|
||||||
|
("/FontFamily", "family"),
|
||||||
|
("/FontWeight", "weight"),
|
||||||
|
("/Ascent", "ascent"),
|
||||||
|
("/Descent", "descent"),
|
||||||
|
("/CapHeight", "cap_height"),
|
||||||
|
("/XHeight", "x_height"),
|
||||||
|
("/ItalicAngle", "italic_angle"),
|
||||||
|
("/Flags", "flags"),
|
||||||
|
("/FontBBox", "bbox")
|
||||||
|
]:
|
||||||
|
if source_key in font_descriptor_dict:
|
||||||
|
font_kwargs[target_key] = font_descriptor_dict[source_key]
|
||||||
|
# Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
|
||||||
|
if "bbox" in font_kwargs:
|
||||||
|
bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
|
||||||
|
assert len(bbox_tuple) == 4, bbox_tuple
|
||||||
|
font_kwargs["bbox"] = bbox_tuple
|
||||||
|
return font_kwargs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _collect_tt_t1_character_widths(
|
||||||
|
pdf_font_dict: DictionaryObject,
|
||||||
|
char_map: dict[Any, Any],
|
||||||
|
encoding: Union[str, dict[int, str]],
|
||||||
|
current_widths: dict[str, int]
|
||||||
|
) -> None:
|
||||||
|
"""Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
|
||||||
|
widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
|
||||||
|
first_char = pdf_font_dict.get("/FirstChar", 0)
|
||||||
|
if not isinstance(encoding, str):
|
||||||
|
# This means that encoding is a dict
|
||||||
|
current_widths.update({
|
||||||
|
encoding.get(idx + first_char, chr(idx + first_char)): width
|
||||||
|
for idx, width in enumerate(widths_array)
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
|
# We map the character code directly to the character
|
||||||
|
# using the string encoding
|
||||||
|
for idx, width in enumerate(widths_array):
|
||||||
|
# Often "idx == 0" will denote the .notdef character, but we add it anyway
|
||||||
|
char_code = idx + first_char # This is a raw code
|
||||||
|
# Get the "raw" character or byte representation
|
||||||
|
raw_char = bytes([char_code]).decode(encoding, "surrogatepass")
|
||||||
|
# Translate raw_char to the REAL Unicode character using the char_map
|
||||||
|
unicode_char = char_map.get(raw_char)
|
||||||
|
if unicode_char:
|
||||||
|
current_widths[unicode_char] = int(width)
|
||||||
|
else:
|
||||||
|
current_widths[raw_char] = int(width)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _collect_cid_character_widths(
|
||||||
|
d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
|
||||||
|
) -> None:
|
||||||
|
"""Parses the /W array from a DescendantFont dictionary and updates character widths."""
|
||||||
|
ord_map = {
|
||||||
|
ord(_target): _surrogate
|
||||||
|
for _target, _surrogate in char_map.items()
|
||||||
|
if isinstance(_target, str)
|
||||||
|
}
|
||||||
|
# /W width definitions have two valid formats which can be mixed and matched:
|
||||||
|
# (1) A character start index followed by a list of widths, e.g.
|
||||||
|
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
|
||||||
|
# (2) A character start index, a character stop index, and a width, e.g.
|
||||||
|
# `45 65 500` applies width 500 to characters 45-65.
|
||||||
|
skip_count = 0
|
||||||
|
_w = d_font.get("/W", [])
|
||||||
|
for idx, w_entry in enumerate(_w):
|
||||||
|
w_entry = w_entry.get_object()
|
||||||
|
if skip_count:
|
||||||
|
skip_count -= 1
|
||||||
|
continue
|
||||||
|
if not isinstance(w_entry, (int, float)):
|
||||||
|
# We should never get here due to skip_count above. But
|
||||||
|
# sometimes we do.
|
||||||
|
logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)
|
||||||
|
continue
|
||||||
|
# check for format (1): `int [int int int int ...]`
|
||||||
|
w_next_entry = _w[idx + 1].get_object()
|
||||||
|
if isinstance(w_next_entry, Sequence):
|
||||||
|
start_idx, width_list = w_entry, w_next_entry
|
||||||
|
current_widths.update(
|
||||||
|
{
|
||||||
|
ord_map[_cidx]: _width
|
||||||
|
for _cidx, _width in zip(
|
||||||
|
range(
|
||||||
|
cast(int, start_idx),
|
||||||
|
cast(int, start_idx) + len(width_list),
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
width_list,
|
||||||
|
)
|
||||||
|
if _cidx in ord_map
|
||||||
|
}
|
||||||
|
)
|
||||||
|
skip_count = 1
|
||||||
|
# check for format (2): `int int int`
|
||||||
|
elif isinstance(w_next_entry, (int, float)) and isinstance(
|
||||||
|
_w[idx + 2].get_object(), (int, float)
|
||||||
|
):
|
||||||
|
start_idx, stop_idx, const_width = (
|
||||||
|
w_entry,
|
||||||
|
w_next_entry,
|
||||||
|
_w[idx + 2].get_object(),
|
||||||
|
)
|
||||||
|
current_widths.update(
|
||||||
|
{
|
||||||
|
ord_map[_cidx]: const_width
|
||||||
|
for _cidx in range(
|
||||||
|
cast(int, start_idx), cast(int, stop_idx + 1), 1
|
||||||
|
)
|
||||||
|
if _cidx in ord_map
|
||||||
|
}
|
||||||
|
)
|
||||||
|
skip_count = 2
|
||||||
|
else:
|
||||||
|
# This handles the case of out of bounds (reaching the end of the width definitions
|
||||||
|
# while expecting more elements).
|
||||||
|
logger_warning(
|
||||||
|
f"Invalid font width definition. Last element: {w_entry}.",
|
||||||
|
__name__
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_default_width(current_widths: dict[str, int]) -> None:
|
||||||
|
if not current_widths:
|
||||||
|
current_widths["default"] = 500
|
||||||
|
return
|
||||||
|
|
||||||
|
if "default" in current_widths:
|
||||||
|
return
|
||||||
|
|
||||||
|
if " " in current_widths and current_widths[" "] != 0:
|
||||||
|
# Setting default to twice the space width
|
||||||
|
current_widths["default"] = int(2 * current_widths[" "])
|
||||||
|
return
|
||||||
|
|
||||||
|
# Use the average width of existing glyph widths
|
||||||
|
valid_widths = [w for w in current_widths.values() if w > 0]
|
||||||
|
current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_font_resource(
|
||||||
|
cls,
|
||||||
|
pdf_font_dict: DictionaryObject,
|
||||||
|
encoding: Optional[Union[str, dict[int, str]]] = None,
|
||||||
|
char_map: Optional[dict[Any, Any]] = None
|
||||||
|
) -> "FontDescriptor":
|
||||||
|
from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415
|
||||||
|
# Prioritize information from the PDF font dictionary
|
||||||
|
font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
||||||
|
font_kwargs: dict[str, Any] = {"character_widths": {}}
|
||||||
|
|
||||||
|
# Deal with fonts by type; Type1, TrueType and certain Type3
|
||||||
|
if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
|
||||||
|
if "/Widths" in pdf_font_dict:
|
||||||
|
if not (encoding and char_map):
|
||||||
|
encoding, char_map = get_encoding(pdf_font_dict)
|
||||||
|
cls._collect_tt_t1_character_widths(
|
||||||
|
pdf_font_dict, char_map, encoding, font_kwargs["character_widths"]
|
||||||
|
)
|
||||||
|
elif font_name in CORE_FONT_METRICS:
|
||||||
|
font_descriptor = CORE_FONT_METRICS[font_name]
|
||||||
|
cls._add_default_width(font_descriptor.character_widths)
|
||||||
|
|
||||||
|
return font_descriptor
|
||||||
|
|
||||||
|
if "/FontDescriptor" in pdf_font_dict: # TODO: This does not account for some Type3 fonts;
|
||||||
|
# see tests/test_cmap.py::test_ascii_charset
|
||||||
|
font_descriptor_resource = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
|
||||||
|
font_descriptor_obj = cast(DictionaryObject, font_descriptor_resource)
|
||||||
|
if "/MissingWidth" in font_descriptor_obj:
|
||||||
|
font_kwargs["character_widths"]["default"] = font_descriptor_obj["/MissingWidth"].get_object()
|
||||||
|
font_kwargs = cls._parse_font_descriptor(
|
||||||
|
font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
|
||||||
|
)
|
||||||
|
if "default" not in font_kwargs["character_widths"]:
|
||||||
|
cls._add_default_width(font_kwargs["character_widths"])
|
||||||
|
|
||||||
|
return cls(**font_kwargs)
|
||||||
|
|
||||||
|
# Composite font or CID font - CID fonts have a /W array mapping character codes
|
||||||
|
# to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
|
||||||
|
# because all other fonts have already been dealt with.
|
||||||
|
if not (encoding and char_map):
|
||||||
|
encoding, char_map = get_encoding(pdf_font_dict)
|
||||||
|
d_font: DictionaryObject
|
||||||
|
for d_font_idx, d_font in enumerate(
|
||||||
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
|
||||||
|
):
|
||||||
|
d_font = cast(DictionaryObject, d_font.get_object())
|
||||||
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
|
||||||
|
cls._collect_cid_character_widths(
|
||||||
|
d_font, char_map, font_kwargs["character_widths"]
|
||||||
|
)
|
||||||
|
if "/DW" in d_font:
|
||||||
|
font_kwargs["character_widths"]["default"] = d_font["/DW"].get_object()
|
||||||
|
else:
|
||||||
|
cls._add_default_width(font_kwargs["character_widths"])
|
||||||
|
font_kwargs = cls._parse_font_descriptor(
|
||||||
|
font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls(**font_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Font:
|
||||||
|
"""
|
||||||
|
A font object for use during text extraction and for producing
|
||||||
|
text appearance streams.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: Font name, derived from font["/BaseFont"]
|
||||||
|
character_map: The font's character map
|
||||||
|
encoding: Font encoding
|
||||||
|
sub_type: The font type, such as Type1, TrueType, or Type3.
|
||||||
|
font_descriptor: Font metrics, including a mapping of characters to widths
|
||||||
|
character_widths: A mapping of characters to widths
|
||||||
|
space_width: The width of a space, or an approximation
|
||||||
|
interpretable: Default True. If False, the font glyphs cannot
|
||||||
|
be translated to characters, e.g. Type3 fonts that do not define
|
||||||
|
a '/ToUnicode' mapping.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
encoding: Union[str, dict[int, str]]
|
||||||
|
character_map: dict[Any, Any] = field(default_factory=dict)
|
||||||
|
sub_type: str = "Unknown"
|
||||||
|
font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
|
||||||
|
character_widths: dict[str, int] = field(default_factory=dict)
|
||||||
|
space_width: Union[float, int] = 250
|
||||||
|
interpretable: bool = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_font_resource(
|
||||||
|
cls,
|
||||||
|
pdf_font_dict: DictionaryObject,
|
||||||
|
) -> "Font":
|
||||||
|
# Can collect base_font, name and encoding directly from font resource
|
||||||
|
name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
||||||
|
sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
|
||||||
|
encoding, character_map = get_encoding(pdf_font_dict)
|
||||||
|
|
||||||
|
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
|
||||||
|
# reliably converted into character codes unless all named chars
|
||||||
|
# in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
|
||||||
|
# PDF 1.7 standard.
|
||||||
|
interpretable = True
|
||||||
|
if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
|
||||||
|
interpretable = all(
|
||||||
|
cname in adobe_glyphs
|
||||||
|
for cname in pdf_font_dict.get("/CharProcs") or []
|
||||||
|
)
|
||||||
|
|
||||||
|
if interpretable:
|
||||||
|
font_descriptor = FontDescriptor.from_font_resource(pdf_font_dict, encoding, character_map)
|
||||||
|
else:
|
||||||
|
font_descriptor = FontDescriptor() # Save some overhead if font is not interpretable
|
||||||
|
character_widths = font_descriptor.character_widths
|
||||||
|
|
||||||
|
space_width = font_descriptor.character_widths.get(" ")
|
||||||
|
if not space_width or space_width == 0:
|
||||||
|
space_width = font_descriptor.character_widths["default"] // 2
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
name=name,
|
||||||
|
sub_type=sub_type,
|
||||||
|
encoding=encoding,
|
||||||
|
font_descriptor=font_descriptor,
|
||||||
|
character_map=character_map,
|
||||||
|
character_widths=character_widths,
|
||||||
|
space_width=space_width,
|
||||||
|
interpretable=interpretable
|
||||||
|
)
|
||||||
|
|
||||||
|
def text_width(self, text: str = "") -> float:
|
||||||
|
"""Sum of character widths specified in PDF font for the supplied text."""
|
||||||
|
return sum(
|
||||||
|
[self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
|
||||||
|
)
|
||||||
2353
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
2353
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
File diff suppressed because it is too large
Load Diff
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
@@ -0,0 +1,289 @@
|
|||||||
|
"""
|
||||||
|
Page labels are shown by PDF viewers as "the page number".
|
||||||
|
|
||||||
|
A page has a numeric index, starting at 0. Additionally, the page
|
||||||
|
has a label. In the most simple case:
|
||||||
|
|
||||||
|
label = index + 1
|
||||||
|
|
||||||
|
However, the title page and the table of contents might have Roman numerals as
|
||||||
|
page labels. This makes things more complicated.
|
||||||
|
|
||||||
|
Example 1
|
||||||
|
---------
|
||||||
|
|
||||||
|
>>> reader.root_object["/PageLabels"]["/Nums"]
|
||||||
|
[0, IndirectObject(18, 0, 139929798197504),
|
||||||
|
8, IndirectObject(19, 0, 139929798197504)]
|
||||||
|
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
|
||||||
|
{'/S': '/r'}
|
||||||
|
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
|
||||||
|
{'/S': '/D'}
|
||||||
|
|
||||||
|
Example 2
|
||||||
|
---------
|
||||||
|
The following is a document with pages labeled
|
||||||
|
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
|
||||||
|
|
||||||
|
1 0 obj
|
||||||
|
<< /Type /Catalog
|
||||||
|
/PageLabels << /Nums [
|
||||||
|
0 << /S /r >>
|
||||||
|
4 << /S /D >>
|
||||||
|
7 << /S /D
|
||||||
|
/P ( A- )
|
||||||
|
/St 8
|
||||||
|
>>
|
||||||
|
% A number tree containing
|
||||||
|
% three page label dictionaries
|
||||||
|
]
|
||||||
|
>>
|
||||||
|
...
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
|
||||||
|
§12.4.2 PDF Specification 1.7 and 2.0
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
Entries in a page label dictionary
|
||||||
|
----------------------------------
|
||||||
|
The /S key:
|
||||||
|
D Decimal Arabic numerals
|
||||||
|
R Uppercase Roman numerals
|
||||||
|
r Lowercase Roman numerals
|
||||||
|
A Uppercase letters (A to Z for the first 26 pages,
|
||||||
|
AA to ZZ for the next 26, and so on)
|
||||||
|
a Lowercase letters (a to z for the first 26 pages,
|
||||||
|
aa to zz for the next 26, and so on)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from typing import Optional, cast
|
||||||
|
|
||||||
|
from ._protocols import PdfCommonDocProtocol
|
||||||
|
from ._utils import logger_warning
|
||||||
|
from .generic import (
|
||||||
|
ArrayObject,
|
||||||
|
DictionaryObject,
|
||||||
|
NullObject,
|
||||||
|
NumberObject,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def number2uppercase_roman_numeral(num: int) -> str:
|
||||||
|
roman = [
|
||||||
|
(1000, "M"),
|
||||||
|
(900, "CM"),
|
||||||
|
(500, "D"),
|
||||||
|
(400, "CD"),
|
||||||
|
(100, "C"),
|
||||||
|
(90, "XC"),
|
||||||
|
(50, "L"),
|
||||||
|
(40, "XL"),
|
||||||
|
(10, "X"),
|
||||||
|
(9, "IX"),
|
||||||
|
(5, "V"),
|
||||||
|
(4, "IV"),
|
||||||
|
(1, "I"),
|
||||||
|
]
|
||||||
|
|
||||||
|
def roman_num(num: int) -> Iterator[str]:
|
||||||
|
for decimal, roman_repr in roman:
|
||||||
|
x, _ = divmod(num, decimal)
|
||||||
|
yield roman_repr * x
|
||||||
|
num -= decimal * x
|
||||||
|
if num <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
return "".join(list(roman_num(num)))
|
||||||
|
|
||||||
|
|
||||||
|
def number2lowercase_roman_numeral(number: int) -> str:
|
||||||
|
return number2uppercase_roman_numeral(number).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def number2uppercase_letter(number: int) -> str:
|
||||||
|
if number <= 0:
|
||||||
|
raise ValueError("Expecting a positive number")
|
||||||
|
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
|
||||||
|
rep = ""
|
||||||
|
while number > 0:
|
||||||
|
remainder = number % 26
|
||||||
|
if remainder == 0:
|
||||||
|
remainder = 26
|
||||||
|
rep = alphabet[remainder - 1] + rep
|
||||||
|
# update
|
||||||
|
number -= remainder
|
||||||
|
number = number // 26
|
||||||
|
return rep
|
||||||
|
|
||||||
|
|
||||||
|
def number2lowercase_letter(number: int) -> str:
|
||||||
|
return number2uppercase_letter(number).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
|
||||||
|
# [Nums] shall be an array of the form
|
||||||
|
# [ key_1 value_1 key_2 value_2 ... key_n value_n ]
|
||||||
|
# where each key_i is an integer and the corresponding
|
||||||
|
# value_i shall be the object associated with that key.
|
||||||
|
# The keys shall be sorted in numerical order,
|
||||||
|
# analogously to the arrangement of keys in a name tree
|
||||||
|
# as described in 7.9.6, "Name Trees."
|
||||||
|
nums = cast(ArrayObject, dictionary_object["/Nums"])
|
||||||
|
i = 0
|
||||||
|
value = None
|
||||||
|
start_index = 0
|
||||||
|
while i < len(nums):
|
||||||
|
start_index = nums[i]
|
||||||
|
value = nums[i + 1].get_object()
|
||||||
|
if i + 2 == len(nums):
|
||||||
|
break
|
||||||
|
if nums[i + 2] > index:
|
||||||
|
break
|
||||||
|
i += 2
|
||||||
|
m = {
|
||||||
|
None: lambda _: "",
|
||||||
|
"/D": lambda n: str(n),
|
||||||
|
"/R": number2uppercase_roman_numeral,
|
||||||
|
"/r": number2lowercase_roman_numeral,
|
||||||
|
"/A": number2uppercase_letter,
|
||||||
|
"/a": number2lowercase_letter,
|
||||||
|
}
|
||||||
|
# if /Nums array is not following the specification or if /Nums is empty
|
||||||
|
if not isinstance(value, dict):
|
||||||
|
return str(index + 1) # Fallback
|
||||||
|
start = value.get("/St", 1)
|
||||||
|
prefix = value.get("/P", "")
|
||||||
|
return prefix + m[value.get("/S")](index - start_index + start)
|
||||||
|
|
||||||
|
|
||||||
|
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
|
||||||
|
"""
|
||||||
|
See 7.9.7 "Number Trees".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reader: The PdfReader
|
||||||
|
index: The index of the page
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The label of the page, e.g. "iv" or "4".
|
||||||
|
|
||||||
|
"""
|
||||||
|
root = cast(DictionaryObject, reader.root_object)
|
||||||
|
if "/PageLabels" not in root:
|
||||||
|
return str(index + 1) # Fallback
|
||||||
|
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
|
||||||
|
if "/Nums" in number_tree:
|
||||||
|
return get_label_from_nums(number_tree, index)
|
||||||
|
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
|
||||||
|
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
|
||||||
|
# Limit maximum depth.
|
||||||
|
level = 0
|
||||||
|
while level < 100:
|
||||||
|
kids = cast(list[DictionaryObject], number_tree["/Kids"])
|
||||||
|
for kid in kids:
|
||||||
|
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
|
||||||
|
limits = cast(list[int], kid["/Limits"])
|
||||||
|
if limits[0] <= index <= limits[1]:
|
||||||
|
if not is_null_or_none(kid.get("/Kids", None)):
|
||||||
|
# Recursive definition.
|
||||||
|
level += 1
|
||||||
|
if level == 100: # pragma: no cover
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Too deep nesting is not supported."
|
||||||
|
)
|
||||||
|
number_tree = kid
|
||||||
|
# Exit the inner `for` loop and continue at the next level with the
|
||||||
|
# next iteration of the `while` loop.
|
||||||
|
break
|
||||||
|
return get_label_from_nums(kid, index)
|
||||||
|
else:
|
||||||
|
# When there are no kids, make sure to exit the `while` loop directly
|
||||||
|
# and continue with the fallback.
|
||||||
|
break
|
||||||
|
|
||||||
|
logger_warning(f"Could not reliably determine page label for {index}.", __name__)
|
||||||
|
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
|
||||||
|
|
||||||
|
|
||||||
|
def nums_insert(
|
||||||
|
key: NumberObject,
|
||||||
|
value: DictionaryObject,
|
||||||
|
nums: ArrayObject,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Insert a key, value pair in a Nums array.
|
||||||
|
|
||||||
|
See 7.9.7 "Number Trees".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: number key of the entry
|
||||||
|
value: value of the entry
|
||||||
|
nums: Nums array to modify
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(nums) % 2 != 0:
|
||||||
|
raise ValueError("A nums like array must have an even number of elements")
|
||||||
|
|
||||||
|
i = len(nums)
|
||||||
|
while i != 0 and key <= nums[i - 2]:
|
||||||
|
i = i - 2
|
||||||
|
|
||||||
|
if i < len(nums) and key == nums[i]:
|
||||||
|
nums[i + 1] = value
|
||||||
|
else:
|
||||||
|
nums.insert(i, key)
|
||||||
|
nums.insert(i + 1, value)
|
||||||
|
|
||||||
|
|
||||||
|
def nums_clear_range(
|
||||||
|
key: NumberObject,
|
||||||
|
page_index_to: int,
|
||||||
|
nums: ArrayObject,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Remove all entries in a number tree in a range after an entry.
|
||||||
|
|
||||||
|
See 7.9.7 "Number Trees".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: number key of the entry before the range
|
||||||
|
page_index_to: The page index of the upper limit of the range
|
||||||
|
nums: Nums array to modify
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(nums) % 2 != 0:
|
||||||
|
raise ValueError("A nums like array must have an even number of elements")
|
||||||
|
if page_index_to < key:
|
||||||
|
raise ValueError("page_index_to must be greater or equal than key")
|
||||||
|
|
||||||
|
i = nums.index(key) + 2
|
||||||
|
while i < len(nums) and nums[i] <= page_index_to:
|
||||||
|
nums.pop(i)
|
||||||
|
nums.pop(i)
|
||||||
|
|
||||||
|
|
||||||
|
def nums_next(
|
||||||
|
key: NumberObject,
|
||||||
|
nums: ArrayObject,
|
||||||
|
) -> tuple[Optional[NumberObject], Optional[DictionaryObject]]:
|
||||||
|
"""
|
||||||
|
Return the (key, value) pair of the entry after the given one.
|
||||||
|
|
||||||
|
See 7.9.7 "Number Trees".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: number key of the entry
|
||||||
|
nums: Nums array
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(nums) % 2 != 0:
|
||||||
|
raise ValueError("A nums like array must have an even number of elements")
|
||||||
|
|
||||||
|
i = nums.index(key) + 2
|
||||||
|
if i < len(nums):
|
||||||
|
return (nums[i], nums[i + 1])
|
||||||
|
return (None, None)
|
||||||
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""Helpers for working with PDF types."""
|
||||||
|
|
||||||
|
from abc import abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import IO, Any, Optional, Protocol, Union
|
||||||
|
|
||||||
|
from ._utils import StrByteType, StreamType
|
||||||
|
|
||||||
|
|
||||||
|
class PdfObjectProtocol(Protocol):
|
||||||
|
indirect_reference: Any
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Union[tuple[str, ...], list[str], None] = (),
|
||||||
|
) -> Any:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
def get_object(self) -> Optional["PdfObjectProtocol"]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
def hash_value(self) -> bytes:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
|
||||||
|
class XmpInformationProtocol(PdfObjectProtocol):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PdfCommonDocProtocol(Protocol):
|
||||||
|
@property
|
||||||
|
def pdf_header(self) -> str:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pages(self) -> list[Any]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
@property
|
||||||
|
def root_object(self) -> PdfObjectProtocol:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
@property
|
||||||
|
def strict(self) -> bool:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
|
||||||
|
class PdfReaderProtocol(PdfCommonDocProtocol, Protocol):
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def xref(self) -> dict[int, dict[int, Any]]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def trailer(self) -> dict[str, Any]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
|
||||||
|
class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
|
||||||
|
_objects: list[Any]
|
||||||
|
_id_translated: dict[int, dict[int, int]]
|
||||||
|
|
||||||
|
incremental: bool
|
||||||
|
_reader: Any # PdfReader
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def write(self, stream: Union[Path, StrByteType]) -> tuple[bool, IO[Any]]:
|
||||||
|
... # pragma: no cover
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def _add_object(self, obj: Any) -> Any:
|
||||||
|
... # pragma: no cover
|
||||||
1352
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
1352
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,245 @@
|
|||||||
|
"""
|
||||||
|
Code related to text extraction.
|
||||||
|
|
||||||
|
Some parts are still in _page.py. In doubt, they will stay there.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
|
from .._font import Font
|
||||||
|
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
|
||||||
|
|
||||||
|
CUSTOM_RTL_MIN: int = -1
|
||||||
|
CUSTOM_RTL_MAX: int = -1
|
||||||
|
CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
|
||||||
|
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
|
||||||
|
|
||||||
|
|
||||||
|
class OrientationNotFoundError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def set_custom_rtl(
|
||||||
|
_min: Union[str, int, None] = None,
|
||||||
|
_max: Union[str, int, None] = None,
|
||||||
|
specials: Union[str, list[int], None] = None,
|
||||||
|
) -> tuple[int, int, list[int]]:
|
||||||
|
"""
|
||||||
|
Change the Right-To-Left and special characters custom parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
_min: The new minimum value for the range of custom characters that
|
||||||
|
will be written right to left.
|
||||||
|
If set to ``None``, the value will not be changed.
|
||||||
|
If set to an integer or string, it will be converted to its ASCII code.
|
||||||
|
The default value is -1, which sets no additional range to be converted.
|
||||||
|
_max: The new maximum value for the range of custom characters that will
|
||||||
|
be written right to left.
|
||||||
|
If set to ``None``, the value will not be changed.
|
||||||
|
If set to an integer or string, it will be converted to its ASCII code.
|
||||||
|
The default value is -1, which sets no additional range to be converted.
|
||||||
|
specials: The new list of special characters to be inserted in the
|
||||||
|
current insertion order.
|
||||||
|
If set to ``None``, the current value will not be changed.
|
||||||
|
If set to a string, it will be converted to a list of ASCII codes.
|
||||||
|
The default value is an empty list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
|
||||||
|
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
|
||||||
|
|
||||||
|
"""
|
||||||
|
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||||
|
if isinstance(_min, int):
|
||||||
|
CUSTOM_RTL_MIN = _min
|
||||||
|
elif isinstance(_min, str):
|
||||||
|
CUSTOM_RTL_MIN = ord(_min)
|
||||||
|
if isinstance(_max, int):
|
||||||
|
CUSTOM_RTL_MAX = _max
|
||||||
|
elif isinstance(_max, str):
|
||||||
|
CUSTOM_RTL_MAX = ord(_max)
|
||||||
|
if isinstance(specials, str):
|
||||||
|
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
|
||||||
|
elif isinstance(specials, list):
|
||||||
|
CUSTOM_RTL_SPECIAL_CHARS = specials
|
||||||
|
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||||
|
|
||||||
|
|
||||||
|
def mult(m: list[float], n: list[float]) -> list[float]:
|
||||||
|
return [
|
||||||
|
m[0] * n[0] + m[1] * n[2],
|
||||||
|
m[0] * n[1] + m[1] * n[3],
|
||||||
|
m[2] * n[0] + m[3] * n[2],
|
||||||
|
m[2] * n[1] + m[3] * n[3],
|
||||||
|
m[4] * n[0] + m[5] * n[2] + n[4],
|
||||||
|
m[4] * n[1] + m[5] * n[3] + n[5],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def orient(m: list[float]) -> int:
|
||||||
|
if m[3] > 1e-6:
|
||||||
|
return 0
|
||||||
|
if m[3] < -1e-6:
|
||||||
|
return 180
|
||||||
|
if m[1] > 0:
|
||||||
|
return 90
|
||||||
|
return 270
|
||||||
|
|
||||||
|
|
||||||
|
def crlf_space_check(
|
||||||
|
text: str,
|
||||||
|
cmtm_prev: tuple[list[float], list[float]],
|
||||||
|
cmtm_matrix: tuple[list[float], list[float]],
|
||||||
|
memo_cmtm: tuple[list[float], list[float]],
|
||||||
|
font_resource: Optional[DictionaryObject],
|
||||||
|
orientations: tuple[int, ...],
|
||||||
|
output: str,
|
||||||
|
font_size: float,
|
||||||
|
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||||
|
str_widths: float,
|
||||||
|
spacewidth: float,
|
||||||
|
str_height: float,
|
||||||
|
) -> tuple[str, str, list[float], list[float]]:
|
||||||
|
cm_prev = cmtm_prev[0]
|
||||||
|
tm_prev = cmtm_prev[1]
|
||||||
|
cm_matrix = cmtm_matrix[0]
|
||||||
|
tm_matrix = cmtm_matrix[1]
|
||||||
|
memo_cm = memo_cmtm[0]
|
||||||
|
memo_tm = memo_cmtm[1]
|
||||||
|
|
||||||
|
m_prev = mult(tm_prev, cm_prev)
|
||||||
|
m = mult(tm_matrix, cm_matrix)
|
||||||
|
orientation = orient(m)
|
||||||
|
delta_x = m[4] - m_prev[4]
|
||||||
|
delta_y = m[5] - m_prev[5]
|
||||||
|
# Table 108 of the 1.7 reference ("Text positioning operators")
|
||||||
|
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
|
||||||
|
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
|
||||||
|
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
|
||||||
|
cm_prev = m
|
||||||
|
|
||||||
|
if orientation not in orientations:
|
||||||
|
raise OrientationNotFoundError
|
||||||
|
if orientation in (0, 180):
|
||||||
|
moved_height: float = delta_y
|
||||||
|
moved_width: float = delta_x
|
||||||
|
elif orientation in (90, 270):
|
||||||
|
moved_height = delta_x
|
||||||
|
moved_width = delta_y
|
||||||
|
try:
|
||||||
|
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
|
||||||
|
if (output + text)[-1] != "\n":
|
||||||
|
output += text + "\n"
|
||||||
|
if visitor_text is not None:
|
||||||
|
visitor_text(
|
||||||
|
text + "\n",
|
||||||
|
memo_cm,
|
||||||
|
memo_tm,
|
||||||
|
font_resource,
|
||||||
|
font_size,
|
||||||
|
)
|
||||||
|
text = ""
|
||||||
|
elif (
|
||||||
|
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
|
||||||
|
and (output + text)[-1] != " "
|
||||||
|
):
|
||||||
|
text += " "
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
tm_prev = tm_matrix.copy()
|
||||||
|
cm_prev = cm_matrix.copy()
|
||||||
|
return text, output, cm_prev, tm_prev
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_operands(
|
||||||
|
operands: list[Union[str, TextStringObject]],
|
||||||
|
cm_matrix: list[float],
|
||||||
|
tm_matrix: list[float],
|
||||||
|
font: Font,
|
||||||
|
orientations: tuple[int, ...]
|
||||||
|
) -> tuple[str, bool]:
|
||||||
|
t: str = ""
|
||||||
|
is_str_operands = False
|
||||||
|
m = mult(tm_matrix, cm_matrix)
|
||||||
|
orientation = orient(m)
|
||||||
|
if orientation in orientations and len(operands) > 0:
|
||||||
|
if isinstance(operands[0], str):
|
||||||
|
t = operands[0]
|
||||||
|
is_str_operands = True
|
||||||
|
else:
|
||||||
|
t = ""
|
||||||
|
tt: bytes = (
|
||||||
|
encode_pdfdocencoding(operands[0])
|
||||||
|
if isinstance(operands[0], str)
|
||||||
|
else operands[0]
|
||||||
|
)
|
||||||
|
if isinstance(font.encoding, str):
|
||||||
|
try:
|
||||||
|
t = tt.decode(font.encoding, "surrogatepass") # apply str encoding
|
||||||
|
except Exception:
|
||||||
|
# the data does not match the expectation,
|
||||||
|
# we use the alternative ;
|
||||||
|
# text extraction may not be good
|
||||||
|
t = tt.decode(
|
||||||
|
"utf-16-be" if font.encoding == "charmap" else "charmap",
|
||||||
|
"surrogatepass",
|
||||||
|
) # apply str encoding
|
||||||
|
else: # apply dict encoding
|
||||||
|
t = "".join(
|
||||||
|
[font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
|
||||||
|
)
|
||||||
|
return (t, is_str_operands)
|
||||||
|
|
||||||
|
|
||||||
|
def get_display_str(
|
||||||
|
text: str,
|
||||||
|
cm_matrix: list[float],
|
||||||
|
tm_matrix: list[float],
|
||||||
|
font_resource: Optional[DictionaryObject],
|
||||||
|
font: Font,
|
||||||
|
text_operands: str,
|
||||||
|
font_size: float,
|
||||||
|
rtl_dir: bool,
|
||||||
|
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
|
||||||
|
) -> tuple[str, bool, float]:
|
||||||
|
# "\u0590 - \u08FF \uFB50 - \uFDFF"
|
||||||
|
widths: float = 0.0
|
||||||
|
for x in [font.character_map.get(x, x) for x in text_operands]:
|
||||||
|
# x can be a sequence of bytes ; ex: habibi.pdf
|
||||||
|
if len(x) == 1:
|
||||||
|
xx = ord(x)
|
||||||
|
else:
|
||||||
|
xx = 1
|
||||||
|
# fmt: off
|
||||||
|
if (
|
||||||
|
# cases where the current inserting order is kept
|
||||||
|
(xx <= 0x2F) # punctuations but...
|
||||||
|
or 0x3A <= xx <= 0x40 # numbers (x30-39)
|
||||||
|
or 0x2000 <= xx <= 0x206F # upper punctuations..
|
||||||
|
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
|
||||||
|
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
|
||||||
|
):
|
||||||
|
text = x + text if rtl_dir else text + x
|
||||||
|
elif ( # right-to-left characters set
|
||||||
|
0x0590 <= xx <= 0x08FF
|
||||||
|
or 0xFB1D <= xx <= 0xFDFF
|
||||||
|
or 0xFE70 <= xx <= 0xFEFF
|
||||||
|
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
|
||||||
|
):
|
||||||
|
if not rtl_dir:
|
||||||
|
rtl_dir = True
|
||||||
|
if visitor_text is not None:
|
||||||
|
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||||
|
text = ""
|
||||||
|
text = x + text
|
||||||
|
else: # left-to-right
|
||||||
|
if rtl_dir:
|
||||||
|
rtl_dir = False
|
||||||
|
if visitor_text is not None:
|
||||||
|
visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
|
||||||
|
text = ""
|
||||||
|
text = text + x
|
||||||
|
widths += font.space_width if x == " " else font.text_width(x)
|
||||||
|
# fmt: on
|
||||||
|
return text, rtl_dir, widths
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
"""Layout mode text extraction extension for pypdf"""
|
||||||
|
from ..._font import Font
|
||||||
|
from ._fixed_width_page import (
|
||||||
|
fixed_char_width,
|
||||||
|
fixed_width_page,
|
||||||
|
text_show_operations,
|
||||||
|
y_coordinate_groups,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Font",
|
||||||
|
"fixed_char_width",
|
||||||
|
"fixed_width_page",
|
||||||
|
"text_show_operations",
|
||||||
|
"y_coordinate_groups",
|
||||||
|
]
|
||||||
@@ -0,0 +1,400 @@
|
|||||||
|
"""Extract PDF text preserving the layout of the source PDF"""
|
||||||
|
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from itertools import groupby
|
||||||
|
from math import ceil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Literal, Optional, TypedDict
|
||||||
|
|
||||||
|
from ..._font import Font
|
||||||
|
from ..._utils import logger_warning
|
||||||
|
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||||
|
from ._text_state_manager import TextStateManager
|
||||||
|
from ._text_state_params import TextStateParams
|
||||||
|
|
||||||
|
|
||||||
|
class BTGroup(TypedDict):
|
||||||
|
"""
|
||||||
|
Dict describing a line of text rendered within a BT/ET operator pair.
|
||||||
|
If multiple text show operations render text on the same line, the text
|
||||||
|
will be combined into a single BTGroup dict.
|
||||||
|
|
||||||
|
Keys:
|
||||||
|
tx: x coordinate of first character in BTGroup
|
||||||
|
ty: y coordinate of first character in BTGroup
|
||||||
|
font_size: nominal font size
|
||||||
|
font_height: effective font height
|
||||||
|
text: rendered text
|
||||||
|
displaced_tx: x coordinate of last character in BTGroup
|
||||||
|
flip_sort: -1 if page is upside down, else 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
tx: float
|
||||||
|
ty: float
|
||||||
|
font_size: float
|
||||||
|
font_height: float
|
||||||
|
text: str
|
||||||
|
displaced_tx: float
|
||||||
|
flip_sort: Literal[-1, 1]
|
||||||
|
|
||||||
|
|
||||||
|
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
|
||||||
|
"""
|
||||||
|
BTGroup constructed from a TextStateParams instance, rendered text, and
|
||||||
|
displaced tx value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tj_op (TextStateParams): TextStateParams instance
|
||||||
|
rendered_text (str): rendered text
|
||||||
|
dispaced_tx (float): x coordinate of last character in BTGroup
|
||||||
|
|
||||||
|
"""
|
||||||
|
return BTGroup(
|
||||||
|
tx=tj_op.tx,
|
||||||
|
ty=tj_op.ty,
|
||||||
|
font_size=tj_op.font_size,
|
||||||
|
font_height=tj_op.font_height,
|
||||||
|
text=rendered_text,
|
||||||
|
displaced_tx=dispaced_tx,
|
||||||
|
flip_sort=-1 if tj_op.flip_vertical else 1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def recurs_to_target_op(
|
||||||
|
ops: Iterator[tuple[list[Any], bytes]],
|
||||||
|
text_state_mgr: TextStateManager,
|
||||||
|
end_target: Literal[b"Q", b"ET"],
|
||||||
|
fonts: dict[str, Font],
|
||||||
|
strip_rotated: bool = True,
|
||||||
|
) -> tuple[list[BTGroup], list[TextStateParams]]:
|
||||||
|
"""
|
||||||
|
Recurse operators between BT/ET and/or q/Q operators managing the transform
|
||||||
|
stack and capturing text positioning and rendering data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ops: iterator of operators in content stream
|
||||||
|
text_state_mgr: a TextStateManager instance
|
||||||
|
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
|
||||||
|
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# 1 entry per line of text rendered within each BT/ET operation.
|
||||||
|
bt_groups: list[BTGroup] = []
|
||||||
|
|
||||||
|
# 1 entry per text show operator (Tj/TJ/'/")
|
||||||
|
tj_ops: list[TextStateParams] = []
|
||||||
|
|
||||||
|
if end_target == b"Q":
|
||||||
|
# add new q level. cm's added at this level will be popped at next b'Q'
|
||||||
|
text_state_mgr.add_q()
|
||||||
|
|
||||||
|
for operands, op in ops:
|
||||||
|
# The loop is broken by the end target, or exits normally when there are no more ops.
|
||||||
|
if op == end_target:
|
||||||
|
if op == b"Q":
|
||||||
|
text_state_mgr.remove_q()
|
||||||
|
if op == b"ET":
|
||||||
|
if not tj_ops:
|
||||||
|
return bt_groups, tj_ops
|
||||||
|
_text = ""
|
||||||
|
bt_idx = 0 # idx of first tj in this bt group
|
||||||
|
last_displaced_tx = tj_ops[bt_idx].displaced_tx
|
||||||
|
last_ty = tj_ops[bt_idx].ty
|
||||||
|
for _idx, _tj in enumerate(
|
||||||
|
tj_ops
|
||||||
|
): # ... build text from new Tj operators
|
||||||
|
if strip_rotated and _tj.rotated:
|
||||||
|
continue
|
||||||
|
if not _tj.font.interpretable: # generates warning
|
||||||
|
continue
|
||||||
|
# if the y position of the text is greater than the font height, assume
|
||||||
|
# the text is on a new line and start a new group
|
||||||
|
if abs(_tj.ty - last_ty) > _tj.font_height:
|
||||||
|
if _text.strip():
|
||||||
|
bt_groups.append(
|
||||||
|
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||||
|
)
|
||||||
|
bt_idx = _idx
|
||||||
|
_text = ""
|
||||||
|
|
||||||
|
# if the x position of the text is less than the last x position by
|
||||||
|
# more than 5 spaces widths, assume the text order should be flipped
|
||||||
|
# and start a new group
|
||||||
|
if (
|
||||||
|
last_displaced_tx - _tj.tx
|
||||||
|
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||||
|
):
|
||||||
|
if _text.strip():
|
||||||
|
bt_groups.append(
|
||||||
|
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||||
|
)
|
||||||
|
bt_idx = _idx
|
||||||
|
last_displaced_tx = _tj.displaced_tx
|
||||||
|
_text = ""
|
||||||
|
|
||||||
|
# calculate excess x translation based on ending tx of previous Tj.
|
||||||
|
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
|
||||||
|
# applied to the first tj of a BTGroup in fixed_width_page().
|
||||||
|
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
|
||||||
|
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
|
||||||
|
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
|
||||||
|
new_text = f'{" " * spaces}{_tj.txt}'
|
||||||
|
|
||||||
|
last_ty = _tj.ty
|
||||||
|
_text = f"{_text}{new_text}"
|
||||||
|
last_displaced_tx = _tj.displaced_tx
|
||||||
|
if _text:
|
||||||
|
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
|
||||||
|
text_state_mgr.reset_tm()
|
||||||
|
break
|
||||||
|
if op == b"q":
|
||||||
|
bts, tjs = recurs_to_target_op(
|
||||||
|
ops, text_state_mgr, b"Q", fonts, strip_rotated
|
||||||
|
)
|
||||||
|
bt_groups.extend(bts)
|
||||||
|
tj_ops.extend(tjs)
|
||||||
|
elif op == b"cm":
|
||||||
|
text_state_mgr.add_cm(*operands)
|
||||||
|
elif op == b"BT":
|
||||||
|
bts, tjs = recurs_to_target_op(
|
||||||
|
ops, text_state_mgr, b"ET", fonts, strip_rotated
|
||||||
|
)
|
||||||
|
bt_groups.extend(bts)
|
||||||
|
tj_ops.extend(tjs)
|
||||||
|
elif op == b"Tj":
|
||||||
|
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||||
|
elif op == b"TJ":
|
||||||
|
_tj = text_state_mgr.text_state_params()
|
||||||
|
for tj_op in operands[0]:
|
||||||
|
if isinstance(tj_op, bytes):
|
||||||
|
_tj = text_state_mgr.text_state_params(tj_op)
|
||||||
|
tj_ops.append(_tj)
|
||||||
|
else:
|
||||||
|
text_state_mgr.add_trm(_tj.displacement_matrix(td_offset=tj_op))
|
||||||
|
elif op == b"'":
|
||||||
|
text_state_mgr.reset_trm()
|
||||||
|
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||||
|
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||||
|
elif op == b'"':
|
||||||
|
text_state_mgr.reset_trm()
|
||||||
|
text_state_mgr.set_state_param(b"Tw", operands[0])
|
||||||
|
text_state_mgr.set_state_param(b"Tc", operands[1])
|
||||||
|
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||||
|
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
|
||||||
|
elif op in (b"Td", b"Tm", b"TD", b"T*"):
|
||||||
|
text_state_mgr.reset_trm()
|
||||||
|
if op == b"Tm":
|
||||||
|
text_state_mgr.reset_tm()
|
||||||
|
elif op == b"TD":
|
||||||
|
text_state_mgr.set_state_param(b"TL", -operands[1])
|
||||||
|
elif op == b"T*":
|
||||||
|
operands = [0, -text_state_mgr.TL]
|
||||||
|
text_state_mgr.add_tm(operands)
|
||||||
|
elif op == b"Tf":
|
||||||
|
text_state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||||
|
else: # handle Tc, Tw, Tz, TL, and Ts operators
|
||||||
|
text_state_mgr.set_state_param(op, operands)
|
||||||
|
else:
|
||||||
|
logger_warning(
|
||||||
|
f"Unbalanced target operations, expected {end_target!r}.",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
return bt_groups, tj_ops
|
||||||
|
|
||||||
|
|
||||||
|
def y_coordinate_groups(
|
||||||
|
bt_groups: list[BTGroup], debug_path: Optional[Path] = None
|
||||||
|
) -> dict[int, list[BTGroup]]:
|
||||||
|
"""
|
||||||
|
Group text operations by rendered y coordinate, i.e. the line number.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bt_groups: list of dicts as returned by text_show_operations()
|
||||||
|
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
|
||||||
|
keyed by y coordinate
|
||||||
|
|
||||||
|
"""
|
||||||
|
ty_groups = {
|
||||||
|
ty: sorted(grp, key=lambda x: x["tx"])
|
||||||
|
for ty, grp in groupby(
|
||||||
|
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
|
||||||
|
)
|
||||||
|
}
|
||||||
|
# combine groups whose y coordinates differ by less than the effective font height
|
||||||
|
# (accounts for mixed fonts and other minor oddities)
|
||||||
|
last_ty = next(iter(ty_groups))
|
||||||
|
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
|
||||||
|
for ty in list(ty_groups)[1:]:
|
||||||
|
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
|
||||||
|
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
|
||||||
|
# prevent merge if both groups are rendering in the same x position.
|
||||||
|
no_text_overlap = not (txs & last_txs)
|
||||||
|
offset_less_than_font_height = abs(ty - last_ty) < fsz
|
||||||
|
if no_text_overlap and offset_less_than_font_height:
|
||||||
|
ty_groups[last_ty] = sorted(
|
||||||
|
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
|
||||||
|
)
|
||||||
|
last_txs |= txs
|
||||||
|
else:
|
||||||
|
last_ty = ty
|
||||||
|
last_txs = txs
|
||||||
|
if debug_path: # pragma: no cover
|
||||||
|
import json # noqa: PLC0415
|
||||||
|
|
||||||
|
debug_path.joinpath("bt_groups.json").write_text(
|
||||||
|
json.dumps(ty_groups, indent=2, default=str), "utf-8"
|
||||||
|
)
|
||||||
|
return ty_groups
|
||||||
|
|
||||||
|
|
||||||
|
def text_show_operations(
|
||||||
|
ops: Iterator[tuple[list[Any], bytes]],
|
||||||
|
fonts: dict[str, Font],
|
||||||
|
strip_rotated: bool = True,
|
||||||
|
debug_path: Optional[Path] = None,
|
||||||
|
) -> list[BTGroup]:
|
||||||
|
"""
|
||||||
|
Extract text from BT/ET operator pairs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
|
||||||
|
fonts (Dict[str, Font]): font dictionary
|
||||||
|
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
|
||||||
|
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[BTGroup]: list of dicts of text rendered by each BT operator
|
||||||
|
|
||||||
|
"""
|
||||||
|
state_mgr = TextStateManager() # transformation stack manager
|
||||||
|
bt_groups: list[BTGroup] = [] # BT operator dict
|
||||||
|
tj_ops: list[TextStateParams] = [] # Tj/TJ operator data
|
||||||
|
for operands, op in ops:
|
||||||
|
if op in (b"BT", b"q"):
|
||||||
|
bts, tjs = recurs_to_target_op(
|
||||||
|
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
|
||||||
|
)
|
||||||
|
bt_groups.extend(bts)
|
||||||
|
tj_ops.extend(tjs)
|
||||||
|
elif op == b"Tf":
|
||||||
|
state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||||
|
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
|
||||||
|
state_mgr.set_state_param(op, operands)
|
||||||
|
|
||||||
|
if any(tj.rotated for tj in tj_ops):
|
||||||
|
if strip_rotated:
|
||||||
|
logger_warning(
|
||||||
|
"Rotated text discovered. Output will be incomplete.", __name__
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger_warning(
|
||||||
|
"Rotated text discovered. Layout will be degraded.", __name__
|
||||||
|
)
|
||||||
|
if not all(tj.font.interpretable for tj in tj_ops):
|
||||||
|
logger_warning(
|
||||||
|
"PDF contains an uninterpretable font. Output will be incomplete.", __name__
|
||||||
|
)
|
||||||
|
|
||||||
|
# left align the data, i.e. decrement all tx values by min(tx)
|
||||||
|
min_x = min((x["tx"] for x in bt_groups), default=0.0)
|
||||||
|
bt_groups = [
|
||||||
|
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
|
||||||
|
for ogrp in sorted(
|
||||||
|
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
if debug_path: # pragma: no cover
|
||||||
|
import json # noqa: PLC0415
|
||||||
|
|
||||||
|
debug_path.joinpath("bts.json").write_text(
|
||||||
|
json.dumps(bt_groups, indent=2, default=str), "utf-8"
|
||||||
|
)
|
||||||
|
debug_path.joinpath("tjs.json").write_text(
|
||||||
|
json.dumps(
|
||||||
|
tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
|
||||||
|
),
|
||||||
|
"utf-8",
|
||||||
|
)
|
||||||
|
return bt_groups
|
||||||
|
|
||||||
|
|
||||||
|
def fixed_char_width(bt_groups: list[BTGroup], scale_weight: float = 1.25) -> float:
|
||||||
|
"""
|
||||||
|
Calculate average character width weighted by the length of the rendered
|
||||||
|
text in each sample for conversion to fixed-width layout.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bt_groups (List[BTGroup]): List of dicts of text rendered by each
|
||||||
|
BT operator
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: fixed character width
|
||||||
|
|
||||||
|
"""
|
||||||
|
char_widths = []
|
||||||
|
for _bt in bt_groups:
|
||||||
|
_len = len(_bt["text"]) * scale_weight
|
||||||
|
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
|
||||||
|
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
|
||||||
|
|
||||||
|
|
||||||
|
def fixed_width_page(
|
||||||
|
ty_groups: dict[int, list[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate page text from text operations grouped by rendered y coordinate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ty_groups: dict of text show ops as returned by y_coordinate_groups()
|
||||||
|
char_width: fixed character width
|
||||||
|
space_vertically: include blank lines inferred from y distance + font height.
|
||||||
|
font_height_weight: multiplier for font height when calculating blank lines.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: page text in a fixed width format that closely adheres to the rendered
|
||||||
|
layout in the source pdf.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lines: list[str] = []
|
||||||
|
last_y_coord = 0
|
||||||
|
table = str.maketrans(dict.fromkeys(range(14, 32), " "))
|
||||||
|
for y_coord, line_data in ty_groups.items():
|
||||||
|
if space_vertically and lines:
|
||||||
|
fh = line_data[0]["font_height"]
|
||||||
|
blank_lines = 0 if fh == 0 else (
|
||||||
|
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
|
||||||
|
)
|
||||||
|
lines.extend([""] * blank_lines)
|
||||||
|
|
||||||
|
line_parts = [] # It uses a list to construct the line, avoiding string concatenation.
|
||||||
|
current_len = 0 # Track the size with int instead of len(str) overhead.
|
||||||
|
last_disp = 0.0
|
||||||
|
for bt_op in line_data:
|
||||||
|
tx = bt_op["tx"]
|
||||||
|
offset = int(tx // char_width)
|
||||||
|
needed_spaces = offset - current_len
|
||||||
|
if needed_spaces > 0 and ceil(last_disp) < int(tx):
|
||||||
|
padding = " " * needed_spaces
|
||||||
|
line_parts.append(padding)
|
||||||
|
current_len += needed_spaces
|
||||||
|
|
||||||
|
raw_text = bt_op["text"]
|
||||||
|
text = raw_text.translate(table)
|
||||||
|
line_parts.append(text)
|
||||||
|
current_len += len(text)
|
||||||
|
last_disp = bt_op["displaced_tx"]
|
||||||
|
|
||||||
|
full_line = "".join(line_parts).rstrip()
|
||||||
|
if full_line.strip() or (space_vertically and lines):
|
||||||
|
lines.append(full_line)
|
||||||
|
|
||||||
|
last_y_coord = y_coord
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -0,0 +1,221 @@
|
|||||||
|
"""manage the PDF transform stack during "layout" mode text extraction"""
|
||||||
|
|
||||||
|
from collections import ChainMap, Counter
|
||||||
|
from collections import ChainMap as ChainMapType
|
||||||
|
from collections import Counter as CounterType
|
||||||
|
from collections.abc import MutableMapping
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from ..._font import Font
|
||||||
|
from ...errors import PdfReadError
|
||||||
|
from .. import mult
|
||||||
|
from ._text_state_params import TextStateParams
|
||||||
|
|
||||||
|
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
|
||||||
|
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
|
||||||
|
|
||||||
|
|
||||||
|
class TextStateManager:
|
||||||
|
"""
|
||||||
|
Tracks the current text state including cm/tm/trm transformation matrices.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
|
||||||
|
q_queue (Counter[int]): Counter of q operators
|
||||||
|
q_depth (List[int]): list of q operator nesting levels
|
||||||
|
Tc (float): character spacing
|
||||||
|
Tw (float): word spacing
|
||||||
|
Tz (int): horizontal scaling
|
||||||
|
TL (float): leading
|
||||||
|
Ts (float): text rise
|
||||||
|
font (Font): font object
|
||||||
|
font_size (int | float): font size
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.transform_stack: TextStateManagerChainMapType = ChainMap(
|
||||||
|
self.new_transform()
|
||||||
|
)
|
||||||
|
self.q_queue: CounterType[int] = Counter()
|
||||||
|
self.q_depth = [0]
|
||||||
|
self.Tc: float = 0.0
|
||||||
|
self.Tw: float = 0.0
|
||||||
|
self.Tz: float = 100.0
|
||||||
|
self.TL: float = 0.0
|
||||||
|
self.Ts: float = 0.0
|
||||||
|
self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
|
||||||
|
self.font: Union[Font, None] = None
|
||||||
|
self.font_size: Union[int, float] = 0
|
||||||
|
|
||||||
|
def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
|
||||||
|
"""
|
||||||
|
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
op: operator read from PDF stream as bytes. No action is taken
|
||||||
|
for unsupported operators (see supported operators above).
|
||||||
|
value (float | List[Any]): new parameter value. If a list,
|
||||||
|
value[0] is used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
|
||||||
|
return
|
||||||
|
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
|
||||||
|
|
||||||
|
def set_font(self, font: Font, size: float) -> None:
|
||||||
|
"""
|
||||||
|
Set the current font and font_size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
font (Font): a layout mode Font
|
||||||
|
size (float): font size
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.font = font
|
||||||
|
self.font_size = size
|
||||||
|
|
||||||
|
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
|
||||||
|
"""
|
||||||
|
Create a TextStateParams instance to display a text string. Type[bytes] values
|
||||||
|
will be decoded implicitly.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value (str | bytes): text to associate with the captured state.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TextStateParams: current text state parameters
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not isinstance(self.font, Font):
|
||||||
|
raise PdfReadError(
|
||||||
|
"font not set: is PDF missing a Tf operator?"
|
||||||
|
) # pragma: no cover
|
||||||
|
if isinstance(value, bytes):
|
||||||
|
try:
|
||||||
|
if isinstance(self.font.encoding, str):
|
||||||
|
txt = value.decode(self.font.encoding, "surrogatepass")
|
||||||
|
else:
|
||||||
|
txt = "".join(
|
||||||
|
self.font.encoding[x]
|
||||||
|
if x in self.font.encoding
|
||||||
|
else bytes((x,)).decode()
|
||||||
|
for x in value
|
||||||
|
)
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
|
txt = value.decode("utf-8", "replace")
|
||||||
|
txt = "".join(
|
||||||
|
self.font.character_map.get(x, x) for x in txt
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
txt = value
|
||||||
|
return TextStateParams(
|
||||||
|
txt,
|
||||||
|
self.font,
|
||||||
|
self.font_size,
|
||||||
|
self.Tc,
|
||||||
|
self.Tw,
|
||||||
|
self.Tz,
|
||||||
|
self.TL,
|
||||||
|
self.Ts,
|
||||||
|
self.effective_transform,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def raw_transform(
|
||||||
|
_a: float = 1.0,
|
||||||
|
_b: float = 0.0,
|
||||||
|
_c: float = 0.0,
|
||||||
|
_d: float = 1.0,
|
||||||
|
_e: float = 0.0,
|
||||||
|
_f: float = 0.0,
|
||||||
|
) -> dict[int, float]:
|
||||||
|
"""Only a/b/c/d/e/f matrix params"""
|
||||||
|
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new_transform(
|
||||||
|
_a: float = 1.0,
|
||||||
|
_b: float = 0.0,
|
||||||
|
_c: float = 0.0,
|
||||||
|
_d: float = 1.0,
|
||||||
|
_e: float = 0.0,
|
||||||
|
_f: float = 0.0,
|
||||||
|
is_text: bool = False,
|
||||||
|
is_render: bool = False,
|
||||||
|
) -> TextStateManagerDictType:
|
||||||
|
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
|
||||||
|
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
|
||||||
|
result.update({"is_text": is_text, "is_render": is_render})
|
||||||
|
return result
|
||||||
|
|
||||||
|
def reset_tm(self) -> TextStateManagerChainMapType:
|
||||||
|
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
|
||||||
|
while (
|
||||||
|
self.transform_stack.maps[0]["is_text"]
|
||||||
|
or self.transform_stack.maps[0]["is_render"]
|
||||||
|
):
|
||||||
|
self.transform_stack = self.transform_stack.parents
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
def reset_trm(self) -> TextStateManagerChainMapType:
|
||||||
|
"""Clear all transforms from chainmap having is_render==True"""
|
||||||
|
while self.transform_stack.maps[0]["is_render"]:
|
||||||
|
self.transform_stack = self.transform_stack.parents
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
def remove_q(self) -> TextStateManagerChainMapType:
|
||||||
|
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
|
||||||
|
self.font, self.font_size = self.font_stack.pop(-1)
|
||||||
|
self.transform_stack = self.reset_tm()
|
||||||
|
self.transform_stack.maps = self.transform_stack.maps[
|
||||||
|
self.q_queue.pop(self.q_depth.pop(), 0) :
|
||||||
|
]
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
def add_q(self) -> None:
|
||||||
|
"""Add another level to q_queue"""
|
||||||
|
self.font_stack.append((self.font, self.font_size))
|
||||||
|
self.q_depth.append(len(self.q_depth))
|
||||||
|
|
||||||
|
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
|
||||||
|
"""Concatenate an additional transform matrix"""
|
||||||
|
self.transform_stack = self.reset_tm()
|
||||||
|
self.q_queue.update(self.q_depth[-1:])
|
||||||
|
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
def _complete_matrix(self, operands: list[float]) -> list[float]:
|
||||||
|
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
|
||||||
|
if len(operands) == 2: # this is a Td operator or equivalent
|
||||||
|
operands = [1.0, 0.0, 0.0, 1.0, *operands]
|
||||||
|
return operands
|
||||||
|
|
||||||
|
def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||||
|
"""Append a text transform matrix"""
|
||||||
|
self.transform_stack = self.transform_stack.new_child(
|
||||||
|
self.new_transform( # type: ignore[misc]
|
||||||
|
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
|
||||||
|
"""Append a text rendering transform matrix"""
|
||||||
|
self.transform_stack = self.transform_stack.new_child(
|
||||||
|
self.new_transform( # type: ignore[misc]
|
||||||
|
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return self.transform_stack
|
||||||
|
|
||||||
|
@property
|
||||||
|
def effective_transform(self) -> list[float]:
|
||||||
|
"""Current effective transform accounting for cm, tm, and trm transforms"""
|
||||||
|
eff_transform = [*self.transform_stack.maps[0].values()]
|
||||||
|
for transform in self.transform_stack.maps[1:]:
|
||||||
|
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
|
||||||
|
return eff_transform
|
||||||
@@ -0,0 +1,135 @@
|
|||||||
|
"""A dataclass that captures the CTM and Text State for a tj operation"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from ..._font import Font
|
||||||
|
from .. import mult, orient
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextStateParams:
|
||||||
|
"""
|
||||||
|
Text state parameters and operator values for a single text value in a
|
||||||
|
TJ or Tj PDF operation.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
txt (str): the text to be rendered.
|
||||||
|
font (Font): font object
|
||||||
|
font_size (int | float): font size
|
||||||
|
Tc (float): character spacing. Defaults to 0.0.
|
||||||
|
Tw (float): word spacing. Defaults to 0.0.
|
||||||
|
Tz (float): horizontal scaling. Defaults to 100.0.
|
||||||
|
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
|
||||||
|
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
|
||||||
|
transform (List[float]): effective transformation matrix.
|
||||||
|
tx (float): x cood of rendered text, i.e. self.transform[4]
|
||||||
|
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
|
||||||
|
displaced_tx (float): x coord immediately following rendered text
|
||||||
|
space_tx (float): tx for a space character
|
||||||
|
font_height (float): effective font height accounting for CTM
|
||||||
|
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
|
||||||
|
rotated (bool): True if the text orientation is rotated with respect to the page.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
txt: str
|
||||||
|
font: Font
|
||||||
|
font_size: Union[int, float]
|
||||||
|
Tc: float = 0.0
|
||||||
|
Tw: float = 0.0
|
||||||
|
Tz: float = 100.0
|
||||||
|
TL: float = 0.0
|
||||||
|
Ts: float = 0.0
|
||||||
|
transform: list[float] = field(
|
||||||
|
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
)
|
||||||
|
tx: float = field(default=0.0, init=False)
|
||||||
|
ty: float = field(default=0.0, init=False)
|
||||||
|
displaced_tx: float = field(default=0.0, init=False)
|
||||||
|
space_tx: float = field(default=0.0, init=False)
|
||||||
|
font_height: float = field(default=0.0, init=False)
|
||||||
|
flip_vertical: bool = field(default=False, init=False)
|
||||||
|
rotated: bool = field(default=False, init=False)
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if orient(self.transform) in (90, 270):
|
||||||
|
self.transform = mult(
|
||||||
|
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
|
||||||
|
self.transform,
|
||||||
|
)
|
||||||
|
self.rotated = True
|
||||||
|
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
|
||||||
|
# If only self.transform[3] < 0, the y coords are simply inverted.
|
||||||
|
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
|
||||||
|
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
|
||||||
|
self.rotated = True
|
||||||
|
self.displaced_tx = self.displaced_transform()[4]
|
||||||
|
self.tx = self.transform[4]
|
||||||
|
self.ty = self.render_transform()[5]
|
||||||
|
self.space_tx = round(self.word_tx(" "), 3)
|
||||||
|
if self.space_tx < 1e-6:
|
||||||
|
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
|
||||||
|
# with TJ int operators a la crazyones.pdf), calculate space_tx as
|
||||||
|
# a td_offset of -1 * font.space_width where font.space_width is
|
||||||
|
# the space_width calculated in _font.py.
|
||||||
|
self.space_tx = round(self.word_tx("", -self.font.space_width), 3)
|
||||||
|
self.font_height = self.font_size * math.sqrt(
|
||||||
|
self.transform[1] ** 2 + self.transform[3] ** 2
|
||||||
|
)
|
||||||
|
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
|
||||||
|
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
|
||||||
|
|
||||||
|
def font_size_matrix(self) -> list[float]:
|
||||||
|
"""Font size matrix"""
|
||||||
|
return [
|
||||||
|
self.font_size * (self.Tz / 100.0),
|
||||||
|
0.0,
|
||||||
|
0.0,
|
||||||
|
self.font_size,
|
||||||
|
0.0,
|
||||||
|
self.Ts,
|
||||||
|
]
|
||||||
|
|
||||||
|
def displaced_transform(self) -> list[float]:
|
||||||
|
"""Effective transform matrix after text has been rendered."""
|
||||||
|
return mult(self.displacement_matrix(), self.transform)
|
||||||
|
|
||||||
|
def render_transform(self) -> list[float]:
|
||||||
|
"""Effective transform matrix accounting for font size, Tz, and Ts."""
|
||||||
|
return mult(self.font_size_matrix(), self.transform)
|
||||||
|
|
||||||
|
def displacement_matrix(
|
||||||
|
self, word: Union[str, None] = None, td_offset: float = 0.0
|
||||||
|
) -> list[float]:
|
||||||
|
"""
|
||||||
|
Text displacement matrix
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word (str, optional): Defaults to None in which case self.txt displacement is
|
||||||
|
returned.
|
||||||
|
td_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
|
||||||
|
|
||||||
|
"""
|
||||||
|
word = word if word is not None else self.txt
|
||||||
|
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0]
|
||||||
|
|
||||||
|
def word_tx(self, word: str, td_offset: float = 0.0) -> float:
|
||||||
|
"""Horizontal text displacement for any word according this text state"""
|
||||||
|
width: float = 0.0
|
||||||
|
for char in word:
|
||||||
|
if char == " ":
|
||||||
|
width += self.font.space_width
|
||||||
|
else:
|
||||||
|
width += self.font.text_width(char)
|
||||||
|
return (
|
||||||
|
(self.font_size * ((width - td_offset) / 1000.0))
|
||||||
|
+ self.Tc
|
||||||
|
+ word.count(" ") * self.Tw
|
||||||
|
) * (self.Tz / 100.0)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def to_dict(inst: "TextStateParams") -> dict[str, Any]:
|
||||||
|
"""Dataclass to dict for json.dumps serialization"""
|
||||||
|
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
|
||||||
@@ -0,0 +1,351 @@
|
|||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||||
|
#
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
|
from .._font import Font, FontDescriptor
|
||||||
|
from ..generic import DictionaryObject, TextStringObject
|
||||||
|
from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
|
||||||
|
|
||||||
|
|
||||||
|
class TextExtraction:
|
||||||
|
"""
|
||||||
|
A class to handle PDF text extraction operations.
|
||||||
|
|
||||||
|
This class encapsulates all the state and operations needed for extracting
|
||||||
|
text from PDF content streams, replacing the nested functions and nonlocal
|
||||||
|
variables in the original implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
|
||||||
|
|
||||||
|
# Text extraction state variables
|
||||||
|
self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self.cm_stack: list[
|
||||||
|
tuple[
|
||||||
|
list[float],
|
||||||
|
Optional[DictionaryObject],
|
||||||
|
Font,
|
||||||
|
float,
|
||||||
|
float,
|
||||||
|
float,
|
||||||
|
float,
|
||||||
|
]
|
||||||
|
] = []
|
||||||
|
|
||||||
|
# Store the last modified matrices; can be an intermediate position
|
||||||
|
self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
|
||||||
|
# Store the position at the beginning of building the text
|
||||||
|
self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
|
||||||
|
self.char_scale = 1.0
|
||||||
|
self.space_scale = 1.0
|
||||||
|
self._space_width: float = 500.0 # will be set correctly at first Tf
|
||||||
|
self._actual_str_size: dict[str, float] = {
|
||||||
|
"str_widths": 0.0,
|
||||||
|
"str_height": 0.0,
|
||||||
|
} # will be set to string length calculation result
|
||||||
|
self.TL = 0.0
|
||||||
|
self.font_size = 12.0 # init just in case of
|
||||||
|
|
||||||
|
# Text extraction variables
|
||||||
|
self.text: str = ""
|
||||||
|
self.output: str = ""
|
||||||
|
self.rtl_dir: bool = False # right-to-left
|
||||||
|
self.font_resource: Optional[DictionaryObject] = None
|
||||||
|
self.font = Font(
|
||||||
|
name = "NotInitialized",
|
||||||
|
sub_type="Unknown",
|
||||||
|
encoding="charmap",
|
||||||
|
font_descriptor=FontDescriptor(),
|
||||||
|
)
|
||||||
|
self.orientations: tuple[int, ...] = (0, 90, 180, 270)
|
||||||
|
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
|
||||||
|
self.font_resources: dict[str, DictionaryObject] = {}
|
||||||
|
self.fonts: dict[str, Font] = {}
|
||||||
|
|
||||||
|
self.operation_handlers = {
|
||||||
|
b"BT": self._handle_bt,
|
||||||
|
b"ET": self._handle_et,
|
||||||
|
b"q": self._handle_save_graphics_state,
|
||||||
|
b"Q": self._handle_restore_graphics_state,
|
||||||
|
b"cm": self._handle_cm,
|
||||||
|
b"Tz": self._handle_tz,
|
||||||
|
b"Tw": self._handle_tw,
|
||||||
|
b"TL": self._handle_tl,
|
||||||
|
b"Tf": self._handle_tf,
|
||||||
|
b"Td": self._handle_td,
|
||||||
|
b"Tm": self._handle_tm,
|
||||||
|
b"T*": self._handle_t_star,
|
||||||
|
b"Tj": self._handle_tj_operation,
|
||||||
|
}
|
||||||
|
|
||||||
|
def initialize_extraction(
|
||||||
|
self,
|
||||||
|
orientations: tuple[int, ...] = (0, 90, 180, 270),
|
||||||
|
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
|
||||||
|
font_resources: Optional[dict[str, DictionaryObject]] = None,
|
||||||
|
fonts: Optional[dict[str, Font]] = None
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the extractor with extraction parameters."""
|
||||||
|
self.orientations = orientations
|
||||||
|
self.visitor_text = visitor_text
|
||||||
|
self.font_resources = font_resources or {}
|
||||||
|
self.fonts = fonts or {}
|
||||||
|
|
||||||
|
# Reset state
|
||||||
|
self.text = ""
|
||||||
|
self.output = ""
|
||||||
|
self.rtl_dir = False
|
||||||
|
|
||||||
|
def compute_str_widths(self, str_widths: float) -> float:
|
||||||
|
return str_widths / 1000
|
||||||
|
|
||||||
|
def process_operation(self, operator: bytes, operands: list[Any]) -> None:
|
||||||
|
if operator in self.operation_handlers:
|
||||||
|
handler = self.operation_handlers[operator]
|
||||||
|
str_widths = handler(operands)
|
||||||
|
|
||||||
|
# Post-process operations that affect text positioning
|
||||||
|
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
|
||||||
|
self._post_process_text_operation(str_widths or 0.0)
|
||||||
|
|
||||||
|
def _post_process_text_operation(self, str_widths: float) -> None:
|
||||||
|
"""Handle common post-processing for text positioning operations."""
|
||||||
|
try:
|
||||||
|
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
|
||||||
|
self.text,
|
||||||
|
(self.cm_prev, self.tm_prev),
|
||||||
|
(self.cm_matrix, self.tm_matrix),
|
||||||
|
(self.memo_cm, self.memo_tm),
|
||||||
|
self.font_resource,
|
||||||
|
self.orientations,
|
||||||
|
self.output,
|
||||||
|
self.font_size,
|
||||||
|
self.visitor_text,
|
||||||
|
str_widths,
|
||||||
|
self.compute_str_widths(self.font_size * self._space_width),
|
||||||
|
self._actual_str_size["str_height"],
|
||||||
|
)
|
||||||
|
if self.text == "":
|
||||||
|
self.memo_cm = self.cm_matrix.copy()
|
||||||
|
self.memo_tm = self.tm_matrix.copy()
|
||||||
|
except OrientationNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _handle_tj(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
operands: list[Union[str, TextStringObject]],
|
||||||
|
cm_matrix: list[float],
|
||||||
|
tm_matrix: list[float],
|
||||||
|
font_resource: Optional[DictionaryObject],
|
||||||
|
font: Font,
|
||||||
|
orientations: tuple[int, ...],
|
||||||
|
font_size: float,
|
||||||
|
rtl_dir: bool,
|
||||||
|
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||||
|
actual_str_size: dict[str, float],
|
||||||
|
) -> tuple[str, bool, dict[str, float]]:
|
||||||
|
text_operands, is_str_operands = get_text_operands(
|
||||||
|
operands, cm_matrix, tm_matrix, font, orientations
|
||||||
|
)
|
||||||
|
if is_str_operands:
|
||||||
|
text += text_operands
|
||||||
|
font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
|
||||||
|
else:
|
||||||
|
text, rtl_dir, font_widths = get_display_str(
|
||||||
|
text,
|
||||||
|
cm_matrix,
|
||||||
|
tm_matrix, # text matrix
|
||||||
|
font_resource,
|
||||||
|
font,
|
||||||
|
text_operands,
|
||||||
|
font_size,
|
||||||
|
rtl_dir,
|
||||||
|
visitor_text,
|
||||||
|
)
|
||||||
|
actual_str_size["str_widths"] += font_widths * font_size
|
||||||
|
actual_str_size["str_height"] = font_size
|
||||||
|
return text, rtl_dir, actual_str_size
|
||||||
|
|
||||||
|
def _flush_text(self) -> None:
|
||||||
|
"""Flush accumulated text to output and call visitor if present."""
|
||||||
|
self.output += self.text
|
||||||
|
if self.visitor_text is not None:
|
||||||
|
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||||
|
self.text = ""
|
||||||
|
self.memo_cm = self.cm_matrix.copy()
|
||||||
|
self.memo_tm = self.tm_matrix.copy()
|
||||||
|
|
||||||
|
# Operation handlers
|
||||||
|
|
||||||
|
def _handle_bt(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle BT (Begin Text) operation - Table 5.4 page 405."""
|
||||||
|
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self._flush_text()
|
||||||
|
|
||||||
|
def _handle_et(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle ET (End Text) operation - Table 5.4 page 405."""
|
||||||
|
self._flush_text()
|
||||||
|
|
||||||
|
def _handle_save_graphics_state(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle q (Save graphics state) operation - Table 4.7 page 219."""
|
||||||
|
self.cm_stack.append(
|
||||||
|
(
|
||||||
|
self.cm_matrix,
|
||||||
|
self.font_resource,
|
||||||
|
self.font,
|
||||||
|
self.font_size,
|
||||||
|
self.char_scale,
|
||||||
|
self.space_scale,
|
||||||
|
self.TL,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
|
||||||
|
try:
|
||||||
|
(
|
||||||
|
self.cm_matrix,
|
||||||
|
self.font_resource,
|
||||||
|
self.font,
|
||||||
|
self.font_size,
|
||||||
|
self.char_scale,
|
||||||
|
self.space_scale,
|
||||||
|
self.TL,
|
||||||
|
) = self.cm_stack.pop()
|
||||||
|
except Exception:
|
||||||
|
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
|
||||||
|
def _handle_cm(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
|
||||||
|
self.output += self.text
|
||||||
|
if self.visitor_text is not None:
|
||||||
|
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||||
|
self.text = ""
|
||||||
|
try:
|
||||||
|
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
|
||||||
|
except Exception:
|
||||||
|
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||||
|
self.memo_cm = self.cm_matrix.copy()
|
||||||
|
self.memo_tm = self.tm_matrix.copy()
|
||||||
|
|
||||||
|
def _handle_tz(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
|
||||||
|
self.char_scale = float(operands[0]) / 100 if operands else 1.0
|
||||||
|
|
||||||
|
def _handle_tw(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
|
||||||
|
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
|
||||||
|
|
||||||
|
def _handle_tl(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
|
||||||
|
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
|
||||||
|
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
|
||||||
|
|
||||||
|
def _handle_tf(self, operands: list[Any]) -> None:
|
||||||
|
"""Handle Tf (Set font size) operation - Table 5.2 page 398."""
|
||||||
|
if self.text != "":
|
||||||
|
self.output += self.text # .translate(cmap)
|
||||||
|
if self.visitor_text is not None:
|
||||||
|
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
|
||||||
|
self.text = ""
|
||||||
|
self.memo_cm = self.cm_matrix.copy()
|
||||||
|
self.memo_tm = self.tm_matrix.copy()
|
||||||
|
try:
|
||||||
|
self.font_resource = self.font_resources[operands[0]]
|
||||||
|
self.font = self.fonts[operands[0]]
|
||||||
|
except KeyError: # font not found
|
||||||
|
self.font_resource = None
|
||||||
|
font_descriptor = FontDescriptor()
|
||||||
|
self.font = Font(
|
||||||
|
"Unknown",
|
||||||
|
space_width=250,
|
||||||
|
encoding=dict.fromkeys(range(256), "<EFBFBD>"),
|
||||||
|
font_descriptor=font_descriptor,
|
||||||
|
character_map={},
|
||||||
|
character_widths=font_descriptor.character_widths
|
||||||
|
)
|
||||||
|
|
||||||
|
self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space...
|
||||||
|
try:
|
||||||
|
self.font_size = float(operands[1])
|
||||||
|
except Exception:
|
||||||
|
pass # keep previous size
|
||||||
|
|
||||||
|
def _handle_td(self, operands: list[Any]) -> float:
|
||||||
|
"""Handle Td (Move text position) operation - Table 5.5 page 406."""
|
||||||
|
# A special case is a translating only tm:
|
||||||
|
# tm = [1, 0, 0, 1, e, f]
|
||||||
|
# i.e. tm[4] += tx, tm[5] += ty.
|
||||||
|
tx, ty = float(operands[0]), float(operands[1])
|
||||||
|
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
|
||||||
|
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
|
||||||
|
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||||
|
self._actual_str_size["str_widths"] = 0.0
|
||||||
|
return str_widths
|
||||||
|
|
||||||
|
def _handle_tm(self, operands: list[Any]) -> float:
|
||||||
|
"""Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
|
||||||
|
self.tm_matrix = [float(operand) for operand in operands[:6]]
|
||||||
|
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||||
|
self._actual_str_size["str_widths"] = 0.0
|
||||||
|
return str_widths
|
||||||
|
|
||||||
|
def _handle_t_star(self, operands: list[Any]) -> float:
|
||||||
|
"""Handle T* (Move to next line) operation - Table 5.5 page 406."""
|
||||||
|
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
|
||||||
|
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
|
||||||
|
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
|
||||||
|
self._actual_str_size["str_widths"] = 0.0
|
||||||
|
return str_widths
|
||||||
|
|
||||||
|
def _handle_tj_operation(self, operands: list[Any]) -> float:
|
||||||
|
"""Handle Tj (Show text) operation - Table 5.5 page 406."""
|
||||||
|
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
|
||||||
|
self.text,
|
||||||
|
operands,
|
||||||
|
self.cm_matrix,
|
||||||
|
self.tm_matrix,
|
||||||
|
self.font_resource,
|
||||||
|
self.font,
|
||||||
|
self.orientations,
|
||||||
|
self.font_size,
|
||||||
|
self.rtl_dir,
|
||||||
|
self.visitor_text,
|
||||||
|
self._actual_str_size,
|
||||||
|
)
|
||||||
|
return 0.0 # str_widths will be handled in post-processing
|
||||||
631
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
631
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
@@ -0,0 +1,631 @@
|
|||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
"""Utility functions for PDF library."""
|
||||||
|
__author__ = "Mathieu Fenniak"
|
||||||
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from io import DEFAULT_BUFFER_SIZE
|
||||||
|
from os import SEEK_CUR
|
||||||
|
from re import Pattern
|
||||||
|
from typing import (
|
||||||
|
IO,
|
||||||
|
Any,
|
||||||
|
Optional,
|
||||||
|
Union,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
# Python 3.10+: https://www.python.org/dev/peps/pep-0484/
|
||||||
|
from typing import TypeAlias
|
||||||
|
else:
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 11):
|
||||||
|
from typing import Self
|
||||||
|
else:
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
from .errors import (
|
||||||
|
STREAM_TRUNCATED_PREMATURELY,
|
||||||
|
DeprecationError,
|
||||||
|
PdfStreamError,
|
||||||
|
)
|
||||||
|
|
||||||
|
TransformationMatrixType: TypeAlias = tuple[
|
||||||
|
tuple[float, float, float], tuple[float, float, float], tuple[float, float, float]
|
||||||
|
]
|
||||||
|
CompressedTransformationMatrix: TypeAlias = tuple[
|
||||||
|
float, float, float, float, float, float
|
||||||
|
]
|
||||||
|
|
||||||
|
StreamType = IO[Any]
|
||||||
|
StrByteType = Union[str, StreamType]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]:
|
||||||
|
orgtext = text
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
if text[0].isdigit():
|
||||||
|
text = "D:" + text
|
||||||
|
if text.endswith(("Z", "z")):
|
||||||
|
text += "0000"
|
||||||
|
text = text.replace("z", "+").replace("Z", "+").replace("'", "")
|
||||||
|
i = max(text.find("+"), text.find("-"))
|
||||||
|
if i > 0 and i != len(text) - 5:
|
||||||
|
text += "00"
|
||||||
|
for f in (
|
||||||
|
"D:%Y",
|
||||||
|
"D:%Y%m",
|
||||||
|
"D:%Y%m%d",
|
||||||
|
"D:%Y%m%d%H",
|
||||||
|
"D:%Y%m%d%H%M",
|
||||||
|
"D:%Y%m%d%H%M%S",
|
||||||
|
"D:%Y%m%d%H%M%S%z",
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
d = datetime.strptime(text, f) # noqa: DTZ007
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if text.endswith("+0000"):
|
||||||
|
d = d.replace(tzinfo=timezone.utc)
|
||||||
|
return d
|
||||||
|
raise ValueError(f"Can not convert date: {orgtext}")
|
||||||
|
|
||||||
|
|
||||||
|
def format_iso8824_date(dt: datetime) -> str:
|
||||||
|
"""
|
||||||
|
Convert a datetime object to PDF date string format.
|
||||||
|
|
||||||
|
Converts datetime to the PDF date format D:YYYYMMDDHHmmSSOHH'mm
|
||||||
|
as specified in the PDF Reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dt: A datetime object to convert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A date string in PDF format.
|
||||||
|
"""
|
||||||
|
date_str = dt.strftime("D:%Y%m%d%H%M%S")
|
||||||
|
if dt.tzinfo is not None:
|
||||||
|
offset = dt.utcoffset()
|
||||||
|
assert offset is not None
|
||||||
|
total_seconds = int(offset.total_seconds())
|
||||||
|
hours, remainder = divmod(abs(total_seconds), 3600)
|
||||||
|
minutes = remainder // 60
|
||||||
|
sign = "+" if total_seconds >= 0 else "-"
|
||||||
|
date_str += f"{sign}{hours:02d}'{minutes:02d}'"
|
||||||
|
return date_str
|
||||||
|
|
||||||
|
|
||||||
|
def _get_max_pdf_version_header(header1: str, header2: str) -> str:
|
||||||
|
versions = (
|
||||||
|
"%PDF-1.3",
|
||||||
|
"%PDF-1.4",
|
||||||
|
"%PDF-1.5",
|
||||||
|
"%PDF-1.6",
|
||||||
|
"%PDF-1.7",
|
||||||
|
"%PDF-2.0",
|
||||||
|
)
|
||||||
|
pdf_header_indices = []
|
||||||
|
if header1 in versions:
|
||||||
|
pdf_header_indices.append(versions.index(header1))
|
||||||
|
if header2 in versions:
|
||||||
|
pdf_header_indices.append(versions.index(header2))
|
||||||
|
if len(pdf_header_indices) == 0:
|
||||||
|
raise ValueError(f"Neither {header1!r} nor {header2!r} are proper headers")
|
||||||
|
return versions[max(pdf_header_indices)]
|
||||||
|
|
||||||
|
|
||||||
|
WHITESPACES = (b"\x00", b"\t", b"\n", b"\f", b"\r", b" ")
|
||||||
|
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
|
||||||
|
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
|
||||||
|
|
||||||
|
|
||||||
|
def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes:
|
||||||
|
"""
|
||||||
|
Read non-whitespace characters and return them.
|
||||||
|
|
||||||
|
Stops upon encountering whitespace or when maxchars is reached.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream: The data stream from which was read.
|
||||||
|
maxchars: The maximum number of bytes returned; by default unlimited.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The data which was read.
|
||||||
|
|
||||||
|
"""
|
||||||
|
txt = b""
|
||||||
|
while True:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if tok.isspace() or not tok:
|
||||||
|
break
|
||||||
|
txt += tok
|
||||||
|
if len(txt) == maxchars:
|
||||||
|
break
|
||||||
|
return txt
|
||||||
|
|
||||||
|
|
||||||
|
def read_non_whitespace(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Find and read the next non-whitespace character (ignores whitespace).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream: The data stream from which was read.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The data which was read.
|
||||||
|
|
||||||
|
"""
|
||||||
|
tok = stream.read(1)
|
||||||
|
while tok in WHITESPACES:
|
||||||
|
tok = stream.read(1)
|
||||||
|
return tok
|
||||||
|
|
||||||
|
|
||||||
|
def skip_over_whitespace(stream: StreamType) -> bool:
|
||||||
|
"""
|
||||||
|
Similar to read_non_whitespace, but return a boolean if at least one
|
||||||
|
whitespace character was read.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream: The data stream from which was read.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if one or more whitespace was skipped, otherwise return False.
|
||||||
|
|
||||||
|
"""
|
||||||
|
tok = stream.read(1)
|
||||||
|
cnt = 0
|
||||||
|
while tok in WHITESPACES:
|
||||||
|
cnt += 1
|
||||||
|
tok = stream.read(1)
|
||||||
|
return cnt > 0
|
||||||
|
|
||||||
|
|
||||||
|
def check_if_whitespace_only(value: bytes) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the given value consists of whitespace characters only.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: The bytes to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the value only has whitespace characters, otherwise return False.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return all(b in WHITESPACES_AS_BYTES for b in value)
|
||||||
|
|
||||||
|
|
||||||
|
def skip_over_comment(stream: StreamType) -> None:
|
||||||
|
tok = stream.read(1)
|
||||||
|
stream.seek(-1, 1)
|
||||||
|
if tok == b"%":
|
||||||
|
while tok not in (b"\n", b"\r"):
|
||||||
|
tok = stream.read(1)
|
||||||
|
if tok == b"":
|
||||||
|
raise PdfStreamError("File ended unexpectedly.")
|
||||||
|
|
||||||
|
|
||||||
|
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
|
||||||
|
"""
|
||||||
|
Read until the regular expression pattern matched (ignore the match).
|
||||||
|
Treats EOF on the underlying stream as the end of the token to be matched.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
regex: re.Pattern
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The read bytes.
|
||||||
|
|
||||||
|
"""
|
||||||
|
name = b""
|
||||||
|
while True:
|
||||||
|
tok = stream.read(16)
|
||||||
|
if not tok:
|
||||||
|
return name
|
||||||
|
m = regex.search(name + tok)
|
||||||
|
if m is not None:
|
||||||
|
stream.seek(m.start() - (len(name) + len(tok)), 1)
|
||||||
|
name = (name + tok)[: m.start()]
|
||||||
|
break
|
||||||
|
name += tok
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
|
||||||
|
"""
|
||||||
|
Given a stream at position X, read a block of size to_read ending at position X.
|
||||||
|
|
||||||
|
This changes the stream's position to the beginning of where the block was
|
||||||
|
read.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream:
|
||||||
|
to_read:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The data which was read.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if stream.tell() < to_read:
|
||||||
|
raise PdfStreamError("Could not read malformed PDF file")
|
||||||
|
# Seek to the start of the block we want to read.
|
||||||
|
stream.seek(-to_read, SEEK_CUR)
|
||||||
|
read = stream.read(to_read)
|
||||||
|
# Seek to the start of the block we read after reading it.
|
||||||
|
stream.seek(-to_read, SEEK_CUR)
|
||||||
|
return read
|
||||||
|
|
||||||
|
|
||||||
|
def read_previous_line(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Given a byte stream with current position X, return the previous line.
|
||||||
|
|
||||||
|
All characters between the first CR/LF byte found before X
|
||||||
|
(or, the start of the file, if no such byte is found) and position X
|
||||||
|
After this call, the stream will be positioned one byte after the
|
||||||
|
first non-CRLF character found beyond the first CR/LF byte before X,
|
||||||
|
or, if no such byte is found, at the beginning of the stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream: StreamType:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The data which was read.
|
||||||
|
|
||||||
|
"""
|
||||||
|
line_content = []
|
||||||
|
found_crlf = False
|
||||||
|
if stream.tell() == 0:
|
||||||
|
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||||
|
while True:
|
||||||
|
to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
|
||||||
|
if to_read == 0:
|
||||||
|
break
|
||||||
|
# Read the block. After this, our stream will be one
|
||||||
|
# beyond the initial position.
|
||||||
|
block = read_block_backwards(stream, to_read)
|
||||||
|
idx = len(block) - 1
|
||||||
|
if not found_crlf:
|
||||||
|
# We haven't found our first CR/LF yet.
|
||||||
|
# Read off characters until we hit one.
|
||||||
|
while idx >= 0 and block[idx] not in b"\r\n":
|
||||||
|
idx -= 1
|
||||||
|
if idx >= 0:
|
||||||
|
found_crlf = True
|
||||||
|
if found_crlf:
|
||||||
|
# We found our first CR/LF already (on this block or
|
||||||
|
# a previous one).
|
||||||
|
# Our combined line is the remainder of the block
|
||||||
|
# plus any previously read blocks.
|
||||||
|
line_content.append(block[idx + 1 :])
|
||||||
|
# Continue to read off any more CRLF characters.
|
||||||
|
while idx >= 0 and block[idx] in b"\r\n":
|
||||||
|
idx -= 1
|
||||||
|
else:
|
||||||
|
# Didn't find CR/LF yet - add this block to our
|
||||||
|
# previously read blocks and continue.
|
||||||
|
line_content.append(block)
|
||||||
|
if idx >= 0:
|
||||||
|
# We found the next non-CRLF character.
|
||||||
|
# Set the stream position correctly, then break
|
||||||
|
stream.seek(idx + 1, SEEK_CUR)
|
||||||
|
break
|
||||||
|
# Join all the blocks in the line (which are in reverse order)
|
||||||
|
return b"".join(line_content[::-1])
|
||||||
|
|
||||||
|
|
||||||
|
def matrix_multiply(
|
||||||
|
a: TransformationMatrixType, b: TransformationMatrixType
|
||||||
|
) -> TransformationMatrixType:
|
||||||
|
return tuple( # type: ignore[return-value]
|
||||||
|
tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b))
|
||||||
|
for row in a
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mark_location(stream: StreamType) -> None:
|
||||||
|
"""Create text file showing current location in context."""
|
||||||
|
# Mainly for debugging
|
||||||
|
radius = 5000
|
||||||
|
stream.seek(-radius, 1)
|
||||||
|
with open("pypdf_pdfLocation.txt", "wb") as output_fh:
|
||||||
|
output_fh.write(stream.read(radius))
|
||||||
|
output_fh.write(b"HERE")
|
||||||
|
output_fh.write(stream.read(radius))
|
||||||
|
stream.seek(-radius, 1)
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def ord_(b: str) -> int:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def ord_(b: bytes) -> bytes:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def ord_(b: int) -> int:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
|
||||||
|
if isinstance(b, str):
|
||||||
|
return ord(b)
|
||||||
|
return b
|
||||||
|
|
||||||
|
|
||||||
|
def deprecate(msg: str, stacklevel: int = 3) -> None:
|
||||||
|
warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
|
||||||
|
|
||||||
|
|
||||||
|
def deprecation(msg: str) -> None:
|
||||||
|
raise DeprecationError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||||
|
"""Issue a warning that a feature will be removed, but has a replacement."""
|
||||||
|
deprecate(
|
||||||
|
f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
|
||||||
|
4,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||||
|
"""Raise an exception that a feature was already removed, but has a replacement."""
|
||||||
|
deprecation(
|
||||||
|
f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def deprecate_no_replacement(name: str, removed_in: str) -> None:
|
||||||
|
"""Issue a warning that a feature will be removed without replacement."""
|
||||||
|
deprecate(f"{name} is deprecated and will be removed in pypdf {removed_in}.", 4)
|
||||||
|
|
||||||
|
|
||||||
|
def deprecation_no_replacement(name: str, removed_in: str) -> None:
|
||||||
|
"""Raise an exception that a feature was already removed without replacement."""
|
||||||
|
deprecation(f"{name} is deprecated and was removed in pypdf {removed_in}.")
|
||||||
|
|
||||||
|
|
||||||
|
def logger_error(msg: str, src: str) -> None:
|
||||||
|
"""
|
||||||
|
Use this instead of logger.error directly.
|
||||||
|
|
||||||
|
That allows people to overwrite it more easily.
|
||||||
|
|
||||||
|
See the docs on when to use which:
|
||||||
|
https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html
|
||||||
|
"""
|
||||||
|
logging.getLogger(src).error(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def logger_warning(msg: str, src: str) -> None:
|
||||||
|
"""
|
||||||
|
Use this instead of logger.warning directly.
|
||||||
|
|
||||||
|
That allows people to overwrite it more easily.
|
||||||
|
|
||||||
|
## Exception, warnings.warn, logger_warning
|
||||||
|
- Exceptions should be used if the user should write code that deals with
|
||||||
|
an error case, e.g. the PDF being completely broken.
|
||||||
|
- warnings.warn should be used if the user needs to fix their code, e.g.
|
||||||
|
DeprecationWarnings
|
||||||
|
- logger_warning should be used if the user needs to know that an issue was
|
||||||
|
handled by pypdf, e.g. a non-compliant PDF being read in a way that
|
||||||
|
pypdf could apply a robustness fix to still read it. This applies mainly
|
||||||
|
to strict=False mode.
|
||||||
|
"""
|
||||||
|
logging.getLogger(src).warning(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def rename_kwargs(
|
||||||
|
func_name: str, kwargs: dict[str, Any], aliases: dict[str, str], fail: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Helper function to deprecate arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func_name: Name of the function to be deprecated
|
||||||
|
kwargs:
|
||||||
|
aliases:
|
||||||
|
fail:
|
||||||
|
|
||||||
|
"""
|
||||||
|
for old_term, new_term in aliases.items():
|
||||||
|
if old_term in kwargs:
|
||||||
|
if fail:
|
||||||
|
raise DeprecationError(
|
||||||
|
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||||
|
)
|
||||||
|
if new_term in kwargs:
|
||||||
|
raise TypeError(
|
||||||
|
f"{func_name} received both {old_term} and {new_term} as "
|
||||||
|
f"an argument. {old_term} is deprecated. "
|
||||||
|
f"Use {new_term} instead."
|
||||||
|
)
|
||||||
|
kwargs[new_term] = kwargs.pop(old_term)
|
||||||
|
warnings.warn(
|
||||||
|
message=(
|
||||||
|
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||||
|
),
|
||||||
|
category=DeprecationWarning,
|
||||||
|
stacklevel=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _human_readable_bytes(bytes: int) -> str:
|
||||||
|
if bytes < 10**3:
|
||||||
|
return f"{bytes} Byte"
|
||||||
|
if bytes < 10**6:
|
||||||
|
return f"{bytes / 10**3:.1f} kB"
|
||||||
|
if bytes < 10**9:
|
||||||
|
return f"{bytes / 10**6:.1f} MB"
|
||||||
|
return f"{bytes / 10**9:.1f} GB"
|
||||||
|
|
||||||
|
|
||||||
|
# The following class has been copied from Django:
|
||||||
|
# https://github.com/django/django/blob/adae619426b6f50046b3daaa744db52989c9d6db/django/utils/functional.py#L51-L65
|
||||||
|
# It received some modifications to comply with our own coding standards.
|
||||||
|
#
|
||||||
|
# Original license:
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Django Software Foundation and individual contributors.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
# are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
#
|
||||||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
#
|
||||||
|
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||||
|
# to endorse or promote products derived from this software without
|
||||||
|
# specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
# ---------------------------------------------------------------------------------
|
||||||
|
class classproperty: # noqa: N801
|
||||||
|
"""
|
||||||
|
Decorator that converts a method with a single cls argument into a property
|
||||||
|
that can be accessed directly from the class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, method=None) -> None: # type: ignore # noqa: ANN001
|
||||||
|
self.fget = method
|
||||||
|
|
||||||
|
def __get__(self, instance, cls=None) -> Any: # type: ignore # noqa: ANN001
|
||||||
|
return self.fget(cls)
|
||||||
|
|
||||||
|
def getter(self, method) -> Self: # type: ignore # noqa: ANN001
|
||||||
|
self.fget = method
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class File:
|
||||||
|
from .generic import IndirectObject # noqa: PLC0415
|
||||||
|
|
||||||
|
name: str = ""
|
||||||
|
"""
|
||||||
|
Filename as identified within the PDF file.
|
||||||
|
"""
|
||||||
|
data: bytes = b""
|
||||||
|
"""
|
||||||
|
Data as bytes.
|
||||||
|
"""
|
||||||
|
indirect_reference: Optional[IndirectObject] = None
|
||||||
|
"""
|
||||||
|
Reference to the object storing the stream.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return self.__str__()[:-1] + f", hash: {hash(self.data)})"
|
||||||
|
|
||||||
|
|
||||||
|
@functools.total_ordering
|
||||||
|
class Version:
|
||||||
|
COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$")
|
||||||
|
|
||||||
|
def __init__(self, version_str: str) -> None:
|
||||||
|
self.version_str = version_str
|
||||||
|
self.components = self._parse_version(version_str)
|
||||||
|
|
||||||
|
def _parse_version(self, version_str: str) -> list[tuple[int, str]]:
|
||||||
|
components = version_str.split(".")
|
||||||
|
parsed_components = []
|
||||||
|
for component in components:
|
||||||
|
match = Version.COMPONENT_PATTERN.match(component)
|
||||||
|
if not match:
|
||||||
|
parsed_components.append((0, component))
|
||||||
|
continue
|
||||||
|
integer_prefix = match.group(1)
|
||||||
|
suffix = match.group(2)
|
||||||
|
if integer_prefix is None:
|
||||||
|
integer_prefix = 0
|
||||||
|
parsed_components.append((int(integer_prefix), suffix))
|
||||||
|
return parsed_components
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, Version):
|
||||||
|
return False
|
||||||
|
return self.components == other.components
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
# Convert to tuple as lists cannot be hashed.
|
||||||
|
return hash((self.__class__, tuple(self.components)))
|
||||||
|
|
||||||
|
def __lt__(self, other: Any) -> bool:
|
||||||
|
if not isinstance(other, Version):
|
||||||
|
raise ValueError(f"Version cannot be compared against {type(other)}")
|
||||||
|
|
||||||
|
for self_component, other_component in zip(self.components, other.components):
|
||||||
|
self_value, self_suffix = self_component
|
||||||
|
other_value, other_suffix = other_component
|
||||||
|
|
||||||
|
if self_value < other_value:
|
||||||
|
return True
|
||||||
|
if self_value > other_value:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if self_suffix < other_suffix:
|
||||||
|
return True
|
||||||
|
if self_suffix > other_suffix:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return len(self.components) < len(other.components)
|
||||||
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
__version__ = "6.6.2"
|
||||||
3307
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
3307
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
577
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
577
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
@@ -0,0 +1,577 @@
|
|||||||
|
"""Functions to convert an image XObject to an image"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Literal, Optional, Union, cast
|
||||||
|
|
||||||
|
from ._utils import check_if_whitespace_only, logger_warning
|
||||||
|
from .constants import ColorSpaces, StreamAttributes
|
||||||
|
from .constants import FilterTypes as FT
|
||||||
|
from .constants import ImageAttributes as IA
|
||||||
|
from .errors import EmptyImageDataError, PdfReadError
|
||||||
|
from .generic import (
|
||||||
|
ArrayObject,
|
||||||
|
DecodedStreamObject,
|
||||||
|
EncodedStreamObject,
|
||||||
|
NullObject,
|
||||||
|
TextStringObject,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
from typing import TypeAlias
|
||||||
|
else:
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pillow is required to do image extraction. "
|
||||||
|
"It can be installed via 'pip install pypdf[image]'"
|
||||||
|
)
|
||||||
|
|
||||||
|
mode_str_type: TypeAlias = Literal[
|
||||||
|
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
|
||||||
|
]
|
||||||
|
|
||||||
|
MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _get_image_mode(
|
||||||
|
color_space: Union[str, list[Any], Any],
|
||||||
|
color_components: int,
|
||||||
|
prev_mode: mode_str_type,
|
||||||
|
depth: int = 0,
|
||||||
|
) -> tuple[mode_str_type, bool]:
|
||||||
|
"""
|
||||||
|
Returns:
|
||||||
|
Image mode, not taking into account mask (transparency).
|
||||||
|
ColorInversion is required (like for some DeviceCMYK).
|
||||||
|
|
||||||
|
"""
|
||||||
|
if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
|
||||||
|
raise PdfReadError(
|
||||||
|
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
|
||||||
|
)
|
||||||
|
if is_null_or_none(color_space):
|
||||||
|
return "", False
|
||||||
|
color_space_str: str = ""
|
||||||
|
if isinstance(color_space, str):
|
||||||
|
color_space_str = color_space
|
||||||
|
elif not isinstance(color_space, list):
|
||||||
|
raise PdfReadError(
|
||||||
|
"Cannot interpret color space", color_space
|
||||||
|
) # pragma: no cover
|
||||||
|
elif not color_space:
|
||||||
|
return "", False
|
||||||
|
elif color_space[0].startswith("/Cal"): # /CalRGB or /CalGray
|
||||||
|
color_space_str = "/Device" + color_space[0][4:]
|
||||||
|
elif color_space[0] == "/ICCBased":
|
||||||
|
icc_profile = color_space[1].get_object()
|
||||||
|
color_components = cast(int, icc_profile["/N"])
|
||||||
|
color_space_str = icc_profile.get("/Alternate", "")
|
||||||
|
elif color_space[0] == "/Indexed":
|
||||||
|
color_space_str = color_space[1].get_object()
|
||||||
|
mode, invert_color = _get_image_mode(
|
||||||
|
color_space_str, color_components, prev_mode, depth + 1
|
||||||
|
)
|
||||||
|
if mode in ("RGB", "CMYK"):
|
||||||
|
mode = "P"
|
||||||
|
return mode, invert_color
|
||||||
|
elif color_space[0] == "/Separation":
|
||||||
|
color_space_str = color_space[2].get_object()
|
||||||
|
mode, invert_color = _get_image_mode(
|
||||||
|
color_space_str, color_components, prev_mode, depth + 1
|
||||||
|
)
|
||||||
|
return mode, True
|
||||||
|
elif color_space[0] == "/DeviceN":
|
||||||
|
original_color_space = color_space
|
||||||
|
color_components = len(color_space[1])
|
||||||
|
color_space_str = color_space[2].get_object()
|
||||||
|
if color_space_str == "/DeviceCMYK" and color_components == 1:
|
||||||
|
if original_color_space[1][0] != "/Black":
|
||||||
|
logger_warning(
|
||||||
|
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
return "L", True
|
||||||
|
mode, invert_color = _get_image_mode(
|
||||||
|
color_space_str, color_components, prev_mode, depth + 1
|
||||||
|
)
|
||||||
|
return mode, invert_color
|
||||||
|
|
||||||
|
mode_map: dict[str, mode_str_type] = {
|
||||||
|
"1bit": "1", # must be zeroth position: color_components may index the values
|
||||||
|
"/DeviceGray": "L", # must be first position: color_components may index the values
|
||||||
|
"palette": "P", # must be second position: color_components may index the values
|
||||||
|
"/DeviceRGB": "RGB", # must be third position: color_components may index the values
|
||||||
|
"/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values
|
||||||
|
"2bit": "2bits",
|
||||||
|
"4bit": "4bits",
|
||||||
|
}
|
||||||
|
|
||||||
|
mode = (
|
||||||
|
mode_map.get(color_space_str)
|
||||||
|
or list(mode_map.values())[color_components]
|
||||||
|
or prev_mode
|
||||||
|
)
|
||||||
|
|
||||||
|
return mode, mode == "CMYK"
|
||||||
|
|
||||||
|
|
||||||
|
def bits2byte(data: bytes, size: tuple[int, int], bits: int) -> bytes:
|
||||||
|
mask = (1 << bits) - 1
|
||||||
|
byte_buffer = bytearray(size[0] * size[1])
|
||||||
|
data_index = 0
|
||||||
|
bit = 8 - bits
|
||||||
|
for y in range(size[1]):
|
||||||
|
if bit != 8 - bits:
|
||||||
|
data_index += 1
|
||||||
|
bit = 8 - bits
|
||||||
|
for x in range(size[0]):
|
||||||
|
byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask
|
||||||
|
bit -= bits
|
||||||
|
if bit < 0:
|
||||||
|
data_index += 1
|
||||||
|
bit = 8 - bits
|
||||||
|
return bytes(byte_buffer)
|
||||||
|
|
||||||
|
|
||||||
|
def _extended_image_from_bytes(
|
||||||
|
mode: str, size: tuple[int, int], data: bytes
|
||||||
|
) -> Image.Image:
|
||||||
|
try:
|
||||||
|
img = Image.frombytes(mode, size, data)
|
||||||
|
except ValueError as exc:
|
||||||
|
nb_pix = size[0] * size[1]
|
||||||
|
data_length = len(data)
|
||||||
|
if data_length == 0:
|
||||||
|
raise EmptyImageDataError(
|
||||||
|
"Data is 0 bytes, cannot process an image from empty data."
|
||||||
|
) from exc
|
||||||
|
if data_length % nb_pix != 0:
|
||||||
|
raise exc
|
||||||
|
k = nb_pix * len(mode) / data_length
|
||||||
|
data = b"".join(bytes((x,) * int(k)) for x in data)
|
||||||
|
img = Image.frombytes(mode, size, data)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def __handle_flate__indexed(color_space: ArrayObject) -> tuple[Any, Any, Any, Any]:
|
||||||
|
count = len(color_space)
|
||||||
|
if count == 4:
|
||||||
|
color_space, base, hival, lookup = (value.get_object() for value in color_space)
|
||||||
|
return color_space, base, hival, lookup
|
||||||
|
|
||||||
|
# Deal with strange AutoDesk files where `base` and `hival` look like this:
|
||||||
|
# /DeviceRGB\x00255
|
||||||
|
element1 = color_space[1]
|
||||||
|
element1 = element1 if isinstance(element1, str) else element1.get_object()
|
||||||
|
if count == 3 and "\x00" in element1:
|
||||||
|
color_space, lookup = color_space[0].get_object(), color_space[2].get_object()
|
||||||
|
base, hival = element1.split("\x00")
|
||||||
|
hival = int(hival)
|
||||||
|
return color_space, base, hival, lookup
|
||||||
|
raise PdfReadError(f"Expected color space with 4 values, got {count}: {color_space}")
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_flate(
|
||||||
|
size: tuple[int, int],
|
||||||
|
data: bytes,
|
||||||
|
mode: mode_str_type,
|
||||||
|
color_space: str,
|
||||||
|
colors: int,
|
||||||
|
obj_as_text: str,
|
||||||
|
) -> tuple[Image.Image, str, str, bool]:
|
||||||
|
"""
|
||||||
|
Process image encoded in flateEncode
|
||||||
|
Returns img, image_format, extension, color inversion
|
||||||
|
"""
|
||||||
|
extension = ".png" # mime_type: "image/png"
|
||||||
|
image_format = "PNG"
|
||||||
|
lookup: Any
|
||||||
|
base: Any
|
||||||
|
hival: Any
|
||||||
|
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
|
||||||
|
color_space, base, hival, lookup = __handle_flate__indexed(color_space)
|
||||||
|
if mode == "2bits":
|
||||||
|
mode = "P"
|
||||||
|
data = bits2byte(data, size, 2)
|
||||||
|
elif mode == "4bits":
|
||||||
|
mode = "P"
|
||||||
|
data = bits2byte(data, size, 4)
|
||||||
|
img = _extended_image_from_bytes(mode, size, data)
|
||||||
|
if color_space == "/Indexed":
|
||||||
|
if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
|
||||||
|
lookup = lookup.get_data()
|
||||||
|
if isinstance(lookup, TextStringObject):
|
||||||
|
lookup = lookup.original_bytes
|
||||||
|
if isinstance(lookup, str):
|
||||||
|
lookup = lookup.encode()
|
||||||
|
try:
|
||||||
|
nb, conv, mode = { # type: ignore
|
||||||
|
"1": (0, "", ""),
|
||||||
|
"L": (1, "P", "L"),
|
||||||
|
"P": (0, "", ""),
|
||||||
|
"RGB": (3, "P", "RGB"),
|
||||||
|
"CMYK": (4, "P", "CMYK"),
|
||||||
|
}[_get_image_mode(base, 0, "")[0]]
|
||||||
|
except KeyError: # pragma: no cover
|
||||||
|
logger_warning(
|
||||||
|
f"Base {base} not coded please share the pdf file with pypdf dev team",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
lookup = None
|
||||||
|
else:
|
||||||
|
if img.mode == "1":
|
||||||
|
# Two values ("high" and "low").
|
||||||
|
expected_count = 2 * nb
|
||||||
|
actual_count = len(lookup)
|
||||||
|
if actual_count != expected_count:
|
||||||
|
if actual_count < expected_count:
|
||||||
|
logger_warning(
|
||||||
|
f"Not enough lookup values: Expected {expected_count}, got {actual_count}.",
|
||||||
|
__name__
|
||||||
|
)
|
||||||
|
lookup += bytes([0] * (expected_count - actual_count))
|
||||||
|
elif not check_if_whitespace_only(lookup[expected_count:]):
|
||||||
|
logger_warning(
|
||||||
|
f"Too many lookup values: Expected {expected_count}, got {actual_count}.",
|
||||||
|
__name__
|
||||||
|
)
|
||||||
|
lookup = lookup[:expected_count]
|
||||||
|
colors_arr = [lookup[:nb], lookup[nb:]]
|
||||||
|
arr = b"".join(
|
||||||
|
b"".join(
|
||||||
|
colors_arr[1 if img.getpixel((x, y)) > 127 else 0] # type: ignore[operator,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
|
||||||
|
for x in range(img.size[0])
|
||||||
|
)
|
||||||
|
for y in range(img.size[1])
|
||||||
|
)
|
||||||
|
img = Image.frombytes(mode, img.size, arr)
|
||||||
|
else:
|
||||||
|
img = img.convert(conv)
|
||||||
|
if len(lookup) != (hival + 1) * nb:
|
||||||
|
logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
|
||||||
|
lookup = None
|
||||||
|
elif mode == "L":
|
||||||
|
# gray lookup does not work: it is converted to a similar RGB lookup
|
||||||
|
lookup = b"".join([bytes([b, b, b]) for b in lookup])
|
||||||
|
mode = "RGB"
|
||||||
|
# TODO: https://github.com/py-pdf/pypdf/pull/2039
|
||||||
|
# this is a work around until PIL is able to process CMYK images
|
||||||
|
elif mode == "CMYK":
|
||||||
|
_rgb = []
|
||||||
|
for _c, _m, _y, _k in (
|
||||||
|
lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
|
||||||
|
):
|
||||||
|
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
|
||||||
|
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
|
||||||
|
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
|
||||||
|
_rgb.append(bytes((_r, _g, _b)))
|
||||||
|
lookup = b"".join(_rgb)
|
||||||
|
mode = "RGB"
|
||||||
|
if lookup is not None:
|
||||||
|
img.putpalette(lookup, rawmode=mode)
|
||||||
|
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
|
||||||
|
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
|
||||||
|
# Table 65 - Additional Entries Specific to an ICC Profile Stream Dictionary
|
||||||
|
mode2 = _get_image_mode(color_space, colors, mode)[0]
|
||||||
|
if mode != mode2:
|
||||||
|
img = Image.frombytes(mode2, size, data) # reloaded as mode may have changed
|
||||||
|
if mode == "CMYK":
|
||||||
|
extension = ".tif"
|
||||||
|
image_format = "TIFF"
|
||||||
|
return img, image_format, extension, False
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_jpx(
|
||||||
|
size: tuple[int, int],
|
||||||
|
data: bytes,
|
||||||
|
mode: mode_str_type,
|
||||||
|
color_space: str,
|
||||||
|
colors: int,
|
||||||
|
) -> tuple[Image.Image, str, str, bool]:
|
||||||
|
"""
|
||||||
|
Process image encoded in flateEncode
|
||||||
|
Returns img, image_format, extension, inversion
|
||||||
|
"""
|
||||||
|
extension = ".jp2" # mime_type: "image/x-jp2"
|
||||||
|
img1: Image.Image = Image.open(BytesIO(data), formats=("JPEG2000",))
|
||||||
|
mode, invert_color = _get_image_mode(color_space, colors, mode)
|
||||||
|
if mode == "":
|
||||||
|
mode = cast(mode_str_type, img1.mode)
|
||||||
|
invert_color = mode in ("CMYK",)
|
||||||
|
if img1.mode == "RGBA" and mode == "RGB":
|
||||||
|
mode = "RGBA"
|
||||||
|
# we need to convert to the good mode
|
||||||
|
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
|
||||||
|
# L and P are indexed modes which should not be changed.
|
||||||
|
img = img1
|
||||||
|
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
|
||||||
|
# RGBA / CMYK are 4bytes encoding where
|
||||||
|
# the encoding should be corrected
|
||||||
|
img = Image.frombytes(mode, img1.size, img1.tobytes())
|
||||||
|
else: # pragma: no cover
|
||||||
|
img = img1.convert(mode)
|
||||||
|
# CMYK conversion
|
||||||
|
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
|
||||||
|
# not implemented for the moment as I need to get properly the ICC
|
||||||
|
if img.mode == "CMYK":
|
||||||
|
img = img.convert("RGB")
|
||||||
|
image_format = "JPEG2000"
|
||||||
|
return img, image_format, extension, invert_color
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_decode(
|
||||||
|
img: Image.Image,
|
||||||
|
x_object_obj: dict[str, Any],
|
||||||
|
lfilters: FT,
|
||||||
|
color_space: Union[str, list[Any], Any],
|
||||||
|
invert_color: bool,
|
||||||
|
) -> Image.Image:
|
||||||
|
# CMYK image and other color spaces without decode
|
||||||
|
# requires reverting scale (cf p243,2§ last sentence)
|
||||||
|
decode = x_object_obj.get(
|
||||||
|
IA.DECODE,
|
||||||
|
([1.0, 0.0] * len(img.getbands()))
|
||||||
|
if (
|
||||||
|
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
|
||||||
|
or (invert_color and img.mode == "L")
|
||||||
|
)
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
isinstance(color_space, ArrayObject)
|
||||||
|
and color_space[0].get_object() == "/Indexed"
|
||||||
|
):
|
||||||
|
decode = None # decode is meaningless if Indexed
|
||||||
|
if (
|
||||||
|
isinstance(color_space, ArrayObject)
|
||||||
|
and color_space[0].get_object() == "/Separation"
|
||||||
|
):
|
||||||
|
decode = [1.0, 0.0] * len(img.getbands())
|
||||||
|
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
|
||||||
|
lut: list[int] = []
|
||||||
|
for i in range(0, len(decode), 2):
|
||||||
|
dmin = decode[i]
|
||||||
|
dmax = decode[i + 1]
|
||||||
|
lut.extend(
|
||||||
|
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
|
||||||
|
)
|
||||||
|
img = img.point(lut)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mode_and_invert_color(
|
||||||
|
x_object_obj: dict[str, Any], colors: int, color_space: Union[str, list[Any], Any]
|
||||||
|
) -> tuple[mode_str_type, bool]:
|
||||||
|
if (
|
||||||
|
IA.COLOR_SPACE in x_object_obj
|
||||||
|
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
|
||||||
|
):
|
||||||
|
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
|
||||||
|
mode: mode_str_type = "RGB"
|
||||||
|
if x_object_obj.get("/BitsPerComponent", 8) < 8:
|
||||||
|
mode, invert_color = _get_image_mode(
|
||||||
|
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
mode, invert_color = _get_image_mode(
|
||||||
|
color_space,
|
||||||
|
2
|
||||||
|
if (
|
||||||
|
colors == 1
|
||||||
|
and (
|
||||||
|
not is_null_or_none(color_space)
|
||||||
|
and "Gray" not in color_space
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else colors,
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
return mode, invert_color
|
||||||
|
|
||||||
|
|
||||||
|
def _xobj_to_image(
|
||||||
|
x_object: dict[str, Any],
|
||||||
|
pillow_parameters: Union[dict[str, Any], None] = None
|
||||||
|
) -> tuple[Optional[str], bytes, Any]:
|
||||||
|
"""
|
||||||
|
Users need to have the pillow package installed.
|
||||||
|
|
||||||
|
It's unclear if pypdf will keep this function here, hence it's private.
|
||||||
|
It might get removed at any point.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x_object:
|
||||||
|
pillow_parameters: parameters provided to Pillow Image.save() method,
|
||||||
|
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[file extension, bytes, PIL.Image.Image]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _apply_alpha(
|
||||||
|
img: Image.Image,
|
||||||
|
x_object: dict[str, Any],
|
||||||
|
obj_as_text: str,
|
||||||
|
image_format: str,
|
||||||
|
extension: str,
|
||||||
|
) -> tuple[Image.Image, str, str]:
|
||||||
|
alpha = None
|
||||||
|
if IA.S_MASK in x_object: # add alpha channel
|
||||||
|
alpha = _xobj_to_image(x_object[IA.S_MASK])[2]
|
||||||
|
if img.size != alpha.size:
|
||||||
|
logger_warning(
|
||||||
|
f"image and mask size not matching: {obj_as_text}", __name__
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# TODO: implement mask
|
||||||
|
if alpha.mode != "L":
|
||||||
|
alpha = alpha.convert("L")
|
||||||
|
if img.mode == "P":
|
||||||
|
img = img.convert("RGB")
|
||||||
|
elif img.mode == "1":
|
||||||
|
img = img.convert("L")
|
||||||
|
img.putalpha(alpha)
|
||||||
|
if "JPEG" in image_format:
|
||||||
|
image_format = "JPEG2000"
|
||||||
|
extension = ".jp2"
|
||||||
|
else:
|
||||||
|
image_format = "PNG"
|
||||||
|
extension = ".png"
|
||||||
|
return img, extension, image_format
|
||||||
|
|
||||||
|
# For error reporting
|
||||||
|
obj_as_text = (
|
||||||
|
x_object.indirect_reference.__repr__()
|
||||||
|
if x_object is None # pragma: no cover
|
||||||
|
else x_object.__repr__()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get size and data
|
||||||
|
size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT]))
|
||||||
|
data = x_object.get_data() # type: ignore
|
||||||
|
if isinstance(data, str): # pragma: no cover
|
||||||
|
data = data.encode()
|
||||||
|
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
|
||||||
|
data = data[:-1]
|
||||||
|
|
||||||
|
# Get color properties
|
||||||
|
colors = x_object.get("/Colors", 1)
|
||||||
|
color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object()
|
||||||
|
if isinstance(color_space, list) and len(color_space) == 1:
|
||||||
|
color_space = color_space[0].get_object()
|
||||||
|
|
||||||
|
mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space)
|
||||||
|
|
||||||
|
# Get filters
|
||||||
|
filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object()
|
||||||
|
lfilters = filters[-1] if isinstance(filters, list) else filters
|
||||||
|
decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None)
|
||||||
|
if decode_parms and isinstance(decode_parms, (tuple, list)):
|
||||||
|
decode_parms = decode_parms[0]
|
||||||
|
else:
|
||||||
|
decode_parms = {}
|
||||||
|
if not isinstance(decode_parms, dict):
|
||||||
|
decode_parms = {}
|
||||||
|
|
||||||
|
extension = None
|
||||||
|
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
|
||||||
|
img, image_format, extension, _ = _handle_flate(
|
||||||
|
size,
|
||||||
|
data,
|
||||||
|
mode,
|
||||||
|
color_space,
|
||||||
|
colors,
|
||||||
|
obj_as_text,
|
||||||
|
)
|
||||||
|
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE):
|
||||||
|
# I'm not sure if the following logic is correct.
|
||||||
|
# There might not be any relationship between the filters and the
|
||||||
|
# extension
|
||||||
|
if lfilters == FT.LZW_DECODE:
|
||||||
|
image_format = "TIFF"
|
||||||
|
extension = ".tiff" # mime_type = "image/tiff"
|
||||||
|
else:
|
||||||
|
image_format = "PNG"
|
||||||
|
extension = ".png" # mime_type = "image/png"
|
||||||
|
try:
|
||||||
|
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
|
||||||
|
except UnidentifiedImageError:
|
||||||
|
img = _extended_image_from_bytes(mode, size, data)
|
||||||
|
elif lfilters == FT.DCT_DECODE:
|
||||||
|
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
|
||||||
|
# invert_color kept unchanged
|
||||||
|
elif lfilters == FT.JPX_DECODE:
|
||||||
|
img, image_format, extension, invert_color = _handle_jpx(
|
||||||
|
size, data, mode, color_space, colors
|
||||||
|
)
|
||||||
|
elif lfilters == FT.CCITT_FAX_DECODE:
|
||||||
|
img, image_format, extension, invert_color = (
|
||||||
|
Image.open(BytesIO(data), formats=("TIFF",)),
|
||||||
|
"TIFF",
|
||||||
|
".tiff",
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
elif lfilters == FT.JBIG2_DECODE:
|
||||||
|
img, image_format, extension, invert_color = (
|
||||||
|
Image.open(BytesIO(data), formats=("PNG", "PPM")),
|
||||||
|
"PNG",
|
||||||
|
".png",
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
elif mode == "CMYK":
|
||||||
|
img, image_format, extension, invert_color = (
|
||||||
|
_extended_image_from_bytes(mode, size, data),
|
||||||
|
"TIFF",
|
||||||
|
".tif",
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
elif mode == "":
|
||||||
|
raise PdfReadError(f"ColorSpace field not found in {x_object}")
|
||||||
|
else:
|
||||||
|
img, image_format, extension, invert_color = (
|
||||||
|
_extended_image_from_bytes(mode, size, data),
|
||||||
|
"PNG",
|
||||||
|
".png",
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
|
img = _apply_decode(img, x_object, lfilters, color_space, invert_color)
|
||||||
|
img, extension, image_format = _apply_alpha(
|
||||||
|
img, x_object, obj_as_text, image_format, extension
|
||||||
|
)
|
||||||
|
|
||||||
|
if pillow_parameters is None:
|
||||||
|
pillow_parameters = {}
|
||||||
|
# Preserve JPEG image quality - see issue #3515.
|
||||||
|
if image_format == "JPEG":
|
||||||
|
# This prevents: Cannot use 'keep' when original image is not a JPEG:
|
||||||
|
# "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format
|
||||||
|
img.format = "JPEG"
|
||||||
|
if "quality" not in pillow_parameters:
|
||||||
|
pillow_parameters["quality"] = "keep"
|
||||||
|
|
||||||
|
# Save image to bytes
|
||||||
|
img_byte_arr = BytesIO()
|
||||||
|
try:
|
||||||
|
img.save(img_byte_arr, format=image_format, **pillow_parameters)
|
||||||
|
except OSError: # pragma: no cover # covered with pillow 10.3
|
||||||
|
# in case of we convert to RGBA and then to PNG
|
||||||
|
img1 = img.convert("RGBA")
|
||||||
|
image_format = "PNG"
|
||||||
|
extension = ".png"
|
||||||
|
img_byte_arr = BytesIO()
|
||||||
|
img1.save(img_byte_arr, format=image_format)
|
||||||
|
data = img_byte_arr.getvalue()
|
||||||
|
|
||||||
|
try: # temporary try/except until other fixes of images
|
||||||
|
img = Image.open(BytesIO(data))
|
||||||
|
except Exception as exception:
|
||||||
|
logger_warning(f"Failed loading image: {exception}", __name__)
|
||||||
|
img = None # type: ignore[assignment,unused-ignore] # TODO: Remove unused-ignore on Python 3.10
|
||||||
|
return extension, data, img
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
"""
|
||||||
|
PDF specifies several annotation types which pypdf makes available here.
|
||||||
|
|
||||||
|
The names of the annotations and their attributes do not reflect the names in
|
||||||
|
the specification in all cases. For example, the PDF standard defines a
|
||||||
|
'Square' annotation that does not actually need to be square. For this reason,
|
||||||
|
pypdf calls it 'Rectangle'.
|
||||||
|
|
||||||
|
At their core, all annotation types are DictionaryObjects. That means if pypdf
|
||||||
|
does not implement a feature, users can easily extend the given functionality.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from ._base import NO_FLAGS, AnnotationDictionary
|
||||||
|
from ._markup_annotations import (
|
||||||
|
Ellipse,
|
||||||
|
FreeText,
|
||||||
|
Highlight,
|
||||||
|
Line,
|
||||||
|
MarkupAnnotation,
|
||||||
|
Polygon,
|
||||||
|
PolyLine,
|
||||||
|
Rectangle,
|
||||||
|
Text,
|
||||||
|
)
|
||||||
|
from ._non_markup_annotations import Link, Popup
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"NO_FLAGS",
|
||||||
|
"AnnotationDictionary",
|
||||||
|
"Ellipse",
|
||||||
|
"FreeText",
|
||||||
|
"Highlight",
|
||||||
|
"Line",
|
||||||
|
"Link",
|
||||||
|
"MarkupAnnotation",
|
||||||
|
"PolyLine",
|
||||||
|
"Polygon",
|
||||||
|
"Popup",
|
||||||
|
"Rectangle",
|
||||||
|
"Text",
|
||||||
|
]
|
||||||
29
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
29
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from abc import ABC
|
||||||
|
|
||||||
|
from ..constants import AnnotationFlag
|
||||||
|
from ..generic import NameObject, NumberObject
|
||||||
|
from ..generic._data_structures import DictionaryObject
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotationDictionary(DictionaryObject, ABC):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
from ..generic._base import NameObject # noqa: PLC0415
|
||||||
|
|
||||||
|
# /Rect should not be added here as Polygon and PolyLine can automatically set it
|
||||||
|
self[NameObject("/Type")] = NameObject("/Annot")
|
||||||
|
# The flags were NOT added to the constructor on purpose:
|
||||||
|
# We expect that most users don't want to change the default.
|
||||||
|
# If they do, they can use the property. The default is 0.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def flags(self) -> AnnotationFlag:
|
||||||
|
return self.get(NameObject("/F"), AnnotationFlag(0))
|
||||||
|
|
||||||
|
@flags.setter
|
||||||
|
def flags(self, value: AnnotationFlag) -> None:
|
||||||
|
self[NameObject("/F")] = NumberObject(value)
|
||||||
|
|
||||||
|
|
||||||
|
NO_FLAGS = AnnotationFlag(0)
|
||||||
@@ -0,0 +1,305 @@
|
|||||||
|
import sys
|
||||||
|
from abc import ABC
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
from ..constants import AnnotationFlag
|
||||||
|
from ..generic import ArrayObject, DictionaryObject
|
||||||
|
from ..generic._base import (
|
||||||
|
BooleanObject,
|
||||||
|
FloatObject,
|
||||||
|
NameObject,
|
||||||
|
NumberObject,
|
||||||
|
TextStringObject,
|
||||||
|
)
|
||||||
|
from ..generic._rectangle import RectangleObject
|
||||||
|
from ..generic._utils import hex_to_rgb
|
||||||
|
from ._base import NO_FLAGS, AnnotationDictionary
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
from typing import TypeAlias
|
||||||
|
else:
|
||||||
|
# PEP 613 introduced typing.TypeAlias with Python 3.10
|
||||||
|
# For older Python versions, the backport typing_extensions is necessary:
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
Vertex: TypeAlias = tuple[float, float]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_bounding_rectangle(vertices: list[Vertex]) -> RectangleObject:
|
||||||
|
x_min, y_min = vertices[0][0], vertices[0][1]
|
||||||
|
x_max, y_max = vertices[0][0], vertices[0][1]
|
||||||
|
for x, y in vertices:
|
||||||
|
x_min = min(x_min, x)
|
||||||
|
y_min = min(y_min, y)
|
||||||
|
x_max = max(x_max, x)
|
||||||
|
y_max = max(y_max, y)
|
||||||
|
return RectangleObject((x_min, y_min, x_max, y_max))
|
||||||
|
|
||||||
|
|
||||||
|
class MarkupAnnotation(AnnotationDictionary, ABC):
|
||||||
|
"""
|
||||||
|
Base class for all markup annotations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title_bar: Text to be displayed in the title bar of the annotation;
|
||||||
|
by convention this is the name of the author
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, title_bar: Optional[str] = None) -> None:
|
||||||
|
if title_bar is not None:
|
||||||
|
self[NameObject("/T")] = TextStringObject(title_bar)
|
||||||
|
|
||||||
|
|
||||||
|
class Text(MarkupAnnotation):
|
||||||
|
"""
|
||||||
|
A text annotation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rect: array of four integers ``[xLL, yLL, xUR, yUR]``
|
||||||
|
specifying the clickable rectangular area
|
||||||
|
text: The text that is added to the document
|
||||||
|
open:
|
||||||
|
flags:
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
text: str,
|
||||||
|
open: bool = False,
|
||||||
|
flags: int = NO_FLAGS,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self[NameObject("/Subtype")] = NameObject("/Text")
|
||||||
|
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||||
|
self[NameObject("/Contents")] = TextStringObject(text)
|
||||||
|
self[NameObject("/Open")] = BooleanObject(open)
|
||||||
|
self[NameObject("/Flags")] = NumberObject(flags)
|
||||||
|
|
||||||
|
|
||||||
|
class FreeText(MarkupAnnotation):
|
||||||
|
"""A FreeText annotation"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
text: str,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
font: str = "Helvetica",
|
||||||
|
bold: bool = False,
|
||||||
|
italic: bool = False,
|
||||||
|
font_size: str = "14pt",
|
||||||
|
font_color: str = "000000",
|
||||||
|
border_color: Optional[str] = "000000",
|
||||||
|
background_color: Optional[str] = "ffffff",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self[NameObject("/Subtype")] = NameObject("/FreeText")
|
||||||
|
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||||
|
|
||||||
|
# Table 225 of the 1.7 reference ("CSS2 style attributes used in rich text strings")
|
||||||
|
font_str = "font: "
|
||||||
|
if italic:
|
||||||
|
font_str = f"{font_str}italic "
|
||||||
|
else:
|
||||||
|
font_str = f"{font_str}normal "
|
||||||
|
if bold:
|
||||||
|
font_str = f"{font_str}bold "
|
||||||
|
else:
|
||||||
|
font_str = f"{font_str}normal "
|
||||||
|
font_str = f"{font_str}{font_size} {font}"
|
||||||
|
font_str = f"{font_str};text-align:left;color:#{font_color}"
|
||||||
|
|
||||||
|
default_appearance_string = ""
|
||||||
|
if border_color:
|
||||||
|
for st in hex_to_rgb(border_color):
|
||||||
|
default_appearance_string = f"{default_appearance_string}{st} "
|
||||||
|
default_appearance_string = f"{default_appearance_string}rg"
|
||||||
|
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Subtype"): NameObject("/FreeText"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
NameObject("/Contents"): TextStringObject(text),
|
||||||
|
# font size color
|
||||||
|
NameObject("/DS"): TextStringObject(font_str),
|
||||||
|
NameObject("/DA"): TextStringObject(default_appearance_string),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if border_color is None:
|
||||||
|
# Border Style
|
||||||
|
self[NameObject("/BS")] = DictionaryObject(
|
||||||
|
{
|
||||||
|
# width of 0 means no border
|
||||||
|
NameObject("/W"): NumberObject(0)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if background_color is not None:
|
||||||
|
self[NameObject("/C")] = ArrayObject(
|
||||||
|
[FloatObject(n) for n in hex_to_rgb(background_color)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Line(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
p1: Vertex,
|
||||||
|
p2: Vertex,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
text: str = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Subtype"): NameObject("/Line"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
NameObject("/L"): ArrayObject(
|
||||||
|
[
|
||||||
|
FloatObject(p1[0]),
|
||||||
|
FloatObject(p1[1]),
|
||||||
|
FloatObject(p2[0]),
|
||||||
|
FloatObject(p2[1]),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
NameObject("/LE"): ArrayObject(
|
||||||
|
[
|
||||||
|
NameObject("/None"),
|
||||||
|
NameObject("/None"),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
NameObject("/IC"): ArrayObject(
|
||||||
|
[
|
||||||
|
FloatObject(0.5),
|
||||||
|
FloatObject(0.5),
|
||||||
|
FloatObject(0.5),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
NameObject("/Contents"): TextStringObject(text),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PolyLine(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vertices: list[Vertex],
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if len(vertices) == 0:
|
||||||
|
raise ValueError("A polyline needs at least 1 vertex with two coordinates")
|
||||||
|
coord_list = []
|
||||||
|
for x, y in vertices:
|
||||||
|
coord_list.append(NumberObject(x))
|
||||||
|
coord_list.append(NumberObject(y))
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Subtype"): NameObject("/PolyLine"),
|
||||||
|
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||||
|
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Rectangle(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
*,
|
||||||
|
interior_color: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Type"): NameObject("/Annot"),
|
||||||
|
NameObject("/Subtype"): NameObject("/Square"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if interior_color:
|
||||||
|
self[NameObject("/IC")] = ArrayObject(
|
||||||
|
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Highlight(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
quad_points: ArrayObject,
|
||||||
|
highlight_color: str = "ff0000",
|
||||||
|
printing: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Subtype"): NameObject("/Highlight"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
NameObject("/QuadPoints"): quad_points,
|
||||||
|
NameObject("/C"): ArrayObject(
|
||||||
|
[FloatObject(n) for n in hex_to_rgb(highlight_color)]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if printing:
|
||||||
|
self.flags = AnnotationFlag.PRINT
|
||||||
|
|
||||||
|
|
||||||
|
class Ellipse(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
*,
|
||||||
|
interior_color: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Type"): NameObject("/Annot"),
|
||||||
|
NameObject("/Subtype"): NameObject("/Circle"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if interior_color:
|
||||||
|
self[NameObject("/IC")] = ArrayObject(
|
||||||
|
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Polygon(MarkupAnnotation):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vertices: list[tuple[float, float]],
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if len(vertices) == 0:
|
||||||
|
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
|
||||||
|
|
||||||
|
coord_list = []
|
||||||
|
for x, y in vertices:
|
||||||
|
coord_list.append(NumberObject(x))
|
||||||
|
coord_list.append(NumberObject(y))
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Type"): NameObject("/Annot"),
|
||||||
|
NameObject("/Subtype"): NameObject("/Polygon"),
|
||||||
|
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||||
|
NameObject("/IT"): NameObject("/PolygonCloud"),
|
||||||
|
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||||
|
}
|
||||||
|
)
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||||
|
|
||||||
|
from ..generic._base import (
|
||||||
|
BooleanObject,
|
||||||
|
NameObject,
|
||||||
|
NumberObject,
|
||||||
|
TextStringObject,
|
||||||
|
)
|
||||||
|
from ..generic._data_structures import ArrayObject, DictionaryObject
|
||||||
|
from ..generic._fit import DEFAULT_FIT, Fit
|
||||||
|
from ..generic._rectangle import RectangleObject
|
||||||
|
from ._base import AnnotationDictionary
|
||||||
|
|
||||||
|
|
||||||
|
class Link(AnnotationDictionary):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
border: Optional[ArrayObject] = None,
|
||||||
|
url: Optional[str] = None,
|
||||||
|
target_page_index: Optional[int] = None,
|
||||||
|
fit: Fit = DEFAULT_FIT,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ..types import BorderArrayType # noqa: PLC0415
|
||||||
|
|
||||||
|
is_external = url is not None
|
||||||
|
is_internal = target_page_index is not None
|
||||||
|
if not is_external and not is_internal:
|
||||||
|
raise ValueError(
|
||||||
|
"Either 'url' or 'target_page_index' have to be provided. Both were None."
|
||||||
|
)
|
||||||
|
if is_external and is_internal:
|
||||||
|
raise ValueError(
|
||||||
|
"Either 'url' or 'target_page_index' have to be provided. "
|
||||||
|
f"{url=}, {target_page_index=}"
|
||||||
|
)
|
||||||
|
|
||||||
|
border_arr: BorderArrayType
|
||||||
|
if border is not None:
|
||||||
|
border_arr = [NumberObject(n) for n in border[:3]]
|
||||||
|
if len(border) == 4:
|
||||||
|
dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
|
||||||
|
border_arr.append(dash_pattern)
|
||||||
|
else:
|
||||||
|
border_arr = [NumberObject(0)] * 3
|
||||||
|
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Type"): NameObject("/Annot"),
|
||||||
|
NameObject("/Subtype"): NameObject("/Link"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
NameObject("/Border"): ArrayObject(border_arr),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if is_external:
|
||||||
|
self[NameObject("/A")] = DictionaryObject(
|
||||||
|
{
|
||||||
|
NameObject("/S"): NameObject("/URI"),
|
||||||
|
NameObject("/Type"): NameObject("/Action"),
|
||||||
|
NameObject("/URI"): TextStringObject(url),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if is_internal:
|
||||||
|
# This needs to be updated later!
|
||||||
|
dest_deferred = DictionaryObject(
|
||||||
|
{
|
||||||
|
"target_page_index": NumberObject(target_page_index),
|
||||||
|
"fit": NameObject(fit.fit_type),
|
||||||
|
"fit_args": fit.fit_args,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self[NameObject("/Dest")] = dest_deferred
|
||||||
|
|
||||||
|
|
||||||
|
class Popup(AnnotationDictionary):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
rect: Union[RectangleObject, tuple[float, float, float, float]],
|
||||||
|
parent: Optional[DictionaryObject] = None,
|
||||||
|
open: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.update(
|
||||||
|
{
|
||||||
|
NameObject("/Subtype"): NameObject("/Popup"),
|
||||||
|
NameObject("/Rect"): RectangleObject(rect),
|
||||||
|
NameObject("/Open"): BooleanObject(open),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if parent:
|
||||||
|
# This needs to be an indirect object
|
||||||
|
try:
|
||||||
|
self[NameObject("/Parent")] = parent.indirect_reference
|
||||||
|
except AttributeError:
|
||||||
|
from .._utils import logger_warning # noqa: PLC0415
|
||||||
|
|
||||||
|
logger_warning(
|
||||||
|
"Unregistered Parent object : No Parent field set",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
796
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
796
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
@@ -0,0 +1,796 @@
|
|||||||
|
"""Various constants, enums, and flags to aid readability."""
|
||||||
|
|
||||||
|
from enum import Enum, IntFlag, auto, unique
|
||||||
|
|
||||||
|
|
||||||
|
class StrEnum(str, Enum): # Once we are on Python 3.11+: enum.StrEnum
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return str(self.value)
|
||||||
|
|
||||||
|
|
||||||
|
class Core:
|
||||||
|
"""Keywords that don't quite belong anywhere else."""
|
||||||
|
|
||||||
|
OUTLINES = "/Outlines"
|
||||||
|
THREADS = "/Threads"
|
||||||
|
PAGE = "/Page"
|
||||||
|
PAGES = "/Pages"
|
||||||
|
CATALOG = "/Catalog"
|
||||||
|
|
||||||
|
|
||||||
|
class TrailerKeys:
|
||||||
|
SIZE = "/Size"
|
||||||
|
PREV = "/Prev"
|
||||||
|
ROOT = "/Root"
|
||||||
|
ENCRYPT = "/Encrypt"
|
||||||
|
INFO = "/Info"
|
||||||
|
ID = "/ID"
|
||||||
|
|
||||||
|
|
||||||
|
class CatalogAttributes:
|
||||||
|
NAMES = "/Names"
|
||||||
|
DESTS = "/Dests"
|
||||||
|
|
||||||
|
|
||||||
|
class EncryptionDictAttributes:
|
||||||
|
"""
|
||||||
|
Additional encryption dictionary entries for the standard security handler.
|
||||||
|
|
||||||
|
Table 3.19, Page 122.
|
||||||
|
Table 21 of the 2.0 manual.
|
||||||
|
"""
|
||||||
|
|
||||||
|
R = "/R" # number, required; revision of the standard security handler
|
||||||
|
O = "/O" # 32-byte string, required # noqa: E741
|
||||||
|
U = "/U" # 32-byte string, required
|
||||||
|
P = "/P" # integer flag, required; permitted operations
|
||||||
|
ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional
|
||||||
|
|
||||||
|
|
||||||
|
class UserAccessPermissions(IntFlag):
|
||||||
|
"""
|
||||||
|
Table 3.20 User access permissions.
|
||||||
|
Table 22 of the 2.0 manual.
|
||||||
|
"""
|
||||||
|
|
||||||
|
R1 = 1
|
||||||
|
R2 = 2
|
||||||
|
PRINT = 4
|
||||||
|
MODIFY = 8
|
||||||
|
EXTRACT = 16
|
||||||
|
ADD_OR_MODIFY = 32
|
||||||
|
R7 = 64
|
||||||
|
R8 = 128
|
||||||
|
FILL_FORM_FIELDS = 256
|
||||||
|
EXTRACT_TEXT_AND_GRAPHICS = 512
|
||||||
|
ASSEMBLE_DOC = 1024
|
||||||
|
PRINT_TO_REPRESENTATION = 2048
|
||||||
|
R13 = 2**12
|
||||||
|
R14 = 2**13
|
||||||
|
R15 = 2**14
|
||||||
|
R16 = 2**15
|
||||||
|
R17 = 2**16
|
||||||
|
R18 = 2**17
|
||||||
|
R19 = 2**18
|
||||||
|
R20 = 2**19
|
||||||
|
R21 = 2**20
|
||||||
|
R22 = 2**21
|
||||||
|
R23 = 2**22
|
||||||
|
R24 = 2**23
|
||||||
|
R25 = 2**24
|
||||||
|
R26 = 2**25
|
||||||
|
R27 = 2**26
|
||||||
|
R28 = 2**27
|
||||||
|
R29 = 2**28
|
||||||
|
R30 = 2**29
|
||||||
|
R31 = 2**30
|
||||||
|
R32 = 2**31
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _is_reserved(cls, name: str) -> bool:
|
||||||
|
"""Check if the given name corresponds to a reserved flag entry."""
|
||||||
|
return name.startswith("R") and name[1:].isdigit()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _is_active(cls, name: str) -> bool:
|
||||||
|
"""Check if the given reserved name defaults to 1 = active."""
|
||||||
|
return name not in {"R1", "R2"}
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, bool]:
|
||||||
|
"""Convert the given flag value to a corresponding verbose name mapping."""
|
||||||
|
result: dict[str, bool] = {}
|
||||||
|
for name, flag in UserAccessPermissions.__members__.items():
|
||||||
|
if UserAccessPermissions._is_reserved(name):
|
||||||
|
continue
|
||||||
|
result[name.lower()] = (self & flag) == flag
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, value: dict[str, bool]) -> "UserAccessPermissions":
|
||||||
|
"""Convert the verbose name mapping to the corresponding flag value."""
|
||||||
|
value_copy = value.copy()
|
||||||
|
result = cls(0)
|
||||||
|
for name, flag in cls.__members__.items():
|
||||||
|
if cls._is_reserved(name):
|
||||||
|
# Reserved names have a required value. Use it.
|
||||||
|
if cls._is_active(name):
|
||||||
|
result |= flag
|
||||||
|
continue
|
||||||
|
is_active = value_copy.pop(name.lower(), False)
|
||||||
|
if is_active:
|
||||||
|
result |= flag
|
||||||
|
if value_copy:
|
||||||
|
raise ValueError(f"Unknown dictionary keys: {value_copy!r}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def all(cls) -> "UserAccessPermissions":
|
||||||
|
return cls((2**32 - 1) - cls.R1 - cls.R2)
|
||||||
|
|
||||||
|
|
||||||
|
class Resources:
|
||||||
|
"""
|
||||||
|
Table 3.30 Entries in a resource dictionary.
|
||||||
|
Table 34 in the 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
EXT_G_STATE = "/ExtGState" # dictionary, optional
|
||||||
|
COLOR_SPACE = "/ColorSpace" # dictionary, optional
|
||||||
|
PATTERN = "/Pattern" # dictionary, optional
|
||||||
|
SHADING = "/Shading" # dictionary, optional
|
||||||
|
XOBJECT = "/XObject" # dictionary, optional
|
||||||
|
FONT = "/Font" # dictionary, optional
|
||||||
|
PROC_SET = "/ProcSet" # array, optional
|
||||||
|
PROPERTIES = "/Properties" # dictionary, optional
|
||||||
|
|
||||||
|
|
||||||
|
class PagesAttributes:
|
||||||
|
"""§7.7.3.2 of the 1.7 and 2.0 reference."""
|
||||||
|
|
||||||
|
TYPE = "/Type" # name, required; must be /Pages
|
||||||
|
PARENT = "/Parent" # dictionary, required; indirect reference to pages object
|
||||||
|
KIDS = "/Kids" # array, required; List of indirect references
|
||||||
|
COUNT = "/Count"
|
||||||
|
# integer, required; the number of leaf nodes (page objects)
|
||||||
|
# that are descendants of this node within the page tree
|
||||||
|
|
||||||
|
|
||||||
|
class PageAttributes:
|
||||||
|
"""§7.7.3.3 of the 1.7 and 2.0 reference."""
|
||||||
|
|
||||||
|
TYPE = "/Type" # name, required; must be /Page
|
||||||
|
PARENT = "/Parent" # dictionary, required; a pages object
|
||||||
|
LAST_MODIFIED = (
|
||||||
|
"/LastModified" # date, optional; date and time of last modification
|
||||||
|
)
|
||||||
|
RESOURCES = "/Resources" # dictionary, required if there are any
|
||||||
|
MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size
|
||||||
|
CROPBOX = "/CropBox" # rectangle, optional
|
||||||
|
BLEEDBOX = "/BleedBox" # rectangle, optional
|
||||||
|
TRIMBOX = "/TrimBox" # rectangle, optional
|
||||||
|
ARTBOX = "/ArtBox" # rectangle, optional
|
||||||
|
BOX_COLOR_INFO = "/BoxColorInfo" # dictionary, optional
|
||||||
|
CONTENTS = "/Contents" # stream or array, optional
|
||||||
|
ROTATE = "/Rotate" # integer, optional; page rotation in degrees
|
||||||
|
GROUP = "/Group" # dictionary, optional; page group
|
||||||
|
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page
|
||||||
|
B = "/B" # array, optional
|
||||||
|
DUR = "/Dur" # number, optional
|
||||||
|
TRANS = "/Trans" # dictionary, optional
|
||||||
|
ANNOTS = "/Annots" # array, optional; an array of annotations
|
||||||
|
AA = "/AA" # dictionary, optional
|
||||||
|
METADATA = "/Metadata" # stream, optional
|
||||||
|
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||||
|
STRUCT_PARENTS = "/StructParents" # integer, optional
|
||||||
|
ID = "/ID" # byte string, optional
|
||||||
|
PZ = "/PZ" # number, optional
|
||||||
|
SEPARATION_INFO = "/SeparationInfo" # dictionary, optional
|
||||||
|
TABS = "/Tabs" # name, optional
|
||||||
|
TEMPLATE_INSTANTIATED = "/TemplateInstantiated" # name, optional
|
||||||
|
PRES_STEPS = "/PresSteps" # dictionary, optional
|
||||||
|
USER_UNIT = "/UserUnit" # number, optional
|
||||||
|
VP = "/VP" # dictionary, optional
|
||||||
|
AF = "/AF" # array of dictionaries, optional
|
||||||
|
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||||
|
D_PART = "/DPart" # dictionary, required, if this page is within the range of a DPart, not permitted otherwise
|
||||||
|
|
||||||
|
|
||||||
|
class FileSpecificationDictionaryEntries:
|
||||||
|
"""Table 3.41 Entries in a file specification dictionary."""
|
||||||
|
|
||||||
|
Type = "/Type"
|
||||||
|
FS = "/FS" # The name of the file system to be used to interpret this file specification
|
||||||
|
F = "/F" # A file specification string of the form described in §3.10.1
|
||||||
|
UF = "/UF" # A Unicode string of the file as described in §3.10.1
|
||||||
|
DOS = "/DOS"
|
||||||
|
Mac = "/Mac"
|
||||||
|
Unix = "/Unix"
|
||||||
|
ID = "/ID"
|
||||||
|
V = "/V"
|
||||||
|
EF = "/EF" # dictionary, containing a subset of the keys F, UF, DOS, Mac, and Unix
|
||||||
|
RF = "/RF" # dictionary, containing arrays of /EmbeddedFile
|
||||||
|
DESC = "/Desc" # description of the file
|
||||||
|
Cl = "/Cl"
|
||||||
|
|
||||||
|
|
||||||
|
class StreamAttributes:
|
||||||
|
"""
|
||||||
|
Table 4.2.
|
||||||
|
Table 5 in the 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
LENGTH = "/Length" # integer, required
|
||||||
|
FILTER = "/Filter" # name or array of names, optional
|
||||||
|
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong
|
||||||
|
|
||||||
|
|
||||||
|
@unique
|
||||||
|
class FilterTypes(StrEnum):
|
||||||
|
"""§7.4 of the 1.7 and 2.0 references."""
|
||||||
|
|
||||||
|
ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx
|
||||||
|
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85
|
||||||
|
LZW_DECODE = "/LZWDecode" # abbreviation: LZW
|
||||||
|
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl
|
||||||
|
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL
|
||||||
|
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
|
||||||
|
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
|
||||||
|
JPX_DECODE = "/JPXDecode"
|
||||||
|
JBIG2_DECODE = "/JBIG2Decode"
|
||||||
|
|
||||||
|
|
||||||
|
class FilterTypeAbbreviations:
|
||||||
|
"""§8.9.7 of the 1.7 and 2.0 references."""
|
||||||
|
|
||||||
|
AHx = "/AHx"
|
||||||
|
A85 = "/A85"
|
||||||
|
LZW = "/LZW"
|
||||||
|
FL = "/Fl"
|
||||||
|
RL = "/RL"
|
||||||
|
CCF = "/CCF"
|
||||||
|
DCT = "/DCT"
|
||||||
|
|
||||||
|
|
||||||
|
class LzwFilterParameters:
|
||||||
|
"""
|
||||||
|
Table 4.4.
|
||||||
|
Table 8 in the 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
PREDICTOR = "/Predictor" # integer
|
||||||
|
COLORS = "/Colors" # integer
|
||||||
|
BITS_PER_COMPONENT = "/BitsPerComponent" # integer
|
||||||
|
COLUMNS = "/Columns" # integer
|
||||||
|
EARLY_CHANGE = "/EarlyChange" # integer
|
||||||
|
|
||||||
|
|
||||||
|
class CcittFaxDecodeParameters:
|
||||||
|
"""
|
||||||
|
Table 4.5.
|
||||||
|
Table 11 in the 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
K = "/K" # integer
|
||||||
|
END_OF_LINE = "/EndOfLine" # boolean
|
||||||
|
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean
|
||||||
|
COLUMNS = "/Columns" # integer
|
||||||
|
ROWS = "/Rows" # integer
|
||||||
|
END_OF_BLOCK = "/EndOfBlock" # boolean
|
||||||
|
BLACK_IS_1 = "/BlackIs1" # boolean
|
||||||
|
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer
|
||||||
|
|
||||||
|
|
||||||
|
class ImageAttributes:
|
||||||
|
"""§11.6.5 of the 1.7 and 2.0 references."""
|
||||||
|
|
||||||
|
TYPE = "/Type" # name, required; must be /XObject
|
||||||
|
SUBTYPE = "/Subtype" # name, required; must be /Image
|
||||||
|
NAME = "/Name" # name, required
|
||||||
|
WIDTH = "/Width" # integer, required
|
||||||
|
HEIGHT = "/Height" # integer, required
|
||||||
|
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required
|
||||||
|
COLOR_SPACE = "/ColorSpace" # name, required
|
||||||
|
DECODE = "/Decode" # array, optional
|
||||||
|
INTENT = "/Intent" # string, optional
|
||||||
|
INTERPOLATE = "/Interpolate" # boolean, optional
|
||||||
|
IMAGE_MASK = "/ImageMask" # boolean, optional
|
||||||
|
MASK = "/Mask" # 1-bit image mask stream
|
||||||
|
S_MASK = "/SMask" # dictionary or name, optional
|
||||||
|
|
||||||
|
|
||||||
|
class ColorSpaces:
|
||||||
|
DEVICE_RGB = "/DeviceRGB"
|
||||||
|
DEVICE_CMYK = "/DeviceCMYK"
|
||||||
|
DEVICE_GRAY = "/DeviceGray"
|
||||||
|
|
||||||
|
|
||||||
|
class TypArguments:
|
||||||
|
"""Table 8.2 of the PDF 1.7 reference."""
|
||||||
|
|
||||||
|
LEFT = "/Left"
|
||||||
|
RIGHT = "/Right"
|
||||||
|
BOTTOM = "/Bottom"
|
||||||
|
TOP = "/Top"
|
||||||
|
|
||||||
|
|
||||||
|
class TypFitArguments:
|
||||||
|
"""Table 8.2 of the PDF 1.7 reference."""
|
||||||
|
|
||||||
|
XYZ = "/XYZ"
|
||||||
|
FIT = "/Fit"
|
||||||
|
FIT_H = "/FitH"
|
||||||
|
FIT_V = "/FitV"
|
||||||
|
FIT_R = "/FitR"
|
||||||
|
FIT_B = "/FitB"
|
||||||
|
FIT_BH = "/FitBH"
|
||||||
|
FIT_BV = "/FitBV"
|
||||||
|
|
||||||
|
|
||||||
|
class GoToActionArguments:
|
||||||
|
S = "/S" # name, required: type of action
|
||||||
|
D = "/D" # name, byte string, or array, required: destination to jump to
|
||||||
|
SD = "/SD" # array, optional: structure destination to jump to
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotationDictionaryAttributes:
|
||||||
|
"""Table 8.15 Entries common to all annotation dictionaries."""
|
||||||
|
|
||||||
|
Type = "/Type"
|
||||||
|
Subtype = "/Subtype"
|
||||||
|
Rect = "/Rect"
|
||||||
|
Contents = "/Contents"
|
||||||
|
P = "/P"
|
||||||
|
NM = "/NM"
|
||||||
|
M = "/M"
|
||||||
|
F = "/F"
|
||||||
|
AP = "/AP"
|
||||||
|
AS = "/AS"
|
||||||
|
DA = "/DA"
|
||||||
|
Border = "/Border"
|
||||||
|
C = "/C"
|
||||||
|
StructParent = "/StructParent"
|
||||||
|
OC = "/OC"
|
||||||
|
|
||||||
|
|
||||||
|
class InteractiveFormDictEntries:
|
||||||
|
Fields = "/Fields"
|
||||||
|
NeedAppearances = "/NeedAppearances"
|
||||||
|
SigFlags = "/SigFlags"
|
||||||
|
CO = "/CO"
|
||||||
|
DR = "/DR"
|
||||||
|
DA = "/DA"
|
||||||
|
Q = "/Q"
|
||||||
|
XFA = "/XFA"
|
||||||
|
|
||||||
|
|
||||||
|
class FieldDictionaryAttributes:
|
||||||
|
"""
|
||||||
|
Entries common to all field dictionaries (Table 8.69 PDF 1.7 reference)
|
||||||
|
(*very partially documented here*).
|
||||||
|
|
||||||
|
FFBits provides the constants used for `/Ff` from Table 8.70/8.75/8.77/8.79
|
||||||
|
"""
|
||||||
|
|
||||||
|
FT = "/FT" # name, required for terminal fields
|
||||||
|
Parent = "/Parent" # dictionary, required for children
|
||||||
|
Kids = "/Kids" # array, sometimes required
|
||||||
|
T = "/T" # text string, optional
|
||||||
|
TU = "/TU" # text string, optional
|
||||||
|
TM = "/TM" # text string, optional
|
||||||
|
Ff = "/Ff" # integer, optional
|
||||||
|
V = "/V" # text string or array, optional
|
||||||
|
DV = "/DV" # text string, optional
|
||||||
|
AA = "/AA" # dictionary, optional
|
||||||
|
Opt = "/Opt" # array, optional
|
||||||
|
|
||||||
|
class FfBits(IntFlag):
|
||||||
|
"""
|
||||||
|
Ease building /Ff flags
|
||||||
|
Some entries may be specific to:
|
||||||
|
|
||||||
|
* Text (Tx) (Table 8.75 PDF 1.7 reference)
|
||||||
|
* Buttons (Btn) (Table 8.77 PDF 1.7 reference)
|
||||||
|
* Choice (Ch) (Table 8.79 PDF 1.7 reference)
|
||||||
|
"""
|
||||||
|
|
||||||
|
ReadOnly = 1 << 0
|
||||||
|
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||||
|
Required = 1 << 1
|
||||||
|
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||||
|
NoExport = 1 << 2
|
||||||
|
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||||
|
|
||||||
|
Multiline = 1 << 12
|
||||||
|
"""Tx"""
|
||||||
|
Password = 1 << 13
|
||||||
|
"""Tx"""
|
||||||
|
|
||||||
|
NoToggleToOff = 1 << 14
|
||||||
|
"""Btn"""
|
||||||
|
Radio = 1 << 15
|
||||||
|
"""Btn"""
|
||||||
|
Pushbutton = 1 << 16
|
||||||
|
"""Btn"""
|
||||||
|
|
||||||
|
Combo = 1 << 17
|
||||||
|
"""Ch"""
|
||||||
|
Edit = 1 << 18
|
||||||
|
"""Ch"""
|
||||||
|
Sort = 1 << 19
|
||||||
|
"""Ch"""
|
||||||
|
|
||||||
|
FileSelect = 1 << 20
|
||||||
|
"""Tx"""
|
||||||
|
|
||||||
|
MultiSelect = 1 << 21
|
||||||
|
"""Tx"""
|
||||||
|
|
||||||
|
DoNotSpellCheck = 1 << 22
|
||||||
|
"""Tx/Ch"""
|
||||||
|
DoNotScroll = 1 << 23
|
||||||
|
"""Tx"""
|
||||||
|
Comb = 1 << 24
|
||||||
|
"""Tx"""
|
||||||
|
|
||||||
|
RadiosInUnison = 1 << 25
|
||||||
|
"""Btn"""
|
||||||
|
|
||||||
|
RichText = 1 << 25
|
||||||
|
"""Tx"""
|
||||||
|
|
||||||
|
CommitOnSelChange = 1 << 26
|
||||||
|
"""Ch"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def attributes(cls) -> tuple[str, ...]:
|
||||||
|
"""
|
||||||
|
Get a tuple of all the attributes present in a Field Dictionary.
|
||||||
|
|
||||||
|
This method returns a tuple of all the attribute constants defined in
|
||||||
|
the FieldDictionaryAttributes class. These attributes correspond to the
|
||||||
|
entries that are common to all field dictionaries as specified in the
|
||||||
|
PDF 1.7 reference.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple containing all the attribute constants.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
cls.TM,
|
||||||
|
cls.T,
|
||||||
|
cls.FT,
|
||||||
|
cls.Parent,
|
||||||
|
cls.TU,
|
||||||
|
cls.Ff,
|
||||||
|
cls.V,
|
||||||
|
cls.DV,
|
||||||
|
cls.Kids,
|
||||||
|
cls.AA,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def attributes_dict(cls) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Get a dictionary of attribute keys and their human-readable names.
|
||||||
|
|
||||||
|
This method returns a dictionary where the keys are the attribute
|
||||||
|
constants defined in the FieldDictionaryAttributes class and the values
|
||||||
|
are their corresponding human-readable names. These attributes
|
||||||
|
correspond to the entries that are common to all field dictionaries as
|
||||||
|
specified in the PDF 1.7 reference.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing attribute keys and their names.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
cls.FT: "Field Type",
|
||||||
|
cls.Parent: "Parent",
|
||||||
|
cls.T: "Field Name",
|
||||||
|
cls.TU: "Alternate Field Name",
|
||||||
|
cls.TM: "Mapping Name",
|
||||||
|
cls.Ff: "Field Flags",
|
||||||
|
cls.V: "Value",
|
||||||
|
cls.DV: "Default Value",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CheckboxRadioButtonAttributes:
|
||||||
|
"""Table 8.76 Field flags common to all field types."""
|
||||||
|
|
||||||
|
Opt = "/Opt" # Options, Optional
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def attributes(cls) -> tuple[str, ...]:
|
||||||
|
"""
|
||||||
|
Get a tuple of all the attributes present in a Field Dictionary.
|
||||||
|
|
||||||
|
This method returns a tuple of all the attribute constants defined in
|
||||||
|
the CheckboxRadioButtonAttributes class. These attributes correspond to
|
||||||
|
the entries that are common to all field dictionaries as specified in
|
||||||
|
the PDF 1.7 reference.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple containing all the attribute constants.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (cls.Opt,)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def attributes_dict(cls) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Get a dictionary of attribute keys and their human-readable names.
|
||||||
|
|
||||||
|
This method returns a dictionary where the keys are the attribute
|
||||||
|
constants defined in the CheckboxRadioButtonAttributes class and the
|
||||||
|
values are their corresponding human-readable names. These attributes
|
||||||
|
correspond to the entries that are common to all field dictionaries as
|
||||||
|
specified in the PDF 1.7 reference.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing attribute keys and their names.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
cls.Opt: "Options",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class FieldFlag(IntFlag):
|
||||||
|
"""Table 8.70 Field flags common to all field types."""
|
||||||
|
|
||||||
|
READ_ONLY = 1
|
||||||
|
REQUIRED = 2
|
||||||
|
NO_EXPORT = 4
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentInformationAttributes:
|
||||||
|
"""Table 10.2 Entries in the document information dictionary."""
|
||||||
|
|
||||||
|
TITLE = "/Title" # text string, optional
|
||||||
|
AUTHOR = "/Author" # text string, optional
|
||||||
|
SUBJECT = "/Subject" # text string, optional
|
||||||
|
KEYWORDS = "/Keywords" # text string, optional
|
||||||
|
CREATOR = "/Creator" # text string, optional
|
||||||
|
PRODUCER = "/Producer" # text string, optional
|
||||||
|
CREATION_DATE = "/CreationDate" # date, optional
|
||||||
|
MOD_DATE = "/ModDate" # date, optional
|
||||||
|
TRAPPED = "/Trapped" # name, optional
|
||||||
|
|
||||||
|
|
||||||
|
class PageLayouts:
|
||||||
|
"""
|
||||||
|
Page 84, PDF 1.4 reference.
|
||||||
|
Page 115, PDF 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SINGLE_PAGE = "/SinglePage"
|
||||||
|
ONE_COLUMN = "/OneColumn"
|
||||||
|
TWO_COLUMN_LEFT = "/TwoColumnLeft"
|
||||||
|
TWO_COLUMN_RIGHT = "/TwoColumnRight"
|
||||||
|
TWO_PAGE_LEFT = "/TwoPageLeft" # (PDF 1.5)
|
||||||
|
TWO_PAGE_RIGHT = "/TwoPageRight" # (PDF 1.5)
|
||||||
|
|
||||||
|
|
||||||
|
class GraphicsStateParameters:
|
||||||
|
"""Table 58 – Entries in a Graphics State Parameter Dictionary"""
|
||||||
|
|
||||||
|
TYPE = "/Type" # name, optional
|
||||||
|
LW = "/LW" # number, optional
|
||||||
|
LC = "/LC" # integer, optional
|
||||||
|
LJ = "/LJ" # integer, optional
|
||||||
|
ML = "/ML" # number, optional
|
||||||
|
D = "/D" # array, optional
|
||||||
|
RI = "/RI" # name, optional
|
||||||
|
OP = "/OP"
|
||||||
|
op = "/op"
|
||||||
|
OPM = "/OPM"
|
||||||
|
FONT = "/Font" # array, optional
|
||||||
|
BG = "/BG"
|
||||||
|
BG2 = "/BG2"
|
||||||
|
UCR = "/UCR"
|
||||||
|
UCR2 = "/UCR2"
|
||||||
|
TR = "/TR"
|
||||||
|
TR2 = "/TR2"
|
||||||
|
HT = "/HT"
|
||||||
|
FL = "/FL"
|
||||||
|
SM = "/SM"
|
||||||
|
SA = "/SA"
|
||||||
|
BM = "/BM"
|
||||||
|
S_MASK = "/SMask" # dictionary or name, optional
|
||||||
|
CA = "/CA"
|
||||||
|
ca = "/ca"
|
||||||
|
AIS = "/AIS"
|
||||||
|
TK = "/TK"
|
||||||
|
|
||||||
|
|
||||||
|
class CatalogDictionary:
|
||||||
|
"""§7.7.2 of the 1.7 and 2.0 references."""
|
||||||
|
|
||||||
|
TYPE = "/Type" # name, required; must be /Catalog
|
||||||
|
VERSION = "/Version" # name
|
||||||
|
EXTENSIONS = "/Extensions" # dictionary, optional; ISO 32000-1
|
||||||
|
PAGES = "/Pages" # dictionary, required
|
||||||
|
PAGE_LABELS = "/PageLabels" # number tree, optional
|
||||||
|
NAMES = "/Names" # dictionary, optional
|
||||||
|
DESTS = "/Dests" # dictionary, optional
|
||||||
|
VIEWER_PREFERENCES = "/ViewerPreferences" # dictionary, optional
|
||||||
|
PAGE_LAYOUT = "/PageLayout" # name, optional
|
||||||
|
PAGE_MODE = "/PageMode" # name, optional
|
||||||
|
OUTLINES = "/Outlines" # dictionary, optional
|
||||||
|
THREADS = "/Threads" # array, optional
|
||||||
|
OPEN_ACTION = "/OpenAction" # array or dictionary or name, optional
|
||||||
|
AA = "/AA" # dictionary, optional
|
||||||
|
URI = "/URI" # dictionary, optional
|
||||||
|
ACRO_FORM = "/AcroForm" # dictionary, optional
|
||||||
|
METADATA = "/Metadata" # stream, optional
|
||||||
|
STRUCT_TREE_ROOT = "/StructTreeRoot" # dictionary, optional
|
||||||
|
MARK_INFO = "/MarkInfo" # dictionary, optional
|
||||||
|
LANG = "/Lang" # text string, optional
|
||||||
|
SPIDER_INFO = "/SpiderInfo" # dictionary, optional
|
||||||
|
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||||
|
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||||
|
OC_PROPERTIES = "/OCProperties" # dictionary, optional
|
||||||
|
PERMS = "/Perms" # dictionary, optional
|
||||||
|
LEGAL = "/Legal" # dictionary, optional
|
||||||
|
REQUIREMENTS = "/Requirements" # array, optional
|
||||||
|
COLLECTION = "/Collection" # dictionary, optional
|
||||||
|
NEEDS_RENDERING = "/NeedsRendering" # boolean, optional
|
||||||
|
DSS = "/DSS" # dictionary, optional
|
||||||
|
AF = "/AF" # array of dictionaries, optional
|
||||||
|
D_PART_ROOT = "/DPartRoot" # dictionary, optional
|
||||||
|
|
||||||
|
|
||||||
|
class OutlineFontFlag(IntFlag):
|
||||||
|
"""A class used as an enumerable flag for formatting an outline font."""
|
||||||
|
|
||||||
|
italic = 1
|
||||||
|
bold = 2
|
||||||
|
|
||||||
|
|
||||||
|
class PageLabelStyle:
|
||||||
|
"""
|
||||||
|
Table 8.10 in the 1.7 reference.
|
||||||
|
Table 161 in the 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DECIMAL = "/D" # Decimal Arabic numerals
|
||||||
|
UPPERCASE_ROMAN = "/R" # Uppercase Roman numerals
|
||||||
|
LOWERCASE_ROMAN = "/r" # Lowercase Roman numerals
|
||||||
|
UPPERCASE_LETTER = "/A" # Uppercase letters
|
||||||
|
LOWERCASE_LETTER = "/a" # Lowercase letters
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotationFlag(IntFlag):
|
||||||
|
"""See §12.5.3 "Annotation Flags"."""
|
||||||
|
|
||||||
|
INVISIBLE = 1
|
||||||
|
HIDDEN = 2
|
||||||
|
PRINT = 4
|
||||||
|
NO_ZOOM = 8
|
||||||
|
NO_ROTATE = 16
|
||||||
|
NO_VIEW = 32
|
||||||
|
READ_ONLY = 64
|
||||||
|
LOCKED = 128
|
||||||
|
TOGGLE_NO_VIEW = 256
|
||||||
|
LOCKED_CONTENTS = 512
|
||||||
|
|
||||||
|
|
||||||
|
PDF_KEYS = (
|
||||||
|
AnnotationDictionaryAttributes,
|
||||||
|
CatalogAttributes,
|
||||||
|
CatalogDictionary,
|
||||||
|
CcittFaxDecodeParameters,
|
||||||
|
CheckboxRadioButtonAttributes,
|
||||||
|
ColorSpaces,
|
||||||
|
Core,
|
||||||
|
DocumentInformationAttributes,
|
||||||
|
EncryptionDictAttributes,
|
||||||
|
FieldDictionaryAttributes,
|
||||||
|
FileSpecificationDictionaryEntries,
|
||||||
|
FilterTypeAbbreviations,
|
||||||
|
FilterTypes,
|
||||||
|
GoToActionArguments,
|
||||||
|
GraphicsStateParameters,
|
||||||
|
ImageAttributes,
|
||||||
|
InteractiveFormDictEntries,
|
||||||
|
LzwFilterParameters,
|
||||||
|
PageAttributes,
|
||||||
|
PageLayouts,
|
||||||
|
PagesAttributes,
|
||||||
|
Resources,
|
||||||
|
StreamAttributes,
|
||||||
|
TrailerKeys,
|
||||||
|
TypArguments,
|
||||||
|
TypFitArguments,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageType(IntFlag):
|
||||||
|
NONE = 0
|
||||||
|
XOBJECT_IMAGES = auto()
|
||||||
|
INLINE_IMAGES = auto()
|
||||||
|
DRAWING_IMAGES = auto()
|
||||||
|
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
|
||||||
|
IMAGES = ALL # for consistency with ObjectDeletionFlag
|
||||||
|
|
||||||
|
|
||||||
|
_INLINE_IMAGE_VALUE_MAPPING = {
|
||||||
|
"/G": "/DeviceGray",
|
||||||
|
"/RGB": "/DeviceRGB",
|
||||||
|
"/CMYK": "/DeviceCMYK",
|
||||||
|
"/I": "/Indexed",
|
||||||
|
"/AHx": "/ASCIIHexDecode",
|
||||||
|
"/A85": "/ASCII85Decode",
|
||||||
|
"/LZW": "/LZWDecode",
|
||||||
|
"/Fl": "/FlateDecode",
|
||||||
|
"/RL": "/RunLengthDecode",
|
||||||
|
"/CCF": "/CCITTFaxDecode",
|
||||||
|
"/DCT": "/DCTDecode",
|
||||||
|
"/DeviceGray": "/DeviceGray",
|
||||||
|
"/DeviceRGB": "/DeviceRGB",
|
||||||
|
"/DeviceCMYK": "/DeviceCMYK",
|
||||||
|
"/Indexed": "/Indexed",
|
||||||
|
"/ASCIIHexDecode": "/ASCIIHexDecode",
|
||||||
|
"/ASCII85Decode": "/ASCII85Decode",
|
||||||
|
"/LZWDecode": "/LZWDecode",
|
||||||
|
"/FlateDecode": "/FlateDecode",
|
||||||
|
"/RunLengthDecode": "/RunLengthDecode",
|
||||||
|
"/CCITTFaxDecode": "/CCITTFaxDecode",
|
||||||
|
"/DCTDecode": "/DCTDecode",
|
||||||
|
"/RelativeColorimetric": "/RelativeColorimetric",
|
||||||
|
}
|
||||||
|
|
||||||
|
_INLINE_IMAGE_KEY_MAPPING = {
|
||||||
|
"/BPC": "/BitsPerComponent",
|
||||||
|
"/CS": "/ColorSpace",
|
||||||
|
"/D": "/Decode",
|
||||||
|
"/DP": "/DecodeParms",
|
||||||
|
"/F": "/Filter",
|
||||||
|
"/H": "/Height",
|
||||||
|
"/W": "/Width",
|
||||||
|
"/I": "/Interpolate",
|
||||||
|
"/Intent": "/Intent",
|
||||||
|
"/IM": "/ImageMask",
|
||||||
|
"/BitsPerComponent": "/BitsPerComponent",
|
||||||
|
"/ColorSpace": "/ColorSpace",
|
||||||
|
"/Decode": "/Decode",
|
||||||
|
"/DecodeParms": "/DecodeParms",
|
||||||
|
"/Filter": "/Filter",
|
||||||
|
"/Height": "/Height",
|
||||||
|
"/Width": "/Width",
|
||||||
|
"/Interpolate": "/Interpolate",
|
||||||
|
"/ImageMask": "/ImageMask",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AFRelationship:
|
||||||
|
"""
|
||||||
|
Associated file relationship types, defining the relationship between
|
||||||
|
the PDF component and the associated file.
|
||||||
|
|
||||||
|
Defined in table 43 of the PDF 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SOURCE = "/Source" # Original content source
|
||||||
|
DATA = "/Data" # Base data for visual presentation
|
||||||
|
ALTERNATIVE = "/Alternative" # Alternative content representation
|
||||||
|
SUPPLEMENT = "/Supplement" # Supplemental representation of original source/data
|
||||||
|
ENCRYPTED_PAYLOAD = "/EncryptedPayload" # Encrypted payload document
|
||||||
|
FORM_DATA = "/FormData" # Data associated with AcroForm of this PDF
|
||||||
|
SCHEMA = "/Schema" # Schema definition for associated object
|
||||||
|
UNSPECIFIED = "/Unspecified" # Not known or cannot be described with values
|
||||||
|
|
||||||
|
|
||||||
|
class BorderStyles:
|
||||||
|
"""
|
||||||
|
A class defining border styles used in PDF documents.
|
||||||
|
|
||||||
|
Defined in table 168 of the PDF 2.0 reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BEVELED = "/B"
|
||||||
|
DASHED = "/D"
|
||||||
|
INSET = "/I"
|
||||||
|
SOLID = "/S"
|
||||||
|
UNDERLINED = "/U"
|
||||||
74
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
74
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
"""
|
||||||
|
All errors/exceptions pypdf raises and all of the warnings it uses.
|
||||||
|
|
||||||
|
Please note that broken PDF files might cause other Exceptions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class DeprecationError(Exception):
|
||||||
|
"""Raised when a deprecated feature is used."""
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyError(Exception):
|
||||||
|
"""
|
||||||
|
Raised when a required dependency (a library or module that pypdf depends on)
|
||||||
|
is not available or cannot be imported.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class PyPdfError(Exception):
|
||||||
|
"""Base class for all exceptions raised by pypdf."""
|
||||||
|
|
||||||
|
|
||||||
|
class PdfReadError(PyPdfError):
|
||||||
|
"""Raised when there is an issue reading a PDF file."""
|
||||||
|
|
||||||
|
|
||||||
|
class PageSizeNotDefinedError(PyPdfError):
|
||||||
|
"""Raised when the page size of a PDF document is not defined."""
|
||||||
|
|
||||||
|
|
||||||
|
class PdfReadWarning(UserWarning):
|
||||||
|
"""Issued when there is a potential issue reading a PDF file, but it can still be read."""
|
||||||
|
|
||||||
|
|
||||||
|
class PdfStreamError(PdfReadError):
|
||||||
|
"""Raised when there is an issue reading the stream of data in a PDF file."""
|
||||||
|
|
||||||
|
|
||||||
|
class ParseError(PyPdfError):
|
||||||
|
"""
|
||||||
|
Raised when there is an issue parsing (analyzing and understanding the
|
||||||
|
structure and meaning of) a PDF file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class FileNotDecryptedError(PdfReadError):
|
||||||
|
"""
|
||||||
|
Raised when a PDF file that has been encrypted
|
||||||
|
(meaning it requires a password to be accessed) has not been successfully
|
||||||
|
decrypted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class WrongPasswordError(FileNotDecryptedError):
|
||||||
|
"""Raised when the wrong password is used to try to decrypt an encrypted PDF file."""
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyFileError(PdfReadError):
|
||||||
|
"""Raised when a PDF file is empty or has no content."""
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyImageDataError(PyPdfError):
|
||||||
|
"""Raised when trying to process an image that has no data."""
|
||||||
|
|
||||||
|
|
||||||
|
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
|
||||||
|
|
||||||
|
|
||||||
|
class LimitReachedError(PyPdfError):
|
||||||
|
"""Raised when a limit is reached."""
|
||||||
|
|
||||||
|
|
||||||
|
class XmpDocumentError(PyPdfError, RuntimeError):
|
||||||
|
"""Raised when the XMP XML document context is invalid or missing."""
|
||||||
815
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
815
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
@@ -0,0 +1,815 @@
|
|||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Implementation of stream filters; §7.4 Filters of the PDF 2.0 specification.
|
||||||
|
|
||||||
|
§8.9.7 Inline images of the PDF 2.0 specification has abbreviations that can be
|
||||||
|
used for the names of filters in an inline image object.
|
||||||
|
"""
|
||||||
|
__author__ = "Mathieu Fenniak"
|
||||||
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import struct
|
||||||
|
import subprocess
|
||||||
|
import zlib
|
||||||
|
from base64 import a85decode
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
from typing import Any, Optional, Union, cast
|
||||||
|
|
||||||
|
from ._codecs._codecs import LzwCodec as _LzwCodec
|
||||||
|
from ._utils import (
|
||||||
|
WHITESPACES_AS_BYTES,
|
||||||
|
deprecation_with_replacement,
|
||||||
|
logger_warning,
|
||||||
|
)
|
||||||
|
from .constants import CcittFaxDecodeParameters as CCITT
|
||||||
|
from .constants import FilterTypeAbbreviations as FTA
|
||||||
|
from .constants import FilterTypes as FT
|
||||||
|
from .constants import ImageAttributes as IA
|
||||||
|
from .constants import LzwFilterParameters as LZW
|
||||||
|
from .constants import StreamAttributes as SA
|
||||||
|
from .errors import DependencyError, LimitReachedError, PdfReadError, PdfStreamError
|
||||||
|
from .generic import (
|
||||||
|
ArrayObject,
|
||||||
|
DictionaryObject,
|
||||||
|
IndirectObject,
|
||||||
|
NullObject,
|
||||||
|
StreamObject,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
|
||||||
|
JBIG2_MAX_OUTPUT_LENGTH = 75_000_000
|
||||||
|
LZW_MAX_OUTPUT_LENGTH = 75_000_000
|
||||||
|
ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _decompress_with_limit(data: bytes) -> bytes:
|
||||||
|
decompressor = zlib.decompressobj()
|
||||||
|
result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH)
|
||||||
|
if decompressor.unconsumed_tail:
|
||||||
|
raise LimitReachedError(
|
||||||
|
f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining."
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def decompress(data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Decompress the given data using zlib.
|
||||||
|
|
||||||
|
Attempts to decompress the input data using zlib.
|
||||||
|
If the decompression fails due to a zlib error, it falls back
|
||||||
|
to using a decompression object with a larger window size.
|
||||||
|
|
||||||
|
Please note that the output length is limited to avoid memory
|
||||||
|
issues. If you need to process larger content streams, consider
|
||||||
|
adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
|
||||||
|
are only dealing with trusted inputs and/or want to disable these
|
||||||
|
limits, set the value to `0`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The input data to be decompressed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The decompressed data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return _decompress_with_limit(data)
|
||||||
|
except zlib.error:
|
||||||
|
# First quick approach: There are known issues with faulty added bytes to the
|
||||||
|
# tail of the encoded stream from early Adobe Distiller or Pitstop versions
|
||||||
|
# with CR char as the default line separator (assumed by reverse engineering)
|
||||||
|
# that breaks the decoding process in the end.
|
||||||
|
#
|
||||||
|
# Try first to cut off some of the tail byte by byte, but limited to not
|
||||||
|
# iterate through too many loops and kill the performance for large streams,
|
||||||
|
# to then allow the final fallback to run. Added this intermediate attempt,
|
||||||
|
# because starting from the head of the stream byte by byte kills completely
|
||||||
|
# the performance for large streams (e.g., 6 MB) with the tail-byte-issue
|
||||||
|
# and takes ages. This solution is really fast:
|
||||||
|
max_tail_cut_off_bytes: int = 8
|
||||||
|
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
|
||||||
|
try:
|
||||||
|
return _decompress_with_limit(data[:-i])
|
||||||
|
except zlib.error:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If still failing, then try with increased window size.
|
||||||
|
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32)
|
||||||
|
result_str = b""
|
||||||
|
remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
|
||||||
|
data_single_bytes = [data[i : i + 1] for i in range(len(data))]
|
||||||
|
known_errors = set()
|
||||||
|
for index, b in enumerate(data_single_bytes):
|
||||||
|
try:
|
||||||
|
decompressed = decompressor.decompress(b, max_length=remaining_limit)
|
||||||
|
result_str += decompressed
|
||||||
|
remaining_limit -= len(decompressed)
|
||||||
|
if remaining_limit <= 0:
|
||||||
|
raise LimitReachedError(
|
||||||
|
f"Limit reached while decompressing. {len(data_single_bytes) - index} bytes remaining."
|
||||||
|
)
|
||||||
|
except zlib.error as error:
|
||||||
|
error_str = str(error)
|
||||||
|
if error_str in known_errors:
|
||||||
|
continue
|
||||||
|
logger_warning(error_str, __name__)
|
||||||
|
known_errors.add(error_str)
|
||||||
|
return result_str
|
||||||
|
|
||||||
|
|
||||||
|
class FlateDecode:
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode data which is flate-encoded.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: flate-encoded data.
|
||||||
|
decode_parms: a dictionary of values, understanding the
|
||||||
|
"/Predictor":<int> key only
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The flate-decoded data.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PdfReadError:
|
||||||
|
|
||||||
|
"""
|
||||||
|
str_data = decompress(data)
|
||||||
|
predictor = 1
|
||||||
|
|
||||||
|
if decode_parms:
|
||||||
|
try:
|
||||||
|
predictor = decode_parms.get("/Predictor", 1)
|
||||||
|
except (AttributeError, TypeError): # Type Error is NullObject
|
||||||
|
pass # Usually an array with a null object was read
|
||||||
|
# predictor 1 == no predictor
|
||||||
|
if predictor != 1:
|
||||||
|
# /Columns, the number of samples in each row, has a default value of 1;
|
||||||
|
# §7.4.4.3, ISO 32000.
|
||||||
|
DEFAULT_BITS_PER_COMPONENT = 8
|
||||||
|
try:
|
||||||
|
columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore
|
||||||
|
except (TypeError, KeyError):
|
||||||
|
columns = 1
|
||||||
|
try:
|
||||||
|
colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore
|
||||||
|
except (TypeError, KeyError):
|
||||||
|
colors = 1
|
||||||
|
try:
|
||||||
|
bits_per_component = cast(
|
||||||
|
int,
|
||||||
|
decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore
|
||||||
|
)
|
||||||
|
except (TypeError, KeyError):
|
||||||
|
bits_per_component = DEFAULT_BITS_PER_COMPONENT
|
||||||
|
|
||||||
|
# PNG predictor can vary by row and so is the lead byte on each row
|
||||||
|
rowlength = (
|
||||||
|
math.ceil(columns * colors * bits_per_component / 8) + 1
|
||||||
|
) # number of bytes
|
||||||
|
|
||||||
|
# TIFF prediction:
|
||||||
|
if predictor == 2:
|
||||||
|
rowlength -= 1 # remove the predictor byte
|
||||||
|
bpp = rowlength // columns
|
||||||
|
str_data = bytearray(str_data)
|
||||||
|
for i in range(len(str_data)):
|
||||||
|
if i % rowlength >= bpp:
|
||||||
|
str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
|
||||||
|
str_data = bytes(str_data)
|
||||||
|
# PNG prediction:
|
||||||
|
elif 10 <= predictor <= 15:
|
||||||
|
str_data = FlateDecode._decode_png_prediction(
|
||||||
|
str_data, columns, rowlength
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
|
||||||
|
return str_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes:
|
||||||
|
# PNG prediction can vary from row to row
|
||||||
|
if (remainder := len(data) % rowlength) != 0:
|
||||||
|
logger_warning("Image data is not rectangular. Adding padding.", __name__)
|
||||||
|
data += b"\x00" * (rowlength - remainder)
|
||||||
|
assert len(data) % rowlength == 0
|
||||||
|
output = []
|
||||||
|
prev_rowdata = (0,) * rowlength
|
||||||
|
bpp = (rowlength - 1) // columns # recomputed locally to not change params
|
||||||
|
for row in range(0, len(data), rowlength):
|
||||||
|
rowdata: list[int] = list(data[row : row + rowlength])
|
||||||
|
filter_byte = rowdata[0]
|
||||||
|
|
||||||
|
if filter_byte == 0:
|
||||||
|
# PNG None Predictor
|
||||||
|
pass
|
||||||
|
elif filter_byte == 1:
|
||||||
|
# PNG Sub Predictor
|
||||||
|
for i in range(bpp + 1, rowlength):
|
||||||
|
rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256
|
||||||
|
elif filter_byte == 2:
|
||||||
|
# PNG Up Predictor
|
||||||
|
for i in range(1, rowlength):
|
||||||
|
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||||
|
elif filter_byte == 3:
|
||||||
|
# PNG Average Predictor
|
||||||
|
for i in range(1, bpp + 1):
|
||||||
|
floor = prev_rowdata[i] // 2
|
||||||
|
rowdata[i] = (rowdata[i] + floor) % 256
|
||||||
|
for i in range(bpp + 1, rowlength):
|
||||||
|
left = rowdata[i - bpp]
|
||||||
|
floor = (left + prev_rowdata[i]) // 2
|
||||||
|
rowdata[i] = (rowdata[i] + floor) % 256
|
||||||
|
elif filter_byte == 4:
|
||||||
|
# PNG Paeth Predictor
|
||||||
|
for i in range(1, bpp + 1):
|
||||||
|
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||||
|
for i in range(bpp + 1, rowlength):
|
||||||
|
left = rowdata[i - bpp]
|
||||||
|
up = prev_rowdata[i]
|
||||||
|
up_left = prev_rowdata[i - bpp]
|
||||||
|
|
||||||
|
p = left + up - up_left
|
||||||
|
dist_left = abs(p - left)
|
||||||
|
dist_up = abs(p - up)
|
||||||
|
dist_up_left = abs(p - up_left)
|
||||||
|
|
||||||
|
if dist_left <= dist_up and dist_left <= dist_up_left:
|
||||||
|
paeth = left
|
||||||
|
elif dist_up <= dist_up_left:
|
||||||
|
paeth = up
|
||||||
|
else:
|
||||||
|
paeth = up_left
|
||||||
|
|
||||||
|
rowdata[i] = (rowdata[i] + paeth) % 256
|
||||||
|
else:
|
||||||
|
raise PdfReadError(
|
||||||
|
f"Unsupported PNG filter {filter_byte!r}"
|
||||||
|
) # pragma: no cover
|
||||||
|
prev_rowdata = tuple(rowdata)
|
||||||
|
output.extend(rowdata[1:])
|
||||||
|
return bytes(output)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def encode(data: bytes, level: int = -1) -> bytes:
|
||||||
|
"""
|
||||||
|
Compress the input data using zlib.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The data to be compressed.
|
||||||
|
level: See https://docs.python.org/3/library/zlib.html#zlib.compress
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The compressed data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return zlib.compress(data, level)
|
||||||
|
|
||||||
|
|
||||||
|
class ASCIIHexDecode:
|
||||||
|
"""
|
||||||
|
The ASCIIHexDecode filter decodes data that has been encoded in ASCII
|
||||||
|
hexadecimal form into a base-7 ASCII format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: Union[str, bytes],
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode an ASCII-Hex encoded data stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: a str sequence of hexadecimal-encoded values to be
|
||||||
|
converted into a base-7 ASCII string
|
||||||
|
decode_parms: this filter does not use parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string conversion in base-7 ASCII, where each of its values
|
||||||
|
v is such that 0 <= ord(v) <= 127.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PdfStreamError:
|
||||||
|
|
||||||
|
"""
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode()
|
||||||
|
retval = b""
|
||||||
|
hex_pair = b""
|
||||||
|
index = 0
|
||||||
|
while True:
|
||||||
|
if index >= len(data):
|
||||||
|
logger_warning(
|
||||||
|
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
|
||||||
|
)
|
||||||
|
break # Reached end of string without an EOD
|
||||||
|
char = data[index : index + 1]
|
||||||
|
if char == b">":
|
||||||
|
break
|
||||||
|
if char.isspace():
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
hex_pair += char
|
||||||
|
if len(hex_pair) == 2:
|
||||||
|
retval += bytes((int(hex_pair, base=16),))
|
||||||
|
hex_pair = b""
|
||||||
|
index += 1
|
||||||
|
# If the filter encounters the EOD marker after reading
|
||||||
|
# an odd number of hexadecimal digits,
|
||||||
|
# it shall behave as if a 0 (zero) followed the last digit.
|
||||||
|
# For every even number of hexadecimal digits, hex_pair is reset to b"".
|
||||||
|
if hex_pair != b"":
|
||||||
|
hex_pair += b"0"
|
||||||
|
retval += bytes((int(hex_pair, base=16),))
|
||||||
|
return retval
|
||||||
|
|
||||||
|
|
||||||
|
class RunLengthDecode:
|
||||||
|
"""
|
||||||
|
The RunLengthDecode filter decodes data that has been encoded in a
|
||||||
|
simple byte-oriented format based on run length.
|
||||||
|
The encoded data is a sequence of runs, where each run consists of
|
||||||
|
a length byte followed by 1 to 128 bytes of data. If the length byte is
|
||||||
|
in the range 0 to 127,
|
||||||
|
the following length + 1 (1 to 128) bytes are copied literally during
|
||||||
|
decompression.
|
||||||
|
If length is in the range 129 to 255, the following single byte is to be
|
||||||
|
copied 257 − length (2 to 128) times during decompression. A length value
|
||||||
|
of 128 denotes EOD.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode a run length encoded data stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: a bytes sequence of length/data
|
||||||
|
decode_parms: this filter does not use parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A bytes decompressed sequence.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PdfStreamError:
|
||||||
|
|
||||||
|
"""
|
||||||
|
lst = []
|
||||||
|
index = 0
|
||||||
|
while True:
|
||||||
|
if index >= len(data):
|
||||||
|
logger_warning(
|
||||||
|
"missing EOD in RunLengthDecode, check if output is OK", __name__
|
||||||
|
)
|
||||||
|
break # Reached end of string without an EOD
|
||||||
|
length = data[index]
|
||||||
|
index += 1
|
||||||
|
if length == 128:
|
||||||
|
data_length = len(data)
|
||||||
|
if index < data_length:
|
||||||
|
# We should first check, if we have an inner stream from a multi-encoded
|
||||||
|
# stream with a faulty trailing newline that we can decode properly.
|
||||||
|
# We will just ignore the last byte and raise a warning ...
|
||||||
|
if (index == data_length - 1) and (data[index : index + 1] == b"\n"):
|
||||||
|
logger_warning(
|
||||||
|
"Found trailing newline in stream data, check if output is OK", __name__
|
||||||
|
)
|
||||||
|
break
|
||||||
|
# Raising an exception here breaks all image extraction for this file, which might
|
||||||
|
# not be desirable. For this reason, indicate that the output is most likely wrong,
|
||||||
|
# as processing stopped after the first EOD marker. See issue #3517.
|
||||||
|
logger_warning(
|
||||||
|
"Early EOD in RunLengthDecode, check if output is OK", __name__
|
||||||
|
)
|
||||||
|
break
|
||||||
|
if length < 128:
|
||||||
|
length += 1
|
||||||
|
lst.append(data[index : (index + length)])
|
||||||
|
index += length
|
||||||
|
else: # >128
|
||||||
|
length = 257 - length
|
||||||
|
lst.append(bytes((data[index],)) * length)
|
||||||
|
index += 1
|
||||||
|
return b"".join(lst)
|
||||||
|
|
||||||
|
|
||||||
|
class LZWDecode:
|
||||||
|
class Decoder:
|
||||||
|
STOP = 257
|
||||||
|
CLEARDICT = 256
|
||||||
|
|
||||||
|
def __init__(self, data: bytes) -> None:
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def decode(self) -> bytes:
|
||||||
|
return _LzwCodec(max_output_length=LZW_MAX_OUTPUT_LENGTH).decode(self.data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode an LZW encoded data stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: ``bytes`` or ``str`` text to decode.
|
||||||
|
decode_parms: a dictionary of parameter values.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
decoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# decode_parms is unused here
|
||||||
|
return LZWDecode.Decoder(data).decode()
|
||||||
|
|
||||||
|
|
||||||
|
class ASCII85Decode:
|
||||||
|
"""Decodes string ASCII85-encoded data into a byte format."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: Union[str, bytes],
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode an Ascii85 encoded data stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: ``bytes`` or ``str`` text to decode.
|
||||||
|
decode_parms: this filter does not use parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
decoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode()
|
||||||
|
data = data.strip(WHITESPACES_AS_BYTES)
|
||||||
|
if len(data) > 2 and data.endswith(b">"):
|
||||||
|
data = data[:-1].rstrip(WHITESPACES_AS_BYTES) + data[-1:]
|
||||||
|
try:
|
||||||
|
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
|
||||||
|
except ValueError as error:
|
||||||
|
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
|
||||||
|
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
|
||||||
|
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
class DCTDecode:
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decompresses data encoded using a DCT (discrete cosine transform)
|
||||||
|
technique based on the JPEG standard (IS0/IEC 10918),
|
||||||
|
reproducing image sample data that approximates the original data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: text to decode.
|
||||||
|
decode_parms: this filter does not use parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
decoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class JPXDecode:
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Decompresses data encoded using the wavelet-based JPEG 2000 standard,
|
||||||
|
reproducing the original image data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: text to decode.
|
||||||
|
decode_parms: this filter does not use parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
decoded data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CCITTParameters:
|
||||||
|
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
|
||||||
|
|
||||||
|
K: int = 0
|
||||||
|
columns: int = 1728
|
||||||
|
rows: int = 0
|
||||||
|
EndOfLine: Union[bool, None] = False
|
||||||
|
EncodedByteAlign: Union[bool, None] = False
|
||||||
|
EndOfBlock: Union[bool, None] = True
|
||||||
|
BlackIs1: bool = False
|
||||||
|
DamagedRowsBeforeError: Union[int, None] = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def group(self) -> int:
|
||||||
|
if self.K < 0:
|
||||||
|
# Pure two-dimensional encoding (Group 4)
|
||||||
|
CCITTgroup = 4
|
||||||
|
else:
|
||||||
|
# K == 0: Pure one-dimensional encoding (Group 3, 1-D)
|
||||||
|
# K > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
|
||||||
|
CCITTgroup = 3
|
||||||
|
return CCITTgroup
|
||||||
|
|
||||||
|
|
||||||
|
def __create_old_class_instance(
|
||||||
|
K: int = 0,
|
||||||
|
columns: int = 0,
|
||||||
|
rows: int = 0
|
||||||
|
) -> CCITTParameters:
|
||||||
|
deprecation_with_replacement("CCITParameters", "CCITTParameters", "6.0.0")
|
||||||
|
return CCITTParameters(K, columns, rows)
|
||||||
|
|
||||||
|
|
||||||
|
# Create an alias for the old class name
|
||||||
|
CCITParameters = __create_old_class_instance
|
||||||
|
|
||||||
|
|
||||||
|
class CCITTFaxDecode:
|
||||||
|
"""
|
||||||
|
§7.4.6, CCITTFaxDecode filter (ISO 32000).
|
||||||
|
|
||||||
|
Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
|
||||||
|
CCITT encoding is bit-oriented, not byte-oriented.
|
||||||
|
|
||||||
|
§7.4.6, optional parameters for the CCITTFaxDecode filter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_parameters(
|
||||||
|
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
|
||||||
|
rows: Union[int, IndirectObject],
|
||||||
|
) -> CCITTParameters:
|
||||||
|
ccitt_parameters = CCITTParameters(rows=int(rows))
|
||||||
|
if parameters:
|
||||||
|
parameters_unwrapped = cast(
|
||||||
|
Union[ArrayObject, DictionaryObject], parameters.get_object()
|
||||||
|
)
|
||||||
|
if isinstance(parameters_unwrapped, ArrayObject):
|
||||||
|
for decode_parm in parameters_unwrapped:
|
||||||
|
if CCITT.K in decode_parm:
|
||||||
|
ccitt_parameters.K = decode_parm[CCITT.K].get_object()
|
||||||
|
if CCITT.COLUMNS in decode_parm:
|
||||||
|
ccitt_parameters.columns = decode_parm[CCITT.COLUMNS].get_object()
|
||||||
|
if CCITT.BLACK_IS_1 in decode_parm:
|
||||||
|
ccitt_parameters.BlackIs1 = decode_parm[CCITT.BLACK_IS_1].get_object().value
|
||||||
|
else:
|
||||||
|
if CCITT.K in parameters_unwrapped:
|
||||||
|
ccitt_parameters.K = parameters_unwrapped[CCITT.K].get_object() # type: ignore
|
||||||
|
if CCITT.COLUMNS in parameters_unwrapped:
|
||||||
|
ccitt_parameters.columns = parameters_unwrapped[CCITT.COLUMNS].get_object() # type: ignore
|
||||||
|
if CCITT.BLACK_IS_1 in parameters_unwrapped:
|
||||||
|
ccitt_parameters.BlackIs1 = parameters_unwrapped[CCITT.BLACK_IS_1].get_object().value # type: ignore
|
||||||
|
return ccitt_parameters
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
height: int = 0,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
params = CCITTFaxDecode._get_parameters(decode_parms, height)
|
||||||
|
|
||||||
|
img_size = len(data)
|
||||||
|
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
|
||||||
|
tiff_header = struct.pack(
|
||||||
|
tiff_header_struct,
|
||||||
|
b"II", # Byte order indication: Little endian
|
||||||
|
42, # Version number (always 42)
|
||||||
|
8, # Offset to the first image file directory (IFD)
|
||||||
|
8, # Number of tags in IFD
|
||||||
|
256, # ImageWidth, LONG, 1, width
|
||||||
|
4,
|
||||||
|
1,
|
||||||
|
params.columns,
|
||||||
|
257, # ImageLength, LONG, 1, length
|
||||||
|
4,
|
||||||
|
1,
|
||||||
|
params.rows,
|
||||||
|
258, # BitsPerSample, SHORT, 1, 1
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
259, # Compression, SHORT, 1, compression Type
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
params.group,
|
||||||
|
262, # Thresholding, SHORT, 1, 0 = BlackIs1
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
int(params.BlackIs1),
|
||||||
|
273, # StripOffsets, LONG, 1, length of header
|
||||||
|
4,
|
||||||
|
1,
|
||||||
|
struct.calcsize(
|
||||||
|
tiff_header_struct
|
||||||
|
),
|
||||||
|
278, # RowsPerStrip, LONG, 1, length
|
||||||
|
4,
|
||||||
|
1,
|
||||||
|
params.rows,
|
||||||
|
279, # StripByteCounts, LONG, 1, size of image
|
||||||
|
4,
|
||||||
|
1,
|
||||||
|
img_size,
|
||||||
|
0, # last IFD
|
||||||
|
)
|
||||||
|
|
||||||
|
return tiff_header + data
|
||||||
|
|
||||||
|
|
||||||
|
JBIG2DEC_BINARY = shutil.which("jbig2dec")
|
||||||
|
|
||||||
|
|
||||||
|
class JBIG2Decode:
|
||||||
|
@staticmethod
|
||||||
|
def decode(
|
||||||
|
data: bytes,
|
||||||
|
decode_parms: Optional[DictionaryObject] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> bytes:
|
||||||
|
if JBIG2DEC_BINARY is None:
|
||||||
|
raise DependencyError("jbig2dec binary is not available.")
|
||||||
|
|
||||||
|
with TemporaryDirectory() as tempdir:
|
||||||
|
directory = Path(tempdir)
|
||||||
|
paths: list[Path] = []
|
||||||
|
|
||||||
|
if decode_parms and "/JBIG2Globals" in decode_parms:
|
||||||
|
jbig2_globals = decode_parms["/JBIG2Globals"]
|
||||||
|
if not is_null_or_none(jbig2_globals) and not is_null_or_none(pointer := jbig2_globals.get_object()):
|
||||||
|
assert pointer is not None, "mypy"
|
||||||
|
if isinstance(pointer, StreamObject):
|
||||||
|
path = directory.joinpath("globals.jbig2")
|
||||||
|
path.write_bytes(pointer.get_data())
|
||||||
|
paths.append(path)
|
||||||
|
|
||||||
|
path = directory.joinpath("image.jbig2")
|
||||||
|
path.write_bytes(data)
|
||||||
|
paths.append(path)
|
||||||
|
|
||||||
|
environment = os.environ.copy()
|
||||||
|
environment["LC_ALL"] = "C"
|
||||||
|
result = subprocess.run( # noqa: S603
|
||||||
|
[
|
||||||
|
JBIG2DEC_BINARY,
|
||||||
|
"--embedded",
|
||||||
|
"--format", "png",
|
||||||
|
"--output", "-",
|
||||||
|
"-M", str(JBIG2_MAX_OUTPUT_LENGTH),
|
||||||
|
*paths
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
env=environment,
|
||||||
|
)
|
||||||
|
if b"unrecognized option '--embedded'" in result.stderr or b"unrecognized option '-M'" in result.stderr:
|
||||||
|
raise DependencyError("jbig2dec>=0.19 is required.")
|
||||||
|
if b"FATAL ERROR failed to allocate image data buffer" in result.stderr:
|
||||||
|
raise LimitReachedError(
|
||||||
|
f"Memory limit reached while reading JBIG2 data:\n{result.stderr.decode('utf-8')}"
|
||||||
|
)
|
||||||
|
if result.stderr:
|
||||||
|
for line in result.stderr.decode("utf-8").splitlines():
|
||||||
|
logger_warning(line, __name__)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise PdfStreamError(f"Unable to decode JBIG2 data. Exit code: {result.returncode}")
|
||||||
|
return result.stdout
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_binary_compatible() -> bool:
|
||||||
|
if not JBIG2DEC_BINARY: # pragma: no cover
|
||||||
|
return False
|
||||||
|
result = subprocess.run( # noqa: S603
|
||||||
|
[JBIG2DEC_BINARY, "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
version = result.stdout.split(" ", maxsplit=1)[1]
|
||||||
|
|
||||||
|
from ._utils import Version # noqa: PLC0415
|
||||||
|
return Version(version) >= Version("0.19")
|
||||||
|
|
||||||
|
|
||||||
|
def decode_stream_data(stream: Any) -> bytes:
|
||||||
|
"""
|
||||||
|
Decode the stream data based on the specified filters.
|
||||||
|
|
||||||
|
This function decodes the stream data using the filters provided in the
|
||||||
|
stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stream: The input stream object containing the data and filters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The decoded stream data.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
NotImplementedError: If an unsupported filter type is encountered.
|
||||||
|
|
||||||
|
"""
|
||||||
|
filters = stream.get(SA.FILTER, ())
|
||||||
|
if isinstance(filters, IndirectObject):
|
||||||
|
filters = cast(ArrayObject, filters.get_object())
|
||||||
|
if not isinstance(filters, ArrayObject):
|
||||||
|
# We have a single filter instance
|
||||||
|
filters = (filters,)
|
||||||
|
decode_parms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
|
||||||
|
if not isinstance(decode_parms, (list, tuple)):
|
||||||
|
decode_parms = (decode_parms,)
|
||||||
|
data: bytes = stream._data
|
||||||
|
# If there is no data to decode, we should not try to decode it.
|
||||||
|
if not data:
|
||||||
|
return data
|
||||||
|
for filter_name, params in zip(filters, decode_parms):
|
||||||
|
if isinstance(params, NullObject):
|
||||||
|
params = {}
|
||||||
|
if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx):
|
||||||
|
data = ASCIIHexDecode.decode(data)
|
||||||
|
elif filter_name in (FT.ASCII_85_DECODE, FTA.A85):
|
||||||
|
data = ASCII85Decode.decode(data)
|
||||||
|
elif filter_name in (FT.LZW_DECODE, FTA.LZW):
|
||||||
|
data = LZWDecode.decode(data, params)
|
||||||
|
elif filter_name in (FT.FLATE_DECODE, FTA.FL):
|
||||||
|
data = FlateDecode.decode(data, params)
|
||||||
|
elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL):
|
||||||
|
data = RunLengthDecode.decode(data)
|
||||||
|
elif filter_name == FT.CCITT_FAX_DECODE:
|
||||||
|
height = stream.get(IA.HEIGHT, ())
|
||||||
|
data = CCITTFaxDecode.decode(data, params, height)
|
||||||
|
elif filter_name == FT.DCT_DECODE:
|
||||||
|
data = DCTDecode.decode(data)
|
||||||
|
elif filter_name == FT.JPX_DECODE:
|
||||||
|
data = JPXDecode.decode(data)
|
||||||
|
elif filter_name == FT.JBIG2_DECODE:
|
||||||
|
data = JBIG2Decode.decode(data, params)
|
||||||
|
elif filter_name == "/Crypt":
|
||||||
|
if "/Name" in params or "/Type" in params:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"/Crypt filter with /Name or /Type not supported yet"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unsupported filter {filter_name}")
|
||||||
|
return data
|
||||||
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
115
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
|
||||||
|
__author__ = "Mathieu Fenniak"
|
||||||
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
|
from ..constants import OutlineFontFlag
|
||||||
|
from ._base import (
|
||||||
|
BooleanObject,
|
||||||
|
ByteStringObject,
|
||||||
|
FloatObject,
|
||||||
|
IndirectObject,
|
||||||
|
NameObject,
|
||||||
|
NullObject,
|
||||||
|
NumberObject,
|
||||||
|
PdfObject,
|
||||||
|
TextStringObject,
|
||||||
|
encode_pdfdocencoding,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
from ._data_structures import (
|
||||||
|
ArrayObject,
|
||||||
|
ContentStream,
|
||||||
|
DecodedStreamObject,
|
||||||
|
Destination,
|
||||||
|
DictionaryObject,
|
||||||
|
EncodedStreamObject,
|
||||||
|
Field,
|
||||||
|
StreamObject,
|
||||||
|
TreeObject,
|
||||||
|
read_object,
|
||||||
|
)
|
||||||
|
from ._files import EmbeddedFile
|
||||||
|
from ._fit import Fit
|
||||||
|
from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links
|
||||||
|
from ._outline import OutlineItem
|
||||||
|
from ._rectangle import RectangleObject
|
||||||
|
from ._utils import (
|
||||||
|
create_string_object,
|
||||||
|
decode_pdfdocencoding,
|
||||||
|
hex_to_rgb,
|
||||||
|
read_hex_string_from_stream,
|
||||||
|
read_string_from_stream,
|
||||||
|
)
|
||||||
|
from ._viewerpref import ViewerPreferences
|
||||||
|
|
||||||
|
PAGE_FIT = Fit.fit()
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"PAGE_FIT",
|
||||||
|
"ArrayObject",
|
||||||
|
"BooleanObject",
|
||||||
|
"ByteStringObject",
|
||||||
|
"ContentStream",
|
||||||
|
"DecodedStreamObject",
|
||||||
|
"Destination",
|
||||||
|
"DictionaryObject",
|
||||||
|
"DirectReferenceLink",
|
||||||
|
"EmbeddedFile",
|
||||||
|
"EncodedStreamObject",
|
||||||
|
"Field",
|
||||||
|
"Fit",
|
||||||
|
"FloatObject",
|
||||||
|
"IndirectObject",
|
||||||
|
"NameObject",
|
||||||
|
"NamedReferenceLink",
|
||||||
|
"NullObject",
|
||||||
|
"NumberObject",
|
||||||
|
"OutlineFontFlag",
|
||||||
|
"OutlineItem",
|
||||||
|
"PdfObject",
|
||||||
|
"RectangleObject",
|
||||||
|
"ReferenceLink",
|
||||||
|
"StreamObject",
|
||||||
|
"TextStringObject",
|
||||||
|
"TreeObject",
|
||||||
|
"ViewerPreferences",
|
||||||
|
# Utility functions
|
||||||
|
"create_string_object",
|
||||||
|
"decode_pdfdocencoding",
|
||||||
|
"encode_pdfdocencoding",
|
||||||
|
"extract_links",
|
||||||
|
"hex_to_rgb",
|
||||||
|
"is_null_or_none",
|
||||||
|
"read_hex_string_from_stream",
|
||||||
|
# Data structures core functions
|
||||||
|
"read_object",
|
||||||
|
"read_string_from_stream",
|
||||||
|
]
|
||||||
@@ -0,0 +1,547 @@
|
|||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import IntEnum
|
||||||
|
from typing import Any, Optional, Union, cast
|
||||||
|
|
||||||
|
from .._codecs import fill_from_encoding
|
||||||
|
from .._codecs.core_fontmetrics import CORE_FONT_METRICS
|
||||||
|
from .._font import Font
|
||||||
|
from .._utils import logger_warning
|
||||||
|
from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
|
||||||
|
from ..generic import (
|
||||||
|
DecodedStreamObject,
|
||||||
|
DictionaryObject,
|
||||||
|
NameObject,
|
||||||
|
NumberObject,
|
||||||
|
RectangleObject,
|
||||||
|
)
|
||||||
|
from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none
|
||||||
|
|
||||||
|
DEFAULT_FONT_SIZE_IN_MULTILINE = 12
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BaseStreamConfig:
|
||||||
|
"""A container representing the basic layout of an appearance stream."""
|
||||||
|
rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0)
|
||||||
|
border_width: int = 1 # The width of the border in points
|
||||||
|
border_style: str = BorderStyles.SOLID
|
||||||
|
|
||||||
|
|
||||||
|
class BaseStreamAppearance(DecodedStreamObject):
|
||||||
|
"""A class representing the very base of an appearance stream, that is, a rectangle and a border."""
|
||||||
|
|
||||||
|
def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None:
|
||||||
|
"""
|
||||||
|
Takes the appearance stream layout as an argument.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
layout: The basic layout parameters.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self._layout = layout or BaseStreamConfig()
|
||||||
|
self[NameObject("/Type")] = NameObject("/XObject")
|
||||||
|
self[NameObject("/Subtype")] = NameObject("/Form")
|
||||||
|
self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle)
|
||||||
|
|
||||||
|
|
||||||
|
class TextAlignment(IntEnum):
|
||||||
|
"""Defines the alignment options for text within a form field's appearance stream."""
|
||||||
|
|
||||||
|
LEFT = 0
|
||||||
|
CENTER = 1
|
||||||
|
RIGHT = 2
|
||||||
|
|
||||||
|
|
||||||
|
class TextStreamAppearance(BaseStreamAppearance):
|
||||||
|
"""
|
||||||
|
A class representing the appearance stream for a text-based form field.
|
||||||
|
|
||||||
|
This class generates the content stream (the `ap_stream_data`) that dictates
|
||||||
|
how text is rendered within a form field's bounding box. It handles properties
|
||||||
|
like font, font size, color, multiline text, and text selection highlighting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _scale_text(
|
||||||
|
self,
|
||||||
|
font: Font,
|
||||||
|
font_size: float,
|
||||||
|
leading_factor: float,
|
||||||
|
field_width: float,
|
||||||
|
field_height: float,
|
||||||
|
text: str,
|
||||||
|
min_font_size: float,
|
||||||
|
font_size_step: float = 0.2
|
||||||
|
) -> tuple[list[tuple[float, str]], float]:
|
||||||
|
"""
|
||||||
|
Takes a piece of text and scales it to field_width or field_height, given font_name
|
||||||
|
and font_size. Wraps text where necessary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
font: The font to be used.
|
||||||
|
font_size: The font size in points.
|
||||||
|
leading_factor: The line distance.
|
||||||
|
field_width: The width of the field in which to fit the text.
|
||||||
|
field_height: The height of the field in which to fit the text.
|
||||||
|
text: The text to fit with the field.
|
||||||
|
min_font_size: The minimum font size at which to scale the text.
|
||||||
|
font_size_step: The amount by which to decrement font size per step while scaling.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The text in the form of list of tuples, each tuple containing the length of a line
|
||||||
|
and its contents, and the font_size for these lines and lengths.
|
||||||
|
"""
|
||||||
|
orig_text = text
|
||||||
|
paragraphs = text.replace("\n", "\r").split("\r")
|
||||||
|
wrapped_lines = []
|
||||||
|
current_line_words: list[str] = []
|
||||||
|
current_line_width: float = 0
|
||||||
|
space_width = font.space_width * font_size / 1000
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if not paragraph.strip():
|
||||||
|
wrapped_lines.append((0.0, ""))
|
||||||
|
continue
|
||||||
|
words = paragraph.split(" ")
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
word_width = font.text_width(word) * font_size / 1000
|
||||||
|
test_width = current_line_width + word_width + (space_width if i else 0)
|
||||||
|
if test_width > field_width and current_line_words:
|
||||||
|
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||||
|
current_line_words = [word]
|
||||||
|
current_line_width = word_width
|
||||||
|
elif not current_line_words and word_width > field_width:
|
||||||
|
wrapped_lines.append((word_width, word))
|
||||||
|
current_line_words = []
|
||||||
|
current_line_width = 0
|
||||||
|
else:
|
||||||
|
if current_line_words:
|
||||||
|
current_line_width += space_width
|
||||||
|
current_line_words.append(word)
|
||||||
|
current_line_width += word_width
|
||||||
|
if current_line_words:
|
||||||
|
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
|
||||||
|
current_line_words = []
|
||||||
|
current_line_width = 0
|
||||||
|
# Estimate total height.
|
||||||
|
estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size
|
||||||
|
if estimated_total_height > field_height:
|
||||||
|
# Text overflows height; Retry with smaller font size.
|
||||||
|
new_font_size = font_size - font_size_step
|
||||||
|
if new_font_size >= min_font_size:
|
||||||
|
return self._scale_text(
|
||||||
|
font,
|
||||||
|
new_font_size,
|
||||||
|
leading_factor,
|
||||||
|
field_width,
|
||||||
|
field_height,
|
||||||
|
orig_text,
|
||||||
|
min_font_size,
|
||||||
|
font_size_step
|
||||||
|
)
|
||||||
|
return wrapped_lines, round(font_size, 1)
|
||||||
|
|
||||||
|
def _generate_appearance_stream_data(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
selection: Union[list[str], None],
|
||||||
|
font: Font,
|
||||||
|
font_glyph_byte_map: Optional[dict[str, bytes]] = None,
|
||||||
|
font_name: str = "/Helv",
|
||||||
|
font_size: float = 0.0,
|
||||||
|
font_color: str = "0 g",
|
||||||
|
is_multiline: bool = False,
|
||||||
|
alignment: TextAlignment = TextAlignment.LEFT,
|
||||||
|
is_comb: bool = False,
|
||||||
|
max_length: Optional[int] = None
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
Generates the raw bytes of the PDF appearance stream for a text field.
|
||||||
|
|
||||||
|
This private method assembles the PDF content stream operators to draw
|
||||||
|
the provided text within the specified rectangle. It handles text positioning,
|
||||||
|
font application, color, and special formatting like selected text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to be rendered in the form field.
|
||||||
|
selection: An optional list of strings that should be highlighted as selected.
|
||||||
|
font: The font to use.
|
||||||
|
font_glyph_byte_map: An optional dictionary mapping characters to their
|
||||||
|
byte representation for glyph encoding.
|
||||||
|
font_name: The name of the font resource to use (e.g., "/Helv").
|
||||||
|
font_size: The font size. If 0, it is automatically calculated
|
||||||
|
based on whether the field is multiline or not.
|
||||||
|
font_color: The color to apply to the font, represented as a PDF
|
||||||
|
graphics state string (e.g., "0 g" for black).
|
||||||
|
is_multiline: A boolean indicating if the text field is multiline.
|
||||||
|
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||||
|
is_comb: Boolean that designates fixed-length fields, where every character
|
||||||
|
fills one "cell", such as in a postcode.
|
||||||
|
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||||
|
length field.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A byte string containing the PDF content stream data.
|
||||||
|
|
||||||
|
"""
|
||||||
|
rectangle = self._layout.rectangle
|
||||||
|
font_glyph_byte_map = font_glyph_byte_map or {}
|
||||||
|
if isinstance(rectangle, tuple):
|
||||||
|
rectangle = RectangleObject(rectangle)
|
||||||
|
leading_factor = (font.font_descriptor.bbox[3] - font.font_descriptor.bbox[1]) / 1000.0
|
||||||
|
|
||||||
|
# Set margins based on border width and style, but never less than 1 point
|
||||||
|
factor = 2 if self._layout.border_style in {"/B", "/I"} else 1
|
||||||
|
margin = max(self._layout.border_width * factor, 1)
|
||||||
|
field_height = rectangle.height - 2 * margin
|
||||||
|
field_width = rectangle.width - 4 * margin
|
||||||
|
|
||||||
|
# If font_size is 0, apply the logic for multiline or large-as-possible font
|
||||||
|
if font_size == 0:
|
||||||
|
min_font_size = 4.0 # The mininum font size
|
||||||
|
if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems
|
||||||
|
is_multiline = False # with matching "selection" with "line" later on.
|
||||||
|
if is_multiline:
|
||||||
|
font_size = DEFAULT_FONT_SIZE_IN_MULTILINE
|
||||||
|
lines, font_size = self._scale_text(
|
||||||
|
font,
|
||||||
|
font_size,
|
||||||
|
leading_factor,
|
||||||
|
field_width,
|
||||||
|
field_height,
|
||||||
|
text,
|
||||||
|
min_font_size
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
max_vertical_size = field_height / leading_factor
|
||||||
|
text_width_unscaled = font.text_width(text) / 1000
|
||||||
|
max_horizontal_size = field_width / (text_width_unscaled or 1)
|
||||||
|
font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1)
|
||||||
|
lines = [(text_width_unscaled * font_size, text)]
|
||||||
|
elif is_comb:
|
||||||
|
if max_length and len(text) > max_length:
|
||||||
|
logger_warning (
|
||||||
|
f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.",
|
||||||
|
__name__
|
||||||
|
)
|
||||||
|
# We act as if each character is one line, because we draw it separately later on
|
||||||
|
lines = [(
|
||||||
|
font.text_width(char) * font_size / 1000,
|
||||||
|
char
|
||||||
|
) for index, char in enumerate(text) if index < (max_length or len(text))]
|
||||||
|
else:
|
||||||
|
lines = [(
|
||||||
|
font.text_width(line) * font_size / 1000,
|
||||||
|
line
|
||||||
|
) for line in text.replace("\n", "\r").split("\r")]
|
||||||
|
|
||||||
|
# Set the vertical offset
|
||||||
|
if is_multiline:
|
||||||
|
y_offset = rectangle.height + margin - font.font_descriptor.bbox[3] * font_size / 1000.0
|
||||||
|
else:
|
||||||
|
y_offset = margin + ((field_height - font.font_descriptor.ascent * font_size / 1000) / 2)
|
||||||
|
default_appearance = f"{font_name} {font_size} Tf {font_color}"
|
||||||
|
|
||||||
|
ap_stream = (
|
||||||
|
f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} "
|
||||||
|
f"re\nW\nBT\n{default_appearance}\n"
|
||||||
|
).encode()
|
||||||
|
current_x_pos: float = 0 # Initial virtual position within the text object.
|
||||||
|
|
||||||
|
for line_number, (line_width, line) in enumerate(lines):
|
||||||
|
if selection and line in selection:
|
||||||
|
# Might be improved, but cannot find how to get fill working => replaced with lined box
|
||||||
|
ap_stream += (
|
||||||
|
f"1 {y_offset - (line_number * font_size * leading_factor) - 1} "
|
||||||
|
f"{rectangle.width - 2} {font_size + 2} re\n"
|
||||||
|
f"0.5 0.5 0.5 rg s\n{default_appearance}\n"
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
# Calculate the desired absolute starting X for the current line
|
||||||
|
desired_abs_x_start: float = 0
|
||||||
|
if is_comb and max_length:
|
||||||
|
# Calculate the width of a cell for one character
|
||||||
|
cell_width = rectangle.width / max_length
|
||||||
|
# Space from the left edge of the cell to the character's baseline start
|
||||||
|
# line_width here is the *actual* character width in points for the single character 'line'
|
||||||
|
centering_offset_in_cell = (cell_width - line_width) / 2
|
||||||
|
# Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset
|
||||||
|
desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell
|
||||||
|
elif alignment == TextAlignment.RIGHT:
|
||||||
|
desired_abs_x_start = rectangle.width - margin * 2 - line_width
|
||||||
|
elif alignment == TextAlignment.CENTER:
|
||||||
|
desired_abs_x_start = (rectangle.width - line_width) / 2
|
||||||
|
else: # Left aligned; default
|
||||||
|
desired_abs_x_start = margin * 2
|
||||||
|
# Calculate x_rel_offset: how much to move from the current_x_pos
|
||||||
|
# to reach the desired_abs_x_start.
|
||||||
|
x_rel_offset = desired_abs_x_start - current_x_pos
|
||||||
|
|
||||||
|
# Y-offset:
|
||||||
|
y_rel_offset: float = 0
|
||||||
|
if line_number == 0:
|
||||||
|
y_rel_offset = y_offset # Initial vertical position
|
||||||
|
elif is_comb:
|
||||||
|
y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters
|
||||||
|
else:
|
||||||
|
y_rel_offset = - font_size * leading_factor # Move down by line height
|
||||||
|
|
||||||
|
# Td is a relative translation (Tx and Ty).
|
||||||
|
# It updates the current text position.
|
||||||
|
ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode()
|
||||||
|
# Update current_x_pos based on the Td operation for the next iteration.
|
||||||
|
# This is the X position where the *current line* will start.
|
||||||
|
current_x_pos = desired_abs_x_start
|
||||||
|
|
||||||
|
encoded_line: list[bytes] = [
|
||||||
|
font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line
|
||||||
|
]
|
||||||
|
if any(len(c) >= 2 for c in encoded_line):
|
||||||
|
ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n"
|
||||||
|
else:
|
||||||
|
ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n"
|
||||||
|
ap_stream += b"ET\nQ\nEMC\nQ\n"
|
||||||
|
return ap_stream
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
layout: Optional[BaseStreamConfig] = None,
|
||||||
|
text: str = "",
|
||||||
|
selection: Optional[list[str]] = None,
|
||||||
|
font_resource: Optional[DictionaryObject] = None,
|
||||||
|
font_name: str = "/Helv",
|
||||||
|
font_size: float = 0.0,
|
||||||
|
font_color: str = "0 g",
|
||||||
|
is_multiline: bool = False,
|
||||||
|
alignment: TextAlignment = TextAlignment.LEFT,
|
||||||
|
is_comb: bool = False,
|
||||||
|
max_length: Optional[int] = None
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initializes a TextStreamAppearance object.
|
||||||
|
|
||||||
|
This constructor creates a new PDF stream object configured as an XObject
|
||||||
|
of subtype Form. It uses the `_appearance_stream_data` method to generate
|
||||||
|
the content for the stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
layout: The basic layout parameters.
|
||||||
|
text: The text to be rendered in the form field.
|
||||||
|
selection: An optional list of strings that should be highlighted as selected.
|
||||||
|
font_resource: An optional variable that represents a PDF font dictionary.
|
||||||
|
font_name: The name of the font resource, e.g., "/Helv".
|
||||||
|
font_size: The font size. If 0, it's auto-calculated.
|
||||||
|
font_color: The font color string.
|
||||||
|
is_multiline: A boolean indicating if the text field is multiline.
|
||||||
|
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
|
||||||
|
is_comb: Boolean that designates fixed-length fields, where every character
|
||||||
|
fills one "cell", such as in a postcode.
|
||||||
|
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
|
||||||
|
length field.
|
||||||
|
|
||||||
|
"""
|
||||||
|
super().__init__(layout)
|
||||||
|
|
||||||
|
# If a font resource was added, get the font character map
|
||||||
|
if font_resource:
|
||||||
|
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||||
|
font = Font.from_font_resource(font_resource)
|
||||||
|
else:
|
||||||
|
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
|
||||||
|
font_name = "/Helv"
|
||||||
|
font_resource = DictionaryObject({
|
||||||
|
NameObject("/Subtype"): NameObject("/Type1"),
|
||||||
|
NameObject("/Name"): NameObject("/Helv"),
|
||||||
|
NameObject("/Type"): NameObject("/Font"),
|
||||||
|
NameObject("/BaseFont"): NameObject("/Helvetica"),
|
||||||
|
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
|
||||||
|
})
|
||||||
|
font_descriptor = CORE_FONT_METRICS["Helvetica"]
|
||||||
|
font_descriptor.character_widths["default"] = 2 * font_descriptor.character_widths[" "]
|
||||||
|
font = Font(
|
||||||
|
name="Helvetica",
|
||||||
|
character_map={},
|
||||||
|
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
|
||||||
|
sub_type="Type1",
|
||||||
|
font_descriptor = font_descriptor,
|
||||||
|
character_widths = font_descriptor.character_widths
|
||||||
|
)
|
||||||
|
|
||||||
|
font_glyph_byte_map: dict[str, bytes]
|
||||||
|
if isinstance(font.encoding, str):
|
||||||
|
font_glyph_byte_map = {
|
||||||
|
v: k.encode(font.encoding) for k, v in font.character_map.items()
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||||
|
font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}
|
||||||
|
for key, value in font.character_map.items():
|
||||||
|
font_glyph_byte_map[value] = font_encoding_rev.get(key, key)
|
||||||
|
|
||||||
|
ap_stream_data = self._generate_appearance_stream_data(
|
||||||
|
text,
|
||||||
|
selection,
|
||||||
|
font,
|
||||||
|
font_glyph_byte_map,
|
||||||
|
font_name=font_name,
|
||||||
|
font_size=font_size,
|
||||||
|
font_color=font_color,
|
||||||
|
is_multiline=is_multiline,
|
||||||
|
alignment=alignment,
|
||||||
|
is_comb=is_comb,
|
||||||
|
max_length=max_length
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_data(ByteStringObject(ap_stream_data))
|
||||||
|
self[NameObject("/Length")] = NumberObject(len(ap_stream_data))
|
||||||
|
# Update Resources with font information
|
||||||
|
self[NameObject("/Resources")] = DictionaryObject({
|
||||||
|
NameObject("/Font"): DictionaryObject({
|
||||||
|
NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_annotation(
|
||||||
|
cls,
|
||||||
|
acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM])
|
||||||
|
field: DictionaryObject,
|
||||||
|
annotation: DictionaryObject,
|
||||||
|
user_font_name: str = "",
|
||||||
|
user_font_size: float = -1,
|
||||||
|
) -> "TextStreamAppearance":
|
||||||
|
"""
|
||||||
|
Creates a TextStreamAppearance object from a text field annotation.
|
||||||
|
|
||||||
|
This class method is a factory for creating a `TextStreamAppearance`
|
||||||
|
instance by extracting all necessary information (bounding box, font,
|
||||||
|
text content, etc.) from the PDF field and annotation dictionaries.
|
||||||
|
It respects inheritance for properties like default appearance (`/DA`).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
acro_form: The root AcroForm dictionary from the PDF catalog.
|
||||||
|
field: The field dictionary object.
|
||||||
|
annotation: The widget annotation dictionary object associated with the field.
|
||||||
|
user_font_name: An optional user-provided font name to override the
|
||||||
|
default. Defaults to an empty string.
|
||||||
|
user_font_size: An optional user-provided font size to override the
|
||||||
|
default. A value of -1 indicates no override.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A new `TextStreamAppearance` instance configured for the given field.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Calculate rectangle dimensions
|
||||||
|
_rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect])
|
||||||
|
rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1])))
|
||||||
|
|
||||||
|
# Get default appearance dictionary from annotation
|
||||||
|
default_appearance = annotation.get_inherited(
|
||||||
|
AnnotationDictionaryAttributes.DA,
|
||||||
|
acro_form.get(AnnotationDictionaryAttributes.DA, None),
|
||||||
|
)
|
||||||
|
if not default_appearance:
|
||||||
|
# Create a default appearance if none was found in the annotation
|
||||||
|
default_appearance = TextStringObject("/Helv 0 Tf 0 g")
|
||||||
|
else:
|
||||||
|
default_appearance = default_appearance.get_object()
|
||||||
|
|
||||||
|
# Derive font name, size and color from the default appearance. Also set
|
||||||
|
# user-provided font name and font size in the default appearance, if given.
|
||||||
|
# For a font name, this presumes that we can find an associated font resource
|
||||||
|
# dictionary. Uses the variable font_properties as an intermediate.
|
||||||
|
# As per the PDF spec:
|
||||||
|
# "At a minimum, the string [that is, default_appearance] shall include a Tf (text
|
||||||
|
# font) operator along with its two operands, font and size" (Section 12.7.4.3
|
||||||
|
# "Variable text" of the PDF 2.0 specification).
|
||||||
|
font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop]
|
||||||
|
font_name = font_properties.pop(font_properties.index("Tf") - 2)
|
||||||
|
font_size = float(font_properties.pop(font_properties.index("Tf") - 1))
|
||||||
|
font_properties.remove("Tf")
|
||||||
|
font_color = " ".join(font_properties)
|
||||||
|
# Determine the font name to use, prioritizing the user's input
|
||||||
|
if user_font_name:
|
||||||
|
font_name = user_font_name
|
||||||
|
# Determine the font size to use, prioritizing the user's input
|
||||||
|
if user_font_size > 0:
|
||||||
|
font_size = user_font_size
|
||||||
|
|
||||||
|
# Try to find a resource dictionary for the font
|
||||||
|
document_resources: Any = cast(
|
||||||
|
DictionaryObject,
|
||||||
|
cast(
|
||||||
|
DictionaryObject,
|
||||||
|
annotation.get_inherited(
|
||||||
|
"/DR",
|
||||||
|
acro_form.get("/DR", DictionaryObject()),
|
||||||
|
),
|
||||||
|
).get_object(),
|
||||||
|
)
|
||||||
|
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
|
||||||
|
# CORE_FONT_METRICS is the dict with Standard font metrics
|
||||||
|
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
|
||||||
|
# ...or AcroForm dictionary
|
||||||
|
document_resources = cast(
|
||||||
|
dict[Any, Any],
|
||||||
|
acro_form.get("/DR", {}),
|
||||||
|
)
|
||||||
|
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
|
||||||
|
font_resource = document_font_resources.get(font_name, None)
|
||||||
|
if not is_null_or_none(font_resource):
|
||||||
|
font_resource = cast(DictionaryObject, font_resource.get_object())
|
||||||
|
|
||||||
|
# Retrieve field text and selected values
|
||||||
|
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
|
||||||
|
if (
|
||||||
|
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
|
||||||
|
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
|
||||||
|
):
|
||||||
|
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
|
||||||
|
selection = field.get("/V", [])
|
||||||
|
if not isinstance(selection, list):
|
||||||
|
selection = [selection]
|
||||||
|
else: # /Tx
|
||||||
|
text = field.get("/V", "")
|
||||||
|
selection = []
|
||||||
|
|
||||||
|
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
|
||||||
|
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
|
||||||
|
|
||||||
|
# Retrieve formatting information
|
||||||
|
is_comb = False
|
||||||
|
max_length = None
|
||||||
|
if field_flags & FieldDictionaryAttributes.FfBits.Comb:
|
||||||
|
is_comb = True
|
||||||
|
max_length = annotation.get("/MaxLen")
|
||||||
|
is_multiline = False
|
||||||
|
if field_flags & FieldDictionaryAttributes.FfBits.Multiline:
|
||||||
|
is_multiline = True
|
||||||
|
alignment = field.get("/Q", TextAlignment.LEFT)
|
||||||
|
border_width = 1
|
||||||
|
border_style = BorderStyles.SOLID
|
||||||
|
if "/BS" in field:
|
||||||
|
border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width)
|
||||||
|
border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style)
|
||||||
|
|
||||||
|
# Create the TextStreamAppearance instance
|
||||||
|
layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style)
|
||||||
|
new_appearance_stream = cls(
|
||||||
|
layout,
|
||||||
|
text,
|
||||||
|
selection,
|
||||||
|
font_resource,
|
||||||
|
font_name=font_name,
|
||||||
|
font_size=font_size,
|
||||||
|
font_color=font_color,
|
||||||
|
is_multiline=is_multiline,
|
||||||
|
alignment=alignment,
|
||||||
|
is_comb=is_comb,
|
||||||
|
max_length=max_length
|
||||||
|
)
|
||||||
|
if AnnotationDictionaryAttributes.AP in annotation:
|
||||||
|
for key, value in (
|
||||||
|
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
|
||||||
|
):
|
||||||
|
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
|
||||||
|
new_appearance_stream[key] = value
|
||||||
|
|
||||||
|
return new_appearance_stream
|
||||||
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
937
venv/lib/python3.12/site-packages/pypdf/generic/_base.py
Normal file
@@ -0,0 +1,937 @@
|
|||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
import binascii
|
||||||
|
import codecs
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from binascii import unhexlify
|
||||||
|
from collections.abc import Sequence
|
||||||
|
from math import log10
|
||||||
|
from struct import iter_unpack
|
||||||
|
from typing import Any, Callable, ClassVar, Optional, Union, cast
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
from typing import TypeGuard
|
||||||
|
else:
|
||||||
|
from typing_extensions import TypeGuard # PEP 647
|
||||||
|
|
||||||
|
from .._codecs import _pdfdoc_encoding_rev
|
||||||
|
from .._protocols import PdfObjectProtocol, PdfWriterProtocol
|
||||||
|
from .._utils import (
|
||||||
|
StreamType,
|
||||||
|
classproperty,
|
||||||
|
deprecation_no_replacement,
|
||||||
|
deprecation_with_replacement,
|
||||||
|
logger_warning,
|
||||||
|
read_non_whitespace,
|
||||||
|
read_until_regex,
|
||||||
|
)
|
||||||
|
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
|
||||||
|
|
||||||
|
__author__ = "Mathieu Fenniak"
|
||||||
|
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||||
|
|
||||||
|
|
||||||
|
class PdfObject(PdfObjectProtocol):
|
||||||
|
# function for calculating a hash value
|
||||||
|
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
|
||||||
|
indirect_reference: Optional["IndirectObject"]
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{self.__class__.__name__} does not implement .hash_bin() so far"
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_value_data(self) -> bytes:
|
||||||
|
return f"{self}".encode()
|
||||||
|
|
||||||
|
def hash_value(self) -> bytes:
|
||||||
|
return (
|
||||||
|
f"{self.__class__.__name__}:"
|
||||||
|
f"{self.hash_func(self.hash_value_data()).hexdigest()}"
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
def replicate(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
) -> "PdfObject":
|
||||||
|
"""
|
||||||
|
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
|
||||||
|
without ensuring links. This is used in clone_document_from_root with incremental = True.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_dest: Target to clone to.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The cloned PdfObject
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.clone(pdf_dest)
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "PdfObject":
|
||||||
|
"""
|
||||||
|
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).
|
||||||
|
|
||||||
|
By default, this method will call ``_reference_clone`` (see ``_reference``).
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_dest: Target to clone to.
|
||||||
|
force_duplicate: By default, if the object has already been cloned and referenced,
|
||||||
|
the copy will be returned; when ``True``, a new copy will be created.
|
||||||
|
(Default value = ``False``)
|
||||||
|
ignore_fields: List/tuple of field names (for dictionaries) that will be ignored
|
||||||
|
during cloning (applies to children duplication as well). If fields are to be
|
||||||
|
considered for a limited number of levels, you have to add it as integer, for
|
||||||
|
example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first
|
||||||
|
level only but ``"/TOTO"`` on all levels.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The cloned PdfObject
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{self.__class__.__name__} does not implement .clone so far"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _reference_clone(
|
||||||
|
self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False
|
||||||
|
) -> PdfObjectProtocol:
|
||||||
|
"""
|
||||||
|
Reference the object within the _objects of pdf_dest only if
|
||||||
|
indirect_reference attribute exists (which means the objects was
|
||||||
|
already identified in xref/xobjstm) if object has been already
|
||||||
|
referenced do nothing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
clone:
|
||||||
|
pdf_dest:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The clone
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not force_duplicate and clone.indirect_reference.pdf == pdf_dest:
|
||||||
|
return clone
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# if hasattr(clone, "indirect_reference"):
|
||||||
|
try:
|
||||||
|
ind = self.indirect_reference
|
||||||
|
except AttributeError:
|
||||||
|
return clone
|
||||||
|
if (
|
||||||
|
pdf_dest.incremental
|
||||||
|
and ind is not None
|
||||||
|
and ind.pdf == pdf_dest._reader
|
||||||
|
and ind.idnum <= len(pdf_dest._objects)
|
||||||
|
):
|
||||||
|
i = ind.idnum
|
||||||
|
else:
|
||||||
|
i = len(pdf_dest._objects) + 1
|
||||||
|
if ind is not None:
|
||||||
|
if id(ind.pdf) not in pdf_dest._id_translated:
|
||||||
|
pdf_dest._id_translated[id(ind.pdf)] = {}
|
||||||
|
pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore[index]
|
||||||
|
if (
|
||||||
|
not force_duplicate
|
||||||
|
and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
|
||||||
|
):
|
||||||
|
obj = pdf_dest.get_object(
|
||||||
|
pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
|
||||||
|
)
|
||||||
|
assert obj is not None
|
||||||
|
return obj
|
||||||
|
pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
|
||||||
|
try:
|
||||||
|
pdf_dest._objects[i - 1] = clone
|
||||||
|
except IndexError:
|
||||||
|
pdf_dest._objects.append(clone)
|
||||||
|
i = len(pdf_dest._objects)
|
||||||
|
clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
|
||||||
|
return clone
|
||||||
|
|
||||||
|
def get_object(self) -> Optional["PdfObject"]:
|
||||||
|
"""Resolve indirect references."""
|
||||||
|
return self
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class NullObject(PdfObject):
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "NullObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__,))
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(b"null")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_from_stream(stream: StreamType) -> "NullObject":
|
||||||
|
nulltxt = stream.read(4)
|
||||||
|
if nulltxt != b"null":
|
||||||
|
raise PdfReadError("Could not read Null object")
|
||||||
|
return NullObject()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "NullObject"
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
return isinstance(other, NullObject)
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return self.hash_bin()
|
||||||
|
|
||||||
|
|
||||||
|
class BooleanObject(PdfObject):
|
||||||
|
def __init__(self, value: Any) -> None:
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "BooleanObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"BooleanObject",
|
||||||
|
self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self.value))
|
||||||
|
|
||||||
|
def __eq__(self, o: object, /) -> bool:
|
||||||
|
if isinstance(o, BooleanObject):
|
||||||
|
return self.value == o.value
|
||||||
|
if isinstance(o, bool):
|
||||||
|
return self.value == o
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return self.hash_bin()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "True" if self.value else "False"
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
if self.value:
|
||||||
|
stream.write(b"true")
|
||||||
|
else:
|
||||||
|
stream.write(b"false")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_from_stream(stream: StreamType) -> "BooleanObject":
|
||||||
|
word = stream.read(4)
|
||||||
|
if word == b"true":
|
||||||
|
return BooleanObject(True)
|
||||||
|
if word == b"fals":
|
||||||
|
stream.read(1)
|
||||||
|
return BooleanObject(False)
|
||||||
|
raise PdfReadError("Could not read Boolean object")
|
||||||
|
|
||||||
|
|
||||||
|
class IndirectObject(PdfObject):
|
||||||
|
def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
|
||||||
|
self.idnum = idnum
|
||||||
|
self.generation = generation
|
||||||
|
self.pdf = pdf
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash((self.idnum, self.generation, id(self.pdf)))
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
|
||||||
|
|
||||||
|
def replicate(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
) -> "PdfObject":
|
||||||
|
return IndirectObject(self.idnum, self.generation, pdf_dest)
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: PdfWriterProtocol,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "IndirectObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
if self.pdf == pdf_dest and not force_duplicate:
|
||||||
|
# Already duplicated and no extra duplication required
|
||||||
|
return self
|
||||||
|
if id(self.pdf) not in pdf_dest._id_translated:
|
||||||
|
pdf_dest._id_translated[id(self.pdf)] = {}
|
||||||
|
pdf_dest._id_translated[id(self.pdf)]["PreventGC"] = self.pdf # type: ignore[index]
|
||||||
|
|
||||||
|
if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
|
||||||
|
dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
|
||||||
|
if force_duplicate:
|
||||||
|
assert dup is not None
|
||||||
|
assert dup.indirect_reference is not None
|
||||||
|
idref = dup.indirect_reference
|
||||||
|
return IndirectObject(idref.idnum, idref.generation, idref.pdf)
|
||||||
|
else:
|
||||||
|
obj = self.get_object()
|
||||||
|
# case observed : a pointed object can not be found
|
||||||
|
if obj is None:
|
||||||
|
# this normally
|
||||||
|
obj = NullObject()
|
||||||
|
assert isinstance(self, (IndirectObject,))
|
||||||
|
obj.indirect_reference = self
|
||||||
|
dup = pdf_dest._add_object(
|
||||||
|
obj.clone(pdf_dest, force_duplicate, ignore_fields)
|
||||||
|
)
|
||||||
|
assert dup is not None, "mypy"
|
||||||
|
assert dup.indirect_reference is not None, "mypy"
|
||||||
|
return dup.indirect_reference
|
||||||
|
|
||||||
|
@property
|
||||||
|
def indirect_reference(self) -> "IndirectObject": # type: ignore[override]
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get_object(self) -> Optional["PdfObject"]:
|
||||||
|
return self.pdf.get_object(self)
|
||||||
|
|
||||||
|
def __deepcopy__(self, memo: Any) -> "IndirectObject":
|
||||||
|
return IndirectObject(self.idnum, self.generation, self.pdf)
|
||||||
|
|
||||||
|
def _get_object_with_check(self) -> Optional["PdfObject"]:
|
||||||
|
o = self.get_object()
|
||||||
|
# the check is done here to not slow down get_object()
|
||||||
|
if isinstance(o, IndirectObject):
|
||||||
|
raise PdfStreamError(
|
||||||
|
f"{self.__repr__()} references an IndirectObject {o.__repr__()}"
|
||||||
|
)
|
||||||
|
return o
|
||||||
|
|
||||||
|
def __getattr__(self, name: str) -> Any:
|
||||||
|
# Attribute not found in object: look in pointed object
|
||||||
|
try:
|
||||||
|
return getattr(self._get_object_with_check(), name)
|
||||||
|
except AttributeError:
|
||||||
|
raise AttributeError(
|
||||||
|
f"No attribute {name} found in IndirectObject or pointed object"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __getitem__(self, key: Any) -> Any:
|
||||||
|
# items should be extracted from pointed Object
|
||||||
|
return self._get_object_with_check()[key] # type: ignore
|
||||||
|
|
||||||
|
def __contains__(self, key: Any) -> bool:
|
||||||
|
return key in self._get_object_with_check() # type: ignore
|
||||||
|
|
||||||
|
def __iter__(self) -> Any:
|
||||||
|
return self._get_object_with_check().__iter__() # type: ignore
|
||||||
|
|
||||||
|
def __float__(self) -> str:
|
||||||
|
# in this case we are looking for the pointed data
|
||||||
|
return self.get_object().__float__() # type: ignore
|
||||||
|
|
||||||
|
def __int__(self) -> int:
|
||||||
|
# in this case we are looking for the pointed data
|
||||||
|
return self.get_object().__int__() # type: ignore
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
# in this case we are looking for the pointed data
|
||||||
|
return self.get_object().__str__()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})"
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
return (
|
||||||
|
other is not None
|
||||||
|
and isinstance(other, IndirectObject)
|
||||||
|
and self.idnum == other.idnum
|
||||||
|
and self.generation == other.generation
|
||||||
|
and self.pdf is other.pdf
|
||||||
|
)
|
||||||
|
|
||||||
|
def __ne__(self, other: object) -> bool:
|
||||||
|
return not self.__eq__(other)
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(f"{self.idnum} {self.generation} R".encode())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader
|
||||||
|
idnum = b""
|
||||||
|
while True:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if not tok:
|
||||||
|
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||||
|
if tok.isspace():
|
||||||
|
break
|
||||||
|
idnum += tok
|
||||||
|
generation = b""
|
||||||
|
while True:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if not tok:
|
||||||
|
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||||
|
if tok.isspace():
|
||||||
|
if not generation:
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
generation += tok
|
||||||
|
r = read_non_whitespace(stream)
|
||||||
|
if r != b"R":
|
||||||
|
raise PdfReadError(
|
||||||
|
f"Error reading indirect object reference at byte {hex(stream.tell())}"
|
||||||
|
)
|
||||||
|
return IndirectObject(int(idnum), int(generation), pdf)
|
||||||
|
|
||||||
|
|
||||||
|
FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj
|
||||||
|
|
||||||
|
|
||||||
|
class FloatObject(float, PdfObject):
|
||||||
|
def __new__(
|
||||||
|
cls, value: Any = "0.0", context: Optional[Any] = None
|
||||||
|
) -> "FloatObject":
|
||||||
|
try:
|
||||||
|
value = float(value)
|
||||||
|
return float.__new__(cls, value)
|
||||||
|
except Exception as e:
|
||||||
|
# If this isn't a valid decimal (happens in malformed PDFs)
|
||||||
|
# fallback to 0
|
||||||
|
logger_warning(
|
||||||
|
f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__
|
||||||
|
)
|
||||||
|
return float.__new__(cls, 0.0)
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "FloatObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"FloatObject",
|
||||||
|
self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self.as_numeric))
|
||||||
|
|
||||||
|
def myrepr(self) -> str:
|
||||||
|
if self == 0:
|
||||||
|
return "0.0"
|
||||||
|
nb = FLOAT_WRITE_PRECISION - int(log10(abs(self)))
|
||||||
|
return f"{self:.{max(1, nb)}f}".rstrip("0").rstrip(".")
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return self.myrepr() # repr(float(self))
|
||||||
|
|
||||||
|
def as_numeric(self) -> float:
|
||||||
|
return float(self)
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(self.myrepr().encode("utf8"))
|
||||||
|
|
||||||
|
|
||||||
|
class NumberObject(int, PdfObject):
|
||||||
|
NumberPattern = re.compile(b"[^+-.0-9]")
|
||||||
|
|
||||||
|
def __new__(cls, value: Any) -> "NumberObject":
|
||||||
|
try:
|
||||||
|
return int.__new__(cls, int(value))
|
||||||
|
except ValueError:
|
||||||
|
logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__)
|
||||||
|
return int.__new__(cls, 0)
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "NumberObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"NumberObject",
|
||||||
|
self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self.as_numeric()))
|
||||||
|
|
||||||
|
def as_numeric(self) -> int:
|
||||||
|
return int(repr(self).encode("utf8"))
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(repr(self).encode("utf8"))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]:
|
||||||
|
num = read_until_regex(stream, NumberObject.NumberPattern)
|
||||||
|
if b"." in num:
|
||||||
|
return FloatObject(num)
|
||||||
|
return NumberObject(num)
|
||||||
|
|
||||||
|
|
||||||
|
class ByteStringObject(bytes, PdfObject):
|
||||||
|
"""
|
||||||
|
Represents a string object where the text encoding could not be determined.
|
||||||
|
|
||||||
|
This occurs quite often, as the PDF spec doesn't provide an alternate way to
|
||||||
|
represent strings -- for example, the encryption data stored in files (like
|
||||||
|
/O) is clearly not text, but is still stored in a "String" object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "ByteStringObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"ByteStringObject",
|
||||||
|
self._reference_clone(
|
||||||
|
ByteStringObject(bytes(self)), pdf_dest, force_duplicate
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, bytes(self)))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def original_bytes(self) -> bytes:
|
||||||
|
"""For compatibility with TextStringObject.original_bytes."""
|
||||||
|
return self
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(b"<")
|
||||||
|
stream.write(binascii.hexlify(self))
|
||||||
|
stream.write(b">")
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
charset_to_try = ["utf-16", *list(NameObject.CHARSETS)]
|
||||||
|
for enc in charset_to_try:
|
||||||
|
try:
|
||||||
|
return self.decode(enc)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
raise PdfReadError("Cannot decode ByteStringObject.")
|
||||||
|
|
||||||
|
|
||||||
|
class TextStringObject(str, PdfObject): # noqa: SLOT000
|
||||||
|
"""
|
||||||
|
A string object that has been decoded into a real unicode string.
|
||||||
|
|
||||||
|
If read from a PDF document, this string appeared to match the
|
||||||
|
PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding
|
||||||
|
to occur.
|
||||||
|
"""
|
||||||
|
|
||||||
|
autodetect_pdfdocencoding: bool
|
||||||
|
autodetect_utf16: bool
|
||||||
|
utf16_bom: bytes
|
||||||
|
_original_bytes: Optional[bytes] = None
|
||||||
|
|
||||||
|
def __new__(cls, value: Any) -> "TextStringObject":
|
||||||
|
original_bytes = None
|
||||||
|
if isinstance(value, bytes):
|
||||||
|
original_bytes = value
|
||||||
|
value = value.decode("charmap")
|
||||||
|
text_string_object = str.__new__(cls, value)
|
||||||
|
text_string_object._original_bytes = original_bytes
|
||||||
|
text_string_object.autodetect_utf16 = False
|
||||||
|
text_string_object.autodetect_pdfdocencoding = False
|
||||||
|
text_string_object.utf16_bom = b""
|
||||||
|
if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
|
||||||
|
# The value of `original_bytes` is only set for inputs being `bytes`.
|
||||||
|
# If this is UTF-16 data according to the BOM (first two characters),
|
||||||
|
# perform special handling. All other cases should not need any special conversion
|
||||||
|
# due to already being a string.
|
||||||
|
try:
|
||||||
|
text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
|
||||||
|
except UnicodeDecodeError as exception:
|
||||||
|
logger_warning(
|
||||||
|
f"{exception!s}\ninitial string:{exception.object!r}",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
|
||||||
|
text_string_object._original_bytes = original_bytes
|
||||||
|
text_string_object.autodetect_utf16 = True
|
||||||
|
text_string_object.utf16_bom = original_bytes[:2]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
encode_pdfdocencoding(text_string_object)
|
||||||
|
text_string_object.autodetect_pdfdocencoding = True
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
text_string_object.autodetect_utf16 = True
|
||||||
|
text_string_object.utf16_bom = codecs.BOM_UTF16_BE
|
||||||
|
return text_string_object
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "TextStringObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
obj = TextStringObject(self)
|
||||||
|
obj._original_bytes = self._original_bytes
|
||||||
|
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
|
||||||
|
obj.autodetect_utf16 = self.autodetect_utf16
|
||||||
|
obj.utf16_bom = self.utf16_bom
|
||||||
|
return cast(
|
||||||
|
"TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self.original_bytes))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def original_bytes(self) -> bytes:
|
||||||
|
"""
|
||||||
|
It is occasionally possible that a text string object gets created where
|
||||||
|
a byte string object was expected due to the autodetection mechanism --
|
||||||
|
if that occurs, this "original_bytes" property can be used to
|
||||||
|
back-calculate what the original encoded bytes were.
|
||||||
|
"""
|
||||||
|
if self._original_bytes is not None:
|
||||||
|
return self._original_bytes
|
||||||
|
return self.get_original_bytes()
|
||||||
|
|
||||||
|
def get_original_bytes(self) -> bytes:
|
||||||
|
# We're a text string object, but the library is trying to get our raw
|
||||||
|
# bytes. This can happen if we auto-detected this string as text, but
|
||||||
|
# we were wrong. It's pretty common. Return the original bytes that
|
||||||
|
# would have been used to create this object, based upon the autodetect
|
||||||
|
# method.
|
||||||
|
if self.autodetect_utf16:
|
||||||
|
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||||
|
return codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||||
|
if self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||||
|
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||||
|
return self.encode("utf-16be")
|
||||||
|
if self.autodetect_pdfdocencoding:
|
||||||
|
return encode_pdfdocencoding(self)
|
||||||
|
raise Exception("no information about original bytes") # pragma: no cover
|
||||||
|
|
||||||
|
def get_encoded_bytes(self) -> bytes:
|
||||||
|
# Try to write the string out as a PDFDocEncoding encoded string. It's
|
||||||
|
# nicer to look at in the PDF file. Sadly, we take a performance hit
|
||||||
|
# here for trying...
|
||||||
|
try:
|
||||||
|
if self._original_bytes is not None:
|
||||||
|
return self._original_bytes
|
||||||
|
if self.autodetect_utf16:
|
||||||
|
raise UnicodeEncodeError("", "forced", -1, -1, "")
|
||||||
|
bytearr = encode_pdfdocencoding(self)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
if self.utf16_bom == codecs.BOM_UTF16_LE:
|
||||||
|
bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
|
||||||
|
elif self.utf16_bom == codecs.BOM_UTF16_BE:
|
||||||
|
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
|
||||||
|
else:
|
||||||
|
bytearr = self.encode("utf-16be")
|
||||||
|
return bytearr
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
bytearr = self.get_encoded_bytes()
|
||||||
|
stream.write(b"(")
|
||||||
|
for c_ in iter_unpack("c", bytearr):
|
||||||
|
c = cast(bytes, c_[0])
|
||||||
|
if not c.isalnum() and c != b" ":
|
||||||
|
# This:
|
||||||
|
# stream.write(rf"\{c:0>3o}".encode())
|
||||||
|
# gives
|
||||||
|
# https://github.com/davidhalter/parso/issues/207
|
||||||
|
stream.write(b"\\%03o" % ord(c))
|
||||||
|
else:
|
||||||
|
stream.write(c)
|
||||||
|
stream.write(b")")
|
||||||
|
|
||||||
|
|
||||||
|
class NameObject(str, PdfObject): # noqa: SLOT000
|
||||||
|
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
|
||||||
|
prefix = b"/"
|
||||||
|
renumber_table: ClassVar[dict[str, bytes]] = {
|
||||||
|
**{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
|
||||||
|
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
|
||||||
|
}
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
pdf_dest: Any,
|
||||||
|
force_duplicate: bool = False,
|
||||||
|
ignore_fields: Optional[Sequence[Union[str, int]]] = (),
|
||||||
|
) -> "NameObject":
|
||||||
|
"""Clone object into pdf_dest."""
|
||||||
|
return cast(
|
||||||
|
"NameObject",
|
||||||
|
self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash_bin(self) -> int:
|
||||||
|
"""
|
||||||
|
Used to detect modified object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash considering type and value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return hash((self.__class__, self))
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(self.renumber())
|
||||||
|
|
||||||
|
def renumber(self) -> bytes:
|
||||||
|
out = self[0].encode("utf-8")
|
||||||
|
if out != b"/":
|
||||||
|
deprecation_no_replacement(
|
||||||
|
f"Incorrect first char in NameObject, should start with '/': ({self})",
|
||||||
|
"5.0.0",
|
||||||
|
)
|
||||||
|
for c in self[1:]:
|
||||||
|
if c > "~":
|
||||||
|
for x in c.encode("utf-8"):
|
||||||
|
out += f"#{x:02X}".encode()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
out += self.renumber_table[c]
|
||||||
|
except KeyError:
|
||||||
|
out += c.encode("utf-8")
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _sanitize(self) -> "NameObject":
|
||||||
|
"""
|
||||||
|
Sanitize the NameObject's name to be a valid PDF name part
|
||||||
|
(alphanumeric, underscore, hyphen). The _sanitize method replaces
|
||||||
|
spaces and any non-alphanumeric/non-underscore/non-hyphen with
|
||||||
|
underscores.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
NameObject with sanitized name.
|
||||||
|
"""
|
||||||
|
name = str(self).removeprefix("/")
|
||||||
|
name = re.sub(r"\ ", "_", name)
|
||||||
|
name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||||
|
return NameObject("/" + name)
|
||||||
|
|
||||||
|
@classproperty
|
||||||
|
def surfix(cls) -> bytes: # noqa: N805
|
||||||
|
deprecation_with_replacement("surfix", "prefix", "5.0.0")
|
||||||
|
return b"/"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def unnumber(sin: bytes) -> bytes:
|
||||||
|
i = sin.find(b"#", 0)
|
||||||
|
while i >= 0:
|
||||||
|
try:
|
||||||
|
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
|
||||||
|
i = sin.find(b"#", i + 1)
|
||||||
|
except ValueError:
|
||||||
|
# if the 2 characters after # can not be converted to hex
|
||||||
|
# we change nothing and carry on
|
||||||
|
i = i + 1
|
||||||
|
return sin
|
||||||
|
|
||||||
|
CHARSETS = ("utf-8", "gbk", "latin1")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
|
||||||
|
name = stream.read(1)
|
||||||
|
if name != NameObject.prefix:
|
||||||
|
raise PdfReadError("Name read error")
|
||||||
|
name += read_until_regex(stream, NameObject.delimiter_pattern)
|
||||||
|
try:
|
||||||
|
# Name objects should represent irregular characters
|
||||||
|
# with a '#' followed by the symbol's hex number
|
||||||
|
name = NameObject.unnumber(name)
|
||||||
|
for enc in NameObject.CHARSETS:
|
||||||
|
try:
|
||||||
|
ret = name.decode(enc)
|
||||||
|
return NameObject(ret)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError) as e:
|
||||||
|
if not pdf.strict:
|
||||||
|
logger_warning(
|
||||||
|
f"Illegal character in NameObject ({name!r}), "
|
||||||
|
"you may need to adjust NameObject.CHARSETS",
|
||||||
|
__name__,
|
||||||
|
)
|
||||||
|
return NameObject(name.decode("charmap"))
|
||||||
|
raise PdfReadError(
|
||||||
|
f"Illegal character in NameObject ({name!r}). "
|
||||||
|
"You may need to adjust NameObject.CHARSETS.",
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
def encode_pdfdocencoding(unicode_string: str) -> bytes:
|
||||||
|
try:
|
||||||
|
return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
|
||||||
|
except KeyError:
|
||||||
|
raise UnicodeEncodeError(
|
||||||
|
"pdfdocencoding",
|
||||||
|
unicode_string,
|
||||||
|
-1,
|
||||||
|
-1,
|
||||||
|
"does not exist in translation table",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]]:
|
||||||
|
"""
|
||||||
|
Returns:
|
||||||
|
True if x is None or NullObject.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return x is None or (
|
||||||
|
isinstance(x, PdfObject)
|
||||||
|
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
|
||||||
|
)
|
||||||
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
1757
venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py
Normal file
File diff suppressed because it is too large
Load Diff
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
401
venv/lib/python3.12/site-packages/pypdf/generic/_files.py
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import bisect
|
||||||
|
from functools import cached_property
|
||||||
|
from typing import TYPE_CHECKING, cast
|
||||||
|
|
||||||
|
from pypdf._utils import format_iso8824_date, parse_iso8824_date
|
||||||
|
from pypdf.constants import CatalogAttributes as CA
|
||||||
|
from pypdf.constants import FileSpecificationDictionaryEntries
|
||||||
|
from pypdf.constants import PageAttributes as PG
|
||||||
|
from pypdf.errors import PdfReadError, PyPdfError
|
||||||
|
from pypdf.generic import (
|
||||||
|
ArrayObject,
|
||||||
|
ByteStringObject,
|
||||||
|
DecodedStreamObject,
|
||||||
|
DictionaryObject,
|
||||||
|
NameObject,
|
||||||
|
NullObject,
|
||||||
|
NumberObject,
|
||||||
|
StreamObject,
|
||||||
|
TextStringObject,
|
||||||
|
is_null_or_none,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
from pypdf._writer import PdfWriter
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddedFile:
|
||||||
|
"""
|
||||||
|
Container holding the information on an embedded file.
|
||||||
|
|
||||||
|
Attributes are evaluated lazily if possible.
|
||||||
|
|
||||||
|
Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.
|
||||||
|
"""
|
||||||
|
def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
name: The (primary) name as provided in the name tree.
|
||||||
|
pdf_object: The corresponding PDF object to allow retrieving further data.
|
||||||
|
parent: The parent list.
|
||||||
|
"""
|
||||||
|
self._name = name
|
||||||
|
self.pdf_object = pdf_object
|
||||||
|
self._parent = parent
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""The (primary) name of the embedded file as provided in the name tree."""
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> EmbeddedFile:
|
||||||
|
"""
|
||||||
|
Create a new embedded file and add it to the PdfWriter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
writer: The PdfWriter instance to add the embedded file to.
|
||||||
|
name: The filename to display.
|
||||||
|
content: The data in the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
EmbeddedFile instance for the newly created embedded file.
|
||||||
|
"""
|
||||||
|
# Convert string content to bytes if needed
|
||||||
|
if isinstance(content, str):
|
||||||
|
content = content.encode("latin-1")
|
||||||
|
|
||||||
|
# Create the file entry (the actual embedded file stream)
|
||||||
|
file_entry = DecodedStreamObject()
|
||||||
|
file_entry.set_data(content)
|
||||||
|
file_entry.update({NameObject(PG.TYPE): NameObject("/EmbeddedFile")})
|
||||||
|
|
||||||
|
# Create the /EF entry
|
||||||
|
ef_entry = DictionaryObject()
|
||||||
|
ef_entry.update({NameObject("/F"): writer._add_object(file_entry)})
|
||||||
|
|
||||||
|
# Create the filespec dictionary
|
||||||
|
from pypdf.generic import create_string_object # noqa: PLC0415
|
||||||
|
filespec = DictionaryObject()
|
||||||
|
filespec_reference = writer._add_object(filespec)
|
||||||
|
name_object = cast(TextStringObject, create_string_object(name))
|
||||||
|
filespec.update(
|
||||||
|
{
|
||||||
|
NameObject(PG.TYPE): NameObject("/Filespec"),
|
||||||
|
NameObject(FileSpecificationDictionaryEntries.F): name_object,
|
||||||
|
NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the name and filespec to the names array.
|
||||||
|
# We use the inverse order for insertion, as this allows us to re-use the
|
||||||
|
# same index.
|
||||||
|
names_array = cls._get_names_array(writer)
|
||||||
|
insertion_index = cls._get_insertion_index(names_array, name_object)
|
||||||
|
names_array.insert(insertion_index, filespec_reference)
|
||||||
|
names_array.insert(insertion_index, name_object)
|
||||||
|
|
||||||
|
# Return an EmbeddedFile instance
|
||||||
|
return cls(name=name, pdf_object=filespec, parent=names_array)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_names_array(cls, writer: PdfWriter) -> ArrayObject:
|
||||||
|
"""Get the names array for embedded files, possibly creating and flattening it."""
|
||||||
|
if CA.NAMES not in writer.root_object:
|
||||||
|
# Add the /Names entry to the catalog.
|
||||||
|
writer.root_object[NameObject(CA.NAMES)] = writer._add_object(DictionaryObject())
|
||||||
|
|
||||||
|
names_dict = cast(DictionaryObject, writer.root_object[CA.NAMES])
|
||||||
|
if "/EmbeddedFiles" not in names_dict:
|
||||||
|
# We do not yet have an entry for embedded files. Create and return it.
|
||||||
|
names = ArrayObject()
|
||||||
|
embedded_files_names_dictionary = DictionaryObject(
|
||||||
|
{NameObject(CA.NAMES): names}
|
||||||
|
)
|
||||||
|
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||||
|
return names
|
||||||
|
|
||||||
|
# We have an existing embedded files entry.
|
||||||
|
embedded_files_names_tree = cast(DictionaryObject, names_dict["/EmbeddedFiles"])
|
||||||
|
if "/Names" in embedded_files_names_tree:
|
||||||
|
# Simple case: We already have a flat list.
|
||||||
|
return cast(ArrayObject, embedded_files_names_tree[NameObject(CA.NAMES)])
|
||||||
|
if "/Kids" not in embedded_files_names_tree:
|
||||||
|
# Invalid case: This is no name tree.
|
||||||
|
raise PdfReadError("Got neither Names nor Kids in embedded files tree.")
|
||||||
|
|
||||||
|
# Complex case: Convert a /Kids-based name tree to a /Names-based one.
|
||||||
|
# /Name-based ones are much easier to handle and allow us to simplify the
|
||||||
|
# actual insertion logic by only having to consider one case.
|
||||||
|
names = ArrayObject()
|
||||||
|
kids = cast(ArrayObject, embedded_files_names_tree["/Kids"].get_object())
|
||||||
|
embedded_files_names_dictionary = DictionaryObject(
|
||||||
|
{NameObject(CA.NAMES): names}
|
||||||
|
)
|
||||||
|
names_dict[NameObject("/EmbeddedFiles")] = writer._add_object(embedded_files_names_dictionary)
|
||||||
|
for kid in kids:
|
||||||
|
# Write the flattened file entries. As we do not change the actual files,
|
||||||
|
# this should not have any impact on references to them.
|
||||||
|
# There might be further (nested) kids here.
|
||||||
|
# Wait for an example before evaluating an implementation.
|
||||||
|
for name in kid.get_object().get("/Names", []):
|
||||||
|
names.append(name)
|
||||||
|
return names
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int:
|
||||||
|
keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)]
|
||||||
|
name_bytes = name.encode("utf-8")
|
||||||
|
|
||||||
|
start = bisect.bisect_left(keys, name_bytes)
|
||||||
|
end = bisect.bisect_right(keys, name_bytes)
|
||||||
|
|
||||||
|
if start != end:
|
||||||
|
return end * 2
|
||||||
|
if start == 0:
|
||||||
|
return 0
|
||||||
|
if start == (key_count := len(keys)):
|
||||||
|
return key_count * 2
|
||||||
|
return end * 2
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alternative_name(self) -> str | None:
|
||||||
|
"""Retrieve the alternative name (file specification)."""
|
||||||
|
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||||
|
# PDF 2.0 reference, table 43:
|
||||||
|
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
|
||||||
|
if key in self.pdf_object:
|
||||||
|
value = self.pdf_object[key].get_object()
|
||||||
|
if not is_null_or_none(value):
|
||||||
|
return cast(str, value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
@alternative_name.setter
|
||||||
|
def alternative_name(self, value: TextStringObject | None) -> None:
|
||||||
|
"""Set the alternative name (file specification)."""
|
||||||
|
if value is None:
|
||||||
|
if FileSpecificationDictionaryEntries.UF in self.pdf_object:
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = NullObject()
|
||||||
|
if FileSpecificationDictionaryEntries.F in self.pdf_object:
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = NullObject()
|
||||||
|
else:
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.UF)] = value
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.F)] = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str | None:
|
||||||
|
"""Retrieve the description."""
|
||||||
|
value = self.pdf_object.get(FileSpecificationDictionaryEntries.DESC)
|
||||||
|
if is_null_or_none(value):
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
|
||||||
|
@description.setter
|
||||||
|
def description(self, value: TextStringObject | None) -> None:
|
||||||
|
"""Set the description."""
|
||||||
|
if value is None:
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = NullObject()
|
||||||
|
else:
|
||||||
|
self.pdf_object[NameObject(FileSpecificationDictionaryEntries.DESC)] = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def associated_file_relationship(self) -> str:
|
||||||
|
"""Retrieve the relationship of the referring document to this embedded file."""
|
||||||
|
return self.pdf_object.get("/AFRelationship", "/Unspecified")
|
||||||
|
|
||||||
|
@associated_file_relationship.setter
|
||||||
|
def associated_file_relationship(self, value: NameObject) -> None:
|
||||||
|
"""Set the relationship of the referring document to this embedded file."""
|
||||||
|
self.pdf_object[NameObject("/AFRelationship")] = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _embedded_file(self) -> StreamObject:
|
||||||
|
"""Retrieve the actual embedded file stream."""
|
||||||
|
if "/EF" not in self.pdf_object:
|
||||||
|
raise PdfReadError(f"/EF entry not found: {self.pdf_object}")
|
||||||
|
ef = cast(DictionaryObject, self.pdf_object["/EF"])
|
||||||
|
for key in [FileSpecificationDictionaryEntries.UF, FileSpecificationDictionaryEntries.F]:
|
||||||
|
if key in ef:
|
||||||
|
return cast(StreamObject, ef[key].get_object())
|
||||||
|
raise PdfReadError(f"No /(U)F key found in file dictionary: {ef}")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _params(self) -> DictionaryObject:
|
||||||
|
"""Retrieve the file-specific parameters."""
|
||||||
|
return self._embedded_file.get("/Params", DictionaryObject()).get_object()
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def _ensure_params(self) -> DictionaryObject:
|
||||||
|
"""Ensure the /Params dictionary exists and return it."""
|
||||||
|
embedded_file = self._embedded_file
|
||||||
|
if "/Params" not in embedded_file:
|
||||||
|
embedded_file[NameObject("/Params")] = DictionaryObject()
|
||||||
|
return cast(DictionaryObject, embedded_file["/Params"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subtype(self) -> str | None:
|
||||||
|
"""Retrieve the subtype. This is a MIME media type, prefixed by a slash."""
|
||||||
|
value = self._embedded_file.get("/Subtype")
|
||||||
|
if is_null_or_none(value):
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
|
||||||
|
@subtype.setter
|
||||||
|
def subtype(self, value: NameObject | None) -> None:
|
||||||
|
"""Set the subtype. This should be a MIME media type, prefixed by a slash."""
|
||||||
|
embedded_file = self._embedded_file
|
||||||
|
if value is None:
|
||||||
|
embedded_file[NameObject("/Subtype")] = NullObject()
|
||||||
|
else:
|
||||||
|
embedded_file[NameObject("/Subtype")] = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def content(self) -> bytes:
|
||||||
|
"""Retrieve the actual file content."""
|
||||||
|
return self._embedded_file.get_data()
|
||||||
|
|
||||||
|
@content.setter
|
||||||
|
def content(self, value: str | bytes) -> None:
|
||||||
|
"""Set the file content."""
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = value.encode("latin-1")
|
||||||
|
self._embedded_file.set_data(value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def size(self) -> int | None:
|
||||||
|
"""Retrieve the size of the uncompressed file in bytes."""
|
||||||
|
value = self._params.get("/Size")
|
||||||
|
if is_null_or_none(value):
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
|
||||||
|
@size.setter
|
||||||
|
def size(self, value: NumberObject | None) -> None:
|
||||||
|
"""Set the size of the uncompressed file in bytes."""
|
||||||
|
params = self._ensure_params
|
||||||
|
if value is None:
|
||||||
|
params[NameObject("/Size")] = NullObject()
|
||||||
|
else:
|
||||||
|
params[NameObject("/Size")] = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def creation_date(self) -> datetime.datetime | None:
|
||||||
|
"""Retrieve the file creation datetime."""
|
||||||
|
return parse_iso8824_date(self._params.get("/CreationDate"))
|
||||||
|
|
||||||
|
@creation_date.setter
|
||||||
|
def creation_date(self, value: datetime.datetime | None) -> None:
|
||||||
|
"""Set the file creation datetime."""
|
||||||
|
params = self._ensure_params
|
||||||
|
if value is None:
|
||||||
|
params[NameObject("/CreationDate")] = NullObject()
|
||||||
|
else:
|
||||||
|
date_str = format_iso8824_date(value)
|
||||||
|
params[NameObject("/CreationDate")] = TextStringObject(date_str)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def modification_date(self) -> datetime.datetime | None:
|
||||||
|
"""Retrieve the datetime of the last file modification."""
|
||||||
|
return parse_iso8824_date(self._params.get("/ModDate"))
|
||||||
|
|
||||||
|
@modification_date.setter
|
||||||
|
def modification_date(self, value: datetime.datetime | None) -> None:
|
||||||
|
"""Set the datetime of the last file modification."""
|
||||||
|
params = self._ensure_params
|
||||||
|
if value is None:
|
||||||
|
params[NameObject("/ModDate")] = NullObject()
|
||||||
|
else:
|
||||||
|
date_str = format_iso8824_date(value)
|
||||||
|
params[NameObject("/ModDate")] = TextStringObject(date_str)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def checksum(self) -> bytes | None:
|
||||||
|
"""Retrieve the MD5 checksum of the (uncompressed) file."""
|
||||||
|
value = self._params.get("/CheckSum")
|
||||||
|
if is_null_or_none(value):
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
|
||||||
|
@checksum.setter
|
||||||
|
def checksum(self, value: ByteStringObject | None) -> None:
|
||||||
|
"""Set the MD5 checksum of the (uncompressed) file."""
|
||||||
|
params = self._ensure_params
|
||||||
|
if value is None:
|
||||||
|
params[NameObject("/CheckSum")] = NullObject()
|
||||||
|
else:
|
||||||
|
params[NameObject("/CheckSum")] = value
|
||||||
|
|
||||||
|
def delete(self) -> None:
|
||||||
|
"""Delete the file from the document."""
|
||||||
|
if not self._parent:
|
||||||
|
raise PyPdfError("Parent required to delete file from document.")
|
||||||
|
if self.pdf_object in self._parent:
|
||||||
|
index = self._parent.index(self.pdf_object)
|
||||||
|
elif (
|
||||||
|
(indirect_reference := getattr(self.pdf_object, "indirect_reference", None)) is not None
|
||||||
|
and indirect_reference in self._parent
|
||||||
|
):
|
||||||
|
index = self._parent.index(indirect_reference)
|
||||||
|
else:
|
||||||
|
raise PyPdfError("File not found in parent object.")
|
||||||
|
self._parent.pop(index) # Reference.
|
||||||
|
self._parent.pop(index - 1) # Name.
|
||||||
|
self.pdf_object = DictionaryObject() # Invalidate.
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<{self.__class__.__name__} name={self.name!r}>"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]:
|
||||||
|
"""
|
||||||
|
Convert the given name tree into class instances.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
names: The name tree to load the data from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterable of class instances for the files found.
|
||||||
|
"""
|
||||||
|
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
|
||||||
|
for i, name in enumerate(names):
|
||||||
|
if not isinstance(name, str):
|
||||||
|
# Skip plain strings and retrieve them as `direct_name` by index.
|
||||||
|
file_dictionary = name.get_object()
|
||||||
|
direct_name = names[i - 1].get_object()
|
||||||
|
yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]:
|
||||||
|
"""
|
||||||
|
Load the embedded files for the given document catalog.
|
||||||
|
|
||||||
|
This method and its signature are considered internal API and thus not exposed publicly for now.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
catalog: The document catalog to load from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterable of class instances for the files found.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
container = cast(
|
||||||
|
DictionaryObject,
|
||||||
|
cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
return
|
||||||
|
|
||||||
|
if "/Kids" in container:
|
||||||
|
for kid in cast(ArrayObject, container["/Kids"].get_object()):
|
||||||
|
# There might be further (nested) kids here.
|
||||||
|
# Wait for an example before evaluating an implementation.
|
||||||
|
kid = kid.get_object()
|
||||||
|
if "/Names" in kid:
|
||||||
|
yield from cls._load_from_names(cast(ArrayObject, kid["/Names"]))
|
||||||
|
if "/Names" in container:
|
||||||
|
yield from cls._load_from_names(cast(ArrayObject, container["/Names"]))
|
||||||
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
174
venv/lib/python3.12/site-packages/pypdf/generic/_fit.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
from ._base import is_null_or_none
|
||||||
|
|
||||||
|
|
||||||
|
class Fit:
|
||||||
|
def __init__(
|
||||||
|
self, fit_type: str, fit_args: tuple[Union[None, float, Any], ...] = ()
|
||||||
|
) -> None:
|
||||||
|
from ._base import FloatObject, NameObject, NullObject, NumberObject # noqa: PLC0415
|
||||||
|
|
||||||
|
self.fit_type = NameObject(fit_type)
|
||||||
|
self.fit_args: list[Union[NullObject, FloatObject, NumberObject]] = [
|
||||||
|
NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def xyz(
|
||||||
|
cls,
|
||||||
|
left: Optional[float] = None,
|
||||||
|
top: Optional[float] = None,
|
||||||
|
zoom: Optional[float] = None,
|
||||||
|
) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with the coordinates (left, top)
|
||||||
|
positioned at the upper-left corner of the window and the contents
|
||||||
|
of the page magnified by the factor zoom.
|
||||||
|
|
||||||
|
A null value for any of the parameters left, top, or zoom specifies
|
||||||
|
that the current value of that parameter is to be retained unchanged.
|
||||||
|
|
||||||
|
A zoom value of 0 has the same meaning as a null value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
left:
|
||||||
|
top:
|
||||||
|
zoom:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created fit object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/XYZ", fit_args=(left, top, zoom))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit(cls) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with its contents magnified just
|
||||||
|
enough to fit the entire page within the window both horizontally and
|
||||||
|
vertically.
|
||||||
|
|
||||||
|
If the required horizontal and vertical magnification factors are
|
||||||
|
different, use the smaller of the two, centering the page within the
|
||||||
|
window in the other dimension.
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/Fit")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with the vertical coordinate top
|
||||||
|
positioned at the top edge of the window and the contents of the page
|
||||||
|
magnified just enough to fit the entire width of the page within the
|
||||||
|
window.
|
||||||
|
|
||||||
|
A null value for ``top`` specifies that the current value of that
|
||||||
|
parameter is to be retained unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
top:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created fit object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/FitH", fit_args=(top,))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||||
|
return Fit(fit_type="/FitV", fit_args=(left,))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_rectangle(
|
||||||
|
cls,
|
||||||
|
left: Optional[float] = None,
|
||||||
|
bottom: Optional[float] = None,
|
||||||
|
right: Optional[float] = None,
|
||||||
|
top: Optional[float] = None,
|
||||||
|
) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with its contents magnified
|
||||||
|
just enough to fit the rectangle specified by the coordinates
|
||||||
|
left, bottom, right, and top entirely within the window
|
||||||
|
both horizontally and vertically.
|
||||||
|
|
||||||
|
If the required horizontal and vertical magnification factors are
|
||||||
|
different, use the smaller of the two, centering the rectangle within
|
||||||
|
the window in the other dimension.
|
||||||
|
|
||||||
|
A null value for any of the parameters may result in unpredictable
|
||||||
|
behavior.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
left:
|
||||||
|
bottom:
|
||||||
|
right:
|
||||||
|
top:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created fit object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_box(cls) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with its contents magnified just
|
||||||
|
enough to fit its bounding box entirely within the window both
|
||||||
|
horizontally and vertically.
|
||||||
|
|
||||||
|
If the required horizontal and vertical magnification factors are
|
||||||
|
different, use the smaller of the two, centering the bounding box
|
||||||
|
within the window in the other dimension.
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/FitB")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with the vertical coordinate top
|
||||||
|
positioned at the top edge of the window and the contents of the page
|
||||||
|
magnified just enough to fit the entire width of its bounding box
|
||||||
|
within the window.
|
||||||
|
|
||||||
|
A null value for top specifies that the current value of that parameter
|
||||||
|
is to be retained unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
top:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created fit object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/FitBH", fit_args=(top,))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit":
|
||||||
|
"""
|
||||||
|
Display the page designated by page, with the horizontal coordinate
|
||||||
|
left positioned at the left edge of the window and the contents of the
|
||||||
|
page magnified just enough to fit the entire height of its bounding box
|
||||||
|
within the window.
|
||||||
|
|
||||||
|
A null value for left specifies that the current value of that
|
||||||
|
parameter is to be retained unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
left:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created fit object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Fit(fit_type="/FitBV", fit_args=(left,))
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
if not self.fit_args:
|
||||||
|
return f"Fit({self.fit_type})"
|
||||||
|
return f"Fit({self.fit_type}, {self.fit_args})"
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_FIT = Fit.fit()
|
||||||
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
314
venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
# Copyright (c) 2024, pypdf contributors
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import IO
|
||||||
|
|
||||||
|
from .._utils import (
|
||||||
|
WHITESPACES,
|
||||||
|
WHITESPACES_AS_BYTES,
|
||||||
|
StreamType,
|
||||||
|
logger_warning,
|
||||||
|
read_non_whitespace,
|
||||||
|
)
|
||||||
|
from ..errors import PdfReadError
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# An inline image should be used only for small images (4096 bytes or less),
|
||||||
|
# but allow twice this for cases where this has been exceeded.
|
||||||
|
BUFFER_SIZE = 8192
|
||||||
|
|
||||||
|
|
||||||
|
def _check_end_image_marker(stream: StreamType) -> bool:
|
||||||
|
ei_tok = read_non_whitespace(stream)
|
||||||
|
ei_tok += stream.read(2)
|
||||||
|
stream.seek(-3, 1)
|
||||||
|
return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Extract HexEncoded stream from inline image.
|
||||||
|
The stream will be moved onto the EI.
|
||||||
|
"""
|
||||||
|
data_out: bytes = b""
|
||||||
|
# Read data until delimiter > and EI as backup.
|
||||||
|
while True:
|
||||||
|
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||||
|
if not data_buffered:
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
pos_tok = data_buffered.find(b">")
|
||||||
|
if pos_tok >= 0: # found >
|
||||||
|
data_out += data_buffered[: pos_tok + 1]
|
||||||
|
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||||
|
break
|
||||||
|
pos_ei = data_buffered.find(b"EI")
|
||||||
|
if pos_ei >= 0: # found EI
|
||||||
|
stream.seek(-len(data_buffered) + pos_ei - 1, 1)
|
||||||
|
c = stream.read(1)
|
||||||
|
while c in WHITESPACES:
|
||||||
|
stream.seek(-2, 1)
|
||||||
|
c = stream.read(1)
|
||||||
|
pos_ei -= 1
|
||||||
|
data_out += data_buffered[:pos_ei]
|
||||||
|
break
|
||||||
|
if len(data_buffered) == 2:
|
||||||
|
data_out += data_buffered
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
# Neither > nor EI found
|
||||||
|
data_out += data_buffered[:-2]
|
||||||
|
stream.seek(-2, 1)
|
||||||
|
|
||||||
|
if not _check_end_image_marker(stream):
|
||||||
|
raise PdfReadError("EI stream not found")
|
||||||
|
return data_out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Extract A85 stream from inline image.
|
||||||
|
The stream will be moved onto the EI.
|
||||||
|
"""
|
||||||
|
data_out: bytes = b""
|
||||||
|
# Read data until delimiter ~>
|
||||||
|
while True:
|
||||||
|
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
||||||
|
if not data_buffered:
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
pos_tok = data_buffered.find(b"~>")
|
||||||
|
if pos_tok >= 0: # found!
|
||||||
|
data_out += data_buffered[: pos_tok + 2]
|
||||||
|
stream.seek(-len(data_buffered) + pos_tok + 2, 1)
|
||||||
|
break
|
||||||
|
if len(data_buffered) == 2: # end of buffer
|
||||||
|
data_out += data_buffered
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
data_out += data_buffered[
|
||||||
|
:-2
|
||||||
|
] # back by one char in case of in the middle of ~>
|
||||||
|
stream.seek(-2, 1)
|
||||||
|
|
||||||
|
if not _check_end_image_marker(stream):
|
||||||
|
raise PdfReadError("EI stream not found")
|
||||||
|
return data_out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_inline__run_length_decode(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Extract RL (RunLengthDecode) stream from inline image.
|
||||||
|
The stream will be moved onto the EI.
|
||||||
|
"""
|
||||||
|
data_out: bytes = b""
|
||||||
|
# Read data until delimiter 128
|
||||||
|
while True:
|
||||||
|
data_buffered = stream.read(BUFFER_SIZE)
|
||||||
|
if not data_buffered:
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
pos_tok = data_buffered.find(b"\x80")
|
||||||
|
if pos_tok >= 0: # found
|
||||||
|
# Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
|
||||||
|
# marks the EOD. But there apparently are cases like in issue #3517, where we have
|
||||||
|
# an inline image with up to 51 EOD markers. In these cases, be resilient here and
|
||||||
|
# use the default `EI` marker detection instead. Please note that this fallback
|
||||||
|
# still omits special `EI` handling within the stream, but for now assume that having
|
||||||
|
# both of these cases occur at the same time is very unlikely (and the image stream
|
||||||
|
# is broken anyway).
|
||||||
|
# For now, do not skip over more than one whitespace character.
|
||||||
|
after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
|
||||||
|
if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
|
||||||
|
data_out += data_buffered[: pos_tok + 1]
|
||||||
|
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
||||||
|
else:
|
||||||
|
logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
|
||||||
|
ei_marker = data_buffered.find(b"EI")
|
||||||
|
if ei_marker > 0:
|
||||||
|
data_out += data_buffered[: ei_marker]
|
||||||
|
stream.seek(-len(data_buffered) + ei_marker - 1, 1)
|
||||||
|
break
|
||||||
|
data_out += data_buffered
|
||||||
|
|
||||||
|
if not _check_end_image_marker(stream):
|
||||||
|
raise PdfReadError("EI stream not found")
|
||||||
|
return data_out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_inline__dct_decode(stream: StreamType) -> bytes:
|
||||||
|
"""
|
||||||
|
Extract DCT (JPEG) stream from inline image.
|
||||||
|
The stream will be moved onto the EI.
|
||||||
|
"""
|
||||||
|
def read(length: int) -> bytes:
|
||||||
|
# If 0 bytes are returned, and *size* was not 0, this indicates end of file.
|
||||||
|
# If the object is in non-blocking mode and no bytes are available, `None` is returned.
|
||||||
|
_result = stream.read(length)
|
||||||
|
if _result is None or len(_result) != length:
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
return _result
|
||||||
|
|
||||||
|
data_out: bytes = b""
|
||||||
|
# Read Blocks of data (ID/Size/data) up to ID=FF/D9
|
||||||
|
# https://www.digicamsoft.com/itu/itu-t81-36.html
|
||||||
|
not_first = False
|
||||||
|
while True:
|
||||||
|
c = read(1)
|
||||||
|
if not_first or (c == b"\xff"):
|
||||||
|
data_out += c
|
||||||
|
if c != b"\xff":
|
||||||
|
continue
|
||||||
|
not_first = True
|
||||||
|
c = read(1)
|
||||||
|
data_out += c
|
||||||
|
if c == b"\xff":
|
||||||
|
stream.seek(-1, 1) # pragma: no cover
|
||||||
|
elif c == b"\x00": # stuffing
|
||||||
|
pass
|
||||||
|
elif c == b"\xd9": # end
|
||||||
|
break
|
||||||
|
elif c in (
|
||||||
|
b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
|
||||||
|
b"\xda\xdb\xdc\xdd\xde\xdf"
|
||||||
|
b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
|
||||||
|
):
|
||||||
|
c = read(2)
|
||||||
|
data_out += c
|
||||||
|
sz = c[0] * 256 + c[1]
|
||||||
|
data_out += read(sz - 2)
|
||||||
|
|
||||||
|
if not _check_end_image_marker(stream):
|
||||||
|
raise PdfReadError("EI stream not found")
|
||||||
|
return data_out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_inline_default(stream: StreamType) -> bytes:
|
||||||
|
"""Legacy method, used by default"""
|
||||||
|
stream_out = BytesIO()
|
||||||
|
# Read the inline image, while checking for EI (End Image) operator.
|
||||||
|
while True:
|
||||||
|
data_buffered = stream.read(BUFFER_SIZE)
|
||||||
|
if not data_buffered:
|
||||||
|
raise PdfReadError("Unexpected end of stream")
|
||||||
|
pos_ei = data_buffered.find(
|
||||||
|
b"E"
|
||||||
|
) # We can not look straight for "EI" because it may not have been loaded in the buffer
|
||||||
|
|
||||||
|
if pos_ei == -1:
|
||||||
|
stream_out.write(data_buffered)
|
||||||
|
else:
|
||||||
|
# Write out everything including E (the one from EI to be removed)
|
||||||
|
stream_out.write(data_buffered[0 : pos_ei + 1])
|
||||||
|
sav_pos_ei = stream_out.tell() - 1
|
||||||
|
# Seek back in the stream to read the E next
|
||||||
|
stream.seek(pos_ei + 1 - len(data_buffered), 1)
|
||||||
|
saved_pos = stream.tell()
|
||||||
|
# Check for End Image
|
||||||
|
tok2 = stream.read(1) # I of "EI"
|
||||||
|
if tok2 != b"I":
|
||||||
|
stream.seek(saved_pos, 0)
|
||||||
|
continue
|
||||||
|
tok3 = stream.read(1) # possible space after "EI"
|
||||||
|
if tok3 not in WHITESPACES:
|
||||||
|
stream.seek(saved_pos, 0)
|
||||||
|
continue
|
||||||
|
while tok3 in WHITESPACES:
|
||||||
|
tok3 = stream.read(1)
|
||||||
|
if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
|
||||||
|
b"Q",
|
||||||
|
b"E",
|
||||||
|
}: # for Q or EMC
|
||||||
|
stream.seek(saved_pos, 0)
|
||||||
|
continue
|
||||||
|
if is_followed_by_binary_data(stream):
|
||||||
|
# Inline image contains `EI ` sequence usually marking the end of it, but
|
||||||
|
# is followed by binary data which does not make sense for the actual end.
|
||||||
|
stream.seek(saved_pos, 0)
|
||||||
|
continue
|
||||||
|
# Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient
|
||||||
|
# remove E(I) wrongly inserted earlier
|
||||||
|
stream.seek(saved_pos - 1, 0)
|
||||||
|
stream_out.truncate(sav_pos_ei)
|
||||||
|
break
|
||||||
|
|
||||||
|
return stream_out.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the next bytes of the stream look like binary image data or regular page content.
|
||||||
|
|
||||||
|
This is just some heuristics due to the PDF specification being too imprecise about
|
||||||
|
inline images containing the `EI` marker which would end an image. Starting with PDF 2.0,
|
||||||
|
we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited
|
||||||
|
everywhere, we should not expect to be able to remove such hacks in the near future - especially
|
||||||
|
considering legacy documents as well.
|
||||||
|
|
||||||
|
The actual implementation draws some inspiration from
|
||||||
|
https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
|
||||||
|
"""
|
||||||
|
position = stream.tell()
|
||||||
|
data = stream.read(length)
|
||||||
|
stream.seek(position)
|
||||||
|
if not data:
|
||||||
|
return False
|
||||||
|
operator_start = None
|
||||||
|
operator_end = None
|
||||||
|
|
||||||
|
for index, byte in enumerate(data):
|
||||||
|
if byte < 32 and byte not in WHITESPACES_AS_BYTES:
|
||||||
|
# This covers all characters not being displayable directly, although omitting whitespace
|
||||||
|
# to allow for operator detection.
|
||||||
|
return True
|
||||||
|
is_whitespace = byte in WHITESPACES_AS_BYTES
|
||||||
|
if operator_start is None and not is_whitespace:
|
||||||
|
# Interpret all other non-whitespace characters as the start of an operation.
|
||||||
|
operator_start = index
|
||||||
|
if operator_start is not None and is_whitespace:
|
||||||
|
# A whitespace stops an operation.
|
||||||
|
# Assume that having an inline image with tons of whitespace is rather unlikely.
|
||||||
|
operator_end = index
|
||||||
|
break
|
||||||
|
|
||||||
|
if operator_start is None:
|
||||||
|
# Inline images should not have tons of whitespaces, which would lead to no operator start.
|
||||||
|
return False
|
||||||
|
if operator_end is None:
|
||||||
|
# We probably are inside an operation.
|
||||||
|
operator_end = length
|
||||||
|
operator_length = operator_end - operator_start
|
||||||
|
operator = data[operator_start:operator_end]
|
||||||
|
if operator.startswith(b"/") and operator_length > 1:
|
||||||
|
# Name object.
|
||||||
|
return False
|
||||||
|
if operator.replace(b".", b"").isdigit():
|
||||||
|
# Graphics operator, for example a move. A number (integer or float).
|
||||||
|
return False
|
||||||
|
if operator_length > 3: # noqa: SIM103
|
||||||
|
# Usually, the operators inside a content stream should not have more than three characters,
|
||||||
|
# especially after an inline image.
|
||||||
|
return True
|
||||||
|
return False
|
||||||
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
118
venv/lib/python3.12/site-packages/pypdf/generic/_link.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
# This module contains code used by _writer.py to track links in pages
|
||||||
|
# being added to the writer until the links can be resolved.
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING, Optional, Union, cast
|
||||||
|
|
||||||
|
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .._page import PageObject
|
||||||
|
from .._reader import PdfReader
|
||||||
|
from .._writer import PdfWriter
|
||||||
|
|
||||||
|
|
||||||
|
class NamedReferenceLink:
|
||||||
|
"""Named reference link being preserved until we can resolve it correctly."""
|
||||||
|
|
||||||
|
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
|
||||||
|
"""reference: TextStringObject with named reference"""
|
||||||
|
self._reference = reference
|
||||||
|
self._source_pdf = source_pdf
|
||||||
|
|
||||||
|
def find_referenced_page(self) -> Union[IndirectObject, None]:
|
||||||
|
destination = self._source_pdf.named_destinations.get(str(self._reference))
|
||||||
|
return destination.page if destination else None
|
||||||
|
|
||||||
|
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||||
|
"""target_pdf: PdfWriter which the new link went into"""
|
||||||
|
# point named destination in new PDF to the new page
|
||||||
|
if str(self._reference) not in target_pdf.named_destinations:
|
||||||
|
target_pdf.add_named_destination(str(self._reference), new_page.page_number)
|
||||||
|
|
||||||
|
|
||||||
|
class DirectReferenceLink:
|
||||||
|
"""Direct reference link being preserved until we can resolve it correctly."""
|
||||||
|
|
||||||
|
def __init__(self, reference: ArrayObject) -> None:
|
||||||
|
"""reference: an ArrayObject whose first element is the Page indirect object"""
|
||||||
|
self._reference = reference
|
||||||
|
|
||||||
|
def find_referenced_page(self) -> IndirectObject:
|
||||||
|
return self._reference[0]
|
||||||
|
|
||||||
|
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
|
||||||
|
"""target_pdf: PdfWriter which the new link went into"""
|
||||||
|
self._reference[0] = new_page
|
||||||
|
|
||||||
|
|
||||||
|
ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_links(new_page: "PageObject", old_page: "PageObject") -> list[tuple[ReferenceLink, ReferenceLink]]:
|
||||||
|
"""Extracts links from two pages on the assumption that the two pages are
|
||||||
|
the same. Produces one list of (new link, old link) tuples.
|
||||||
|
"""
|
||||||
|
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
|
||||||
|
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
|
||||||
|
|
||||||
|
return [
|
||||||
|
(new_link, old_link) for (new_link, old_link)
|
||||||
|
in zip(new_links, old_links)
|
||||||
|
if new_link and old_link
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
|
||||||
|
src = cast("PdfReader", page.pdf)
|
||||||
|
link = cast(DictionaryObject, indirect_object.get_object())
|
||||||
|
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
|
||||||
|
return None
|
||||||
|
|
||||||
|
if "/A" in link:
|
||||||
|
action = cast(DictionaryObject, link["/A"])
|
||||||
|
if action.get("/S") != "/GoTo":
|
||||||
|
return None
|
||||||
|
|
||||||
|
if "/D" not in action:
|
||||||
|
return None
|
||||||
|
return _create_link(action["/D"], src)
|
||||||
|
|
||||||
|
if "/Dest" in link:
|
||||||
|
return _create_link(link["/Dest"], src)
|
||||||
|
|
||||||
|
return None # Nothing to do here
|
||||||
|
|
||||||
|
|
||||||
|
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
|
||||||
|
if isinstance(reference, TextStringObject):
|
||||||
|
return NamedReferenceLink(reference, source_pdf)
|
||||||
|
if isinstance(reference, ArrayObject):
|
||||||
|
return DirectReferenceLink(reference)
|
||||||
|
return None
|
||||||
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
33
venv/lib/python3.12/site-packages/pypdf/generic/_outline.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from .._utils import StreamType, deprecation_no_replacement
|
||||||
|
from ._base import NameObject
|
||||||
|
from ._data_structures import Destination
|
||||||
|
|
||||||
|
|
||||||
|
class OutlineItem(Destination):
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
stream.write(b"<<\n")
|
||||||
|
for key in [
|
||||||
|
NameObject(x)
|
||||||
|
for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"]
|
||||||
|
if x in self
|
||||||
|
]:
|
||||||
|
key.write_to_stream(stream)
|
||||||
|
stream.write(b" ")
|
||||||
|
value = self.raw_get(key)
|
||||||
|
value.write_to_stream(stream)
|
||||||
|
stream.write(b"\n")
|
||||||
|
key = NameObject("/Dest")
|
||||||
|
key.write_to_stream(stream)
|
||||||
|
stream.write(b" ")
|
||||||
|
value = self.dest_array
|
||||||
|
value.write_to_stream(stream)
|
||||||
|
stream.write(b"\n")
|
||||||
|
stream.write(b">>")
|
||||||
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
132
venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from ._base import FloatObject, NumberObject
|
||||||
|
from ._data_structures import ArrayObject
|
||||||
|
|
||||||
|
|
||||||
|
class RectangleObject(ArrayObject):
|
||||||
|
"""
|
||||||
|
This class is used to represent *page boxes* in pypdf.
|
||||||
|
|
||||||
|
These boxes include:
|
||||||
|
|
||||||
|
* :attr:`artbox <pypdf._page.PageObject.artbox>`
|
||||||
|
* :attr:`bleedbox <pypdf._page.PageObject.bleedbox>`
|
||||||
|
* :attr:`cropbox <pypdf._page.PageObject.cropbox>`
|
||||||
|
* :attr:`mediabox <pypdf._page.PageObject.mediabox>`
|
||||||
|
* :attr:`trimbox <pypdf._page.PageObject.trimbox>`
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, arr: Union["RectangleObject", tuple[float, float, float, float]]
|
||||||
|
) -> None:
|
||||||
|
# must have four points
|
||||||
|
assert len(arr) == 4
|
||||||
|
# automatically convert arr[x] into NumberObject(arr[x]) if necessary
|
||||||
|
ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr])
|
||||||
|
|
||||||
|
def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]:
|
||||||
|
if not isinstance(value, (FloatObject, NumberObject)):
|
||||||
|
value = FloatObject(value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
def scale(self, sx: float, sy: float) -> "RectangleObject":
|
||||||
|
return RectangleObject(
|
||||||
|
(
|
||||||
|
float(self.left) * sx,
|
||||||
|
float(self.bottom) * sy,
|
||||||
|
float(self.right) * sx,
|
||||||
|
float(self.top) * sy,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"RectangleObject({list(self)!r})"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def left(self) -> FloatObject:
|
||||||
|
return self[0]
|
||||||
|
|
||||||
|
@left.setter
|
||||||
|
def left(self, f: float) -> None:
|
||||||
|
self[0] = FloatObject(f)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bottom(self) -> FloatObject:
|
||||||
|
return self[1]
|
||||||
|
|
||||||
|
@bottom.setter
|
||||||
|
def bottom(self, f: float) -> None:
|
||||||
|
self[1] = FloatObject(f)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def right(self) -> FloatObject:
|
||||||
|
return self[2]
|
||||||
|
|
||||||
|
@right.setter
|
||||||
|
def right(self, f: float) -> None:
|
||||||
|
self[2] = FloatObject(f)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def top(self) -> FloatObject:
|
||||||
|
return self[3]
|
||||||
|
|
||||||
|
@top.setter
|
||||||
|
def top(self, f: float) -> None:
|
||||||
|
self[3] = FloatObject(f)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def lower_left(self) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Property to read and modify the lower left coordinate of this box
|
||||||
|
in (x,y) form.
|
||||||
|
"""
|
||||||
|
return self.left, self.bottom
|
||||||
|
|
||||||
|
@lower_left.setter
|
||||||
|
def lower_left(self, value: tuple[float, float]) -> None:
|
||||||
|
self[0], self[1] = (self._ensure_is_number(x) for x in value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def lower_right(self) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Property to read and modify the lower right coordinate of this box
|
||||||
|
in (x,y) form.
|
||||||
|
"""
|
||||||
|
return self.right, self.bottom
|
||||||
|
|
||||||
|
@lower_right.setter
|
||||||
|
def lower_right(self, value: tuple[float, float]) -> None:
|
||||||
|
self[2], self[1] = (self._ensure_is_number(x) for x in value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def upper_left(self) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Property to read and modify the upper left coordinate of this box
|
||||||
|
in (x,y) form.
|
||||||
|
"""
|
||||||
|
return self.left, self.top
|
||||||
|
|
||||||
|
@upper_left.setter
|
||||||
|
def upper_left(self, value: tuple[float, float]) -> None:
|
||||||
|
self[0], self[3] = (self._ensure_is_number(x) for x in value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def upper_right(self) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Property to read and modify the upper right coordinate of this box
|
||||||
|
in (x,y) form.
|
||||||
|
"""
|
||||||
|
return self.right, self.top
|
||||||
|
|
||||||
|
@upper_right.setter
|
||||||
|
def upper_right(self, value: tuple[float, float]) -> None:
|
||||||
|
self[2], self[3] = (self._ensure_is_number(x) for x in value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self) -> float:
|
||||||
|
return self.right - self.left
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self) -> float:
|
||||||
|
return self.top - self.bottom
|
||||||
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
208
venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
import codecs
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from .._codecs import _pdfdoc_encoding
|
||||||
|
from .._utils import StreamType, logger_warning, read_non_whitespace
|
||||||
|
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
|
||||||
|
from ._base import ByteStringObject, TextStringObject
|
||||||
|
|
||||||
|
|
||||||
|
def hex_to_rgb(value: str) -> tuple[float, float, float]:
|
||||||
|
return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def read_hex_string_from_stream(
|
||||||
|
stream: StreamType,
|
||||||
|
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||||
|
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||||
|
stream.read(1)
|
||||||
|
arr = []
|
||||||
|
x = b""
|
||||||
|
while True:
|
||||||
|
tok = read_non_whitespace(stream)
|
||||||
|
if not tok:
|
||||||
|
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||||
|
if tok == b">":
|
||||||
|
break
|
||||||
|
x += tok
|
||||||
|
if len(x) == 2:
|
||||||
|
arr.append(int(x, base=16))
|
||||||
|
x = b""
|
||||||
|
if len(x) == 1:
|
||||||
|
x += b"0"
|
||||||
|
if x != b"":
|
||||||
|
arr.append(int(x, base=16))
|
||||||
|
return create_string_object(bytes(arr), forced_encoding)
|
||||||
|
|
||||||
|
|
||||||
|
__ESCAPE_DICT__ = {
|
||||||
|
b"n": ord(b"\n"),
|
||||||
|
b"r": ord(b"\r"),
|
||||||
|
b"t": ord(b"\t"),
|
||||||
|
b"b": ord(b"\b"),
|
||||||
|
b"f": ord(b"\f"),
|
||||||
|
b"(": ord(b"("),
|
||||||
|
b")": ord(b")"),
|
||||||
|
b"/": ord(b"/"),
|
||||||
|
b"\\": ord(b"\\"),
|
||||||
|
b" ": ord(b" "),
|
||||||
|
b"%": ord(b"%"),
|
||||||
|
b"<": ord(b"<"),
|
||||||
|
b">": ord(b">"),
|
||||||
|
b"[": ord(b"["),
|
||||||
|
b"]": ord(b"]"),
|
||||||
|
b"#": ord(b"#"),
|
||||||
|
b"_": ord(b"_"),
|
||||||
|
b"&": ord(b"&"),
|
||||||
|
b"$": ord(b"$"),
|
||||||
|
}
|
||||||
|
__BACKSLASH_CODE__ = 92
|
||||||
|
|
||||||
|
|
||||||
|
def read_string_from_stream(
|
||||||
|
stream: StreamType,
|
||||||
|
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||||
|
) -> Union["TextStringObject", "ByteStringObject"]:
|
||||||
|
tok = stream.read(1)
|
||||||
|
parens = 1
|
||||||
|
txt = []
|
||||||
|
while True:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if not tok:
|
||||||
|
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||||
|
if tok == b"(":
|
||||||
|
parens += 1
|
||||||
|
elif tok == b")":
|
||||||
|
parens -= 1
|
||||||
|
if parens == 0:
|
||||||
|
break
|
||||||
|
elif tok == b"\\":
|
||||||
|
tok = stream.read(1)
|
||||||
|
try:
|
||||||
|
txt.append(__ESCAPE_DICT__[tok])
|
||||||
|
continue
|
||||||
|
except KeyError:
|
||||||
|
if b"0" <= tok <= b"7":
|
||||||
|
# "The number ddd may consist of one, two, or three
|
||||||
|
# octal digits; high-order overflow shall be ignored.
|
||||||
|
# Three octal digits shall be used, with leading zeros
|
||||||
|
# as needed, if the next character of the string is also
|
||||||
|
# a digit." (PDF reference 7.3.4.2, p 16)
|
||||||
|
sav = stream.tell() - 1
|
||||||
|
for _ in range(2):
|
||||||
|
ntok = stream.read(1)
|
||||||
|
if b"0" <= ntok <= b"7":
|
||||||
|
tok += ntok
|
||||||
|
else:
|
||||||
|
stream.seek(-1, 1) # ntok has to be analyzed
|
||||||
|
break
|
||||||
|
i = int(tok, base=8)
|
||||||
|
if i > 255:
|
||||||
|
txt.append(__BACKSLASH_CODE__)
|
||||||
|
stream.seek(sav)
|
||||||
|
else:
|
||||||
|
txt.append(i)
|
||||||
|
continue
|
||||||
|
if tok in b"\n\r":
|
||||||
|
# This case is hit when a backslash followed by a line
|
||||||
|
# break occurs. If it's a multi-char EOL, consume the
|
||||||
|
# second character:
|
||||||
|
tok = stream.read(1)
|
||||||
|
if tok not in b"\n\r":
|
||||||
|
stream.seek(-1, 1)
|
||||||
|
# Then don't add anything to the actual string, since this
|
||||||
|
# line break was escaped:
|
||||||
|
continue
|
||||||
|
msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"
|
||||||
|
logger_warning(msg, __name__)
|
||||||
|
txt.append(__BACKSLASH_CODE__)
|
||||||
|
txt.append(ord(tok))
|
||||||
|
return create_string_object(bytes(txt), forced_encoding)
|
||||||
|
|
||||||
|
|
||||||
|
def create_string_object(
|
||||||
|
string: Union[str, bytes],
|
||||||
|
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
|
||||||
|
) -> Union[TextStringObject, ByteStringObject]:
|
||||||
|
"""
|
||||||
|
Create a ByteStringObject or a TextStringObject from a string to represent the string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string: The data being used
|
||||||
|
forced_encoding: Typically None, or an encoding string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ByteStringObject
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TypeError: If string is not of type str or bytes.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if isinstance(string, str):
|
||||||
|
return TextStringObject(string)
|
||||||
|
if isinstance(string, bytes):
|
||||||
|
if isinstance(forced_encoding, (list, dict)):
|
||||||
|
out = ""
|
||||||
|
for x in string:
|
||||||
|
try:
|
||||||
|
out += forced_encoding[x]
|
||||||
|
except Exception:
|
||||||
|
out += bytes((x,)).decode("charmap")
|
||||||
|
obj = TextStringObject(out)
|
||||||
|
obj._original_bytes = string
|
||||||
|
return obj
|
||||||
|
if isinstance(forced_encoding, str):
|
||||||
|
if forced_encoding == "bytes":
|
||||||
|
return ByteStringObject(string)
|
||||||
|
obj = TextStringObject(string.decode(forced_encoding))
|
||||||
|
obj._original_bytes = string
|
||||||
|
return obj
|
||||||
|
try:
|
||||||
|
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
|
||||||
|
retval = TextStringObject(string.decode("utf-16"))
|
||||||
|
retval._original_bytes = string
|
||||||
|
retval.autodetect_utf16 = True
|
||||||
|
retval.utf16_bom = string[:2]
|
||||||
|
return retval
|
||||||
|
if string.startswith(b"\x00"):
|
||||||
|
retval = TextStringObject(string.decode("utf-16be"))
|
||||||
|
retval._original_bytes = string
|
||||||
|
retval.autodetect_utf16 = True
|
||||||
|
retval.utf16_bom = codecs.BOM_UTF16_BE
|
||||||
|
return retval
|
||||||
|
if string[1:2] == b"\x00":
|
||||||
|
retval = TextStringObject(string.decode("utf-16le"))
|
||||||
|
retval._original_bytes = string
|
||||||
|
retval.autodetect_utf16 = True
|
||||||
|
retval.utf16_bom = codecs.BOM_UTF16_LE
|
||||||
|
return retval
|
||||||
|
|
||||||
|
# This is probably a big performance hit here, but we need
|
||||||
|
# to convert string objects into the text/unicode-aware
|
||||||
|
# version if possible... and the only way to check if that's
|
||||||
|
# possible is to try.
|
||||||
|
# Some strings are strings, some are just byte arrays.
|
||||||
|
retval = TextStringObject(decode_pdfdocencoding(string))
|
||||||
|
retval._original_bytes = string
|
||||||
|
retval.autodetect_pdfdocencoding = True
|
||||||
|
return retval
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return ByteStringObject(string)
|
||||||
|
else:
|
||||||
|
raise TypeError("create_string_object should have str or unicode arg")
|
||||||
|
|
||||||
|
|
||||||
|
def decode_pdfdocencoding(byte_array: bytes) -> str:
|
||||||
|
retval = ""
|
||||||
|
for b in byte_array:
|
||||||
|
c = _pdfdoc_encoding[b]
|
||||||
|
if c == "\u0000":
|
||||||
|
raise UnicodeDecodeError(
|
||||||
|
"pdfdocencoding",
|
||||||
|
bytearray(b),
|
||||||
|
-1,
|
||||||
|
-1,
|
||||||
|
"does not exist in translation table",
|
||||||
|
)
|
||||||
|
retval += c
|
||||||
|
return retval
|
||||||
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
163
venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
# Copyright (c) 2023, Pubpub-ZZ
|
||||||
|
#
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are
|
||||||
|
# met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
# * The name of the author may not be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Optional,
|
||||||
|
)
|
||||||
|
|
||||||
|
from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none
|
||||||
|
from ._data_structures import ArrayObject, DictionaryObject
|
||||||
|
|
||||||
|
f_obj = BooleanObject(False)
|
||||||
|
|
||||||
|
|
||||||
|
class ViewerPreferences(DictionaryObject):
|
||||||
|
def __init__(self, obj: Optional[DictionaryObject] = None) -> None:
|
||||||
|
super().__init__(self)
|
||||||
|
if not is_null_or_none(obj):
|
||||||
|
self.update(obj.items()) # type: ignore
|
||||||
|
try:
|
||||||
|
self.indirect_reference = obj.indirect_reference # type: ignore
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _get_bool(self, key: str, default: Optional[BooleanObject]) -> Optional[BooleanObject]:
|
||||||
|
return self.get(key, default)
|
||||||
|
|
||||||
|
def _set_bool(self, key: str, v: bool) -> None:
|
||||||
|
self[NameObject(key)] = BooleanObject(v is True)
|
||||||
|
|
||||||
|
def _get_name(self, key: str, default: Optional[NameObject]) -> Optional[NameObject]:
|
||||||
|
return self.get(key, default)
|
||||||
|
|
||||||
|
def _set_name(self, key: str, lst: list[str], v: NameObject) -> None:
|
||||||
|
if v[0] != "/":
|
||||||
|
raise ValueError(f"{v} does not start with '/'")
|
||||||
|
if lst != [] and v not in lst:
|
||||||
|
raise ValueError(f"{v} is an unacceptable value")
|
||||||
|
self[NameObject(key)] = NameObject(v)
|
||||||
|
|
||||||
|
def _get_arr(self, key: str, default: Optional[list[Any]]) -> Optional[ArrayObject]:
|
||||||
|
return self.get(key, None if default is None else ArrayObject(default))
|
||||||
|
|
||||||
|
def _set_arr(self, key: str, v: Optional[ArrayObject]) -> None:
|
||||||
|
if v is None:
|
||||||
|
try:
|
||||||
|
del self[NameObject(key)]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
if not isinstance(v, ArrayObject):
|
||||||
|
raise ValueError("ArrayObject is expected")
|
||||||
|
self[NameObject(key)] = v
|
||||||
|
|
||||||
|
def _get_int(self, key: str, default: Optional[NumberObject]) -> Optional[NumberObject]:
|
||||||
|
return self.get(key, default)
|
||||||
|
|
||||||
|
def _set_int(self, key: str, v: int) -> None:
|
||||||
|
self[NameObject(key)] = NumberObject(v)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def PRINT_SCALING(self) -> NameObject:
|
||||||
|
return NameObject("/PrintScaling")
|
||||||
|
|
||||||
|
def __new__(cls: Any, value: Any = None) -> "ViewerPreferences":
|
||||||
|
def _add_prop_bool(key: str, default: Optional[BooleanObject]) -> property:
|
||||||
|
return property(
|
||||||
|
lambda self: self._get_bool(key, default),
|
||||||
|
lambda self, v: self._set_bool(key, v),
|
||||||
|
None,
|
||||||
|
f"""
|
||||||
|
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _add_prop_name(
|
||||||
|
key: str, lst: list[str], default: Optional[NameObject]
|
||||||
|
) -> property:
|
||||||
|
return property(
|
||||||
|
lambda self: self._get_name(key, default),
|
||||||
|
lambda self, v: self._set_name(key, lst, v),
|
||||||
|
None,
|
||||||
|
f"""
|
||||||
|
Returns/Modify the status of {key}, Returns {default} if not defined.
|
||||||
|
Acceptable values: {lst}
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _add_prop_arr(key: str, default: Optional[ArrayObject]) -> property:
|
||||||
|
return property(
|
||||||
|
lambda self: self._get_arr(key, default),
|
||||||
|
lambda self, v: self._set_arr(key, v),
|
||||||
|
None,
|
||||||
|
f"""
|
||||||
|
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _add_prop_int(key: str, default: Optional[int]) -> property:
|
||||||
|
return property(
|
||||||
|
lambda self: self._get_int(key, default),
|
||||||
|
lambda self, v: self._set_int(key, v),
|
||||||
|
None,
|
||||||
|
f"""
|
||||||
|
Returns/Modify the status of {key}, Returns {default} if not defined
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
cls.hide_toolbar = _add_prop_bool("/HideToolbar", f_obj)
|
||||||
|
cls.hide_menubar = _add_prop_bool("/HideMenubar", f_obj)
|
||||||
|
cls.hide_windowui = _add_prop_bool("/HideWindowUI", f_obj)
|
||||||
|
cls.fit_window = _add_prop_bool("/FitWindow", f_obj)
|
||||||
|
cls.center_window = _add_prop_bool("/CenterWindow", f_obj)
|
||||||
|
cls.display_doctitle = _add_prop_bool("/DisplayDocTitle", f_obj)
|
||||||
|
|
||||||
|
cls.non_fullscreen_pagemode = _add_prop_name(
|
||||||
|
"/NonFullScreenPageMode",
|
||||||
|
["/UseNone", "/UseOutlines", "/UseThumbs", "/UseOC"],
|
||||||
|
NameObject("/UseNone"),
|
||||||
|
)
|
||||||
|
cls.direction = _add_prop_name(
|
||||||
|
"/Direction", ["/L2R", "/R2L"], NameObject("/L2R")
|
||||||
|
)
|
||||||
|
cls.view_area = _add_prop_name("/ViewArea", [], None)
|
||||||
|
cls.view_clip = _add_prop_name("/ViewClip", [], None)
|
||||||
|
cls.print_area = _add_prop_name("/PrintArea", [], None)
|
||||||
|
cls.print_clip = _add_prop_name("/PrintClip", [], None)
|
||||||
|
cls.print_scaling = _add_prop_name("/PrintScaling", [], None)
|
||||||
|
cls.duplex = _add_prop_name(
|
||||||
|
"/Duplex", ["/Simplex", "/DuplexFlipShortEdge", "/DuplexFlipLongEdge"], None
|
||||||
|
)
|
||||||
|
cls.pick_tray_by_pdfsize = _add_prop_bool("/PickTrayByPDFSize", None)
|
||||||
|
cls.print_pagerange = _add_prop_arr("/PrintPageRange", None)
|
||||||
|
cls.num_copies = _add_prop_int("/NumCopies", None)
|
||||||
|
|
||||||
|
cls.enforce = _add_prop_arr("/Enforce", ArrayObject())
|
||||||
|
|
||||||
|
return DictionaryObject.__new__(cls)
|
||||||
200
venv/lib/python3.12/site-packages/pypdf/pagerange.py
Normal file
200
venv/lib/python3.12/site-packages/pypdf/pagerange.py
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
"""
|
||||||
|
Representation and utils for ranges of PDF file pages.
|
||||||
|
|
||||||
|
Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
|
||||||
|
All rights reserved. This software is available under a BSD license;
|
||||||
|
see https://github.com/py-pdf/pypdf/blob/main/LICENSE
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from .errors import ParseError
|
||||||
|
|
||||||
|
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
|
||||||
|
PAGE_RANGE_RE = f"^({_INT_RE}|({_INT_RE}?(:{_INT_RE}?(:{_INT_RE}?)?)))$"
|
||||||
|
# groups: 12 34 5 6 7 8
|
||||||
|
|
||||||
|
|
||||||
|
class PageRange:
|
||||||
|
"""
|
||||||
|
A slice-like representation of a range of page indices.
|
||||||
|
|
||||||
|
For example, page numbers, only starting at zero.
|
||||||
|
|
||||||
|
The syntax is like what you would put between brackets [ ].
|
||||||
|
The slice is one of the few Python types that can't be subclassed,
|
||||||
|
but this class converts to and from slices, and allows similar use.
|
||||||
|
|
||||||
|
- PageRange(str) parses a string representing a page range.
|
||||||
|
- PageRange(slice) directly "imports" a slice.
|
||||||
|
- to_slice() gives the equivalent slice.
|
||||||
|
- str() and repr() allow printing.
|
||||||
|
- indices(n) is like slice.indices(n).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, arg: Union[slice, "PageRange", str]) -> None:
|
||||||
|
"""
|
||||||
|
Initialize with either a slice -- giving the equivalent page range,
|
||||||
|
or a PageRange object -- making a copy,
|
||||||
|
or a string like
|
||||||
|
"int", "[int]:[int]" or "[int]:[int]:[int]",
|
||||||
|
where the brackets indicate optional ints.
|
||||||
|
Remember, page indices start with zero.
|
||||||
|
Page range expression examples:
|
||||||
|
|
||||||
|
: all pages. -1 last page.
|
||||||
|
22 just the 23rd page. :-1 all but the last page.
|
||||||
|
0:3 the first three pages. -2 second-to-last page.
|
||||||
|
:3 the first three pages. -2: last two pages.
|
||||||
|
5: from the sixth page onward. -3:-1 third & second to last.
|
||||||
|
The third, "stride" or "step" number is also recognized.
|
||||||
|
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
|
||||||
|
1:10:2 1 3 5 7 9 2::-1 2 1 0.
|
||||||
|
::-1 all pages in reverse order.
|
||||||
|
Note the difference between this notation and arguments to slice():
|
||||||
|
slice(3) means the first three pages;
|
||||||
|
PageRange("3") means the range of only the fourth page.
|
||||||
|
However PageRange(slice(3)) means the first three pages.
|
||||||
|
"""
|
||||||
|
if isinstance(arg, slice):
|
||||||
|
self._slice = arg
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(arg, PageRange):
|
||||||
|
self._slice = arg.to_slice()
|
||||||
|
return
|
||||||
|
|
||||||
|
m = isinstance(arg, str) and re.match(PAGE_RANGE_RE, arg)
|
||||||
|
if not m:
|
||||||
|
raise ParseError(arg)
|
||||||
|
if m.group(2):
|
||||||
|
# Special case: just an int means a range of one page.
|
||||||
|
start = int(m.group(2))
|
||||||
|
stop = start + 1 if start != -1 else None
|
||||||
|
self._slice = slice(start, stop)
|
||||||
|
else:
|
||||||
|
self._slice = slice(*[int(g) if g else None for g in m.group(4, 6, 8)])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def valid(input: Any) -> bool:
|
||||||
|
"""
|
||||||
|
True if input is a valid initializer for a PageRange.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input: A possible PageRange string or a PageRange object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True, if the ``input`` is a valid PageRange.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return isinstance(input, (slice, PageRange)) or (
|
||||||
|
isinstance(input, str) and bool(re.match(PAGE_RANGE_RE, input))
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_slice(self) -> slice:
|
||||||
|
"""Return the slice equivalent of this page range."""
|
||||||
|
return self._slice
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
"""A string like "1:2:3"."""
|
||||||
|
s = self._slice
|
||||||
|
indices: Union[tuple[int, int], tuple[int, int, int]]
|
||||||
|
if s.step is None:
|
||||||
|
if s.start is not None and s.stop == s.start + 1:
|
||||||
|
return str(s.start)
|
||||||
|
|
||||||
|
indices = s.start, s.stop
|
||||||
|
else:
|
||||||
|
indices = s.start, s.stop, s.step
|
||||||
|
return ":".join("" if i is None else str(i) for i in indices)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
"""A string like "PageRange('1:2:3')"."""
|
||||||
|
return "PageRange(" + repr(str(self)) + ")"
|
||||||
|
|
||||||
|
def indices(self, n: int) -> tuple[int, int, int]:
|
||||||
|
"""
|
||||||
|
Assuming a sequence of length n, calculate the start and stop indices,
|
||||||
|
and the stride length of the PageRange.
|
||||||
|
|
||||||
|
See help(slice.indices).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n: the length of the list of pages to choose from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Arguments for range().
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._slice.indices(n)
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, PageRange):
|
||||||
|
return False
|
||||||
|
return self._slice == other._slice
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash((self.__class__, (self._slice.start, self._slice.stop, self._slice.step)))
|
||||||
|
|
||||||
|
def __add__(self, other: "PageRange") -> "PageRange":
|
||||||
|
if not isinstance(other, PageRange):
|
||||||
|
raise TypeError(f"Can't add PageRange and {type(other)}")
|
||||||
|
if self._slice.step is not None or other._slice.step is not None:
|
||||||
|
raise ValueError("Can't add PageRange with stride")
|
||||||
|
a = self._slice.start, self._slice.stop
|
||||||
|
b = other._slice.start, other._slice.stop
|
||||||
|
|
||||||
|
if a[0] > b[0]:
|
||||||
|
a, b = b, a
|
||||||
|
|
||||||
|
# Now a[0] is the smallest
|
||||||
|
if b[0] > a[1]:
|
||||||
|
# There is a gap between a and b.
|
||||||
|
raise ValueError("Can't add PageRanges with gap")
|
||||||
|
return PageRange(slice(a[0], max(a[1], b[1])))
|
||||||
|
|
||||||
|
|
||||||
|
PAGE_RANGE_ALL = PageRange(":") # The range of all pages.
|
||||||
|
|
||||||
|
|
||||||
|
def parse_filename_page_ranges(
|
||||||
|
args: list[Union[str, PageRange, None]]
|
||||||
|
) -> list[tuple[str, PageRange]]:
|
||||||
|
"""
|
||||||
|
Given a list of filenames and page ranges, return a list of (filename, page_range) pairs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args: A list where the first element is a filename. The other elements are
|
||||||
|
filenames, page-range expressions, slice objects, or PageRange objects.
|
||||||
|
A filename not followed by a page range indicates all pages of the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of (filename, page_range) pairs.
|
||||||
|
|
||||||
|
"""
|
||||||
|
pairs: list[tuple[str, PageRange]] = []
|
||||||
|
pdf_filename: Union[str, None] = None
|
||||||
|
did_page_range = False
|
||||||
|
for arg in [*args, None]:
|
||||||
|
if PageRange.valid(arg):
|
||||||
|
if not pdf_filename:
|
||||||
|
raise ValueError(
|
||||||
|
"The first argument must be a filename, not a page range."
|
||||||
|
)
|
||||||
|
|
||||||
|
assert arg is not None
|
||||||
|
pairs.append((pdf_filename, PageRange(arg)))
|
||||||
|
did_page_range = True
|
||||||
|
else:
|
||||||
|
# New filename or end of list - use the complete previous file?
|
||||||
|
if pdf_filename and not did_page_range:
|
||||||
|
pairs.append((pdf_filename, PAGE_RANGE_ALL))
|
||||||
|
|
||||||
|
assert not isinstance(arg, PageRange), arg
|
||||||
|
pdf_filename = arg
|
||||||
|
did_page_range = False
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
PageRangeSpec = Union[str, PageRange, tuple[int, int], tuple[int, int, int], list[int]]
|
||||||
52
venv/lib/python3.12/site-packages/pypdf/papersizes.py
Normal file
52
venv/lib/python3.12/site-packages/pypdf/papersizes.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Helper to get paper sizes."""
|
||||||
|
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
|
|
||||||
|
class Dimensions(NamedTuple):
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
|
||||||
|
|
||||||
|
class PaperSize:
|
||||||
|
"""(width, height) of the paper in portrait mode in pixels at 72 ppi."""
|
||||||
|
|
||||||
|
# Notes of how to calculate it:
|
||||||
|
# 1. Get the size of the paper in millimeters
|
||||||
|
# 2. Convert it to inches (25.4 millimeters is equal to 1 inch)
|
||||||
|
# 3. Convert it to pixels at 72dpi (1 inch is equal to 72 pixels)
|
||||||
|
|
||||||
|
# All Din-A paper sizes follow this pattern:
|
||||||
|
# 2 x A(n - 1) = A(n)
|
||||||
|
# So the height of the next bigger one is the width of the smaller one
|
||||||
|
# The ratio is always approximately 1:2**0.5
|
||||||
|
# Additionally, A0 is defined to have an area of 1 m**2
|
||||||
|
# https://en.wikipedia.org/wiki/ISO_216
|
||||||
|
# Be aware of rounding issues!
|
||||||
|
A0 = Dimensions(2384, 3370) # 841mm x 1189mm
|
||||||
|
A1 = Dimensions(1684, 2384)
|
||||||
|
A2 = Dimensions(1191, 1684)
|
||||||
|
A3 = Dimensions(842, 1191)
|
||||||
|
A4 = Dimensions(
|
||||||
|
595, 842
|
||||||
|
) # Printer paper, documents - this is by far the most common
|
||||||
|
A5 = Dimensions(420, 595) # Paperback books
|
||||||
|
A6 = Dimensions(298, 420) # Postcards
|
||||||
|
A7 = Dimensions(210, 298)
|
||||||
|
A8 = Dimensions(147, 210)
|
||||||
|
|
||||||
|
# Envelopes
|
||||||
|
C4 = Dimensions(649, 918)
|
||||||
|
|
||||||
|
|
||||||
|
_din_a = (
|
||||||
|
PaperSize.A0,
|
||||||
|
PaperSize.A1,
|
||||||
|
PaperSize.A2,
|
||||||
|
PaperSize.A3,
|
||||||
|
PaperSize.A4,
|
||||||
|
PaperSize.A5,
|
||||||
|
PaperSize.A6,
|
||||||
|
PaperSize.A7,
|
||||||
|
PaperSize.A8,
|
||||||
|
)
|
||||||
0
venv/lib/python3.12/site-packages/pypdf/py.typed
Normal file
0
venv/lib/python3.12/site-packages/pypdf/py.typed
Normal file
80
venv/lib/python3.12/site-packages/pypdf/types.py
Normal file
80
venv/lib/python3.12/site-packages/pypdf/types.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
"""Helpers for working with PDF types."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from typing import Literal, Union
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
# Python 3.10+: https://www.python.org/dev/peps/pep-0484
|
||||||
|
from typing import TypeAlias
|
||||||
|
else:
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
from .generic._base import NameObject, NullObject, NumberObject
|
||||||
|
from .generic._data_structures import ArrayObject, Destination
|
||||||
|
from .generic._outline import OutlineItem
|
||||||
|
|
||||||
|
BorderArrayType: TypeAlias = list[Union[NameObject, NumberObject, ArrayObject]]
|
||||||
|
|
||||||
|
OutlineItemType: TypeAlias = Union[OutlineItem, Destination]
|
||||||
|
|
||||||
|
FitType: TypeAlias = Literal[
|
||||||
|
"/XYZ", "/Fit", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV"
|
||||||
|
]
|
||||||
|
# These go with the FitType, they specify values for the fit
|
||||||
|
ZoomArgType: TypeAlias = Union[NumberObject, NullObject, float]
|
||||||
|
ZoomArgsType: TypeAlias = list[ZoomArgType]
|
||||||
|
|
||||||
|
# Recursive types like the following are not yet supported by Sphinx:
|
||||||
|
# OutlineType = List[Union[Destination, "OutlineType"]]
|
||||||
|
# Hence use this for the moment:
|
||||||
|
OutlineType = list[Union[Destination, list[Union[Destination, list[Destination]]]]]
|
||||||
|
|
||||||
|
LayoutType: TypeAlias = Literal[
|
||||||
|
"/NoLayout",
|
||||||
|
"/SinglePage",
|
||||||
|
"/OneColumn",
|
||||||
|
"/TwoColumnLeft",
|
||||||
|
"/TwoColumnRight",
|
||||||
|
"/TwoPageLeft",
|
||||||
|
"/TwoPageRight",
|
||||||
|
]
|
||||||
|
|
||||||
|
PagemodeType: TypeAlias = Literal[
|
||||||
|
"/UseNone",
|
||||||
|
"/UseOutlines",
|
||||||
|
"/UseThumbs",
|
||||||
|
"/FullScreen",
|
||||||
|
"/UseOC",
|
||||||
|
"/UseAttachments",
|
||||||
|
]
|
||||||
|
|
||||||
|
AnnotationSubtype: TypeAlias = Literal[
|
||||||
|
"/Text",
|
||||||
|
"/Link",
|
||||||
|
"/FreeText",
|
||||||
|
"/Line",
|
||||||
|
"/Square",
|
||||||
|
"/Circle",
|
||||||
|
"/Polygon",
|
||||||
|
"/PolyLine",
|
||||||
|
"/Highlight",
|
||||||
|
"/Underline",
|
||||||
|
"/Squiggly",
|
||||||
|
"/StrikeOut",
|
||||||
|
"/Caret",
|
||||||
|
"/Stamp",
|
||||||
|
"/Ink",
|
||||||
|
"/Popup",
|
||||||
|
"/FileAttachment",
|
||||||
|
"/Sound",
|
||||||
|
"/Movie",
|
||||||
|
"/Screen",
|
||||||
|
"/Widget",
|
||||||
|
"/PrinterMark",
|
||||||
|
"/TrapNet",
|
||||||
|
"/Watermark",
|
||||||
|
"/3D",
|
||||||
|
"/Redact",
|
||||||
|
"/Projection",
|
||||||
|
"/RichMedia",
|
||||||
|
]
|
||||||
748
venv/lib/python3.12/site-packages/pypdf/xmp.py
Normal file
748
venv/lib/python3.12/site-packages/pypdf/xmp.py
Normal file
@@ -0,0 +1,748 @@
|
|||||||
|
"""
|
||||||
|
Anything related to Extensible Metadata Platform (XMP) metadata.
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
|
||||||
|
"""
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import decimal
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
Optional,
|
||||||
|
TypeVar,
|
||||||
|
Union,
|
||||||
|
)
|
||||||
|
from xml.dom.minidom import Document, parseString
|
||||||
|
from xml.dom.minidom import Element as XmlElement
|
||||||
|
from xml.parsers.expat import ExpatError
|
||||||
|
|
||||||
|
from ._protocols import XmpInformationProtocol
|
||||||
|
from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
|
||||||
|
from .errors import PdfReadError, XmpDocumentError
|
||||||
|
from .generic import ContentStream, PdfObject
|
||||||
|
|
||||||
|
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||||
|
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
|
||||||
|
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
|
||||||
|
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
|
||||||
|
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
|
||||||
|
|
||||||
|
# What is the PDFX namespace, you might ask?
|
||||||
|
# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
|
||||||
|
# This namespace is used to place "custom metadata"
|
||||||
|
# properties, which are arbitrary metadata properties with no semantic or
|
||||||
|
# documented meaning.
|
||||||
|
#
|
||||||
|
# Elements in the namespace are key/value-style storage,
|
||||||
|
# where the element name is the key and the content is the value. The keys
|
||||||
|
# are transformed into valid XML identifiers by substituting an invalid
|
||||||
|
# identifier character with \u2182 followed by the unicode hex ID of the
|
||||||
|
# original character. A key like "my car" is therefore "my\u21820020car".
|
||||||
|
#
|
||||||
|
# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
|
||||||
|
#
|
||||||
|
# The pdfx namespace should be avoided.
|
||||||
|
# A custom data schema and sensical XML elements could be used instead, as is
|
||||||
|
# suggested by Adobe's own documentation on XMP under "Extensibility of
|
||||||
|
# Schemas".
|
||||||
|
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
|
||||||
|
|
||||||
|
# PDF/A
|
||||||
|
PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
|
||||||
|
|
||||||
|
# Internal mapping of namespace URI → prefix
|
||||||
|
_NAMESPACE_PREFIX_MAP = {
|
||||||
|
DC_NAMESPACE: "dc",
|
||||||
|
XMP_NAMESPACE: "xmp",
|
||||||
|
PDF_NAMESPACE: "pdf",
|
||||||
|
XMPMM_NAMESPACE: "xmpMM",
|
||||||
|
PDFAID_NAMESPACE: "pdfaid",
|
||||||
|
PDFX_NAMESPACE: "pdfx",
|
||||||
|
}
|
||||||
|
|
||||||
|
iso8601 = re.compile(
|
||||||
|
"""
|
||||||
|
(?P<year>[0-9]{4})
|
||||||
|
(-
|
||||||
|
(?P<month>[0-9]{2})
|
||||||
|
(-
|
||||||
|
(?P<day>[0-9]+)
|
||||||
|
(T
|
||||||
|
(?P<hour>[0-9]{2}):
|
||||||
|
(?P<minute>[0-9]{2})
|
||||||
|
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
|
||||||
|
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
|
||||||
|
)?
|
||||||
|
)?
|
||||||
|
)?
|
||||||
|
""",
|
||||||
|
re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
K = TypeVar("K")
|
||||||
|
|
||||||
|
# Minimal XMP template
|
||||||
|
_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
|
||||||
|
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf">
|
||||||
|
<rdf:RDF xmlns:rdf="{RDF_NAMESPACE}">
|
||||||
|
<rdf:Description rdf:about=""
|
||||||
|
xmlns:dc="{DC_NAMESPACE}"
|
||||||
|
xmlns:xmp="{XMP_NAMESPACE}"
|
||||||
|
xmlns:pdf="{PDF_NAMESPACE}"
|
||||||
|
xmlns:xmpMM="{XMPMM_NAMESPACE}"
|
||||||
|
xmlns:pdfaid="{PDFAID_NAMESPACE}"
|
||||||
|
xmlns:pdfx="{PDFX_NAMESPACE}">
|
||||||
|
</rdf:Description>
|
||||||
|
</rdf:RDF>
|
||||||
|
</x:xmpmeta>
|
||||||
|
<?xpacket end="r"?>"""
|
||||||
|
|
||||||
|
|
||||||
|
def _identity(value: K) -> K:
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def _converter_date(value: str) -> datetime.datetime:
|
||||||
|
matches = iso8601.match(value)
|
||||||
|
if matches is None:
|
||||||
|
raise ValueError(f"Invalid date format: {value}")
|
||||||
|
year = int(matches.group("year"))
|
||||||
|
month = int(matches.group("month") or "1")
|
||||||
|
day = int(matches.group("day") or "1")
|
||||||
|
hour = int(matches.group("hour") or "0")
|
||||||
|
minute = int(matches.group("minute") or "0")
|
||||||
|
second = decimal.Decimal(matches.group("second") or "0")
|
||||||
|
seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
|
||||||
|
milliseconds_dec = (second - seconds_dec) * 1_000_000
|
||||||
|
|
||||||
|
seconds = int(seconds_dec)
|
||||||
|
milliseconds = int(milliseconds_dec)
|
||||||
|
|
||||||
|
tzd = matches.group("tzd") or "Z"
|
||||||
|
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
|
||||||
|
if tzd != "Z":
|
||||||
|
tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
|
||||||
|
tzd_hours *= -1
|
||||||
|
if tzd_hours < 0:
|
||||||
|
tzd_minutes *= -1
|
||||||
|
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
|
||||||
|
def _format_datetime_utc(value: datetime.datetime) -> str:
|
||||||
|
"""Format a datetime as UTC with trailing 'Z'.
|
||||||
|
|
||||||
|
- If the input is timezone-aware, convert to UTC first.
|
||||||
|
- If naive, assume UTC.
|
||||||
|
"""
|
||||||
|
if value.tzinfo is not None and value.utcoffset() is not None:
|
||||||
|
value = value.astimezone(datetime.timezone.utc)
|
||||||
|
|
||||||
|
value = value.replace(tzinfo=None)
|
||||||
|
return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
|
||||||
|
def _generic_get(
|
||||||
|
element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
|
||||||
|
) -> Optional[list[str]]:
|
||||||
|
containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
|
||||||
|
retval: list[Any] = []
|
||||||
|
if len(containers):
|
||||||
|
for container in containers:
|
||||||
|
for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||||
|
value = self._get_text(item)
|
||||||
|
value = converter(value)
|
||||||
|
retval.append(value)
|
||||||
|
return retval
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class XmpInformation(XmpInformationProtocol, PdfObject):
|
||||||
|
"""
|
||||||
|
An object that represents Extensible Metadata Platform (XMP) metadata.
|
||||||
|
Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PdfReadError: if XML is invalid
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, stream: ContentStream) -> None:
|
||||||
|
self.stream = stream
|
||||||
|
try:
|
||||||
|
data = self.stream.get_data()
|
||||||
|
doc_root: Document = parseString(data) # noqa: S318
|
||||||
|
except (AttributeError, ExpatError) as e:
|
||||||
|
raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
|
||||||
|
self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
|
||||||
|
RDF_NAMESPACE, "RDF"
|
||||||
|
)[0]
|
||||||
|
self.cache: dict[Any, Any] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls) -> "XmpInformation":
|
||||||
|
"""
|
||||||
|
Create a new XmpInformation object with minimal structure.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A new XmpInformation instance with empty metadata fields.
|
||||||
|
"""
|
||||||
|
stream = ContentStream(None, None)
|
||||||
|
stream.set_data(_MINIMAL_XMP.encode("utf-8"))
|
||||||
|
return cls(stream)
|
||||||
|
|
||||||
|
def write_to_stream(
|
||||||
|
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||||
|
) -> None:
|
||||||
|
deprecate_with_replacement(
|
||||||
|
"XmpInformation.write_to_stream",
|
||||||
|
"PdfWriter.xmp_metadata",
|
||||||
|
"6.0.0"
|
||||||
|
)
|
||||||
|
if encryption_key is not None: # deprecated
|
||||||
|
deprecation_no_replacement(
|
||||||
|
"the encryption_key parameter of write_to_stream", "5.0.0"
|
||||||
|
)
|
||||||
|
self.stream.write_to_stream(stream)
|
||||||
|
|
||||||
|
def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
|
||||||
|
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||||
|
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||||
|
attr = desc.getAttributeNodeNS(namespace, name)
|
||||||
|
if attr is not None:
|
||||||
|
yield attr
|
||||||
|
yield from desc.getElementsByTagNameNS(namespace, name)
|
||||||
|
|
||||||
|
def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
|
||||||
|
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||||
|
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||||
|
for i in range(desc.attributes.length):
|
||||||
|
attr = desc.attributes.item(i)
|
||||||
|
if attr and attr.namespaceURI == namespace:
|
||||||
|
yield attr
|
||||||
|
for child in desc.childNodes:
|
||||||
|
if child.namespaceURI == namespace:
|
||||||
|
yield child
|
||||||
|
|
||||||
|
def _get_text(self, element: XmlElement) -> str:
|
||||||
|
text = ""
|
||||||
|
for child in element.childNodes:
|
||||||
|
if child.nodeType == child.TEXT_NODE:
|
||||||
|
text += child.data
|
||||||
|
return text
|
||||||
|
|
||||||
|
def _get_single_value(
|
||||||
|
self,
|
||||||
|
namespace: str,
|
||||||
|
name: str,
|
||||||
|
converter: Callable[[str], Any] = _identity,
|
||||||
|
) -> Optional[Any]:
|
||||||
|
cached = self.cache.get(namespace, {}).get(name)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
value = None
|
||||||
|
for element in self.get_element("", namespace, name):
|
||||||
|
if element.nodeType == element.ATTRIBUTE_NODE:
|
||||||
|
value = element.nodeValue
|
||||||
|
else:
|
||||||
|
value = self._get_text(element)
|
||||||
|
break
|
||||||
|
if value is not None:
|
||||||
|
value = converter(value)
|
||||||
|
ns_cache = self.cache.setdefault(namespace, {})
|
||||||
|
ns_cache[name] = value
|
||||||
|
return value
|
||||||
|
|
||||||
|
def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]:
|
||||||
|
cached = self.cache.get(namespace, {}).get(name)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
retval: list[str] = []
|
||||||
|
for element in self.get_element("", namespace, name):
|
||||||
|
if (bags := _generic_get(element, self, list_type="Bag")) is not None:
|
||||||
|
retval.extend(bags)
|
||||||
|
else:
|
||||||
|
value = self._get_text(element)
|
||||||
|
retval.append(value)
|
||||||
|
ns_cache = self.cache.setdefault(namespace, {})
|
||||||
|
ns_cache[name] = retval
|
||||||
|
return retval
|
||||||
|
|
||||||
|
def _get_seq_values(
|
||||||
|
self,
|
||||||
|
namespace: str,
|
||||||
|
name: str,
|
||||||
|
converter: Callable[[Any], Any] = _identity,
|
||||||
|
) -> Optional[list[Any]]:
|
||||||
|
cached = self.cache.get(namespace, {}).get(name)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
retval: list[Any] = []
|
||||||
|
for element in self.get_element("", namespace, name):
|
||||||
|
if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
|
||||||
|
retval.extend(seqs)
|
||||||
|
elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
|
||||||
|
# See issue at https://github.com/py-pdf/pypdf/issues/3324
|
||||||
|
# Some applications violate the XMP metadata standard regarding `dc:creator` which should
|
||||||
|
# be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
|
||||||
|
# This seems to stem from the fact that the original Dublin Core specification does indeed
|
||||||
|
# use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
|
||||||
|
# the plain Dublin Core variant. For this reason, add a fallback here to deal with such
|
||||||
|
# issues accordingly.
|
||||||
|
retval.extend(bags)
|
||||||
|
else:
|
||||||
|
value = converter(self._get_text(element))
|
||||||
|
retval.append(value)
|
||||||
|
ns_cache = self.cache.setdefault(namespace, {})
|
||||||
|
ns_cache[name] = retval
|
||||||
|
return retval
|
||||||
|
|
||||||
|
def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]:
|
||||||
|
cached = self.cache.get(namespace, {}).get(name)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
retval: dict[Any, Any] = {}
|
||||||
|
for element in self.get_element("", namespace, name):
|
||||||
|
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
|
||||||
|
if len(alts):
|
||||||
|
for alt in alts:
|
||||||
|
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
|
||||||
|
value = self._get_text(item)
|
||||||
|
retval[item.getAttribute("xml:lang")] = value
|
||||||
|
else:
|
||||||
|
retval["x-default"] = self._get_text(element)
|
||||||
|
ns_cache = self.cache.setdefault(namespace, {})
|
||||||
|
ns_cache[name] = retval
|
||||||
|
return retval
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_contributor(self) -> Optional[list[str]]:
|
||||||
|
"""Contributors to the resource (other than the authors)."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "contributor")
|
||||||
|
|
||||||
|
@dc_contributor.setter
|
||||||
|
def dc_contributor(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "contributor", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_coverage(self) -> Optional[str]:
|
||||||
|
"""Text describing the extent or scope of the resource."""
|
||||||
|
return self._get_single_value(DC_NAMESPACE, "coverage")
|
||||||
|
|
||||||
|
@dc_coverage.setter
|
||||||
|
def dc_coverage(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(DC_NAMESPACE, "coverage", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_creator(self) -> Optional[list[str]]:
|
||||||
|
"""A sorted array of names of the authors of the resource, listed in order of precedence."""
|
||||||
|
return self._get_seq_values(DC_NAMESPACE, "creator")
|
||||||
|
|
||||||
|
@dc_creator.setter
|
||||||
|
def dc_creator(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_seq_values(DC_NAMESPACE, "creator", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_date(self) -> Optional[list[datetime.datetime]]:
|
||||||
|
"""A sorted array of dates of significance to the resource. The dates and times are in UTC."""
|
||||||
|
return self._get_seq_values(DC_NAMESPACE, "date", _converter_date)
|
||||||
|
|
||||||
|
@dc_date.setter
|
||||||
|
def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None:
|
||||||
|
if values is None:
|
||||||
|
self._set_seq_values(DC_NAMESPACE, "date", None)
|
||||||
|
else:
|
||||||
|
date_strings = []
|
||||||
|
for value in values:
|
||||||
|
if isinstance(value, datetime.datetime):
|
||||||
|
date_strings.append(_format_datetime_utc(value))
|
||||||
|
else:
|
||||||
|
date_strings.append(str(value))
|
||||||
|
self._set_seq_values(DC_NAMESPACE, "date", date_strings)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_description(self) -> Optional[dict[str, str]]:
|
||||||
|
"""A language-keyed dictionary of textual descriptions of the content of the resource."""
|
||||||
|
return self._get_langalt_values(DC_NAMESPACE, "description")
|
||||||
|
|
||||||
|
@dc_description.setter
|
||||||
|
def dc_description(self, values: Optional[dict[str, str]]) -> None:
|
||||||
|
self._set_langalt_values(DC_NAMESPACE, "description", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_format(self) -> Optional[str]:
|
||||||
|
"""The mime-type of the resource."""
|
||||||
|
return self._get_single_value(DC_NAMESPACE, "format")
|
||||||
|
|
||||||
|
@dc_format.setter
|
||||||
|
def dc_format(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(DC_NAMESPACE, "format", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_identifier(self) -> Optional[str]:
|
||||||
|
"""Unique identifier of the resource."""
|
||||||
|
return self._get_single_value(DC_NAMESPACE, "identifier")
|
||||||
|
|
||||||
|
@dc_identifier.setter
|
||||||
|
def dc_identifier(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(DC_NAMESPACE, "identifier", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_language(self) -> Optional[list[str]]:
|
||||||
|
"""An unordered array specifying the languages used in the resource."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "language")
|
||||||
|
|
||||||
|
@dc_language.setter
|
||||||
|
def dc_language(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "language", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_publisher(self) -> Optional[list[str]]:
|
||||||
|
"""An unordered array of publisher names."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "publisher")
|
||||||
|
|
||||||
|
@dc_publisher.setter
|
||||||
|
def dc_publisher(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "publisher", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_relation(self) -> Optional[list[str]]:
|
||||||
|
"""An unordered array of text descriptions of relationships to other documents."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "relation")
|
||||||
|
|
||||||
|
@dc_relation.setter
|
||||||
|
def dc_relation(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "relation", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_rights(self) -> Optional[dict[str, str]]:
|
||||||
|
"""A language-keyed dictionary of textual descriptions of the rights the user has to this resource."""
|
||||||
|
return self._get_langalt_values(DC_NAMESPACE, "rights")
|
||||||
|
|
||||||
|
@dc_rights.setter
|
||||||
|
def dc_rights(self, values: Optional[dict[str, str]]) -> None:
|
||||||
|
self._set_langalt_values(DC_NAMESPACE, "rights", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_source(self) -> Optional[str]:
|
||||||
|
"""Unique identifier of the work from which this resource was derived."""
|
||||||
|
return self._get_single_value(DC_NAMESPACE, "source")
|
||||||
|
|
||||||
|
@dc_source.setter
|
||||||
|
def dc_source(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(DC_NAMESPACE, "source", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_subject(self) -> Optional[list[str]]:
|
||||||
|
"""An unordered array of descriptive phrases or keywords that specify the topic of the content."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "subject")
|
||||||
|
|
||||||
|
@dc_subject.setter
|
||||||
|
def dc_subject(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "subject", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_title(self) -> Optional[dict[str, str]]:
|
||||||
|
"""A language-keyed dictionary of the title of the resource."""
|
||||||
|
return self._get_langalt_values(DC_NAMESPACE, "title")
|
||||||
|
|
||||||
|
@dc_title.setter
|
||||||
|
def dc_title(self, values: Optional[dict[str, str]]) -> None:
|
||||||
|
self._set_langalt_values(DC_NAMESPACE, "title", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dc_type(self) -> Optional[list[str]]:
|
||||||
|
"""An unordered array of textual descriptions of the document type."""
|
||||||
|
return self._getter_bag(DC_NAMESPACE, "type")
|
||||||
|
|
||||||
|
@dc_type.setter
|
||||||
|
def dc_type(self, values: Optional[list[str]]) -> None:
|
||||||
|
self._set_bag_values(DC_NAMESPACE, "type", values)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pdf_keywords(self) -> Optional[str]:
|
||||||
|
"""An unformatted text string representing document keywords."""
|
||||||
|
return self._get_single_value(PDF_NAMESPACE, "Keywords")
|
||||||
|
|
||||||
|
@pdf_keywords.setter
|
||||||
|
def pdf_keywords(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(PDF_NAMESPACE, "Keywords", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pdf_pdfversion(self) -> Optional[str]:
|
||||||
|
"""The PDF file version, for example 1.0 or 1.3."""
|
||||||
|
return self._get_single_value(PDF_NAMESPACE, "PDFVersion")
|
||||||
|
|
||||||
|
@pdf_pdfversion.setter
|
||||||
|
def pdf_pdfversion(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(PDF_NAMESPACE, "PDFVersion", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pdf_producer(self) -> Optional[str]:
|
||||||
|
"""The name of the tool that saved the document as a PDF."""
|
||||||
|
return self._get_single_value(PDF_NAMESPACE, "Producer")
|
||||||
|
|
||||||
|
@pdf_producer.setter
|
||||||
|
def pdf_producer(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(PDF_NAMESPACE, "Producer", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmp_create_date(self) -> Optional[datetime.datetime]:
|
||||||
|
"""The date and time the resource was originally created. Returned as a UTC datetime object."""
|
||||||
|
return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date)
|
||||||
|
|
||||||
|
@xmp_create_date.setter
|
||||||
|
def xmp_create_date(self, value: Optional[datetime.datetime]) -> None:
|
||||||
|
if value:
|
||||||
|
date_str = _format_datetime_utc(value)
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str)
|
||||||
|
else:
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "CreateDate", None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmp_modify_date(self) -> Optional[datetime.datetime]:
|
||||||
|
"""The date and time the resource was last modified. Returned as a UTC datetime object."""
|
||||||
|
return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date)
|
||||||
|
|
||||||
|
@xmp_modify_date.setter
|
||||||
|
def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None:
|
||||||
|
if value:
|
||||||
|
date_str = _format_datetime_utc(value)
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str)
|
||||||
|
else:
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "ModifyDate", None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmp_metadata_date(self) -> Optional[datetime.datetime]:
|
||||||
|
"""The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object."""
|
||||||
|
return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date)
|
||||||
|
|
||||||
|
@xmp_metadata_date.setter
|
||||||
|
def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None:
|
||||||
|
if value:
|
||||||
|
date_str = _format_datetime_utc(value)
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str)
|
||||||
|
else:
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "MetadataDate", None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmp_creator_tool(self) -> Optional[str]:
|
||||||
|
"""The name of the first known tool used to create the resource."""
|
||||||
|
return self._get_single_value(XMP_NAMESPACE, "CreatorTool")
|
||||||
|
|
||||||
|
@xmp_creator_tool.setter
|
||||||
|
def xmp_creator_tool(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(XMP_NAMESPACE, "CreatorTool", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmpmm_document_id(self) -> Optional[str]:
|
||||||
|
"""The common identifier for all versions and renditions of this resource."""
|
||||||
|
return self._get_single_value(XMPMM_NAMESPACE, "DocumentID")
|
||||||
|
|
||||||
|
@xmpmm_document_id.setter
|
||||||
|
def xmpmm_document_id(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xmpmm_instance_id(self) -> Optional[str]:
|
||||||
|
"""An identifier for a specific incarnation of a document, updated each time a file is saved."""
|
||||||
|
return self._get_single_value(XMPMM_NAMESPACE, "InstanceID")
|
||||||
|
|
||||||
|
@xmpmm_instance_id.setter
|
||||||
|
def xmpmm_instance_id(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pdfaid_part(self) -> Optional[str]:
|
||||||
|
"""The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
|
||||||
|
return self._get_single_value(PDFAID_NAMESPACE, "part")
|
||||||
|
|
||||||
|
@pdfaid_part.setter
|
||||||
|
def pdfaid_part(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(PDFAID_NAMESPACE, "part", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pdfaid_conformance(self) -> Optional[str]:
|
||||||
|
"""The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
|
||||||
|
return self._get_single_value(PDFAID_NAMESPACE, "conformance")
|
||||||
|
|
||||||
|
@pdfaid_conformance.setter
|
||||||
|
def pdfaid_conformance(self, value: Optional[str]) -> None:
|
||||||
|
self._set_single_value(PDFAID_NAMESPACE, "conformance", value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def custom_properties(self) -> dict[Any, Any]:
|
||||||
|
"""
|
||||||
|
Retrieve custom metadata properties defined in the undocumented pdfx
|
||||||
|
metadata schema.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary of key/value items for custom metadata properties.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not hasattr(self, "_custom_properties"):
|
||||||
|
self._custom_properties = {}
|
||||||
|
for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
|
||||||
|
key = node.localName
|
||||||
|
while True:
|
||||||
|
# see documentation about PDFX_NAMESPACE earlier in file
|
||||||
|
idx = key.find("\u2182")
|
||||||
|
if idx == -1:
|
||||||
|
break
|
||||||
|
key = (
|
||||||
|
key[:idx]
|
||||||
|
+ chr(int(key[idx + 1 : idx + 5], base=16))
|
||||||
|
+ key[idx + 5 :]
|
||||||
|
)
|
||||||
|
if node.nodeType == node.ATTRIBUTE_NODE:
|
||||||
|
value = node.nodeValue
|
||||||
|
else:
|
||||||
|
value = self._get_text(node)
|
||||||
|
self._custom_properties[key] = value
|
||||||
|
return self._custom_properties
|
||||||
|
|
||||||
|
def _get_or_create_description(self, about_uri: str = "") -> XmlElement:
|
||||||
|
"""Get or create an rdf:Description element with the given about URI."""
|
||||||
|
for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
|
||||||
|
if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
|
||||||
|
return desc
|
||||||
|
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description")
|
||||||
|
desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri)
|
||||||
|
self.rdf_root.appendChild(desc)
|
||||||
|
return desc
|
||||||
|
|
||||||
|
def _clear_cache_entry(self, namespace: str, name: str) -> None:
|
||||||
|
"""Remove a cached value for a given namespace/name if present."""
|
||||||
|
ns_cache = self.cache.get(namespace)
|
||||||
|
if ns_cache and name in ns_cache:
|
||||||
|
del ns_cache[name]
|
||||||
|
|
||||||
|
def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None:
|
||||||
|
"""Set or remove a single metadata value."""
|
||||||
|
self._clear_cache_entry(namespace, name)
|
||||||
|
desc = self._get_or_create_description()
|
||||||
|
|
||||||
|
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||||
|
for elem in existing_elements:
|
||||||
|
desc.removeChild(elem)
|
||||||
|
|
||||||
|
if existing_attr := desc.getAttributeNodeNS(namespace, name):
|
||||||
|
desc.removeAttributeNode(existing_attr)
|
||||||
|
|
||||||
|
if value is not None:
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
prefix = self._get_namespace_prefix(namespace)
|
||||||
|
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||||
|
text_node = doc.createTextNode(str(value))
|
||||||
|
elem.appendChild(text_node)
|
||||||
|
desc.appendChild(elem)
|
||||||
|
|
||||||
|
self._update_stream()
|
||||||
|
|
||||||
|
def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
|
||||||
|
"""Set or remove bag values (unordered array)."""
|
||||||
|
self._clear_cache_entry(namespace, name)
|
||||||
|
desc = self._get_or_create_description()
|
||||||
|
|
||||||
|
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||||
|
for elem in existing_elements:
|
||||||
|
desc.removeChild(elem)
|
||||||
|
|
||||||
|
if values:
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
prefix = self._get_namespace_prefix(namespace)
|
||||||
|
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||||
|
bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag")
|
||||||
|
|
||||||
|
for value in values:
|
||||||
|
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||||
|
text_node = doc.createTextNode(str(value))
|
||||||
|
li.appendChild(text_node)
|
||||||
|
bag.appendChild(li)
|
||||||
|
|
||||||
|
elem.appendChild(bag)
|
||||||
|
desc.appendChild(elem)
|
||||||
|
|
||||||
|
self._update_stream()
|
||||||
|
|
||||||
|
def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
|
||||||
|
"""Set or remove sequence values (ordered array)."""
|
||||||
|
self._clear_cache_entry(namespace, name)
|
||||||
|
desc = self._get_or_create_description()
|
||||||
|
|
||||||
|
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||||
|
for elem in existing_elements:
|
||||||
|
desc.removeChild(elem)
|
||||||
|
|
||||||
|
if values:
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
prefix = self._get_namespace_prefix(namespace)
|
||||||
|
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||||
|
seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq")
|
||||||
|
|
||||||
|
for value in values:
|
||||||
|
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||||
|
text_node = doc.createTextNode(str(value))
|
||||||
|
li.appendChild(text_node)
|
||||||
|
seq.appendChild(li)
|
||||||
|
|
||||||
|
elem.appendChild(seq)
|
||||||
|
desc.appendChild(elem)
|
||||||
|
|
||||||
|
self._update_stream()
|
||||||
|
|
||||||
|
def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None:
|
||||||
|
"""Set or remove language alternative values."""
|
||||||
|
self._clear_cache_entry(namespace, name)
|
||||||
|
desc = self._get_or_create_description()
|
||||||
|
|
||||||
|
existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
|
||||||
|
for elem in existing_elements:
|
||||||
|
desc.removeChild(elem)
|
||||||
|
|
||||||
|
if values:
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
prefix = self._get_namespace_prefix(namespace)
|
||||||
|
elem = doc.createElementNS(namespace, f"{prefix}:{name}")
|
||||||
|
alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt")
|
||||||
|
|
||||||
|
for lang, value in values.items():
|
||||||
|
li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
|
||||||
|
li.setAttribute("xml:lang", lang)
|
||||||
|
text_node = doc.createTextNode(str(value))
|
||||||
|
li.appendChild(text_node)
|
||||||
|
alt.appendChild(li)
|
||||||
|
|
||||||
|
elem.appendChild(alt)
|
||||||
|
desc.appendChild(elem)
|
||||||
|
|
||||||
|
self._update_stream()
|
||||||
|
|
||||||
|
def _get_namespace_prefix(self, namespace: str) -> str:
|
||||||
|
"""Get the appropriate namespace prefix for a given namespace URI."""
|
||||||
|
return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown")
|
||||||
|
|
||||||
|
def _update_stream(self) -> None:
|
||||||
|
"""Update the stream with the current XML content."""
|
||||||
|
doc = self.rdf_root.ownerDocument
|
||||||
|
if doc is None:
|
||||||
|
raise XmpDocumentError("XMP Document is None")
|
||||||
|
|
||||||
|
xml_data = doc.toxml(encoding="utf-8")
|
||||||
|
self.stream.set_data(xml_data)
|
||||||
Reference in New Issue
Block a user