Update ashboard, dashboard, memory +1 more (+2 ~3)

2026-02-02 22:27:24 +00:00
parent 4f00131184
commit b0c9b254f1
65 changed files with 42112 additions and 53 deletions
--- a/venv/lib/python3.12/site-packages/pypdf/_cmap.py
+++ b/venv/lib/python3.12/site-packages/pypdf/_cmap.py
@@ -0,0 +1,338 @@
+import binascii
+from binascii import Error as BinasciiError
+from binascii import unhexlify
+from math import ceil
+from typing import Any, Union, cast
+
+from ._codecs import adobe_glyphs, charset_encoding
+from ._utils import logger_error, logger_warning
+from .generic import (
+    DecodedStreamObject,
+    DictionaryObject,
+    NullObject,
+    StreamObject,
+    is_null_or_none,
+)
+
+_predefined_cmap: dict[str, str] = {
+    "/Identity-H": "utf-16-be",
+    "/Identity-V": "utf-16-be",
+    "/GB-EUC-H": "gbk",
+    "/GB-EUC-V": "gbk",
+    "/GBpc-EUC-H": "gb2312",
+    "/GBpc-EUC-V": "gb2312",
+    "/GBK-EUC-H": "gbk",
+    "/GBK-EUC-V": "gbk",
+    "/GBK2K-H": "gb18030",
+    "/GBK2K-V": "gb18030",
+    "/ETen-B5-H": "cp950",
+    "/ETen-B5-V": "cp950",
+    "/ETenms-B5-H": "cp950",
+    "/ETenms-B5-V": "cp950",
+    "/UniCNS-UTF16-H": "utf-16-be",
+    "/UniCNS-UTF16-V": "utf-16-be",
+    "/UniGB-UTF16-H": "gb18030",
+    "/UniGB-UTF16-V": "gb18030",
+    # UCS2 in code
+}
+
+
+def get_encoding(
+    ft: DictionaryObject
+) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
+    encoding = _parse_encoding(ft)
+    map_dict, int_entry = _parse_to_unicode(ft)
+
+    # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
+    #   if cmap not empty encoding should be discarded
+    #   (here transformed into identity for those characters)
+    # If encoding is a string, it is expected to be an identity translation.
+    if isinstance(encoding, dict):
+        for x in int_entry:
+            if x <= 255:
+                encoding[x] = chr(x)
+
+    return encoding, map_dict
+
+
+def _parse_encoding(
+    ft: DictionaryObject
+) -> Union[str, dict[int, str]]:
+    encoding: Union[str, list[str], dict[int, str]] = []
+    if "/Encoding" not in ft:
+        if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
+            encoding = dict(
+                zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
+            )
+        else:
+            encoding = "charmap"
+        return encoding
+    enc: Union[str, DictionaryObject, NullObject] = cast(
+        Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
+    )
+    if isinstance(enc, str):
+        try:
+            # already done : enc = NameObject.unnumber(enc.encode()).decode()
+            # for #xx decoding
+            if enc in charset_encoding:
+                encoding = charset_encoding[enc].copy()
+            elif enc in _predefined_cmap:
+                encoding = _predefined_cmap[enc]
+            elif "-UCS2-" in enc:
+                encoding = "utf-16-be"
+            else:
+                raise Exception("not found")
+        except Exception:
+            logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
+            encoding = enc
+    elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
+        try:
+            encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
+        except Exception:
+            logger_error(
+                f"Advanced encoding {encoding} not implemented yet",
+                __name__,
+            )
+            encoding = charset_encoding["/StandardEncoding"].copy()
+    else:
+        encoding = charset_encoding["/StandardEncoding"].copy()
+    if isinstance(enc, DictionaryObject) and "/Differences" in enc:
+        x: int = 0
+        o: Union[int, str]
+        for o in cast(DictionaryObject, enc["/Differences"]):
+            if isinstance(o, int):
+                x = o
+            else:  # isinstance(o, str):
+                try:
+                    if x < len(encoding):
+                        encoding[x] = adobe_glyphs[o]  # type: ignore
+                except Exception:
+                    encoding[x] = o  # type: ignore
+                x += 1
+    if isinstance(encoding, list):
+        encoding = dict(zip(range(256), encoding))
+    return encoding
+
+
+def _parse_to_unicode(
+    ft: DictionaryObject
+) -> tuple[dict[Any, Any], list[int]]:
+    # will store all translation code
+    # and map_dict[-1] we will have the number of bytes to convert
+    map_dict: dict[Any, Any] = {}
+
+    # will provide the list of cmap keys as int to correct encoding
+    int_entry: list[int] = []
+
+    if "/ToUnicode" not in ft:
+        if ft.get("/Subtype", "") == "/Type1":
+            return _type1_alternative(ft, map_dict, int_entry)
+        return {}, []
+    process_rg: bool = False
+    process_char: bool = False
+    multiline_rg: Union[
+        None, tuple[int, int]
+    ] = None  # tuple = (current_char, remaining size) ; cf #1285 for example of file
+    cm = prepare_cm(ft)
+    for line in cm.split(b"\n"):
+        process_rg, process_char, multiline_rg = process_cm_line(
+            line.strip(b" \t"),
+            process_rg,
+            process_char,
+            multiline_rg,
+            map_dict,
+            int_entry,
+        )
+
+    return map_dict, int_entry
+
+
+def prepare_cm(ft: DictionaryObject) -> bytes:
+    tu = ft["/ToUnicode"]
+    cm: bytes
+    if isinstance(tu, StreamObject):
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
+    else:  # if (tu is None) or cast(str, tu).startswith("/Identity"):
+        # the full range 0000-FFFF will be processed
+        cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
+    if isinstance(cm, str):
+        cm = cm.encode()
+    # we need to prepare cm before due to missing return line in pdf printed
+    # to pdf from word
+    cm = (
+        cm.strip()
+        .replace(b"beginbfchar", b"\nbeginbfchar\n")
+        .replace(b"endbfchar", b"\nendbfchar\n")
+        .replace(b"beginbfrange", b"\nbeginbfrange\n")
+        .replace(b"endbfrange", b"\nendbfrange\n")
+        .replace(b"<<", b"\n{\n")  # text between << and >> not used but
+        .replace(b">>", b"\n}\n")  # some solution to find it back
+    )
+    ll = cm.split(b"<")
+    for i in range(len(ll)):
+        j = ll[i].find(b">")
+        if j >= 0:
+            if j == 0:
+                # string is empty: stash a placeholder here (see below)
+                # see https://github.com/py-pdf/pypdf/issues/1111
+                content = b"."
+            else:
+                content = ll[i][:j].replace(b" ", b"")
+            ll[i] = content + b" " + ll[i][j + 1 :]
+    cm = (
+        (b" ".join(ll))
+        .replace(b"[", b" [ ")
+        .replace(b"]", b" ]\n ")
+        .replace(b"\r", b"\n")
+    )
+    return cm
+
+
+def process_cm_line(
+    line: bytes,
+    process_rg: bool,
+    process_char: bool,
+    multiline_rg: Union[None, tuple[int, int]],
+    map_dict: dict[Any, Any],
+    int_entry: list[int],
+) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
+    if line == b"" or line[0] == 37:  # 37 = %
+        return process_rg, process_char, multiline_rg
+    line = line.replace(b"\t", b" ")
+    if b"beginbfrange" in line:
+        process_rg = True
+    elif b"endbfrange" in line:
+        process_rg = False
+    elif b"beginbfchar" in line:
+        process_char = True
+    elif b"endbfchar" in line:
+        process_char = False
+    elif process_rg:
+        try:
+            multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
+        except binascii.Error as error:
+            logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
+    elif process_char:
+        parse_bfchar(line, map_dict, int_entry)
+    return process_rg, process_char, multiline_rg
+
+
+def parse_bfrange(
+    line: bytes,
+    map_dict: dict[Any, Any],
+    int_entry: list[int],
+    multiline_rg: Union[None, tuple[int, int]],
+) -> Union[None, tuple[int, int]]:
+    lst = [x for x in line.split(b" ") if x]
+    closure_found = False
+    if multiline_rg is not None:
+        fmt = b"%%0%dX" % (map_dict[-1] * 2)
+        a = multiline_rg[0]  # a, b not in the current line
+        b = multiline_rg[1]
+        for sq in lst:
+            if sq == b"]":
+                closure_found = True
+                break
+            map_dict[
+                unhexlify(fmt % a).decode(
+                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                    "surrogatepass",
+                )
+            ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+            int_entry.append(a)
+            a += 1
+    else:
+        a = int(lst[0], 16)
+        b = int(lst[1], 16)
+        nbi = max(len(lst[0]), len(lst[1]))
+        map_dict[-1] = ceil(nbi / 2)
+        fmt = b"%%0%dX" % (map_dict[-1] * 2)
+        if lst[2] == b"[":
+            for sq in lst[3:]:
+                if sq == b"]":
+                    closure_found = True
+                    break
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+        else:  # case without list
+            c = int(lst[2], 16)
+            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+            closure_found = True
+            while a <= b:
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+                c += 1
+    return None if closure_found else (a, b)
+
+
+def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
+    lst = [x for x in line.split(b" ") if x]
+    map_dict[-1] = len(lst[0]) // 2
+    while len(lst) > 1:
+        map_to = ""
+        # placeholder (see above) means empty string
+        if lst[1] != b".":
+            try:
+                map_to = unhexlify(lst[1]).decode(
+                    "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
+                )  # join is here as some cases where the code was split
+            except BinasciiError as exception:
+                logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
+        map_dict[
+            unhexlify(lst[0]).decode(
+                "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
+            )
+        ] = map_to
+        int_entry.append(int(lst[0], 16))
+        lst = lst[2:]
+
+
+def _type1_alternative(
+    ft: DictionaryObject,
+    map_dict: dict[Any, Any],
+    int_entry: list[int],
+) -> tuple[dict[Any, Any], list[int]]:
+    if "/FontDescriptor" not in ft:
+        return map_dict, int_entry
+    ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
+    if is_null_or_none(ft_desc):
+        return map_dict, int_entry
+    assert ft_desc is not None, "mypy"
+    txt = ft_desc.get_object().get_data()
+    txt = txt.split(b"eexec\n")[0]  # only clear part
+    txt = txt.split(b"/Encoding")[1]  # to get the encoding part
+    lines = txt.replace(b"\r", b"\n").split(b"\n")
+    for li in lines:
+        if li.startswith(b"dup"):
+            words = [_w for _w in li.split(b" ") if _w != b""]
+            if len(words) > 3 and words[3] != b"put":
+                continue
+            try:
+                i = int(words[1])
+            except ValueError:  # pragma: no cover
+                continue
+            try:
+                v = adobe_glyphs[words[2].decode()]
+            except KeyError:
+                if words[2].startswith(b"/uni"):
+                    try:
+                        v = chr(int(words[2][4:], 16))
+                    except ValueError:  # pragma: no cover
+                        continue
+                else:
+                    continue
+            map_dict[chr(i)] = v
+            int_entry.append(i)
+    return map_dict, int_entry