diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 9f413660b..c1a5b927d 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -3,6 +3,7 @@ from abc import abstractmethod from pathlib import Path from typing import IO, Any, Dict, List, Optional, Tuple, Union +from weakref import WeakKeyDictionary try: # Python 3.8+: https://peps.python.org/pep-0586 @@ -78,7 +79,7 @@ def trailer(self) -> Dict[str, Any]: class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: List[Any] - _id_translated: Dict[int, Dict[int, int]] + _id_translated: "WeakKeyDictionary[PdfReaderProtocol, Dict[int, int]]" @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index c7569e31e..6a2659865 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -52,7 +52,9 @@ Union, cast, ) +from weakref import WeakKeyDictionary +from ._protocols import PdfReaderProtocol from ._cmap import build_char_map_from_dict from ._doc_common import PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption @@ -178,7 +180,7 @@ def __init__( self._idnum_hash: Dict[bytes, IndirectObject] = {} """Maps hash values of indirect objects to their IndirectObject instances.""" - self._id_translated: Dict[int, Dict[int, int]] = {} + self._id_translated: "WeakKeyDictionary[PdfReaderProtocol, dict[int, int]]" = WeakKeyDictionary() # The root of our page tree node. pages = DictionaryObject() @@ -389,7 +391,7 @@ def _add_page( # page, we need to create a new dictionary for the page, however the # objects below (including content) are not duplicated: try: # delete an already existing page - del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore + del self._id_translated[page_org.indirect_reference.pdf][ # type: ignore page_org.indirect_reference.idnum # type: ignore ] except Exception: @@ -2451,7 +2453,7 @@ def merge( ArrayObject, cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], ) - trslat = self._id_translated[id(reader)] + trslat = self._id_translated[reader] try: for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore try: @@ -2564,7 +2566,7 @@ def add_filtered_articles( else: thr = thr.get_object() if thr.indirect_reference.idnum not in self._id_translated[ - id(reader) + reader ] and fltr.search((thr["/I"] if "/I" in thr else {}).get("/Title", "")): self._add_articles_thread(thr, pages, reader) @@ -2780,15 +2782,15 @@ def reset_translation( if set to None or omitted, all tables will be reset. """ if reader is None: - self._id_translated = {} + self._id_translated = WeakKeyDictionary() elif isinstance(reader, PdfReader): try: - del self._id_translated[id(reader)] + del self._id_translated[reader] except Exception: pass elif isinstance(reader, IndirectObject): try: - del self._id_translated[id(reader.pdf)] + del self._id_translated[reader.pdf] except Exception: pass else: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 3fde174b0..52de94643 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -124,19 +124,18 @@ def _reference_clone( return clone i = len(pdf_dest._objects) + 1 if ind is not None: - if id(ind.pdf) not in pdf_dest._id_translated: - pdf_dest._id_translated[id(ind.pdf)] = {} - pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore + if ind.pdf not in pdf_dest._id_translated: + pdf_dest._id_translated[ind.pdf] = {} if ( not force_duplicate - and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] + and ind.idnum in pdf_dest._id_translated[ind.pdf] ): obj = pdf_dest.get_object( - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] + pdf_dest._id_translated[ind.pdf][ind.idnum] ) assert obj is not None return obj - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i + pdf_dest._id_translated[ind.pdf][ind.idnum] = i pdf_dest._objects.append(clone) clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone @@ -250,11 +249,11 @@ def clone( if self.pdf == pdf_dest and not force_duplicate: # Already duplicated and no extra duplication required return self - if id(self.pdf) not in pdf_dest._id_translated: - pdf_dest._id_translated[id(self.pdf)] = {} + if self.pdf not in pdf_dest._id_translated: + pdf_dest._id_translated[self.pdf] = {} - if self.idnum in pdf_dest._id_translated[id(self.pdf)]: - dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) + if self.idnum in pdf_dest._id_translated[self.pdf]: + dup = pdf_dest.get_object(pdf_dest._id_translated[self.pdf][self.idnum]) if force_duplicate: assert dup is not None assert dup.indirect_reference is not None