Skip to content
Open
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 31 additions & 27 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,8 +355,8 @@ def _get_object_from_stream(
) -> Union[int, PdfObject, str]:
# indirect reference to object in object stream
# read the entire object stream into memory
stmnum, _idx = self.xref_objStm[indirect_reference.idnum]
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
stream_num, _idx = self.xref_objStm[indirect_reference.idnum]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to increase readability, we should avoid abbreviations wherever possible, thus using number is preferred over num and index over idx.

stream_num might not really be obvious enough here as well - it is the object number of the corresponding stream object.

obj_stm: EncodedStreamObject = IndirectObject(stream_num, 0, self).get_object() # type: ignore
# This is an xref to a stream, so its type better be a stream
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
# Parse ALL objects in this stream in one pass and cache them.
Expand All @@ -373,28 +373,29 @@ def _get_object_from_stream(
stream_data.seek(0)
if n > max_n:
if self.strict:
raise LimitReachedError(f"Value /N {n} for object {stmnum} exceeds maximum allowed value {max_n}.")
raise LimitReachedError(f"Value /N {n} for object {stream_num} exceeds maximum allowed value {max_n}.")
logger_warning(
"Value /N %(n)d for object %(stmnum)d exceeds maximum allowed value %(max_n)d. Limiting to %(max_n)d.",
"Value /N %(n)d for object %(stream_num)d exceeds maximum allowed value %(max_n)d. "
"Limiting to %(max_n)d.",
source=__name__,
n=n,
stmnum=stmnum,
stream_num=stream_num,
max_n=max_n,
)
n = max_n

# Phase 1: Read the index (objnum, offset) pairs from the header.
# Phase 1: Read the index (obj_num, offset) pairs from the header.
obj_index: list[tuple[int, int]] = []
for _i in range(n):
read_non_whitespace(stream_data)
stream_data.seek(-1, 1)
objnum = NumberObject.read_from_stream(stream_data)
obj_num = int(NumberObject.read_from_stream(stream_data))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the direct conversion to an integer increase the readability?

Copy link
Copy Markdown
Author

@semav-techdev semav-techdev May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The direct conversion to int was mainly to satisfy the type checker rather than improve readability.

When the code was:

obj_num = NumberObject.read_from_stream(stream_data)

the CI failed with mypy errors because the returned type was inferred as NumberObject | FloatObject, while later usages expected int.

Converting it explicitly with:

obj_num = int(NumberObject.read_from_stream(stream_data))

resolved the type mismatch errors in _reader.py and made the expected type explicit for later calls such as cache_get_indirect_object() and dictionary lookups.
@stefan6419846

read_non_whitespace(stream_data)
stream_data.seek(-1, 1)
offset = NumberObject.read_from_stream(stream_data)
offset = int(NumberObject.read_from_stream(stream_data))
read_non_whitespace(stream_data)
stream_data.seek(-1, 1)
obj_index.append((int(objnum), int(offset)))
obj_index.append((obj_num, offset))

# Phase 2: Parse each object and cache it.
target_obj: Union[int, PdfObject, str] = NullObject()
Expand Down Expand Up @@ -436,7 +437,7 @@ def _get_object_from_stream(
# Incremental updates may override objects originally in the stream;
# caching those stale versions would shadow the newer xref entry.
authoritative_stm, _idx = self.xref_objStm.get(obj_num, (None, None))
if authoritative_stm == stmnum:
if authoritative_stm == stream_num:
self.cache_indirect_object(0, obj_num, obj) # type: ignore[arg-type]

if obj_num == indirect_reference.idnum:
Expand Down Expand Up @@ -745,18 +746,21 @@ def _basic_validation(self, stream: StreamType) -> None:
"""Ensure the stream is valid and not empty."""
stream.seek(0, os.SEEK_SET)
try:
header_byte = stream.read(5)
header_bytes = stream.read(5)
except UnicodeDecodeError:
raise UnsupportedOperation("cannot read header")
if header_byte == b"":
if header_bytes == b"":
raise EmptyFileError("Cannot read an empty file")
if header_byte != b"%PDF-":
if header_bytes != b"%PDF-":
if self.strict:
raise PdfReadError(
f"PDF starts with '{header_byte.decode('utf8')}', "
f"PDF starts with '{header_bytes.decode('utf8')}', "
"but '%PDF-' expected"
)
logger_warning("invalid pdf header: %(header_byte)r", source=__name__, header_byte=header_byte)
logger_warning(
"invalid pdf header: %(header_bytes)r",
source=__name__,
header_bytes=header_bytes)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
header_bytes=header_bytes)
header_bytes=header_bytes
)

stream.seek(0, os.SEEK_END)

def _find_eof_marker(self, stream: StreamType) -> None:
Expand Down Expand Up @@ -851,8 +855,8 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
return
read_non_whitespace(stream)
stream.seek(-1, 1)
cnt = 0
while cnt < size:
count_number = 0
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have never seen count_number used as a loop variable.

while count_number < size:
line = stream.read(20)
if not line:
raise PdfReadError("Unexpected empty line in Xref table.")
Expand Down Expand Up @@ -929,7 +933,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
self.xref_free_entry[65535][num] = entry_type_b == b"f"
except Exception:
pass
cnt += 1
count_number += 1
num += 1
read_non_whitespace(stream)
stream.seek(-1, 1)
Expand Down Expand Up @@ -1008,10 +1012,10 @@ def _process_xref_stream(self, xrefstream: DictionaryObject) -> None:
if key in xrefstream and key not in self.trailer:
self.trailer[NameObject(key)] = xrefstream.raw_get(key)
if "/XRefStm" in xrefstream:
p = self.stream.tell()
stream_position = self.stream.tell()
self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0)
self._read_pdf15_xref_stream(self.stream)
self.stream.seek(p, 0)
self.stream.seek(stream_position, 0)

def _read_xref(self, stream: StreamType) -> Optional[int]:
self._read_standard_xref_table(stream)
Expand All @@ -1025,7 +1029,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]:
if key not in self.trailer:
self.trailer[key] = value
if "/XRefStm" in new_trailer:
p = stream.tell()
stream_position = stream.tell()
stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0)
try:
self._read_pdf15_xref_stream(stream)
Expand All @@ -1035,7 +1039,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]:
source=__name__,
xref_stm=int(new_trailer["/XRefStm"]),
)
stream.seek(p, 0)
stream.seek(stream_position, 0)
if "/Prev" in new_trailer:
return cast(int, new_trailer["/Prev"])
return None
Expand All @@ -1058,8 +1062,8 @@ def _read_xref_other_error(
# the xref table nearby, as we've observed this error with an
# off-by-one before.
stream.seek(-11, 1)
tmp = stream.read(20)
xref_loc = tmp.find(b"xref")
xref_data = stream.read(20)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
xref_data = stream.read(20)
xref_data = stream.read(20)

xref_loc = xref_data .find(b"xref")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
xref_loc = xref_data .find(b"xref")
xref_loc = xref_data.find(b"xref")

if xref_loc != -1:
startxref -= 10 - xref_loc
return startxref
Expand Down Expand Up @@ -1353,11 +1357,11 @@ def _read_xref_subsections(
self.xref[generation][num] = byte_offset # type: ignore
elif xref_type == 2:
# compressed objects
objstr_num = get_entry(1)
obstr_idx = get_entry(2)
obj_str_num = get_entry(1)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another bulk of abbreviations. Preferably, when we are reviewing specific methods for such improvements, this should be done for all possible names, not only some of them.

obj_str_idx = get_entry(2)
generation = 0 # PDF spec table 18, generation is 0
if not used_before(num, generation):
self.xref_objStm[num] = (objstr_num, obstr_idx)
self.xref_objStm[num] = (obj_str_num, obj_str_idx)
elif self.strict:
raise PdfReadError(f"Unknown xref type: {xref_type}")

Expand Down
Loading