-
Notifications
You must be signed in to change notification settings - Fork 1.6k
MAINT: Improve readability in _reader.py by refactoring variable names (PEP8 cleanup). #3772
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 10 commits
96923c5
5fd8160
59cb566
0ebff4c
e90561b
febd5f4
f86255b
3c5d7ae
a9562de
82c8308
a880cb8
65ec7d4
36c177d
a7f9204
6947e89
d7de24a
9acb9f4
f3b1111
a529901
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -355,8 +355,8 @@ def _get_object_from_stream( | |||||||
| ) -> Union[int, PdfObject, str]: | ||||||||
| # indirect reference to object in object stream | ||||||||
| # read the entire object stream into memory | ||||||||
| stmnum, _idx = self.xref_objStm[indirect_reference.idnum] | ||||||||
| obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore | ||||||||
| stream_num, _idx = self.xref_objStm[indirect_reference.idnum] | ||||||||
| obj_stm: EncodedStreamObject = IndirectObject(stream_num, 0, self).get_object() # type: ignore | ||||||||
| # This is an xref to a stream, so its type better be a stream | ||||||||
| assert cast(str, obj_stm["/Type"]) == "/ObjStm" | ||||||||
| # Parse ALL objects in this stream in one pass and cache them. | ||||||||
|
|
@@ -373,28 +373,29 @@ def _get_object_from_stream( | |||||||
| stream_data.seek(0) | ||||||||
| if n > max_n: | ||||||||
| if self.strict: | ||||||||
| raise LimitReachedError(f"Value /N {n} for object {stmnum} exceeds maximum allowed value {max_n}.") | ||||||||
| raise LimitReachedError(f"Value /N {n} for object {stream_num} exceeds maximum allowed value {max_n}.") | ||||||||
| logger_warning( | ||||||||
| "Value /N %(n)d for object %(stmnum)d exceeds maximum allowed value %(max_n)d. Limiting to %(max_n)d.", | ||||||||
| "Value /N %(n)d for object %(stream_num)d exceeds maximum allowed value %(max_n)d. " | ||||||||
| "Limiting to %(max_n)d.", | ||||||||
| source=__name__, | ||||||||
| n=n, | ||||||||
| stmnum=stmnum, | ||||||||
| stream_num=stream_num, | ||||||||
| max_n=max_n, | ||||||||
| ) | ||||||||
| n = max_n | ||||||||
|
|
||||||||
| # Phase 1: Read the index (objnum, offset) pairs from the header. | ||||||||
| # Phase 1: Read the index (obj_num, offset) pairs from the header. | ||||||||
| obj_index: list[tuple[int, int]] = [] | ||||||||
| for _i in range(n): | ||||||||
| read_non_whitespace(stream_data) | ||||||||
| stream_data.seek(-1, 1) | ||||||||
| objnum = NumberObject.read_from_stream(stream_data) | ||||||||
| obj_num = int(NumberObject.read_from_stream(stream_data)) | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does the direct conversion to an integer increase the readability?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The direct conversion to int was mainly to satisfy the type checker rather than improve readability. When the code was: obj_num = NumberObject.read_from_stream(stream_data) the CI failed with mypy errors because the returned type was inferred as NumberObject | FloatObject, while later usages expected int. Converting it explicitly with: obj_num = int(NumberObject.read_from_stream(stream_data)) resolved the type mismatch errors in _reader.py and made the expected type explicit for later calls such as cache_get_indirect_object() and dictionary lookups. |
||||||||
| read_non_whitespace(stream_data) | ||||||||
| stream_data.seek(-1, 1) | ||||||||
| offset = NumberObject.read_from_stream(stream_data) | ||||||||
| offset = int(NumberObject.read_from_stream(stream_data)) | ||||||||
| read_non_whitespace(stream_data) | ||||||||
| stream_data.seek(-1, 1) | ||||||||
| obj_index.append((int(objnum), int(offset))) | ||||||||
| obj_index.append((obj_num, offset)) | ||||||||
|
|
||||||||
| # Phase 2: Parse each object and cache it. | ||||||||
| target_obj: Union[int, PdfObject, str] = NullObject() | ||||||||
|
|
@@ -436,7 +437,7 @@ def _get_object_from_stream( | |||||||
| # Incremental updates may override objects originally in the stream; | ||||||||
| # caching those stale versions would shadow the newer xref entry. | ||||||||
| authoritative_stm, _idx = self.xref_objStm.get(obj_num, (None, None)) | ||||||||
| if authoritative_stm == stmnum: | ||||||||
| if authoritative_stm == stream_num: | ||||||||
| self.cache_indirect_object(0, obj_num, obj) # type: ignore[arg-type] | ||||||||
|
|
||||||||
| if obj_num == indirect_reference.idnum: | ||||||||
|
|
@@ -745,18 +746,21 @@ def _basic_validation(self, stream: StreamType) -> None: | |||||||
| """Ensure the stream is valid and not empty.""" | ||||||||
| stream.seek(0, os.SEEK_SET) | ||||||||
| try: | ||||||||
| header_byte = stream.read(5) | ||||||||
| header_bytes = stream.read(5) | ||||||||
| except UnicodeDecodeError: | ||||||||
| raise UnsupportedOperation("cannot read header") | ||||||||
| if header_byte == b"": | ||||||||
| if header_bytes == b"": | ||||||||
| raise EmptyFileError("Cannot read an empty file") | ||||||||
| if header_byte != b"%PDF-": | ||||||||
| if header_bytes != b"%PDF-": | ||||||||
| if self.strict: | ||||||||
| raise PdfReadError( | ||||||||
| f"PDF starts with '{header_byte.decode('utf8')}', " | ||||||||
| f"PDF starts with '{header_bytes.decode('utf8')}', " | ||||||||
| "but '%PDF-' expected" | ||||||||
| ) | ||||||||
| logger_warning("invalid pdf header: %(header_byte)r", source=__name__, header_byte=header_byte) | ||||||||
| logger_warning( | ||||||||
| "invalid pdf header: %(header_bytes)r", | ||||||||
| source=__name__, | ||||||||
| header_bytes=header_bytes) | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| stream.seek(0, os.SEEK_END) | ||||||||
|
|
||||||||
| def _find_eof_marker(self, stream: StreamType) -> None: | ||||||||
|
|
@@ -851,8 +855,8 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: | |||||||
| return | ||||||||
| read_non_whitespace(stream) | ||||||||
| stream.seek(-1, 1) | ||||||||
| cnt = 0 | ||||||||
| while cnt < size: | ||||||||
| count_number = 0 | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have never seen |
||||||||
| while count_number < size: | ||||||||
| line = stream.read(20) | ||||||||
| if not line: | ||||||||
| raise PdfReadError("Unexpected empty line in Xref table.") | ||||||||
|
|
@@ -929,7 +933,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: | |||||||
| self.xref_free_entry[65535][num] = entry_type_b == b"f" | ||||||||
| except Exception: | ||||||||
| pass | ||||||||
| cnt += 1 | ||||||||
| count_number += 1 | ||||||||
| num += 1 | ||||||||
| read_non_whitespace(stream) | ||||||||
| stream.seek(-1, 1) | ||||||||
|
|
@@ -1008,10 +1012,10 @@ def _process_xref_stream(self, xrefstream: DictionaryObject) -> None: | |||||||
| if key in xrefstream and key not in self.trailer: | ||||||||
| self.trailer[NameObject(key)] = xrefstream.raw_get(key) | ||||||||
| if "/XRefStm" in xrefstream: | ||||||||
| p = self.stream.tell() | ||||||||
| stream_position = self.stream.tell() | ||||||||
| self.stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) | ||||||||
| self._read_pdf15_xref_stream(self.stream) | ||||||||
| self.stream.seek(p, 0) | ||||||||
| self.stream.seek(stream_position, 0) | ||||||||
|
|
||||||||
| def _read_xref(self, stream: StreamType) -> Optional[int]: | ||||||||
| self._read_standard_xref_table(stream) | ||||||||
|
|
@@ -1025,7 +1029,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: | |||||||
| if key not in self.trailer: | ||||||||
| self.trailer[key] = value | ||||||||
| if "/XRefStm" in new_trailer: | ||||||||
| p = stream.tell() | ||||||||
| stream_position = stream.tell() | ||||||||
| stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) | ||||||||
| try: | ||||||||
| self._read_pdf15_xref_stream(stream) | ||||||||
|
|
@@ -1035,7 +1039,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: | |||||||
| source=__name__, | ||||||||
| xref_stm=int(new_trailer["/XRefStm"]), | ||||||||
| ) | ||||||||
| stream.seek(p, 0) | ||||||||
| stream.seek(stream_position, 0) | ||||||||
| if "/Prev" in new_trailer: | ||||||||
| return cast(int, new_trailer["/Prev"]) | ||||||||
| return None | ||||||||
|
|
@@ -1058,8 +1062,8 @@ def _read_xref_other_error( | |||||||
| # the xref table nearby, as we've observed this error with an | ||||||||
| # off-by-one before. | ||||||||
| stream.seek(-11, 1) | ||||||||
| tmp = stream.read(20) | ||||||||
| xref_loc = tmp.find(b"xref") | ||||||||
| xref_data = stream.read(20) | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| xref_loc = xref_data .find(b"xref") | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| if xref_loc != -1: | ||||||||
| startxref -= 10 - xref_loc | ||||||||
| return startxref | ||||||||
|
|
@@ -1353,11 +1357,11 @@ def _read_xref_subsections( | |||||||
| self.xref[generation][num] = byte_offset # type: ignore | ||||||||
| elif xref_type == 2: | ||||||||
| # compressed objects | ||||||||
| objstr_num = get_entry(1) | ||||||||
| obstr_idx = get_entry(2) | ||||||||
| obj_str_num = get_entry(1) | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another bulk of abbreviations. Preferably, when we are reviewing specific methods for such improvements, this should be done for all possible names, not only some of them. |
||||||||
| obj_str_idx = get_entry(2) | ||||||||
| generation = 0 # PDF spec table 18, generation is 0 | ||||||||
| if not used_before(num, generation): | ||||||||
| self.xref_objStm[num] = (objstr_num, obstr_idx) | ||||||||
| self.xref_objStm[num] = (obj_str_num, obj_str_idx) | ||||||||
| elif self.strict: | ||||||||
| raise PdfReadError(f"Unknown xref type: {xref_type}") | ||||||||
|
|
||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we want to increase readability, we should avoid abbreviations wherever possible, thus using
numberis preferred overnumandindexoveridx.stream_nummight not really be obvious enough here as well - it is the object number of the corresponding stream object.