Skip to content

Commit e2ebf10

Browse files
authored
BUG: Title sometimes is bytes and not str (#2930)
Closes #2929.
1 parent 5b50f47 commit e2ebf10

File tree

4 files changed

+20
-0
lines changed

4 files changed

+20
-0
lines changed

pypdf/_doc_common.py

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ def _get_text(self, key: str) -> Optional[str]:
121121
retval = self.get(key, None)
122122
if isinstance(retval, TextStringObject):
123123
return retval
124+
if isinstance(retval, ByteStringObject):
125+
return str(retval)
124126
return None
125127

126128
@property

pypdf/generic/_base.py

+9
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,15 @@ def write_to_stream(
619619
stream.write(binascii.hexlify(self))
620620
stream.write(b">")
621621

622+
def __str__(self) -> str:
623+
charset_to_try = ["utf-16"] + list(NameObject.CHARSETS)
624+
for enc in charset_to_try:
625+
try:
626+
return self.decode(enc)
627+
except UnicodeDecodeError:
628+
pass
629+
raise PdfReadError("Cannot decode ByteStringObject.")
630+
622631

623632
class TextStringObject(str, PdfObject): # noqa: SLOT000
624633
"""

resources/bytes.pdf

1.06 KB
Binary file not shown.

tests/test_reader.py

+9
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,18 @@ def test_read_metadata(pdf_path, expected):
108108
docinfo.modification_date
109109
docinfo.modification_date_raw
110110
if "/Title" in metadict:
111+
assert isinstance(docinfo.title, str)
111112
assert metadict["/Title"] == docinfo.title
112113

113114

115+
def test_read_metadata_title_is_utf8():
116+
with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile:
117+
reader = PdfReader(inputfile)
118+
title = reader.metadata.title
119+
# Should be a str.
120+
assert title == "Microsoft Word - トランスバース社買収電話会議英語Final.docx"
121+
122+
114123
def test_iss1943():
115124
with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
116125
docinfo = reader.metadata

0 commit comments

Comments
 (0)