Skip to content

Commit ff7e40f

Browse files
committed
MAINT: Address PR review comments
- DOC: standardize language. use "layout", not "structure/structural". - BUG: address bug introduced by ruff refactoring (remove "TYPE_CHECKING" block for Literal import) - DEV: use sys.version_info based import switch (not try/except) for Literal and TypedDict to correct vscode colors and prevent odd mypy errors - TST: add test created by @MartinThoma in py-pdf#2390 - ENH: add remaining standard fonts and aliases
1 parent cefbfc6 commit ff7e40f

File tree

6 files changed

+48
-124
lines changed

6 files changed

+48
-124
lines changed

docs/user/extract-text.md

+15
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,21 @@ print(page.extract_text(0))
2020
print(page.extract_text((0, 90)))
2121
```
2222

23+
you can also extract text in "layout" mode:
24+
25+
```python
26+
# extract text in a fixed width format that closely adheres to the rendered
27+
# layout in the source pdf
28+
print(page.extract_text(extraction_mode="layout"))
29+
30+
# extract text preserving horizontal positioning without excess vertical
31+
# whitespace (removes blank and "whitespace only" lines)
32+
print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))
33+
34+
# adjust horizontal spacing
35+
print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))
36+
```
37+
2338
Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.
2439

2540
## Using a visitor

pypdf/_page.py

+10-12
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@
3030
import json
3131
import math
3232
import re
33+
import sys
3334
from decimal import Decimal
3435
from pathlib import Path
3536
from typing import (
36-
TYPE_CHECKING,
3737
Any,
3838
Callable,
3939
Dict,
@@ -87,12 +87,10 @@
8787
StreamObject,
8888
)
8989

90-
if TYPE_CHECKING:
91-
try:
92-
# Python 3.8+: https://peps.python.org/pep-0586
93-
from typing import Literal # type: ignore[attr-defined,unused-ignore]
94-
except ImportError:
95-
from typing_extensions import Literal # type: ignore[assignment,unused-ignore]
90+
if sys.version_info >= (3, 8):
91+
from typing import Literal
92+
else:
93+
from typing_extensions import Literal
9694

9795

9896
MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
@@ -1935,8 +1933,8 @@ def _layout_mode_text(
19351933
Defaults to None.
19361934
19371935
Returns:
1938-
str: multiline string containing page text structured as it appeared in the
1939-
source pdf.
1936+
str: multiline string containing page text in a fixed width format that
1937+
closely adheres to the rendered layout in the source pdf.
19401938
"""
19411939
fonts = self._layout_mode_fonts()
19421940
if debug_path:
@@ -1965,6 +1963,7 @@ def extract_text(
19651963
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
19661964
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
19671965
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1966+
extraction_mode: Literal["plain", "layout"] = "plain",
19681967
**kwargs: Any,
19691968
) -> str:
19701969
"""
@@ -2003,12 +2002,12 @@ def extract_text(
20032002
text matrix, font-dictionary and font-size.
20042003
The font-dictionary may be None in case of unknown fonts.
20052004
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
2006-
2007-
KwArgs:
20082005
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality
20092006
"layout" for experimental layout mode functionality.
20102007
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
20112008
in "layout" mode.
2009+
2010+
KwArgs:
20122011
layout_mode_space_vertically: include blank lines inferred from y distance + font
20132012
height. Defaults to True.
20142013
layout_mode_scale_weight: multiplier for string length when calculating weighted
@@ -2024,7 +2023,6 @@ def extract_text(
20242023
Returns:
20252024
The extracted text
20262025
"""
2027-
extraction_mode: Literal["plain", "layout"] = kwargs.get("extraction_mode", "plain")
20282026
if extraction_mode not in ["plain", "layout"]:
20292027
raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
20302028
if extraction_mode == "layout":

pypdf/_text_extraction/_layout_mode/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Structured text extraction extension for pypdf"""
1+
"""layout mode text extraction extension for pypdf"""
22
from ._fixed_width_page import fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups
33
from ._fonts import Font
44

pypdf/_text_extraction/_layout_mode/_fixed_width_page.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
1-
"""Extract pdf text preserving the structural layout of the source PDF"""
1+
"""Extract pdf text preserving the layout of the source PDF"""
22

33
import json
4+
import sys
45
from itertools import groupby
6+
from math import ceil
57
from pathlib import Path
68
from typing import Any, Dict, Iterator, List, Tuple, Union
79

8-
try:
9-
# Python 3.8+: https://peps.python.org/pep-0586
10-
from typing import Literal, TypedDict # type: ignore[attr-defined,unused-ignore]
11-
except ImportError:
12-
from typing_extensions import Literal, TypedDict # type: ignore[assignment,unused-ignore]
13-
1410
from ._fonts import Font, TextStateParams
1511
from ._xform_stack import XformStack
1612

13+
if sys.version_info >= (3, 8):
14+
from typing import Literal, TypedDict
15+
else:
16+
from typing_extensions import Literal, TypedDict
17+
1718

1819
class BTGroup(TypedDict):
1920
"""
@@ -367,15 +368,16 @@ def fixed_width_page(
367368
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
368369
) -> str:
369370
"""
370-
Generate page text by grouping text operations by y coordinate
371+
Generate page text from text operations grouped by rendered y coordinate
371372
372373
Args:
373374
ty_groups: dict of text show ops as returned by y_coordinate_groups()
374375
char_width: fixed character width
375376
space_vertically: include blank lines inferred from y distance + font height.
376377
377378
Returns:
378-
str: page text structured as it was rendered in the source PDF.
379+
str: page text in a fixed width format that closely adheres to the rendered
380+
layout in the source pdf.
379381
"""
380382
lines: List[str] = []
381383
last_y_coord = 0
@@ -387,9 +389,7 @@ def fixed_width_page(
387389
last_disp = 0.0
388390
for bt_op in line_data:
389391
offset = int(bt_op["tx"] // char_width)
390-
spaces = (offset - len(line)) * (
391-
round(last_disp + (char_width / 2.0)) < round(bt_op["tx"])
392-
)
392+
spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
393393
line = f"{line}{' ' * spaces}{bt_op['text']}"
394394
last_disp = bt_op["displaced_tx"]
395395
if line.strip() or lines:

pypdf/_text_extraction/_layout_mode/_fonts.py

+10-98
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
190190
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
191191

192192

193-
# Widths for some of the standard 14 fonts
193+
# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
194194
STANDARD_WIDTHS = {
195195
"Helvetica": { # 4 fonts, includes bold, oblique and boldoblique variants
196196
" ": 278,
@@ -386,101 +386,13 @@ def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
386386
"}": 348,
387387
"~": 469,
388388
},
389-
"Courier": { # 4 fonts, includes bold, oblique and boldoblique variants
390-
" ": 600,
391-
"!": 600,
392-
'"': 600,
393-
"#": 600,
394-
"$": 600,
395-
"%": 600,
396-
"&": 600,
397-
"'": 600,
398-
"(": 600,
399-
")": 600,
400-
"*": 600,
401-
"+": 600,
402-
",": 600,
403-
"-": 600,
404-
".": 600,
405-
"/": 600,
406-
"0": 600,
407-
"1": 600,
408-
"2": 600,
409-
"3": 600,
410-
"4": 600,
411-
"5": 600,
412-
"6": 600,
413-
"7": 600,
414-
"8": 600,
415-
"9": 600,
416-
":": 600,
417-
";": 600,
418-
"<": 600,
419-
"=": 600,
420-
">": 600,
421-
"?": 600,
422-
"@": 600,
423-
"A": 600,
424-
"B": 600,
425-
"C": 600,
426-
"D": 600,
427-
"E": 600,
428-
"F": 600,
429-
"G": 600,
430-
"H": 600,
431-
"I": 600,
432-
"J": 600,
433-
"K": 600,
434-
"L": 600,
435-
"M": 600,
436-
"N": 600,
437-
"O": 600,
438-
"P": 600,
439-
"Q": 600,
440-
"R": 600,
441-
"S": 600,
442-
"T": 600,
443-
"U": 600,
444-
"V": 600,
445-
"W": 600,
446-
"X": 600,
447-
"Y": 600,
448-
"Z": 600,
449-
"[": 600,
450-
"\\": 600,
451-
"]": 600,
452-
"^": 600,
453-
"_": 600,
454-
"`": 600,
455-
"a": 600,
456-
"b": 600,
457-
"c": 600,
458-
"d": 600,
459-
"e": 600,
460-
"f": 600,
461-
"g": 600,
462-
"h": 600,
463-
"i": 600,
464-
"j": 600,
465-
"k": 600,
466-
"l": 600,
467-
"m": 600,
468-
"n": 600,
469-
"o": 600,
470-
"p": 600,
471-
"q": 600,
472-
"r": 600,
473-
"s": 600,
474-
"t": 600,
475-
"u": 600,
476-
"v": 600,
477-
"w": 600,
478-
"x": 600,
479-
"y": 600,
480-
"z": 600,
481-
"{": 600,
482-
"|": 600,
483-
"}": 600,
484-
"~": 600,
485-
},
486389
}
390+
STANDARD_WIDTHS["Courier"] = { # 4 fonts, includes bold, oblique and boldoblique variants
391+
c: 600 for c in STANDARD_WIDTHS["Times"] # fixed width
392+
}
393+
STANDARD_WIDTHS["ZapfDingbats"] = {c: 1000 for c in STANDARD_WIDTHS["Times"]} # 1 font
394+
STANDARD_WIDTHS["Symbol"] = {c: 500 for c in STANDARD_WIDTHS["Times"]} # 1 font
395+
# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
396+
STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
397+
STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
398+
STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]

tests/test_workflows.py

-1
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,6 @@ def test_cr_with_cm_operation():
10431043

10441044

10451045
@pytest.mark.samples()
1046-
@pytest.mark.xfail(reason="#2388 implements this")
10471046
def test_text_extraction_layout_mode():
10481047
pdf_path = SAMPLE_ROOT / "026-latex-multicolumn/multicolumn.pdf"
10491048
reader = PdfReader(pdf_path)

0 commit comments

Comments
 (0)