MAINT: Address PR review comments

shartzog · shartzog · commit ff7e40f039c6 · 2024-01-04T00:41:29.000-05:00
- DOC: standardize language. use "layout", not "structure/structural". - BUG: address bug introduced by ruff refactoring (remove "TYPE_CHECKING" block for Literal import) - DEV: use sys.version_info based import switch (not try/except) for Literal and TypedDict to correct vscode colors and prevent odd mypy errors - TST: add test created by @MartinThoma in py-pdf#2390 - ENH: add remaining standard fonts and aliases
diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
@@ -20,6 +20,21 @@ print(page.extract_text(0))
 print(page.extract_text((0, 90)))
 ```
 
+you can also extract text in "layout" mode:
+
+```python
+# extract text in a fixed width format that closely adheres to the rendered
+# layout in the source pdf
+print(page.extract_text(extraction_mode="layout"))
+
+# extract text preserving horizontal positioning without excess vertical
+# whitespace (removes blank and "whitespace only" lines)
+print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))
+
+# adjust horizontal spacing
+print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))
+```
+
 Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.
 
 ## Using a visitor
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -30,10 +30,10 @@
 import json
 import math
 import re
+import sys
 from decimal import Decimal
 from pathlib import Path
 from typing import (
-    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -87,12 +87,10 @@
     StreamObject,
 )
 
-if TYPE_CHECKING:
-    try:
-        # Python 3.8+: https://peps.python.org/pep-0586
-        from typing import Literal  # type: ignore[attr-defined,unused-ignore]
-    except ImportError:
-        from typing_extensions import Literal  # type: ignore[assignment,unused-ignore]
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
 
 
 MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
@@ -1935,8 +1933,8 @@ def _layout_mode_text(
                 Defaults to None.
 
         Returns:
-            str: multiline string containing page text structured as it appeared in the
-            source pdf.
+            str: multiline string containing page text in a fixed width format that
+                closely adheres to the rendered layout in the source pdf.
         """
         fonts = self._layout_mode_fonts()
         if debug_path:
@@ -1965,6 +1963,7 @@ def extract_text(
         visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        extraction_mode: Literal["plain", "layout"] = "plain",
         **kwargs: Any,
     ) -> str:
         """
@@ -2003,12 +2002,12 @@ def extract_text(
                 text matrix, font-dictionary and font-size.
                 The font-dictionary may be None in case of unknown fonts.
                 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
-
-        KwArgs:
             extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality
                 "layout" for experimental layout mode functionality.
                 NOTE: orientations, space_width, and visitor_* parameters are NOT respected
                 in "layout" mode.
+
+        KwArgs:
             layout_mode_space_vertically: include blank lines inferred from y distance + font
                 height. Defaults to True.
             layout_mode_scale_weight: multiplier for string length when calculating weighted
@@ -2024,7 +2023,6 @@ def extract_text(
         Returns:
             The extracted text
         """
-        extraction_mode: Literal["plain", "layout"] = kwargs.get("extraction_mode", "plain")
         if extraction_mode not in ["plain", "layout"]:
             raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
         if extraction_mode == "layout":
diff --git a/pypdf/_text_extraction/_layout_mode/__init__.py b/pypdf/_text_extraction/_layout_mode/__init__.py
@@ -1,4 +1,4 @@
-"""Structured text extraction extension for pypdf"""
+"""layout mode text extraction extension for pypdf"""
 from ._fixed_width_page import fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups
 from ._fonts import Font
 
diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -1,19 +1,20 @@
-"""Extract pdf text preserving the structural layout of the source PDF"""
+"""Extract pdf text preserving the layout of the source PDF"""
 
 import json
+import sys
 from itertools import groupby
+from math import ceil
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Tuple, Union
 
-try:
-    # Python 3.8+: https://peps.python.org/pep-0586
-    from typing import Literal, TypedDict  # type: ignore[attr-defined,unused-ignore]
-except ImportError:
-    from typing_extensions import Literal, TypedDict  # type: ignore[assignment,unused-ignore]
-
 from ._fonts import Font, TextStateParams
 from ._xform_stack import XformStack
 
+if sys.version_info >= (3, 8):
+    from typing import Literal, TypedDict
+else:
+    from typing_extensions import Literal, TypedDict
+
 
 class BTGroup(TypedDict):
     """
@@ -367,15 +368,16 @@ def fixed_width_page(
     ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
 ) -> str:
     """
-    Generate page text by grouping text operations by y coordinate
+    Generate page text from text operations grouped by rendered y coordinate
 
     Args:
         ty_groups: dict of text show ops as returned by y_coordinate_groups()
         char_width: fixed character width
         space_vertically: include blank lines inferred from y distance + font height.
 
     Returns:
-        str: page text structured as it was rendered in the source PDF.
+        str: page text in a fixed width format that closely adheres to the rendered
+            layout in the source pdf.
     """
     lines: List[str] = []
     last_y_coord = 0
@@ -387,9 +389,7 @@ def fixed_width_page(
         last_disp = 0.0
         for bt_op in line_data:
             offset = int(bt_op["tx"] // char_width)
-            spaces = (offset - len(line)) * (
-                round(last_disp + (char_width / 2.0)) < round(bt_op["tx"])
-            )
+            spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
             line = f"{line}{' ' * spaces}{bt_op['text']}"
             last_disp = bt_op["displaced_tx"]
         if line.strip() or lines:
diff --git a/pypdf/_text_extraction/_layout_mode/_fonts.py b/pypdf/_text_extraction/_layout_mode/_fonts.py
@@ -190,7 +190,7 @@ def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
         return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
 
 
-# Widths for some of the standard 14 fonts
+# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
 STANDARD_WIDTHS = {
     "Helvetica": {  # 4 fonts, includes bold, oblique and boldoblique variants
         " ": 278,
@@ -386,101 +386,13 @@ def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
         "}": 348,
         "~": 469,
     },
-    "Courier": {  # 4 fonts, includes bold, oblique and boldoblique variants
-        " ": 600,
-        "!": 600,
-        '"': 600,
-        "#": 600,
-        "$": 600,
-        "%": 600,
-        "&": 600,
-        "'": 600,
-        "(": 600,
-        ")": 600,
-        "*": 600,
-        "+": 600,
-        ",": 600,
-        "-": 600,
-        ".": 600,
-        "/": 600,
-        "0": 600,
-        "1": 600,
-        "2": 600,
-        "3": 600,
-        "4": 600,
-        "5": 600,
-        "6": 600,
-        "7": 600,
-        "8": 600,
-        "9": 600,
-        ":": 600,
-        ";": 600,
-        "<": 600,
-        "=": 600,
-        ">": 600,
-        "?": 600,
-        "@": 600,
-        "A": 600,
-        "B": 600,
-        "C": 600,
-        "D": 600,
-        "E": 600,
-        "F": 600,
-        "G": 600,
-        "H": 600,
-        "I": 600,
-        "J": 600,
-        "K": 600,
-        "L": 600,
-        "M": 600,
-        "N": 600,
-        "O": 600,
-        "P": 600,
-        "Q": 600,
-        "R": 600,
-        "S": 600,
-        "T": 600,
-        "U": 600,
-        "V": 600,
-        "W": 600,
-        "X": 600,
-        "Y": 600,
-        "Z": 600,
-        "[": 600,
-        "\\": 600,
-        "]": 600,
-        "^": 600,
-        "_": 600,
-        "`": 600,
-        "a": 600,
-        "b": 600,
-        "c": 600,
-        "d": 600,
-        "e": 600,
-        "f": 600,
-        "g": 600,
-        "h": 600,
-        "i": 600,
-        "j": 600,
-        "k": 600,
-        "l": 600,
-        "m": 600,
-        "n": 600,
-        "o": 600,
-        "p": 600,
-        "q": 600,
-        "r": 600,
-        "s": 600,
-        "t": 600,
-        "u": 600,
-        "v": 600,
-        "w": 600,
-        "x": 600,
-        "y": 600,
-        "z": 600,
-        "{": 600,
-        "|": 600,
-        "}": 600,
-        "~": 600,
-    },
 }
+STANDARD_WIDTHS["Courier"] = {  # 4 fonts, includes bold, oblique and boldoblique variants
+    c: 600 for c in STANDARD_WIDTHS["Times"]  # fixed width
+}
+STANDARD_WIDTHS["ZapfDingbats"] = {c: 1000 for c in STANDARD_WIDTHS["Times"]}  # 1 font
+STANDARD_WIDTHS["Symbol"] = {c: 500 for c in STANDARD_WIDTHS["Times"]}  # 1 font
+# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
+STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
+STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
+STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -1043,7 +1043,6 @@ def test_cr_with_cm_operation():
 
 
 @pytest.mark.samples()
-@pytest.mark.xfail(reason="#2388 implements this")
 def test_text_extraction_layout_mode():
     pdf_path = SAMPLE_ROOT / "026-latex-multicolumn/multicolumn.pdf"
     reader = PdfReader(pdf_path)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-"""Structured text extraction extension for pypdf"""`
	`1`	`+"""layout mode text extraction extension for pypdf"""`
`2`	`2`	`from ._fixed_width_page import fixed_char_width, fixed_width_page, text_show_operations, y_coordinate_groups`
`3`	`3`	`from ._fonts import Font`
`4`	`4`