30
30
import json
31
31
import math
32
32
import re
33
+ import sys
33
34
from decimal import Decimal
34
35
from pathlib import Path
35
36
from typing import (
36
- TYPE_CHECKING ,
37
37
Any ,
38
38
Callable ,
39
39
Dict ,
87
87
StreamObject ,
88
88
)
89
89
90
- if TYPE_CHECKING :
91
- try :
92
- # Python 3.8+: https://peps.python.org/pep-0586
93
- from typing import Literal # type: ignore[attr-defined,unused-ignore]
94
- except ImportError :
95
- from typing_extensions import Literal # type: ignore[assignment,unused-ignore]
90
+ if sys .version_info >= (3 , 8 ):
91
+ from typing import Literal
92
+ else :
93
+ from typing_extensions import Literal
96
94
97
95
98
96
MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
@@ -1935,8 +1933,8 @@ def _layout_mode_text(
1935
1933
Defaults to None.
1936
1934
1937
1935
Returns:
1938
- str: multiline string containing page text structured as it appeared in the
1939
- source pdf.
1936
+ str: multiline string containing page text in a fixed width format that
1937
+ closely adheres to the rendered layout in the source pdf.
1940
1938
"""
1941
1939
fonts = self ._layout_mode_fonts ()
1942
1940
if debug_path :
@@ -1965,6 +1963,7 @@ def extract_text(
1965
1963
visitor_operand_before : Optional [Callable [[Any , Any , Any , Any ], None ]] = None ,
1966
1964
visitor_operand_after : Optional [Callable [[Any , Any , Any , Any ], None ]] = None ,
1967
1965
visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]] = None ,
1966
+ extraction_mode : Literal ["plain" , "layout" ] = "plain" ,
1968
1967
** kwargs : Any ,
1969
1968
) -> str :
1970
1969
"""
@@ -2003,12 +2002,12 @@ def extract_text(
2003
2002
text matrix, font-dictionary and font-size.
2004
2003
The font-dictionary may be None in case of unknown fonts.
2005
2004
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
2006
-
2007
- KwArgs:
2008
2005
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality
2009
2006
"layout" for experimental layout mode functionality.
2010
2007
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
2011
2008
in "layout" mode.
2009
+
2010
+ KwArgs:
2012
2011
layout_mode_space_vertically: include blank lines inferred from y distance + font
2013
2012
height. Defaults to True.
2014
2013
layout_mode_scale_weight: multiplier for string length when calculating weighted
@@ -2024,7 +2023,6 @@ def extract_text(
2024
2023
Returns:
2025
2024
The extracted text
2026
2025
"""
2027
- extraction_mode : Literal ["plain" , "layout" ] = kwargs .get ("extraction_mode" , "plain" )
2028
2026
if extraction_mode not in ["plain" , "layout" ]:
2029
2027
raise ValueError (f"Invalid text extraction mode '{ extraction_mode } '" )
2030
2028
if extraction_mode == "layout" :
0 commit comments