Skip to content

Commit e2da09f

Browse files
authored
ENH: Add layout_mode_font_height_weight argument to PageObject.extract_text() (#2920)
Closes #2915.
1 parent dd18967 commit e2da09f

5 files changed

+95
-3
lines changed

pypdf/_page.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -2210,6 +2210,7 @@ def _layout_mode_text(
22102210
scale_weight: float = 1.25,
22112211
strip_rotated: bool = True,
22122212
debug_path: Optional[Path] = None,
2213+
font_height_weight: float = 1,
22132214
) -> str:
22142215
"""
22152216
Get text preserving fidelity to source PDF text layout.
@@ -2229,6 +2230,8 @@ def _layout_mode_text(
22292230
- bts.json: text render ops left justified and grouped by BT/ET operators
22302231
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
22312232
Defaults to None.
2233+
font_height_weight: multiplier for font height when calculating
2234+
blank lines. Defaults to 1.
22322235
22332236
Returns:
22342237
str: multiline string containing page text in a fixed width format that
@@ -2260,7 +2263,7 @@ def _layout_mode_text(
22602263

22612264
char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
22622265

2263-
return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
2266+
return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)
22642267

22652268
def extract_text(
22662269
self,
@@ -2335,6 +2338,8 @@ def extract_text(
23352338
- tjs.json: individual text render ops with corresponding transform matrices
23362339
- bts.json: text render ops left justified and grouped by BT/ET operators
23372340
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
2341+
layout_mode_font_height_weight (float): multiplier for font height when calculating
2342+
blank lines. Defaults to 1.
23382343
23392344
Returns:
23402345
The extracted text
@@ -2358,6 +2363,7 @@ def extract_text(
23582363
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
23592364
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
23602365
debug_path=kwargs.get("layout_mode_debug_path"),
2366+
font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
23612367
)
23622368
if len(args) >= 1:
23632369
if isinstance(args[0], str):

pypdf/_text_extraction/_layout_mode/_fixed_width_page.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl
343343

344344

345345
def fixed_width_page(
346-
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
346+
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
347347
) -> str:
348348
"""
349349
Generate page text from text operations grouped by rendered y coordinate.
@@ -352,6 +352,7 @@ def fixed_width_page(
352352
ty_groups: dict of text show ops as returned by y_coordinate_groups()
353353
char_width: fixed character width
354354
space_vertically: include blank lines inferred from y distance + font height.
355+
font_height_weight: multiplier for font height when calculating blank lines.
355356
356357
Returns:
357358
str: page text in a fixed width format that closely adheres to the rendered
@@ -363,7 +364,7 @@ def fixed_width_page(
363364
for y_coord, line_data in ty_groups.items():
364365
if space_vertically and lines:
365366
blank_lines = (
366-
int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
367+
int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
367368
)
368369
lines.extend([""] * blank_lines)
369370
line = ""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
The Crazy Ones
2+
October 14, 1998
3+
4+
Heres to the crazy ones. The misfits. The rebels. The troublemakers.
5+
The round pegs in the square holes.
6+
The ones who see things differently. Theyre not fond of rules. And
7+
they have no respect for the status quo. You can quote them,
8+
disagree with them, glorify or vilify them.
9+
About the only thing you cant do is ignore them. Because they change
10+
things. They invent. They imagine. They heal. They explore. They
11+
create. They inspire. They push the human race forward.
12+
Maybe they have to be crazy.
13+
How else can you stare at an empty canvas and see a work of art? Or
14+
sit in silence and hear a song thats never been written? Or gaze at
15+
a red planet and see a laboratory on wheels?
16+
We make tools for these kinds of people.
17+
While some see them as the crazy ones, we see genius. Because the
18+
people who are crazy enough to think they can change the world,
19+
are the ones who do.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
The Crazy Ones
2+
October 14, 1998
3+
4+
Heres to the crazy ones. The misfits. The rebels. The troublemakers.
5+
The round pegs in the square holes.
6+
7+
The ones who see things differently. Theyre not fond of rules. And
8+
they have no respect for the status quo. You can quote them,
9+
disagree with them, glorify or vilify them.
10+
11+
About the only thing you cant do is ignore them. Because they change
12+
things. They invent. They imagine. They heal. They explore. They
13+
create. They inspire. They push the human race forward.
14+
15+
Maybe they have to be crazy.
16+
17+
How else can you stare at an empty canvas and see a work of art? Or
18+
sit in silence and hear a song thats never been written? Or gaze at
19+
a red planet and see a laboratory on wheels?
20+
21+
We make tools for these kinds of people.
22+
23+
While some see them as the crazy ones, we see genius. Because the
24+
people who are crazy enough to think they can change the world,
25+
are the ones who do.

tests/test_text_extraction.py

+41
Original file line numberDiff line numberDiff line change
@@ -219,3 +219,44 @@ def test_text_leading_height_unit():
219219
page = reader.pages[0]
220220
extracted = page.extract_text()
221221
assert "Something[cited]\n" in extracted
222+
223+
224+
def test_layout_mode_space_vertically_font_height_weight():
225+
"""Tests layout mode with vertical space and font height weight (issue #2915)"""
226+
with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile:
227+
# Load PDF file from file
228+
reader = PdfReader(inputfile)
229+
page = reader.pages[0]
230+
231+
# Normal behaviour
232+
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file:
233+
pdftext = pdftext_file.read()
234+
235+
text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8")
236+
237+
# Compare the text of the PDF to a known source
238+
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
239+
assert expected_line == actual_line
240+
241+
pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
242+
assert text == pdftext, (
243+
"PDF extracted text differs from expected value.\n\n"
244+
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
245+
)
246+
247+
# Blank lines are added to truly separate paragraphs
248+
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file:
249+
pdftext = pdftext_file.read()
250+
251+
text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True,
252+
layout_mode_font_height_weight=0.85).encode("utf-8")
253+
254+
# Compare the text of the PDF to a known source
255+
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
256+
assert expected_line == actual_line
257+
258+
pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
259+
assert text == pdftext, (
260+
"PDF extracted text differs from expected value.\n\n"
261+
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
262+
)

0 commit comments

Comments
 (0)