Skip to content

Commit 445d866

Browse files
committed
community: support advanced text extraction options for pdf documents
- Updated constructors in PyPDFParser and PyPDFLoader to handle `extraction_mode` and additional kwargs, aligning with the capabilities of `PageObject.extract_text()` from pypdf. - Added `test_pypdf_loader_with_layout` along with a corresponding example text file to validate layout extraction from PDFs.
1 parent b35e68c commit 445d866

File tree

11 files changed

+104
-22
lines changed

11 files changed

+104
-22
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ env.bak/
133133

134134
# mypy
135135
.mypy_cache/
136+
.mypy_cache_test/
136137
.dmypy.json
137138
dmypy.json
138139

libs/community/langchain_community/document_loaders/parsers/pdf.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,16 @@ class PyPDFParser(BaseBlobParser):
8282
"""Load `PDF` using `pypdf`"""
8383

8484
def __init__(
85-
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
85+
self,
86+
password: Optional[Union[str, bytes]] = None,
87+
extract_images: bool = False,
88+
extraction_mode: str = "plain",
89+
**kwargs: Any,
8690
):
8791
self.password = password
8892
self.extract_images = extract_images
93+
self.extraction_mode = extraction_mode
94+
self.extraction_kwargs = kwargs
8995

9096
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
9197
"""Lazily parse the blob."""
@@ -95,7 +101,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
95101
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
96102
yield from [
97103
Document(
98-
page_content=page.extract_text()
104+
page_content=page.extract_text(
105+
extraction_mode=self.extraction_mode, **self.extraction_kwargs
106+
)
99107
+ self._extract_images_from_page(page),
100108
metadata={"source": blob.source, "page": page_number},
101109
)

libs/community/langchain_community/document_loaders/pdf.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ def __init__(
171171
password: Optional[Union[str, bytes]] = None,
172172
headers: Optional[Dict] = None,
173173
extract_images: bool = False,
174+
extraction_mode: str = "plain",
175+
kwargs: Optional[Dict] = None,
174176
) -> None:
175177
"""Initialize with a file path."""
176178
try:
@@ -180,7 +182,12 @@ def __init__(
180182
"pypdf package not found, please install it with " "`pip install pypdf`"
181183
)
182184
super().__init__(file_path, headers=headers)
183-
self.parser = PyPDFParser(password=password, extract_images=extract_images)
185+
self.parser = PyPDFParser(
186+
password=password,
187+
extract_images=extract_images,
188+
extraction_mode=extraction_mode,
189+
kwargs=kwargs,
190+
)
184191

185192
def lazy_load(
186193
self,

libs/community/poetry.lock

+7-7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

libs/community/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jinja2 = {version = "^3", optional = true}
2626
cohere = {version = "^4", optional = true}
2727
openai = {version = "<2", optional = true}
2828
arxiv = {version = "^1.4", optional = true}
29-
pypdf = {version = "^3.4.0", optional = true}
29+
pypdf = {version = "^4.0.0", optional = true}
3030
aleph-alpha-client = {version="^2.15.0", optional = true}
3131
gradientai = {version="^1.4.0", optional = true}
3232
pgvector = {version = "^0.1.6", optional = true}

libs/community/tests/integration_tests/document_loaders/test_pdf.py

+17
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from pathlib import Path
23
from typing import Sequence, Union
34

@@ -100,6 +101,22 @@ def test_pypdf_loader() -> None:
100101
assert len(docs) == 16
101102

102103

104+
def test_pypdf_loader_with_layout() -> None:
105+
"""Test PyPDFLoader with layout mode."""
106+
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
107+
loader = PyPDFLoader(str(file_path), extraction_mode="layout")
108+
109+
docs = loader.load()
110+
first_page = docs[0].page_content
111+
112+
expected = (
113+
Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt"
114+
).read_text(encoding="utf-8")
115+
cleaned_first_page = re.sub(r"\x00", "", first_page)
116+
cleaned_expected = re.sub(r"\x00", "", expected)
117+
assert cleaned_first_page == cleaned_expected
118+
119+
103120
def test_pypdfium2_loader() -> None:
104121
"""Test PyPDFium2Loader."""
105122
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
LayoutParser : A Unified Toolkit for Deep
2+
Learning Based Document Image Analysis
3+
4+
5+
Zejiang Shen 1 ( ), Ruochen Zhang 2, Melissa Dell 3, Benjamin Charles Germain
6+
Lee 4, Jacob Carlson 3, and Weining Li 5
7+
8+
1 Allen Institute for AI
9+
shannons@allenai.org
10+
2 Brown University
11+
ruochen zhang@brown.edu
12+
3 Harvard University
13+
{melissadell,jacob carlson }@fas.harvard.edu
14+
4 University of Washington
15+
bcgl@cs.washington.edu
16+
5 University of Waterloo
17+
w422li@uwaterloo.ca
18+
19+
20+
21+
Abstract. Recentadvancesindocumentimageanalysis(DIA)havebeen
22+
primarily driven by the application of neural networks. Ideally, research
23+
outcomes could be easily deployed in production and extended for further
24+
investigation. However, various factors like loosely organized codebases
25+
and sophisticated model configurations complicate the easy reuse of im-
26+
portant innovations by awide audience. Though there havebeen on-going
27+
efforts to improve reusability and simplify deep learning (DL) model
28+
development in disciplines like natural language processing and computer
29+
vision, none of them are optimized for challenges in the domain of DIA.
30+
This represents a major gap in the existing toolkit, as DIA is central to
31+
academic research across a wide range of disciplines in the social sciences
32+
and humanities. This paper introduces LayoutParser , an open-source
33+
library for streamlining the usage of DL in DIA research and applica-
34+
tions. The core LayoutParser library comes with a set of simple and
35+
intuitive interfaces for applying and customizing DL models for layout de-
36+
tection,characterrecognition,andmanyotherdocumentprocessingtasks.
37+
To promote extensibility, LayoutParser also incorporates a community
38+
platform for sharing both pre-trained models and full document digiti-
39+
zation pipelines. We demonstrate that LayoutParser is helpful for both
40+
lightweight and large-scale digitization pipelines in real-word use cases.
41+
The library is publicly available at https://layout-parser.github.io .
42+
43+
Keywords: DocumentImageAnalysis ·DeepLearning ·LayoutAnalysis
44+
· Character Recognition · Open Source library · Toolkit.
45+
46+
1 Introduction
47+
48+
Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of
49+
documentimageanalysis(DIA)tasksincludingdocumentimageclassification[ 11 ,

libs/langchain/poetry.lock

+8-8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

libs/langchain/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ nlpcloud = {version = "^1", optional = true}
4242
huggingface_hub = {version = "^0", optional = true}
4343
sentence-transformers = {version = "^2", optional = true}
4444
arxiv = {version = "^1.4", optional = true}
45-
pypdf = {version = "^3.4.0", optional = true}
45+
pypdf = {version = "^4.0.0", optional = true}
4646
aleph-alpha-client = {version="^2.15.0", optional = true}
4747
pgvector = {version = "^0.1.6", optional = true}
4848
async-timeout = {version = "^4.0.0", python = "<3.11"}

templates/mongo-parent-document-retrieval/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
1010
langchain = "^0.1"
1111
openai = "<2"
1212
pymongo = "^4.6.0"
13-
pypdf = "^3.17.0"
13+
pypdf = "^4.0.0"
1414
tiktoken = "^0.5.1"
1515
langchain-text-splitters = ">=0.0.1,<0.1"
1616

templates/nvidia-rag-canonical/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ python = ">=3.8.1,<4.0"
1010
langchain = "^0.1"
1111
pymilvus = ">=2.3.0"
1212
langchain-nvidia-aiplay = "^0.0.2"
13-
pypdf = ">=3.1"
13+
pypdf = ">=4.0.1"
1414
langchain-text-splitters = ">=0.0.1,<0.1"
1515

1616
[tool.poetry.group.dev.dependencies]

0 commit comments

Comments
 (0)