Skip to content

Commit 18445f2

Browse files
authored
Minor reader bugfixes and fortified regression testing (#1205)
1 parent c91b09c commit 18445f2

File tree

13 files changed

+8967
-8427
lines changed

13 files changed

+8967
-8427
lines changed

packages/paper-qa-docling/src/paperqa_docling/reader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ def parse_pdf_to_pages( # noqa: PLR0912
5454
parse_media: Flag to also parse media (e.g. images, tables).
5555
pipeline_cls: Optional custom pipeline class for document conversion.
5656
Default is Docling's standard PDF pipeline.
57-
dpi: Optional DPI (dots per inch) for image resolution.
58-
Default PDF resolution is 72 DPI, so dpi of 144 would render at 2x scale.
57+
dpi: Optional DPI (dots per inch) for image resolution,
58+
if left unspecified Docling's default 1.0 scale will be employed.
5959
custom_pipeline_options: Optional keyword arguments to use to construct the
6060
PDF pipeline's options.
6161
page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)

packages/paper-qa-docling/tests/test_paperqa_docling.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,12 @@ async def test_parse_pdf_to_pages() -> None:
3232
# Weird spaces are because 'Pa S a' is bolded in the original PDF
3333
matches = re.findall(
3434
r"Abstract\n+We introduce PaSa, an advanced Pa ?per S ?e ?a ?rch"
35-
r" agent powered by large language models.",
35+
r" agent powered by large language models\.",
3636
parsed_text.content["1"][0],
3737
)
38-
assert len(matches) == 1, "Parsing failed to handle abstract"
38+
assert (
39+
len(matches) == 1
40+
), f"Parsing failed to handle abstract in {parsed_text.content['1'][0]}."
3941

4042
# Check the images in Figure 1
4143
assert not isinstance(parsed_text.content["2"], str)
@@ -46,6 +48,12 @@ async def test_parse_pdf_to_pages() -> None:
4648
(p2_image,) = [m for m in p2_media if m.info["type"] == "picture"]
4749
assert p2_image.index == 0
4850
assert p2_image.info["page_num"] == 2
51+
assert p2_image.info["height"] == pytest.approx(130, rel=0.1)
52+
assert p2_image.info["width"] == pytest.approx(452, rel=0.1)
53+
p2_bbox = p2_image.info["bbox"]
54+
assert isinstance(p2_bbox, tuple)
55+
for i, value in enumerate((71, 643.90, 522, 770.35)):
56+
assert p2_bbox[i] == pytest.approx(value, rel=0.1)
4957
assert isinstance(p2_image.data, bytes)
5058

5159
# Check the image is valid base64
@@ -185,10 +193,10 @@ def test_media_deduplication() -> None:
185193
# We allow for one table to be misinterpreted as an image
186194
assert (
187195
10 <= len(all_images) <= 11
188-
), "Expected each image (one/page) and formula (one/page) to be read"
196+
), "Expected each image (one/page) and equation (one/page) to be read"
189197
assert (
190198
len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 2
191-
), "Expected images/formulas on all pages beyond 1 to be deduplicated"
199+
), "Expected images/equations on all pages beyond 1 to be deduplicated"
192200

193201
all_tables = [m for m in all_media if m.info.get("type") == "table"]
194202
assert len(all_tables) == 5, "Expected each table (one/page) to be read"
@@ -222,6 +230,12 @@ def test_invalid_pdf_is_denied(tmp_path) -> None:
222230
parse_pdf_to_pages(bad_pdf_path)
223231

224232

233+
def test_nonexistent_file_failure() -> None:
234+
filename = "/nonexistent/path/file.pdf"
235+
with pytest.raises(FileNotFoundError, match=filename):
236+
parse_pdf_to_pages(filename)
237+
238+
225239
def test_table_parsing() -> None:
226240
# pylint: disable=duplicate-code
227241
filepath = STUB_DATA_DIR / "influence.pdf"
@@ -264,3 +278,16 @@ def test_document_timeout_denial() -> None:
264278
assert (
265279
time.perf_counter() - tic < 10
266280
), "Expected document timeout to have taken much less time than a normal read"
281+
282+
283+
def test_equation_parsing() -> None:
284+
parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
285+
assert isinstance(parsed_text.content, dict)
286+
assert isinstance(parsed_text.content["1"], tuple)
287+
p1_text, p1_media = parsed_text.content["1"]
288+
# SEE: https://regex101.com/r/pyOHLq/1
289+
assert re.search(
290+
r"[_*]*E[_*]* ?= ?[_*]*mc[_*]*(?:<sup>)?[ ^]?[2²] ?(?:<\/sup>)?", p1_text
291+
), "Expected inline equation in page 1 text"
292+
assert re.search(r"n ?\+ ?a", p1_text), "Expected block equation in page 1 text"
293+
assert p1_media

packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
5353
parse_media: bool = True,
5454
full_page: bool = False,
5555
image_cluster_tolerance: float | tuple[float, float] = 25,
56-
image_dpi: float | None = 150,
56+
dpi: float | None = None,
5757
**_,
5858
) -> ParsedText:
5959
"""Parse a PDF.
@@ -70,7 +70,9 @@ def parse_pdf_to_pages( # noqa: PLR0912
7070
Can be a single value to apply to both X and Y directions,
7171
or a two-tuple to specify X and Y directions separately.
7272
The default was chosen to perform well on image extraction from LitQA2 PDFs.
73-
image_dpi: Dots per inch for images captured from the PDF.
73+
dpi: Optional DPI (dots per inch) for image resolution,
74+
if left unspecified PyMuPDF's default resolution from
75+
pymupdf.Page.get_pixmap will be applied.
7476
page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
7577
to parse only specific pages, where pages are one-indexed.
7678
Leaving as the default of None will parse all pages.
@@ -131,7 +133,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
131133
media: list[ParsedMedia] = []
132134
if parse_media:
133135
if full_page: # Capture the entire page as one image
134-
pix = page.get_pixmap(dpi=image_dpi)
136+
pix = page.get_pixmap(dpi=dpi)
135137
media_metadata: dict[str, JsonValue] = {"type": "screenshot"} | {
136138
a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS
137139
}
@@ -153,7 +155,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
153155
y_tolerance=y_tol,
154156
)
155157
):
156-
pix = page.get_pixmap(clip=box, dpi=image_dpi)
158+
pix = page.get_pixmap(clip=box, dpi=dpi)
157159
media_metadata = {"bbox": tuple(box), "type": "drawing"} | {
158160
a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS
159161
}
@@ -171,7 +173,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
171173

172174
# Capture tables
173175
for table_i, table in enumerate(t for t in page.find_tables()):
174-
pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
176+
pix = page.get_pixmap(clip=table.bbox, dpi=dpi)
175177
media_metadata = {
176178
"bbox": tuple(table.bbox),
177179
"type": "table",
@@ -201,7 +203,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
201203
total_length += len(text)
202204
count_media += len(media)
203205

204-
multimodal_string = f"|multimodal|dpi={image_dpi}" + (
206+
multimodal_string = f"|multimodal|dpi={dpi}" + (
205207
"|mode=full-page"
206208
if full_page
207209
else f"|mode=individual|x-tol={x_tol}|y-tol={y_tol}"

packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import json
3+
import re
34
from pathlib import Path
45
from typing import cast
56
from unittest.mock import MagicMock, patch
@@ -39,6 +40,12 @@ async def test_parse_pdf_to_pages() -> None:
3940
(p2_image,) = [m for m in p2_media if m.info["type"] == "drawing"]
4041
assert p2_image.index == 0
4142
assert p2_image.info["page_num"] == 2
43+
assert p2_image.info["height"] == pytest.approx(130, rel=0.1)
44+
assert p2_image.info["width"] == pytest.approx(452, rel=0.1)
45+
p2_bbox = p2_image.info["bbox"]
46+
assert isinstance(p2_bbox, tuple)
47+
for i, value in enumerate((71, 70.87, 522, 202.98)):
48+
assert p2_bbox[i] == pytest.approx(value, rel=0.1)
4249
assert isinstance(p2_image.data, bytes)
4350

4451
# Check the image is valid base64
@@ -118,7 +125,10 @@ async def test_parse_pdf_to_pages() -> None:
118125
page_text, (full_page_image,) = page_content
119126
assert page_text
120127
assert full_page_image.index == 0, "Full page image should have index 0"
128+
assert full_page_image.info["type"] == "screenshot"
121129
assert full_page_image.info["page_num"] == int(page_num)
130+
assert full_page_image.info["height"] == pytest.approx(842, rel=0.01)
131+
assert full_page_image.info["width"] == pytest.approx(596, rel=0.01)
122132
assert isinstance(full_page_image.data, bytes)
123133
assert full_page_image.data, "Full page image should have data"
124134
# Check useful attributes are present and are JSON serializable
@@ -202,6 +212,12 @@ async def test_invalid_pdf_is_denied(tmp_path) -> None:
202212
)
203213

204214

215+
def test_nonexistent_file_failure() -> None:
216+
filename = "/nonexistent/path/file.pdf"
217+
with pytest.raises((pymupdf.FileNotFoundError, FileNotFoundError), match=filename):
218+
parse_pdf_to_pages(filename)
219+
220+
205221
def test_table_parsing() -> None:
206222
spy_to_markdown = MagicMock(side_effect=pymupdf.table.Table.to_markdown)
207223
zeroth_raw_table_text = ""
@@ -258,3 +274,16 @@ def custom_to_markdown(self, clean=False, fill_empty=True) -> str:
258274
"\n|**2.0** <br>|3/6 (50%)|2/6 (33%)|1/6 (17%)|"
259275
"\n\n" # NOTE: this is before strip, so there can be trailing whitespace
260276
)
277+
278+
279+
def test_equation_parsing() -> None:
280+
parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
281+
assert isinstance(parsed_text.content, dict)
282+
assert isinstance(parsed_text.content["1"], tuple)
283+
p1_text, p1_media = parsed_text.content["1"]
284+
# SEE: https://regex101.com/r/pyOHLq/1
285+
assert re.search(
286+
r"[_*]*E[_*]* ?= ?[_*]*mc[_*]*(?:<sup>)?[ ^]?[2²] ?(?:<\/sup>)?", p1_text
287+
), "Expected inline equation in page 1 text"
288+
assert re.search(r"n ?\+ ?a", p1_text), "Expected block equation in page 1 text"
289+
assert p1_media

packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ async def test_parse_pdf_to_pages() -> None:
2929
assert isinstance(parsed_text_full_page.content["1"], tuple)
3030
matches = re.findall(
3131
r"Abstract\nWe introduce PaSa, an advanced Paper ?Search"
32-
r"\nagent powered by large language models.",
32+
r"\nagent powered by large language models\.",
3333
parsed_text_full_page.content["1"][0],
3434
)
35-
assert len(matches) == 1, "Parsing failed to handle abstract"
35+
assert (
36+
len(matches) == 1
37+
), f"Parsing failed to handle abstract in {parsed_text_full_page.content['1'][0]}."
3638

3739
# Check the images in Figure 1
3840
assert not isinstance(parsed_text_full_page.content["2"], str)
@@ -41,7 +43,10 @@ async def test_parse_pdf_to_pages() -> None:
4143
assert "Crawler" in p2_text, "Expected Figure 1 contents"
4244
(p2_image,) = p2_media
4345
assert p2_image.index == 0
46+
assert p2_image.info["type"] == "screenshot"
4447
assert p2_image.info["page_num"] == 2
48+
assert p2_image.info["page_height"] == pytest.approx(842, rel=0.1)
49+
assert p2_image.info["page_width"] == pytest.approx(596, rel=0.1)
4550
assert isinstance(p2_image.data, bytes)
4651

4752
# Check the image is valid base64
@@ -176,3 +181,9 @@ def test_invalid_pdf_is_denied(tmp_path) -> None:
176181

177182
with pytest.raises(ImpossibleParsingError, match="corrupt"):
178183
parse_pdf_to_pages(bad_pdf_path)
184+
185+
186+
def test_nonexistent_file_failure() -> None:
187+
filename = "/nonexistent/path/file.pdf"
188+
with pytest.raises(FileNotFoundError, match=filename):
189+
parse_pdf_to_pages(filename)

src/paperqa/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,7 @@ def get_index_name(self) -> str:
10031003
str(self.parsing.parse_pdf), # Don't use __name__ as lambda wouldn't differ
10041004
str(self.parsing.reader_config["chunk_chars"]),
10051005
str(self.parsing.reader_config["overlap"]),
1006+
str(self.parsing.reader_config.get("full_page", False)),
10061007
self.parsing.chunking_algorithm,
10071008
str(self.parsing.multimodal),
10081009
]

tests/cassettes/test_equations[parse_pdf_to_pages0].yaml

Lines changed: 140 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)