Future-House
diff --git a/‎packages/paper-qa-docling/src/paperqa_docling/reader.py‎
Lines changed: 2 additions & 2 deletions b/‎packages/paper-qa-docling/src/paperqa_docling/reader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/paper-qa-docling/tests/test_paperqa_docling.py‎
Lines changed: 31 additions & 4 deletions b/‎packages/paper-qa-docling/tests/test_paperqa_docling.py‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py‎
Lines changed: 8 additions & 6 deletions b/‎packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py‎
Lines changed: 29 additions & 0 deletions b/‎packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py‎
Lines changed: 13 additions & 2 deletions b/‎packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎src/paperqa/settings.py‎
Lines changed: 1 addition & 0 deletions b/‎src/paperqa/settings.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/cassettes/test_equations[parse_pdf_to_pages0].yaml‎
Lines changed: 140 additions & 0 deletions b/‎tests/cassettes/test_equations[parse_pdf_to_pages0].yaml‎
Lines changed: 140 additions & 0 deletions
@@ -54,8 +54,8 @@ def parse_pdf_to_pages(  # noqa: PLR0912
         parse_media: Flag to also parse media (e.g. images, tables).
         pipeline_cls: Optional custom pipeline class for document conversion.
             Default is Docling's standard PDF pipeline.
-        dpi: Optional DPI (dots per inch) for image resolution.
-            Default PDF resolution is 72 DPI, so dpi of 144 would render at 2x scale.
+        dpi: Optional DPI (dots per inch) for image resolution,
+            if left unspecified Docling's default 1.0 scale will be employed.
         custom_pipeline_options: Optional keyword arguments to use to construct the
             PDF pipeline's options.
         page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
 
@@ -32,10 +32,12 @@ async def test_parse_pdf_to_pages() -> None:
     # Weird spaces are because 'Pa S a' is bolded in the original PDF
     matches = re.findall(
         r"Abstract\n+We introduce PaSa, an advanced Pa ?per S ?e ?a ?rch"
-        r" agent powered by large language models.",
+        r" agent powered by large language models\.",
         parsed_text.content["1"][0],
     )
-    assert len(matches) == 1, "Parsing failed to handle abstract"
+    assert (
+        len(matches) == 1
+    ), f"Parsing failed to handle abstract in {parsed_text.content['1'][0]}."
 
     # Check the images in Figure 1
     assert not isinstance(parsed_text.content["2"], str)
@@ -46,6 +48,12 @@ async def test_parse_pdf_to_pages() -> None:
     (p2_image,) = [m for m in p2_media if m.info["type"] == "picture"]
     assert p2_image.index == 0
     assert p2_image.info["page_num"] == 2
+    assert p2_image.info["height"] == pytest.approx(130, rel=0.1)
+    assert p2_image.info["width"] == pytest.approx(452, rel=0.1)
+    p2_bbox = p2_image.info["bbox"]
+    assert isinstance(p2_bbox, tuple)
+    for i, value in enumerate((71, 643.90, 522, 770.35)):
+        assert p2_bbox[i] == pytest.approx(value, rel=0.1)
     assert isinstance(p2_image.data, bytes)
 
     # Check the image is valid base64
@@ -185,10 +193,10 @@ def test_media_deduplication() -> None:
     # We allow for one table to be misinterpreted as an image
     assert (
         10 <= len(all_images) <= 11
-    ), "Expected each image (one/page) and formula (one/page) to be read"
+    ), "Expected each image (one/page) and equation (one/page) to be read"
     assert (
         len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 2
-    ), "Expected images/formulas on all pages beyond 1 to be deduplicated"
+    ), "Expected images/equations on all pages beyond 1 to be deduplicated"
 
     all_tables = [m for m in all_media if m.info.get("type") == "table"]
     assert len(all_tables) == 5, "Expected each table (one/page) to be read"
@@ -222,6 +230,12 @@ def test_invalid_pdf_is_denied(tmp_path) -> None:
         parse_pdf_to_pages(bad_pdf_path)
 
 
+def test_nonexistent_file_failure() -> None:
+    filename = "/nonexistent/path/file.pdf"
+    with pytest.raises(FileNotFoundError, match=filename):
+        parse_pdf_to_pages(filename)
+
+
 def test_table_parsing() -> None:
     # pylint: disable=duplicate-code
     filepath = STUB_DATA_DIR / "influence.pdf"
@@ -264,3 +278,16 @@ def test_document_timeout_denial() -> None:
         assert (
             time.perf_counter() - tic < 10
         ), "Expected document timeout to have taken much less time than a normal read"
+
+
+def test_equation_parsing() -> None:
+    parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
+    assert isinstance(parsed_text.content, dict)
+    assert isinstance(parsed_text.content["1"], tuple)
+    p1_text, p1_media = parsed_text.content["1"]
+    # SEE: https://regex101.com/r/pyOHLq/1
+    assert re.search(
+        r"[_*]*E[_*]* ?= ?[_*]*mc[_*]*(?:<sup>)?[ ^]?[2²] ?(?:<\/sup>)?", p1_text
+    ), "Expected inline equation in page 1 text"
+    assert re.search(r"n ?\+ ?a", p1_text), "Expected block equation in page 1 text"
+    assert p1_media
@@ -53,7 +53,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
     parse_media: bool = True,
     full_page: bool = False,
     image_cluster_tolerance: float | tuple[float, float] = 25,
-    image_dpi: float | None = 150,
+    dpi: float | None = None,
     **_,
 ) -> ParsedText:
     """Parse a PDF.
@@ -70,7 +70,9 @@ def parse_pdf_to_pages(  # noqa: PLR0912
             Can be a single value to apply to both X and Y directions,
             or a two-tuple to specify X and Y directions separately.
             The default was chosen to perform well on image extraction from LitQA2 PDFs.
-        image_dpi: Dots per inch for images captured from the PDF.
+        dpi: Optional DPI (dots per inch) for image resolution,
+            if left unspecified PyMuPDF's default resolution from
+            pymupdf.Page.get_pixmap will be applied.
         page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
             to parse only specific pages, where pages are one-indexed.
             Leaving as the default of None will parse all pages.
@@ -131,7 +133,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
             media: list[ParsedMedia] = []
             if parse_media:
                 if full_page:  # Capture the entire page as one image
-                    pix = page.get_pixmap(dpi=image_dpi)
+                    pix = page.get_pixmap(dpi=dpi)
                     media_metadata: dict[str, JsonValue] = {"type": "screenshot"} | {
                         a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS
                     }
@@ -153,7 +155,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
                             y_tolerance=y_tol,
                         )
                     ):
-                        pix = page.get_pixmap(clip=box, dpi=image_dpi)
+                        pix = page.get_pixmap(clip=box, dpi=dpi)
                         media_metadata = {"bbox": tuple(box), "type": "drawing"} | {
                             a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS
                         }
@@ -171,7 +173,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
 
                     # Capture tables
                     for table_i, table in enumerate(t for t in page.find_tables()):
-                        pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
+                        pix = page.get_pixmap(clip=table.bbox, dpi=dpi)
                         media_metadata = {
                             "bbox": tuple(table.bbox),
                             "type": "table",
@@ -201,7 +203,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
             total_length += len(text)
             count_media += len(media)
 
-    multimodal_string = f"|multimodal|dpi={image_dpi}" + (
+    multimodal_string = f"|multimodal|dpi={dpi}" + (
         "|mode=full-page"
         if full_page
         else f"|mode=individual|x-tol={x_tol}|y-tol={y_tol}"
 
@@ -1,5 +1,6 @@
 import base64
 import json
+import re
 from pathlib import Path
 from typing import cast
 from unittest.mock import MagicMock, patch
@@ -39,6 +40,12 @@ async def test_parse_pdf_to_pages() -> None:
     (p2_image,) = [m for m in p2_media if m.info["type"] == "drawing"]
     assert p2_image.index == 0
     assert p2_image.info["page_num"] == 2
+    assert p2_image.info["height"] == pytest.approx(130, rel=0.1)
+    assert p2_image.info["width"] == pytest.approx(452, rel=0.1)
+    p2_bbox = p2_image.info["bbox"]
+    assert isinstance(p2_bbox, tuple)
+    for i, value in enumerate((71, 70.87, 522, 202.98)):
+        assert p2_bbox[i] == pytest.approx(value, rel=0.1)
     assert isinstance(p2_image.data, bytes)
 
     # Check the image is valid base64
@@ -118,7 +125,10 @@ async def test_parse_pdf_to_pages() -> None:
         page_text, (full_page_image,) = page_content
         assert page_text
         assert full_page_image.index == 0, "Full page image should have index 0"
+        assert full_page_image.info["type"] == "screenshot"
         assert full_page_image.info["page_num"] == int(page_num)
+        assert full_page_image.info["height"] == pytest.approx(842, rel=0.01)
+        assert full_page_image.info["width"] == pytest.approx(596, rel=0.01)
         assert isinstance(full_page_image.data, bytes)
         assert full_page_image.data, "Full page image should have data"
         # Check useful attributes are present and are JSON serializable
@@ -202,6 +212,12 @@ async def test_invalid_pdf_is_denied(tmp_path) -> None:
         )
 
 
+def test_nonexistent_file_failure() -> None:
+    filename = "/nonexistent/path/file.pdf"
+    with pytest.raises((pymupdf.FileNotFoundError, FileNotFoundError), match=filename):
+        parse_pdf_to_pages(filename)
+
+
 def test_table_parsing() -> None:
     spy_to_markdown = MagicMock(side_effect=pymupdf.table.Table.to_markdown)
     zeroth_raw_table_text = ""
@@ -258,3 +274,16 @@ def custom_to_markdown(self, clean=False, fill_empty=True) -> str:
             "\n|**2.0** <br>|3/6 (50%)|2/6 (33%)|1/6 (17%)|"
             "\n\n"  # NOTE: this is before strip, so there can be trailing whitespace
         )
+
+
+def test_equation_parsing() -> None:
+    parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
+    assert isinstance(parsed_text.content, dict)
+    assert isinstance(parsed_text.content["1"], tuple)
+    p1_text, p1_media = parsed_text.content["1"]
+    # SEE: https://regex101.com/r/pyOHLq/1
+    assert re.search(
+        r"[_*]*E[_*]* ?= ?[_*]*mc[_*]*(?:<sup>)?[ ^]?[2²] ?(?:<\/sup>)?", p1_text
+    ), "Expected inline equation in page 1 text"
+    assert re.search(r"n ?\+ ?a", p1_text), "Expected block equation in page 1 text"
+    assert p1_media
@@ -29,10 +29,12 @@ async def test_parse_pdf_to_pages() -> None:
     assert isinstance(parsed_text_full_page.content["1"], tuple)
     matches = re.findall(
         r"Abstract\nWe introduce PaSa, an advanced Paper ?Search"
-        r"\nagent powered by large language models.",
+        r"\nagent powered by large language models\.",
         parsed_text_full_page.content["1"][0],
     )
-    assert len(matches) == 1, "Parsing failed to handle abstract"
+    assert (
+        len(matches) == 1
+    ), f"Parsing failed to handle abstract in {parsed_text_full_page.content['1'][0]}."
 
     # Check the images in Figure 1
     assert not isinstance(parsed_text_full_page.content["2"], str)
@@ -41,7 +43,10 @@ async def test_parse_pdf_to_pages() -> None:
     assert "Crawler" in p2_text, "Expected Figure 1 contents"
     (p2_image,) = p2_media
     assert p2_image.index == 0
+    assert p2_image.info["type"] == "screenshot"
     assert p2_image.info["page_num"] == 2
+    assert p2_image.info["page_height"] == pytest.approx(842, rel=0.1)
+    assert p2_image.info["page_width"] == pytest.approx(596, rel=0.1)
     assert isinstance(p2_image.data, bytes)
 
     # Check the image is valid base64
@@ -176,3 +181,9 @@ def test_invalid_pdf_is_denied(tmp_path) -> None:
 
     with pytest.raises(ImpossibleParsingError, match="corrupt"):
         parse_pdf_to_pages(bad_pdf_path)
+
+
+def test_nonexistent_file_failure() -> None:
+    filename = "/nonexistent/path/file.pdf"
+    with pytest.raises(FileNotFoundError, match=filename):
+        parse_pdf_to_pages(filename)
@@ -1003,6 +1003,7 @@ def get_index_name(self) -> str:
             str(self.parsing.parse_pdf),  # Don't use __name__ as lambda wouldn't differ
             str(self.parsing.reader_config["chunk_chars"]),
             str(self.parsing.reader_config["overlap"]),
+            str(self.parsing.reader_config.get("full_page", False)),
             self.parsing.chunking_algorithm,
             str(self.parsing.multimodal),
         ]
Original file line number	Diff line number	Diff line change
`@@ -1003,6 +1003,7 @@ def get_index_name(self) -> str:`
`1003`	`1003`	`str(self.parsing.parse_pdf), # Don't use __name__ as lambda wouldn't differ`
`1004`	`1004`	`str(self.parsing.reader_config["chunk_chars"]),`
`1005`	`1005`	`str(self.parsing.reader_config["overlap"]),`
	`1006`	`+ str(self.parsing.reader_config.get("full_page", False)),`
`1006`	`1007`	`self.parsing.chunking_algorithm,`
`1007`	`1008`	`str(self.parsing.multimodal),`
`1008`	`1009`	`]`