Skip to content

Commit caa8978

Browse files
Dev/leverage experimental serializer (#4)
* cleaned up the logging Signed-off-by: Peter Staar <[email protected]> * working on refactoring the writer Signed-off-by: Peter Staar <[email protected]> * refactoring the writing agent Signed-off-by: Peter Staar <[email protected]> * added initial test Signed-off-by: Peter Staar <[email protected]> * working on tests and CI/CD Signed-off-by: Peter Staar <[email protected]> * working on tests and CI/CD (2) Signed-off-by: Peter Staar <[email protected]> * reformatted the code Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 709e9f9 commit caa8978

File tree

15 files changed

+445
-388
lines changed

15 files changed

+445
-388
lines changed

.github/workflows/checks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
runs-on: ubuntu-latest
1919
strategy:
2020
matrix:
21-
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
21+
python-version: ['3.10', '3.11', '3.12', '3.13']
2222
steps:
2323
- uses: actions/checkout@v4
2424
- name: Cache Hugging Face models

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ on:
1010

1111
jobs:
1212
code-checks:
13-
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-agent' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-core') }}
13+
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-agent' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-agent') }}
1414
uses: ./.github/workflows/checks.yml
1515
secrets:
1616
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

docling_agent/agent/base.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from __future__ import annotations
22

3-
import logging
43
from abc import abstractmethod
54
from enum import Enum
5+
from pathlib import Path
66
from typing import TYPE_CHECKING
77

88
# from smolagents import MCPClient, Tool, ToolCollection
@@ -13,11 +13,7 @@
1313
if TYPE_CHECKING:
1414
from docling_core.types.doc.document import DoclingDocument
1515

16-
# Configure logging
17-
logging.basicConfig(
18-
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
19-
)
20-
logger = logging.getLogger(__name__)
16+
# Use shared logger from docling_agent.agents
2117

2218

2319
class DoclingAgentType(Enum):
@@ -63,7 +59,7 @@ def run(
6359
self,
6460
task: str,
6561
document: DoclingDocument | None = None,
66-
sources: list[DoclingDocument] = [],
62+
sources: list[DoclingDocument | Path] = [],
6763
**kwargs,
6864
) -> "DoclingDocument":
6965
"""Execute the agent for a task and return a document."""

docling_agent/agent/base_functions.py

Lines changed: 9 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
import logging
32
import re
43
from io import BytesIO
54

@@ -9,7 +8,6 @@
98
from docling.datamodel.document import ConversionResult
109
from docling.document_converter import DocumentConverter
1110
from docling_core.types.doc.document import (
12-
BaseMeta,
1311
DocItemLabel,
1412
DoclingDocument,
1513
GroupItem,
@@ -19,19 +17,15 @@
1917
PictureItem,
2018
RefItem,
2119
SectionHeaderItem,
22-
SummaryMetaField,
23-
TableData,
2420
TableItem,
2521
TextItem,
2622
TitleItem,
2723
)
2824
from docling_core.types.io import DocumentStream
2925

30-
# Configure logging
31-
logging.basicConfig(
32-
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33-
)
34-
logger = logging.getLogger(__name__)
26+
from docling_agent.logging import logger
27+
28+
# Use shared logger from docling_agent.agents
3529

3630

3731
def find_json_dicts(text: str) -> list[dict]:
@@ -110,136 +104,6 @@ def create_document_outline(doc: DoclingDocument) -> str:
110104
return outline
111105

112106

113-
def find_outline_v1(text: str) -> DoclingDocument | None:
114-
starts = ["paragraph", "list", "table", "figure", "picture"]
115-
116-
md = find_markdown_code_block(text)
117-
118-
if md:
119-
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
120-
121-
buff = BytesIO(md.encode("utf-8"))
122-
doc_stream = DocumentStream(name="tmp.md", stream=buff)
123-
124-
conv: ConversionResult = converter.convert(doc_stream)
125-
126-
lines: list[str] = []
127-
for item, level in conv.document.iterate_items(with_groups=True):
128-
if isinstance(item, TitleItem) or isinstance(item, SectionHeaderItem):
129-
continue
130-
elif isinstance(item, TextItem):
131-
pattern = rf"^({'|'.join(starts)}):\s(.*)\.$"
132-
match = bool(re.match(pattern, text, re.DOTALL))
133-
if match is None:
134-
lines.append(item.text)
135-
else:
136-
continue
137-
138-
if len(lines) > 0:
139-
message = f"Every content line should start with one out of the following choices: {starts}. The following lines need to be updated: {'\n'.join(lines)}"
140-
logger.error(message)
141-
142-
return None
143-
else:
144-
return conv.document
145-
else:
146-
return None
147-
148-
149-
def find_outline_v2(text: str) -> DoclingDocument | None:
150-
starts = ["paragraph", "list", "table", "figure", "picture"]
151-
152-
md = find_markdown_code_block(text)
153-
154-
if not md:
155-
return None
156-
157-
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
158-
159-
buff = BytesIO(md.encode("utf-8"))
160-
doc_stream = DocumentStream(name="tmp.md", stream=buff)
161-
162-
conv: ConversionResult = converter.convert(doc_stream)
163-
164-
# Build a fresh outline document rather than deep-copying content
165-
outline = DoclingDocument(name=f"outline for: {conv.document.name}")
166-
167-
invalid_lines: list[str] = []
168-
169-
for item, level in conv.document.iterate_items(with_groups=True):
170-
if isinstance(item, TitleItem):
171-
outline.add_title(text=item.text)
172-
173-
elif isinstance(item, SectionHeaderItem):
174-
outline.add_heading(text=item.text, level=item.level)
175-
176-
elif isinstance(item, TextItem):
177-
pattern = rf"^({'|'.join(starts)}):\s(.*)\.$"
178-
match = re.match(pattern, item.text, re.DOTALL)
179-
180-
if not match:
181-
invalid_lines.append(item.text)
182-
continue
183-
184-
label = match[1]
185-
summary = match[2]
186-
187-
meta = BaseMeta(summary=SummaryMetaField(text=summary))
188-
189-
if label == "paragraph":
190-
_ = outline.add_text(label=DocItemLabel.TEXT, text=item.text)
191-
_.meta = meta
192-
193-
elif label == "table":
194-
# Create an empty placeholder table with summary in meta
195-
caption = outline.add_text(label=DocItemLabel.CAPTION, text="")
196-
data = TableData(table_cells=[], num_rows=0, num_cols=0)
197-
_ = outline.add_table(
198-
label=DocItemLabel.TABLE, data=data, caption=caption
199-
)
200-
_.meta = meta
201-
202-
elif label in ["figure", "picture"]:
203-
# Add a picture with a caption derived from the summary
204-
caption = outline.add_text(label=DocItemLabel.CAPTION, text="")
205-
_ = outline.add_picture(caption=caption)
206-
_.meta = meta
207-
208-
elif label == "list":
209-
# Add a group placeholder for a list; attach summary via meta
210-
try:
211-
_ = outline.add_group(
212-
name="list", label=GroupLabel.UNSPECIFIED, parent=None
213-
)
214-
except TypeError:
215-
# Fallback for API variants that don't require explicit parent
216-
_ = outline.add_group(name="list", label=GroupLabel.UNSPECIFIED)
217-
_.meta = meta
218-
219-
else:
220-
logger.warning(f"NOT SUPPORTED: {label}")
221-
else:
222-
continue
223-
224-
if len(invalid_lines) > 0:
225-
message = (
226-
"Every content line should start with one of: "
227-
f"{starts}. The following lines need to be updated: "
228-
+ "\n".join(invalid_lines)
229-
)
230-
logger.error(message)
231-
return None
232-
233-
# print(outline.export_to_markdown())
234-
235-
return outline
236-
237-
238-
def validate_outline_format(text: str) -> bool:
239-
logger.info(f"testing validate_outline_format for {text[0:64]}")
240-
return find_outline_v2(text) is not None
241-
242-
243107
def serialize_item_to_markdown(item: TextItem, doc: DoclingDocument) -> str:
244108
"""Serialize a text item to markdown format using existing serializer."""
245109
from docling_core.transforms.serializer.markdown import (
@@ -286,7 +150,7 @@ def has_html_code_block(text: str) -> bool:
286150
"""
287151
Check if a string contains a html code block pattern anywhere in the text
288152
"""
289-
logger.info(f"testing has_html_code_block for {text[0:64]}")
153+
# logger.info(f"testing has_html_code_block for {text[0:64]}")
290154
return find_html_code_block(text) is not None
291155

292156

@@ -303,7 +167,7 @@ def has_markdown_code_block(text: str) -> bool:
303167
"""
304168
Check if a string contains a markdown code block pattern anywhere in the text
305169
"""
306-
logger.info(f"testing has_markdown_code_block for {text[0:64]}")
170+
# logger.info(f"testing has_markdown_code_block for {text[0:64]}")
307171
return find_markdown_code_block(text) is not None
308172

309173

@@ -331,7 +195,7 @@ def convert_html_to_docling_table(text: str) -> list[TableItem] | None:
331195

332196

333197
def validate_html_to_docling_table(text: str) -> bool:
334-
logger.info(f"validate_html_to_docling_table for {text[0:64]}")
198+
# logger.info(f"validate_html_to_docling_table for {text[0:64]}")
335199
return convert_html_to_docling_table(text) is not None
336200

337201

@@ -357,7 +221,7 @@ def convert_markdown_to_docling_document(text: str) -> DoclingDocument | None:
357221

358222

359223
def validate_markdown_to_docling_document(text: str) -> bool:
360-
logger.info(f"testing validate_markdown_docling_document for {text[0:64]}")
224+
# logger.info(f"testing validate_markdown_docling_document for {text[0:64]}")
361225
return convert_markdown_to_docling_document(text) is not None
362226

363227

@@ -384,14 +248,14 @@ def convert_html_to_docling_document(text: str) -> DoclingDocument | None:
384248

385249

386250
def validate_html_to_docling_document(text: str) -> bool:
387-
logger.info(f"testing validate_html_docling_document for {text[0:64]}")
251+
# logger.info(f"testing validate_html_docling_document for {text[0:64]}")
388252
return convert_html_to_docling_document(text) is not None
389253

390254

391255
def insert_document(
392256
*, item: NodeItem, doc: DoclingDocument, updated_doc: DoclingDocument
393257
) -> DoclingDocument:
394-
logger.info(f"inserting new document at item {item.self_ref}")
258+
# logger.info(f"inserting new document at item {item.self_ref}")
395259

396260
group_item = GroupItem(
397261
label=GroupLabel.UNSPECIFIED,

docling_agent/agent/editor.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import logging
1+
from pathlib import Path
22
from typing import Any, ClassVar
33

44
# from smolagents import MCPClient, Tool, ToolCollection
@@ -29,6 +29,7 @@
2929
validate_html_to_docling_table,
3030
)
3131
from docling_agent.agent_models import setup_local_session, view_linear_context
32+
from docling_agent.logging import logger
3233

3334
# from examples.smolagents.agent_tools import MCPConfig, setup_mcp_tools
3435
from docling_agent.resources.prompts import (
@@ -37,11 +38,7 @@
3738
SYSTEM_PROMPT_FOR_EDITING_TABLE,
3839
)
3940

40-
# Configure logging
41-
logging.basicConfig(
42-
level=logging.INFO, format="%(asctime)s - %(levelname)s: %(name)s - %(message)s"
43-
)
44-
logger = logging.getLogger(__name__)
41+
# Use shared logger from docling_agent.agents
4542

4643

4744
class DoclingEditingAgent(BaseDoclingAgent):
@@ -63,7 +60,7 @@ def run(
6360
self,
6461
task: str,
6562
document: DoclingDocument | None = None,
66-
sources: list[DoclingDocument] = [],
63+
sources: list[DoclingDocument | Path] = [],
6764
**kwargs,
6865
) -> DoclingDocument:
6966
if document is None:

0 commit comments

Comments
 (0)