11import json
2- import logging
32import re
43from io import BytesIO
54
98from docling .datamodel .document import ConversionResult
109from docling .document_converter import DocumentConverter
1110from docling_core .types .doc .document import (
12- BaseMeta ,
1311 DocItemLabel ,
1412 DoclingDocument ,
1513 GroupItem ,
1917 PictureItem ,
2018 RefItem ,
2119 SectionHeaderItem ,
22- SummaryMetaField ,
23- TableData ,
2420 TableItem ,
2521 TextItem ,
2622 TitleItem ,
2723)
2824from docling_core .types .io import DocumentStream
2925
30- # Configure logging
31- logging .basicConfig (
32- level = logging .INFO , format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33- )
34- logger = logging .getLogger (__name__ )
26+ from docling_agent .logging import logger
27+
28+ # Use shared logger from docling_agent.agents
3529
3630
3731def find_json_dicts (text : str ) -> list [dict ]:
@@ -110,136 +104,6 @@ def create_document_outline(doc: DoclingDocument) -> str:
110104 return outline
111105
112106
113- def find_outline_v1 (text : str ) -> DoclingDocument | None :
114- starts = ["paragraph" , "list" , "table" , "figure" , "picture" ]
115-
116- md = find_markdown_code_block (text )
117-
118- if md :
119- converter = DocumentConverter (allowed_formats = [InputFormat .MD ])
120-
121- buff = BytesIO (md .encode ("utf-8" ))
122- doc_stream = DocumentStream (name = "tmp.md" , stream = buff )
123-
124- conv : ConversionResult = converter .convert (doc_stream )
125-
126- lines : list [str ] = []
127- for item , level in conv .document .iterate_items (with_groups = True ):
128- if isinstance (item , TitleItem ) or isinstance (item , SectionHeaderItem ):
129- continue
130- elif isinstance (item , TextItem ):
131- pattern = rf"^({ '|' .join (starts )} ):\s(.*)\.$"
132- match = bool (re .match (pattern , text , re .DOTALL ))
133- if match is None :
134- lines .append (item .text )
135- else :
136- continue
137-
138- if len (lines ) > 0 :
139- message = f"Every content line should start with one out of the following choices: { starts } . The following lines need to be updated: { '\n ' .join (lines )} "
140- logger .error (message )
141-
142- return None
143- else :
144- return conv .document
145- else :
146- return None
147-
148-
149- def find_outline_v2 (text : str ) -> DoclingDocument | None :
150- starts = ["paragraph" , "list" , "table" , "figure" , "picture" ]
151-
152- md = find_markdown_code_block (text )
153-
154- if not md :
155- return None
156-
157- converter = DocumentConverter (allowed_formats = [InputFormat .MD ])
158-
159- buff = BytesIO (md .encode ("utf-8" ))
160- doc_stream = DocumentStream (name = "tmp.md" , stream = buff )
161-
162- conv : ConversionResult = converter .convert (doc_stream )
163-
164- # Build a fresh outline document rather than deep-copying content
165- outline = DoclingDocument (name = f"outline for: { conv .document .name } " )
166-
167- invalid_lines : list [str ] = []
168-
169- for item , level in conv .document .iterate_items (with_groups = True ):
170- if isinstance (item , TitleItem ):
171- outline .add_title (text = item .text )
172-
173- elif isinstance (item , SectionHeaderItem ):
174- outline .add_heading (text = item .text , level = item .level )
175-
176- elif isinstance (item , TextItem ):
177- pattern = rf"^({ '|' .join (starts )} ):\s(.*)\.$"
178- match = re .match (pattern , item .text , re .DOTALL )
179-
180- if not match :
181- invalid_lines .append (item .text )
182- continue
183-
184- label = match [1 ]
185- summary = match [2 ]
186-
187- meta = BaseMeta (summary = SummaryMetaField (text = summary ))
188-
189- if label == "paragraph" :
190- _ = outline .add_text (label = DocItemLabel .TEXT , text = item .text )
191- _ .meta = meta
192-
193- elif label == "table" :
194- # Create an empty placeholder table with summary in meta
195- caption = outline .add_text (label = DocItemLabel .CAPTION , text = "" )
196- data = TableData (table_cells = [], num_rows = 0 , num_cols = 0 )
197- _ = outline .add_table (
198- label = DocItemLabel .TABLE , data = data , caption = caption
199- )
200- _ .meta = meta
201-
202- elif label in ["figure" , "picture" ]:
203- # Add a picture with a caption derived from the summary
204- caption = outline .add_text (label = DocItemLabel .CAPTION , text = "" )
205- _ = outline .add_picture (caption = caption )
206- _ .meta = meta
207-
208- elif label == "list" :
209- # Add a group placeholder for a list; attach summary via meta
210- try :
211- _ = outline .add_group (
212- name = "list" , label = GroupLabel .UNSPECIFIED , parent = None
213- )
214- except TypeError :
215- # Fallback for API variants that don't require explicit parent
216- _ = outline .add_group (name = "list" , label = GroupLabel .UNSPECIFIED )
217- _ .meta = meta
218-
219- else :
220- logger .warning (f"NOT SUPPORTED: { label } " )
221- else :
222- continue
223-
224- if len (invalid_lines ) > 0 :
225- message = (
226- "Every content line should start with one of: "
227- f"{ starts } . The following lines need to be updated: "
228- + "\n " .join (invalid_lines )
229- )
230- logger .error (message )
231- return None
232-
233- # print(outline.export_to_markdown())
234-
235- return outline
236-
237-
238- def validate_outline_format (text : str ) -> bool :
239- logger .info (f"testing validate_outline_format for { text [0 :64 ]} " )
240- return find_outline_v2 (text ) is not None
241-
242-
243107def serialize_item_to_markdown (item : TextItem , doc : DoclingDocument ) -> str :
244108 """Serialize a text item to markdown format using existing serializer."""
245109 from docling_core .transforms .serializer .markdown import (
@@ -286,7 +150,7 @@ def has_html_code_block(text: str) -> bool:
286150 """
287151 Check if a string contains a html code block pattern anywhere in the text
288152 """
289- logger .info (f"testing has_html_code_block for { text [0 :64 ]} " )
153+ # logger.info(f"testing has_html_code_block for {text[0:64]}")
290154 return find_html_code_block (text ) is not None
291155
292156
@@ -303,7 +167,7 @@ def has_markdown_code_block(text: str) -> bool:
303167 """
304168 Check if a string contains a markdown code block pattern anywhere in the text
305169 """
306- logger .info (f"testing has_markdown_code_block for { text [0 :64 ]} " )
170+ # logger.info(f"testing has_markdown_code_block for {text[0:64]}")
307171 return find_markdown_code_block (text ) is not None
308172
309173
@@ -331,7 +195,7 @@ def convert_html_to_docling_table(text: str) -> list[TableItem] | None:
331195
332196
333197def validate_html_to_docling_table (text : str ) -> bool :
334- logger .info (f"validate_html_to_docling_table for { text [0 :64 ]} " )
198+ # logger.info(f"validate_html_to_docling_table for {text[0:64]}")
335199 return convert_html_to_docling_table (text ) is not None
336200
337201
@@ -357,7 +221,7 @@ def convert_markdown_to_docling_document(text: str) -> DoclingDocument | None:
357221
358222
359223def validate_markdown_to_docling_document (text : str ) -> bool :
360- logger .info (f"testing validate_markdown_docling_document for { text [0 :64 ]} " )
224+ # logger.info(f"testing validate_markdown_docling_document for {text[0:64]}")
361225 return convert_markdown_to_docling_document (text ) is not None
362226
363227
@@ -384,14 +248,14 @@ def convert_html_to_docling_document(text: str) -> DoclingDocument | None:
384248
385249
386250def validate_html_to_docling_document (text : str ) -> bool :
387- logger .info (f"testing validate_html_docling_document for { text [0 :64 ]} " )
251+ # logger.info(f"testing validate_html_docling_document for {text[0:64]}")
388252 return convert_html_to_docling_document (text ) is not None
389253
390254
391255def insert_document (
392256 * , item : NodeItem , doc : DoclingDocument , updated_doc : DoclingDocument
393257) -> DoclingDocument :
394- logger .info (f"inserting new document at item { item .self_ref } " )
258+ # logger.info(f"inserting new document at item {item.self_ref}")
395259
396260 group_item = GroupItem (
397261 label = GroupLabel .UNSPECIFIED ,
0 commit comments