Improvements and corrections

Maksym Lysak · Maksym Lysak · commit bdc7b0a14275 · 2025-11-14T12:54:11.000+01:00
Signed-off-by: Maksym Lysak &lt;mly@zurich.ibm.com&gt;
diff --git a/docling/utils/api_image_request.py b/docling/utils/api_image_request.py
@@ -23,51 +23,68 @@ def api_image_request(
     **params,
 ) -> Tuple[str, Optional[int], VlmStopReason]:
     img_io = BytesIO()
-    image.save(img_io, "PNG")
-    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
-    messages = [
-        {
-            "role": "user",
-            "content": [
+    image = image.copy()
+    image = image.convert("RGBA")
+    good_image = True
+    try:
+        image.save(img_io, "PNG")
+    except:
+        good_image = False
+        _log.error("Error, corrupter PNG of size: {}".format(image.size))
+
+    if good_image:
+        try:
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+            messages = [
                 {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                },
-                {
-                    "type": "text",
-                    "text": prompt,
-                },
-            ],
-        }
-    ]
-
-    payload = {
-        "messages": messages,
-        **params,
-    }
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                        },
+                        {
+                            "type": "text",
+                            "text": prompt,
+                        },
+                    ],
+                }
+            ]
+
+            payload = {
+                "messages": messages,
+                **params,
+            }
+
+            headers = headers or {}
+
+            r = requests.post(
+                str(url),
+                headers=headers,
+                json=payload,
+                timeout=timeout,
+            )
+            if not r.ok:
+                _log.error(f"Error calling the API. Response was {r.text}")
+                # image.show()
+            # r.raise_for_status()
+
+            api_resp = OpenAiApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.choices[0].message.content.strip()
+            num_tokens = api_resp.usage.total_tokens
+            stop_reason = (
+                VlmStopReason.LENGTH
+                if api_resp.choices[0].finish_reason == "length"
+                else VlmStopReason.END_OF_SEQUENCE
+            )
 
-    headers = headers or {}
-
-    r = requests.post(
-        str(url),
-        headers=headers,
-        json=payload,
-        timeout=timeout,
-    )
-    if not r.ok:
-        _log.error(f"Error calling the API. Response was {r.text}")
-    r.raise_for_status()
-
-    api_resp = OpenAiApiResponse.model_validate_json(r.text)
-    generated_text = api_resp.choices[0].message.content.strip()
-    num_tokens = api_resp.usage.total_tokens
-    stop_reason = (
-        VlmStopReason.LENGTH
-        if api_resp.choices[0].finish_reason == "length"
-        else VlmStopReason.END_OF_SEQUENCE
-    )
-
-    return generated_text, num_tokens, stop_reason
+            return generated_text, num_tokens, 
+        except Exception as e:
+            _log.error(f"Error, could not process request: {e}")
+            return "", 0, "bad request"
+    else:
+        return "", 0, "bad image"
 
 
 def api_image_request_streaming(
diff --git a/docs/examples/post_process_ocr_with_vlm.py b/docs/examples/post_process_ocr_with_vlm.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import os
+import re
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
@@ -17,6 +18,7 @@
     DocItem,
     GraphCell,
     KeyValueItem,
+    FormItem,
     PictureItem,
     RichTableCell,
     TableCell,
@@ -54,10 +56,29 @@
 LM_STUDIO_MODEL = "nanonets-ocr2-3b"
 
 DEFAULT_PROMPT = "Extract the text from the above document as if you were reading it naturally. Output pure text, no html and no markdown. Pay attention on line breaks and don't miss text after line break. Put all text in one line."
-VERBOSE = False
+VERBOSE = True
 SHOW_IMAGE = False
 
 
+def safe_crop(img: Image.Image, bbox):
+    left, top, right, bottom = bbox
+    # Clamp to image boundaries
+    left   = max(0, min(left,   img.width))
+    top    = max(0, min(top,    img.height))
+    right  = max(0, min(right,  img.width))
+    bottom = max(0, min(bottom, img.height))
+    return img.crop((left, top, right, bottom))
+
+
+def no_long_repeats(s: str, threshold: int) -> bool:
+    """
+    Returns False if the string `s` contains more than `threshold`
+    identical characters in a row, otherwise True.
+    """
+    pattern = r'(.)\1{' + str(threshold) + ',}'
+    return re.search(pattern, s) is None
+
+
 class PostOcrEnrichmentElement(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -136,7 +157,7 @@ def prepare_element(
         allowed = (DocItem, TableItem, GraphCell)
         assert isinstance(element, allowed)
 
-        if isinstance(element, KeyValueItem):
+        if isinstance(element, (KeyValueItem, FormItem)):
             # Yield from the graphCells inside here.
             result = []
             for c in element.graph.cells:
@@ -164,6 +185,9 @@ def prepare_element(
                     cropped_image = conv_res.document.pages[
                         page_ix
                     ].image.pil_image.crop(expanded_bbox.as_tuple())
+
+                    # cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
+
                     # cropped_image.show()
                     result.append(
                         PostOcrEnrichmentElement(item=c, image=[cropped_image])
@@ -202,6 +226,8 @@ def prepare_element(
                                 cropped_image = conv_res.document.pages[
                                     page_ix
                                 ].image.pil_image.crop(expanded_bbox.as_tuple())
+
+                                # cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
                                 # cropped_image.show()
                                 result.append(
                                     PostOcrEnrichmentElement(
@@ -234,15 +260,27 @@ def prepare_element(
                 ):
                     good_bbox = False
 
-                if good_bbox:
-                    cropped_image = conv_res.document.pages[
-                        page_ix
-                    ].image.pil_image.crop(expanded_bbox.as_tuple())
-                    multiple_crops.append(cropped_image)
-                    # cropped_image.show()
+                if hasattr(element, "text"):
+                    if good_bbox:
+                        cropped_image = conv_res.document.pages[
+                            page_ix
+                        ].image.pil_image.crop(expanded_bbox.as_tuple())
+                        # cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
+
+                        multiple_crops.append(cropped_image)
+                        print("")
+                        print("cropped image size: {}".format(cropped_image.size))
+                        print(type(element))
+                        if hasattr(element, "text"):
+                            print("OLD TEXT: {}".format(element.text))
+                        # cropped_image.show()
+                else:
+                    print("Not a text element")
             if len(multiple_crops) > 0:
+                # good crops
                 return [PostOcrEnrichmentElement(item=element, image=multiple_crops)]
             else:
+                # nothing
                 return []
 
     @classmethod
@@ -260,8 +298,9 @@ def __init__(
     ):
         self.enabled = enabled
         self.options = options
-        self.concurrency = 4
+        self.concurrency = 2
         self.expansion_factor = 0.05
+        # self.expansion_factor = 0.0
         self.elements_batch_size = 4
         self._accelerator_options = accelerator_options
         self._artifacts_path = (
@@ -282,7 +321,8 @@ def _api_request(image: Image.Image) -> str:
                 image=image,
                 prompt=self.options.prompt,
                 url=self.options.url,
-                timeout=self.options.timeout,
+                # timeout=self.options.timeout,
+                timeout=30,
                 headers=self.options.headers,
                 **self.options.params,
             )
@@ -343,36 +383,36 @@ def clean_html_tags(text):
                 return text
 
             output = clean_html_tags(output).strip()
-
-            if VERBOSE:
-                if isinstance(item, (TextItem)):
-                    print(f"OLD TEXT: {item.text}")
-
-            # Re-populate text
-            if isinstance(item, (TextItem, GraphCell)):
-                if img_ind > 0:
-                    # Concat texts across several provenances
-                    item.text += " " + output
-                    item.orig += " " + output
-                else:
+            if no_long_repeats(output, 50):
+                if VERBOSE:
+                    if isinstance(item, (TextItem)):
+                        print(f"OLD TEXT: {item.text}")
+
+                # Re-populate text
+                if isinstance(item, (TextItem, GraphCell)):
+                    if img_ind > 0:
+                        # Concat texts across several provenances
+                        item.text += " " + output
+                        item.orig += " " + output
+                    else:
+                        item.text = output
+                        item.orig = output
+                elif isinstance(item, (TableCell, RichTableCell)):
                     item.text = output
-                    item.orig = output
-            elif isinstance(item, (TableCell, RichTableCell)):
-                item.text = output
-            elif isinstance(item, PictureItem):
-                pass
-            else:
-                raise ValueError(f"Unknown item type: {type(item)}")
+                elif isinstance(item, PictureItem):
+                    pass
+                else:
+                    raise ValueError(f"Unknown item type: {type(item)}")
 
-            if VERBOSE:
-                if isinstance(item, (TextItem)):
-                    print(f"NEW TEXT: {item.text}")
+                if VERBOSE:
+                    if isinstance(item, (TextItem)):
+                        print(f"NEW TEXT: {item.text}")
 
-            # Take care of charspans for relevant types
-            if isinstance(item, GraphCell):
-                item.prov.charspan = (0, len(item.text))
-            elif isinstance(item, TextItem):
-                item.prov[0].charspan = (0, len(item.text))
+                # Take care of charspans for relevant types
+                if isinstance(item, GraphCell):
+                    item.prov.charspan = (0, len(item.text))
+                elif isinstance(item, TextItem):
+                    item.prov[0].charspan = (0, len(item.text))
 
             yield item
 
@@ -382,7 +422,8 @@ def convert_pdf(pdf_path: Path, out_intermediate_json: Path):
     pipeline_options = PdfPipelineOptions()
     pipeline_options.generate_page_images = True
     pipeline_options.generate_picture_images = True
-    pipeline_options.images_scale = 4.0
+    # pipeline_options.images_scale = 4.0
+    pipeline_options.images_scale = 2.0
 
     doc_converter = (
         DocumentConverter(  # all of the below is optional, has internal defaults.
@@ -424,6 +465,7 @@ def post_process_json(in_json: Path, out_final_json: Path):
         )
     )
 
+    # try:
     doc_converter = DocumentConverter(
         format_options={
             InputFormat.JSON_DOCLING: FormatOption(
@@ -440,6 +482,8 @@ def post_process_json(in_json: Path, out_final_json: Path):
     md = result.document.export_to_markdown()
     print("*** MARKDOWN ***")
     print(md)
+    # except:
+    #     print("ERROR IN OCR for: {}".format(in_json))
 
 
 def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):