Merge pull request #383 from VikParuchuri/dev

VikParuchuri · web-flow · commit 21b029f305b8 · 2025-06-02T08:36:58.000-07:00
Dev
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.14.3"
+version = "0.14.4"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/recognition/__init__.py b/surya/recognition/__init__.py
@@ -76,17 +76,17 @@ class RecognitionPrompt:
 class RecognitionPredictor(BasePredictor):
     model_loader_cls = RecognitionModelLoader
     batch_size = settings.RECOGNITION_BATCH_SIZE
-    torch_dtype = None      # No default, loader picks the dtype based on device properties - bf16/fp16
+    torch_dtype = None  # No default, loader picks the dtype based on device properties - bf16/fp16
     default_batch_sizes = {"cpu": 32, "mps": 64, "cuda": 256, "xla": 128}
-    encoder_chunk_size: int = 4096
+    encoder_chunk_size: int = 4096  # Default chunk size
     encoder_chunk_sizes = {"cpu": 4096, "mps": 4096, "cuda": 32768, "xla": 32768}
     min_prefill_ratio: int = 0.2
     min_trim_length: int = 50
     tasks = {
         TaskNames.ocr_with_boxes: {
             "needs_bboxes": True,
             "img_size": (1024, 256),  # 370 max tokens
-            "max_tokens": 256,
+            "max_tokens": 224,
         },
         TaskNames.ocr_without_boxes: {
             "needs_bboxes": False,
@@ -111,8 +111,11 @@ def __init__(self, checkpoint=None, device=settings.TORCH_DEVICE_MODEL, dtype=No
             self.processor.pad_token_id, device=self.model.device, dtype=torch.long
         )
 
-    def get_encoder_chunk_size(self):
-        chunk_size = self.encoder_chunk_size
+    def get_encoder_chunk_size(self) -> int:
+        if settings.RECOGNITION_CHUNK_SIZE is not None:
+            return settings.RECOGNITION_CHUNK_SIZE
+
+        chunk_size = settings.encoder_chunk_size
         if settings.TORCH_DEVICE_MODEL in self.encoder_chunk_sizes:
             if settings.TORCH_DEVICE_MODEL in self.encoder_chunk_sizes:
                 chunk_size = self.encoder_chunk_sizes[settings.TORCH_DEVICE_MODEL]
@@ -239,6 +242,8 @@ def slice_bboxes(
             == len(all_polygons)
             == len(all_text)
             == len(all_task_names)
+        ), (
+            f"Mismatch in lengths: {len(all_slices)}, {sum(slice_map)}, {len(all_polygons)}, {len(all_text)}, {len(all_task_names)}"
         )
 
         return {
@@ -593,7 +598,7 @@ def prediction_loop(
             current_inputs = self.maybe_trim_cache_padding(current_inputs)
             mark_step()
         pbar.close()
-        
+
         del self.kv_cache
         self.kv_cache = None
         torch.cuda.empty_cache()
@@ -636,12 +641,14 @@ def get_bboxes_text(
             # If the image is very out of distribution, we can get nonsense repeats, and we may need to drop the text entirely
             if drop_repeated_text and detect_repeat_token(image_tokens):
                 char_predictions.append(
-                    TextChar(
-                        text="",
-                        polygon=blank_bbox,
-                        confidence=0,
-                        bbox_valid=False,
-                    )
+                    [
+                        TextChar(
+                            text="",
+                            polygon=blank_bbox,
+                            confidence=0,
+                            bbox_valid=False,
+                        )
+                    ]
                 )
                 continue
 
@@ -772,7 +779,7 @@ def __call__(
         highres_images: List[Image.Image] | None = None,
         bboxes: List[List[List[int]]] | None = None,
         polygons: List[List[List[List[int]]]] | None = None,
-        input_text: List[str | None] | None = None,
+        input_text: List[List[str | None]] | None = None,
         sort_lines: bool = False,
         math_mode: bool = True,
         return_words: bool = False,
@@ -857,7 +864,11 @@ def __call__(
             batch_bboxes, image_sizes, bbox_size, bbox_size // 2
         )
         char_predictions = self.get_bboxes_text(
-            flat, predicted_tokens, scores, predicted_polygons
+            flat,
+            predicted_tokens,
+            scores,
+            predicted_polygons,
+            drop_repeated_text=drop_repeated_text,
         )
 
         char_predictions = sorted(zip(indices, char_predictions), key=lambda x: x[0])
@@ -886,7 +897,11 @@ def __call__(
                         )
                     )
                 else:
-                    confidence = float(np.mean([char.confidence for char in text_line]))
+                    confidence = (
+                        float(np.mean([char.confidence for char in text_line]))
+                        if len(text_line) > 0
+                        else 0
+                    )
                     poly_box = PolygonBox(polygon=polygon)
                     for char in text_line:
                         char.rescale(
diff --git a/surya/recognition/schema.py b/surya/recognition/schema.py
@@ -1,26 +1,36 @@
+import math
+import numpy as np
 from typing import Optional, List
 
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator
 
 from surya.common.polygon import PolygonBox
 
 
-class TextChar(PolygonBox):
+class BaseChar(PolygonBox):
     text: str
+    confidence: Optional[float] = 0
+
+    @field_validator("confidence", mode="before")
+    @classmethod
+    def validate_confidence(cls, v: float) -> float:
+        if v is None:
+            return 0
+        elif math.isnan(v) or np.isnan(v):
+            return 0
+        return v
+
+
+class TextChar(BaseChar):
     bbox_valid: bool = True  # This is false when the given bbox is not valid
-    confidence: Optional[float] = None
 
 
-class TextWord(PolygonBox):
-    text: str
+class TextWord(BaseChar):
     bbox_valid: bool = True
-    confidence: Optional[float] = None
 
 
-class TextLine(PolygonBox):
-    text: str
+class TextLine(BaseChar):
     chars: List[TextChar]  # Individual characters in the line
-    confidence: Optional[float] = None
     original_text_good: bool = False
     words: List[TextWord] | None = None
 
diff --git a/surya/settings.py b/surya/settings.py
@@ -82,6 +82,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     RECOGNITION_BATCH_SIZE: Optional[int] = (
         None  # Defaults to 8 for CPU/MPS, 256 otherwise
     )
+    RECOGNITION_CHUNK_SIZE: Optional[int] = None
     RECOGNITION_RENDER_FONTS: Dict[str, str] = {
         "all": os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf"),
         "zh": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -1,4 +1,5 @@
 import time
+from PIL import ImageDraw, Image
 
 
 def test_recognition(recognition_predictor, detection_predictor, test_image):
@@ -34,3 +35,17 @@ def test_recognition_input_text(recognition_predictor, detection_predictor, test
     text_lines = recognition_results[0].text_lines
     assert len(text_lines) == 4
     assert "Hello World" in text_lines[0].text
+
+
+def test_recognition_drop_repeats(recognition_predictor, detection_predictor):
+    image = Image.new("RGB", (1024, 128), "white")
+    draw = ImageDraw.Draw(image)
+    text = "a" * 80
+    draw.text((5, 5), text, fill="black", font_size=24)
+
+    recognition_results = recognition_predictor(
+        [image], None, bboxes=[[[0, 0, 1024, 128]]], drop_repeated_text=True
+    )
+    assert len(recognition_results) == 1
+    result = recognition_results[0].text_lines
+    assert result[0].text == ""

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ def TORCH_DEVICE_MODEL(self) -> str:`
`82`	`82`	`RECOGNITION_BATCH_SIZE: Optional[int] = (`
`83`	`83`	`None # Defaults to 8 for CPU/MPS, 256 otherwise`
`84`	`84`	`)`
	`85`	`+ RECOGNITION_CHUNK_SIZE: Optional[int] = None`
`85`	`86`	`RECOGNITION_RENDER_FONTS: Dict[str, str] = {`
`86`	`87`	`"all": os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf"),`
`87`	`88`	`"zh": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),`