Merge pull request #329 from VikParuchuri/dev

VikParuchuri · web-flow · commit 7e5ac9d5afce · 2025-02-28T12:16:51.000-08:00
Dev
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.12.1"
+version = "0.13.0"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/common/polygon.py b/surya/common/polygon.py
@@ -86,6 +86,21 @@ def merge(self, other):
         y2 = max(self.bbox[3], other.bbox[3])
         self.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
 
+    def expand(self, x_margin: float, y_margin: float):
+        new_polygon = []
+        x_margin = x_margin * self.width
+        y_margin = y_margin * self.height
+        for idx, poly in enumerate(self.polygon):
+            if idx == 0:
+                new_polygon.append([int(poly[0] - x_margin), int(poly[1] - y_margin)])
+            elif idx == 1:
+                new_polygon.append([int(poly[0] + x_margin), int(poly[1] - y_margin)])
+            elif idx == 2:
+                new_polygon.append([int(poly[0] + x_margin), int(poly[1] + y_margin)])
+            elif idx == 3:
+                new_polygon.append([int(poly[0] - x_margin), int(poly[1] + y_margin)])
+        self.polygon = new_polygon
+
     def intersection_polygon(self, other) -> List[List[float]]:
         new_poly = []
         for i in range(4):
diff --git a/surya/common/s3.py b/surya/common/s3.py
@@ -129,6 +129,4 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
                     print(f"Failed to download {pretrained_model_name_or_path} after {retries} attempts.")
                     raise e  # Reraise exception after max retries
 
-            pretrained_model_name_or_path = local_path
-        
-        return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return super().from_pretrained(local_path, *args, **kwargs)
diff --git a/surya/detection/heatmap.py b/surya/detection/heatmap.py
@@ -2,7 +2,7 @@
 
 import cv2
 import numpy as np
-from PIL import Image, ImageDraw
+from PIL import Image
 
 from surya.common.util import clean_boxes, rescale_bbox
 from surya.detection.affinity import get_vertical_lines
@@ -122,6 +122,7 @@ def get_and_clean_boxes(textmap, processor_size, image_size, text_threshold=None
     bboxes = clean_boxes(bboxes)
     return bboxes
 
+
 def parallel_get_lines(preds, orig_sizes, include_maps=False):
     heatmap, affinity_map = preds
     heat_img, aff_img = None, None
@@ -143,18 +144,24 @@ def parallel_get_lines(preds, orig_sizes, include_maps=False):
     return result
 
 def parallel_get_boxes(preds, orig_sizes, include_maps=False):
-    heatmap, _ = preds
+    heatmap, affinity_map = preds
     heat_img, aff_img = None, None
+
     if include_maps:
         heat_img = Image.fromarray((heatmap * 255).astype(np.uint8))
+        aff_img = Image.fromarray((affinity_map * 255).astype(np.uint8))
     heatmap_size = list(reversed(heatmap.shape))
     bboxes = get_and_clean_boxes(heatmap, heatmap_size, orig_sizes)
+    for box in bboxes:
+        # Skip for vertical boxes
+        if box.height < 3 * box.width:
+            box.expand(x_margin=0, y_margin=settings.DETECTOR_BOX_Y_EXPAND_MARGIN)
 
     result = TextDetectionResult(
         bboxes=bboxes,
         vertical_lines=[],
         heatmap=heat_img,
-        affinity_map=None,
+        affinity_map=aff_img,
         image_bbox=[0, 0, orig_sizes[0], orig_sizes[1]]
     )
     return result
diff --git a/surya/scripts/streamlit_app.py b/surya/scripts/streamlit_app.py
@@ -75,7 +75,7 @@ def text_detection(img) -> (Image.Image, TextDetectionResult):
 def layout_detection(img) -> (Image.Image, LayoutResult):
     pred = predictors["layout"]([img])[0]
     polygons = [p.polygon for p in pred.bboxes]
-    labels = [f"{p.label}-{p.position}" for p in pred.bboxes]
+    labels = [f"{p.label}-{p.position}-{round(p.top_k[p.label], 2)}" for p in pred.bboxes]
     layout_img = draw_polys_on_image(polygons, img.copy(), labels=labels, label_font_size=18)
     return layout_img, pred
 
diff --git a/surya/settings.py b/surya/settings.py
@@ -48,13 +48,14 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # Text detection
     DETECTOR_BATCH_SIZE: Optional[int] = None # Defaults to 2 for CPU/MPS, 32 otherwise
-    DETECTOR_MODEL_CHECKPOINT: str = "s3://text_detection/2025_02_18"
+    DETECTOR_MODEL_CHECKPOINT: str = "s3://text_detection/2025_02_28"
     DETECTOR_BENCH_DATASET_NAME: str = "vikp/doclaynet_bench"
     DETECTOR_IMAGE_CHUNK_HEIGHT: int = 1400 # Height at which to slice images vertically
     DETECTOR_TEXT_THRESHOLD: float = 0.6 # Threshold for text detection (above this is considered text)
     DETECTOR_BLANK_THRESHOLD: float = 0.35 # Threshold for blank space (below this is considered blank)
     DETECTOR_POSTPROCESSING_CPU_WORKERS: int = min(8, os.cpu_count()) # Number of workers for postprocessing
     DETECTOR_MIN_PARALLEL_THRESH: int = 3 # Minimum number of images before we parallelize
+    DETECTOR_BOX_Y_EXPAND_MARGIN: float = 0.025  #Margin by which to expand detected boxes vertically
     COMPILE_DETECTOR: bool = False
 
     # Inline math detection