Merge pull request #315 from VikParuchuri/dev

VikParuchuri · web-flow · commit d349f30c2e76 · 2025-02-10T09:25:30.000-08:00
Inline math fixes
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -25,7 +25,7 @@ jobs:
         run: |
           poetry run python benchmark/detection.py --max_rows 2
           poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/det_bench/results.json --bench_type detection
-      - name: Run inline detection benchmarj
+      - name: Run inline detection benchmark
         run: |
           poetry run python benchmark/inline_detection.py --max_rows 5
           poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/inline_math_bench/results.json --bench_type inline_detection
diff --git a/surya/detection/heatmap.py b/surya/detection/heatmap.py
@@ -166,8 +166,22 @@ def parallel_get_inline_boxes(preds, orig_sizes, text_boxes, include_maps=False)
     for text_box in text_boxes:
         text_box_reshaped = rescale_bbox(text_box, orig_sizes, heatmap_size)
         x1, y1, x2, y2 = text_box_reshaped
-        heatmap[y2:y2+3, x1:x2] = 0
-    bboxes = get_and_clean_boxes(heatmap, heatmap_size, orig_sizes, text_threshold=settings.INLINE_MATH_THRESHOLD)
+
+        # Blank out above and below text boxes, so we avoid merging inline math blocks together
+        heatmap[y2:y2+settings.INLINE_MATH_TEXT_BLANK_PX, x1:x2] = 0
+        heatmap[y1-settings.INLINE_MATH_TEXT_BLANK_PX:y1, x1:x2] = 0
+        heatmap[y1:y2, x2:x2+settings.INLINE_MATH_TEXT_BLANK_PX] = 0
+        heatmap[y1:y2, x1-settings.INLINE_MATH_TEXT_BLANK_PX:x1] = 0
+
+    bboxes = get_and_clean_boxes(
+        heatmap,
+        heatmap_size,
+        orig_sizes,
+        text_threshold=settings.INLINE_MATH_THRESHOLD,
+        low_text=settings.INLINE_MATH_BLANK_THRESHOLD
+    )
+
+    bboxes = [bbox for bbox in bboxes if bbox.area > settings.INLINE_MATH_MIN_AREA]
 
     heat_img, aff_img = None, None
     if include_maps:
diff --git a/surya/scripts/streamlit_app.py b/surya/scripts/streamlit_app.py
@@ -55,15 +55,20 @@ def ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=1
     return label, results.labels
 
 
-def text_detection(img) -> (Image.Image, TextDetectionResult):
+def inline_detection(img) -> (Image.Image, TextDetectionResult):
     text_pred = predictors["detection"]([img])[0]
-    text_polygons = [p.polygon for p in text_pred.bboxes]
     text_boxes = [p.bbox for p in text_pred.bboxes]
-    det_img = draw_polys_on_image(text_polygons, img.copy())
-    
+
     inline_pred = predictors["inline_detection"]([img], [text_boxes], include_maps=True)[0]
     inline_polygons = [p.polygon for p in inline_pred.bboxes]
-    det_img = draw_polys_on_image(inline_polygons, det_img, color='blue')
+    det_img = draw_polys_on_image(inline_polygons, img.copy(), color='blue')
+    return det_img, text_pred, inline_pred
+
+
+def text_detection(img) -> (Image.Image, TextDetectionResult):
+    text_pred = predictors["detection"]([img])[0]
+    text_polygons = [p.polygon for p in text_pred.bboxes]
+    det_img = draw_polys_on_image(text_polygons, img.copy())
     return det_img, text_pred, inline_pred
 
 
@@ -193,6 +198,7 @@ def page_counter(pdf_file):
     page_number = None
 
 run_text_det = st.sidebar.button("Run Text Detection")
+run_inline_det = st.sidebar.button("Run Inline Math Detection")
 run_text_rec = st.sidebar.button("Run OCR")
 run_layout_det = st.sidebar.button("Run Layout Analysis")
 run_table_rec = st.sidebar.button("Run Table Rec")
@@ -211,6 +217,13 @@ def page_counter(pdf_file):
         st.json(text_pred.model_dump(exclude=["heatmap", "affinity_map"]), expanded=True)
         st.json(inline_pred.model_dump(exclude=["heatmap", "affinity_map"]), expanded=True)
 
+if run_inline_det:
+    det_img, text_pred, inline_pred = inline_detection(pil_image)
+    with col1:
+        st.image(det_img, caption="Detected Inline Math", use_container_width=True)
+        st.json(text_pred.model_dump(exclude=["heatmap", "affinity_map"]), expanded=True)
+        st.json(inline_pred.model_dump(exclude=["heatmap", "affinity_map"]), expanded=True)
+
 
 # Run layout
 if run_layout_det:
diff --git a/surya/settings.py b/surya/settings.py
@@ -57,8 +57,11 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # Inline math detection
     INLINE_MATH_MODEL_CHECKPOINT: str = "datalab-to/inline_math_det0@75aafc7aa3d494ece6496d28038c91f0d2518a43"
-    INLINE_MATH_THRESHOLD: float = 0.9 #Threshold for inline math detection (above this is considered inline-math)
+    INLINE_MATH_THRESHOLD: float = 0.8 #Threshold for inline math detection (above this is considered inline-math)
+    INLINE_MATH_BLANK_THRESHOLD: float = 0.5 # Threshold for blank space (below this is considered blank)
     INLINE_MATH_BENCH_DATASET_NAME: str = "datalab-to/inline_detection_bench"
+    INLINE_MATH_TEXT_BLANK_PX: int = 2 # How many pixels to blank out at the botton of each text line
+    INLINE_MATH_MIN_AREA: int = 100 # Minimum area for inline math detection
 
     # Text recognition
     RECOGNITION_MODEL_CHECKPOINT: str = "vikp/surya_rec2@6611509b2c3a32c141703ce19adc899d9d0abf41"
diff --git a/surya/texify/__init__.py b/surya/texify/__init__.py
@@ -125,7 +125,7 @@ def batch_texify(self, images: List[Image.Image], batch_size: int | None) -> Tup
             batch_confidences = torch.sum(sequence_scores, dim=-1) / torch.sum(sequence_scores != 0, dim=-1)
             batch_confidences = batch_confidences.cpu()[:current_batch_size]
             batch_predictions = batch_predictions.cpu()[:current_batch_size, 1:] # Cut off initial token
-            detected_text = self.processor.tokenizer.batch_decode(batch_predictions)
+            detected_text = self.processor.tokenizer.batch_decode(batch_predictions, skip_special_tokens=True)
 
             batch_confidences = batch_confidences.tolist()