Merge pull request #392 from datalab-to/dev

VikParuchuri · web-flow · commit 80e9a7e986ae · 2025-06-11T15:53:44.000-04:00
Dev
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -7,10 +7,7 @@ env:
 
 jobs:
   build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, windows-latest]
+    runs-on: t4_gpu
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.11
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,12 +2,12 @@ name: Unit tests
 
 on: [push]
 
-env:
-  TORCH_DEVICE: "cpu"
-
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [t4_gpu, ubuntu-latest, windows-latest]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.11
diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
@@ -2,12 +2,9 @@ name: Test CLI scripts
 
 on: [push]
 
-env:
-  TORCH_DEVICE: "cpu"
-
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: t4_gpu
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.11
diff --git a/benchmark/texify.py b/benchmark/texify.py
@@ -38,9 +38,12 @@ def score_text(predictions, references):
     return sum(lev_dist) / len(lev_dist)
 
 
-def inference_texify(source_data, predictor: RecognitionPredictor):
+def inference_texify(
+    source_data, predictor: RecognitionPredictor, line_mode: bool = False
+):
     images = [sd["image"] for sd in source_data]
-    tasks = [TaskNames.block_without_boxes] * len(images)
+    mode = TaskNames.ocr_with_boxes if line_mode else TaskNames.block_without_boxes
+    tasks = [mode] * len(images)
     bboxes = [[[0, 0, image.width, image.height]] for image in images]
     texify_predictions: List[OCRResult] = predictor(images, tasks, bboxes=bboxes)
     out_data = [
@@ -70,15 +73,18 @@ def inference_texify(source_data, predictor: RecognitionPredictor):
 @click.option(
     "--max_rows", type=int, help="Maximum number of images to benchmark.", default=None
 )
-def main(ds_name: str, results_dir: str, max_rows: int):
+@click.option(
+    "--line_mode", is_flag=True, help="Use line mode for texify.", default=False
+)
+def main(ds_name: str, results_dir: str, max_rows: int, line_mode: bool):
     predictor = RecognitionPredictor()
     ds = datasets.load_dataset(ds_name, split="train")
 
     if max_rows:
         ds = ds.filter(lambda x, idx: idx < max_rows, with_indices=True)
 
     start = time.time()
-    predictions = inference_texify(ds, predictor)
+    predictions = inference_texify(ds, predictor, line_mode)
     time_taken = time.time() - start
 
     text = [p["text"] for p in predictions]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.14.5"
+version = "0.14.6"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/recognition/__init__.py b/surya/recognition/__init__.py
@@ -34,6 +34,7 @@
     detect_repeat_token,
     prediction_to_polygon_batch,
     unwrap_math,
+    clean_math_tags,
 )
 from surya.recognition.schema import TextLine, OCRResult, TextChar
 from surya.common.surya.schema import TaskNames
@@ -917,6 +918,7 @@ def __call__(
                     )
                     text = "".join([char.text for char in text_line])
                     text = unwrap_math(text)
+                    text = clean_math_tags(text)
                     lines.append(
                         TextLine(
                             text=text,
diff --git a/surya/recognition/util.py b/surya/recognition/util.py
@@ -27,6 +27,35 @@ def unwrap_math(text: str) -> str:
     return text
 
 
+MATH_BLOCK = re.compile(r"(<math\b[^>]*>)(.*?)</math>", flags=re.I | re.S)
+STRIP_TAGS = re.compile(r"</?(?:br|u|del|mark|i|b|sup|sub)\b[^>]*>", flags=re.I | re.S)
+
+
+def clean_math_tags(html: str) -> str:
+    # strip unwanted tags inside every well‑formed <math>…</math>
+    def _inner(m):
+        inner = STRIP_TAGS.sub("", m.group(2))
+        return f"{m.group(1)}{inner}</math>" if inner.strip() else ""
+
+    cleaned = MATH_BLOCK.sub(_inner, html)
+
+    # drop only orphan *closing* </math> tags
+    depth = 0
+    parts = []
+    for token in re.split(r"(</?math[^>]*>)", cleaned, flags=re.I):
+        if token.lower().startswith("<math"):
+            depth += 1
+            parts.append(token)
+        elif token.lower() == "</math>":
+            if depth:  # keep it only if it matches an open
+                depth -= 1
+                parts.append(token)
+            # else: skip orphan closing tag
+        else:
+            parts.append(token)
+    return "".join(parts)
+
+
 def detect_repeat_token(predicted_tokens: List[int], max_repeats: int = 40):
     if len(predicted_tokens) < max_repeats:
         return False
diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -1,5 +1,6 @@
 import time
 from PIL import ImageDraw, Image
+from surya.recognition.util import clean_math_tags
 
 
 def test_recognition(recognition_predictor, detection_predictor, test_image):
@@ -49,3 +50,18 @@ def test_recognition_drop_repeats(recognition_predictor, detection_predictor):
     assert len(recognition_results) == 1
     result = recognition_results[0].text_lines
     assert result[0].text == ""
+
+
+def test_recognition_clean_math():
+    math = """<math display="block">na_n^{1+2r} \\text{cov}(\\hat{f}_n^{(r)}(x), \\hat{f}_n^{(r)}(y)) = \\frac{1}{n} \\sum_{j=1}^n \\frac{a_n^{1+2r}}{a_j^{1+2r}} \\text{cov}\\left(K^{(r)}\\left(\\frac{x-X_j}{a_j}\\right), K^{(r)}\\left(\\frac{y-X_j}{a_j}\\right)\\right) <br>+ \\frac{a_n^{1+2r}}{n} \\sum_{\\substack{j \\neq k \\\\ 1 \\le j, k \\le n}} \\frac{1}{(a_j a_k)^{1+r}} \\text{cov}\\left(K^{(r)}\\left(\\frac{x-X_j}{a_j}\\right), K^{(r)}\\left(\\frac{y-X_k}{a_k}\\right)\\right) <br>=: I_1 + I_2.</math> (1.7)</math>'"""
+    clean_math = clean_math_tags(math)
+
+    assert clean_math.count("</math>") == 1, "Should have one closing math tag"
+    assert "<br>" not in clean_math, "Should not have <br> tags in cleaned math"
+
+
+def test_recognition_clean_math_preserve_text():
+    text = """Hello, this is a sentence with <math display="inline">x^2 + y^2 = z^2</math> and some text after it, with a weird tag <hello> and <goodbye>."""
+    clean_text = clean_math_tags(text)
+
+    assert clean_text == text