datalab-to
diff --git a/‎.github/workflows/benchmarks.yml
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/benchmarks.yml
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/inline_detection.py
Lines changed: 107 additions & 0 deletions b/‎benchmark/inline_detection.py
Lines changed: 107 additions & 0 deletions
diff --git a/‎benchmark/recognition.py
Lines changed: 34 additions & 5 deletions b/‎benchmark/recognition.py
Lines changed: 34 additions & 5 deletions
diff --git a/‎benchmark/utils/textract.py
Lines changed: 29 additions & 0 deletions b/‎benchmark/utils/textract.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmark/utils/verify_benchmark_scores.py
Lines changed: 7 additions & 0 deletions b/‎benchmark/utils/verify_benchmark_scores.py
Lines changed: 7 additions & 0 deletions
@@ -25,6 +25,10 @@ jobs:
         run: |
           poetry run python benchmark/detection.py --max_rows 2
           poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/det_bench/results.json --bench_type detection
+      - name: Run inline detection benchmarj
+        run: |
+          poetry run python benchmark/inline_detection.py --max_rows 5
+          poetry run python benchmark/utils/verify_benchmark_scores.py results/benchmark/inline_math_bench/results.json --bench_type inline_detection
       - name: Run recognition benchmark test
         run: |
           poetry run python benchmark/recognition.py --max_rows 2
 
@@ -388,7 +388,7 @@ For Google Cloud, I aligned the output from Google Cloud with the ground truth.
 
 | Model     | Time (s)   | Time per page (s)   | precision   |   recall |
 |-----------|------------|---------------------|-------------|----------|
-| surya     | 50.2099    | 0.196133            | 0.821061    | 0.956556 |
+| surya     | 47.2285    | 0.094452            | 0.835857    | 0.960807 |
 | tesseract | 74.4546    | 0.290838            | 0.631498    | 0.997694 |
 
 
 
@@ -0,0 +1,107 @@
+import collections
+import copy
+import json
+from pathlib import Path
+
+import click
+
+from benchmark.utils.metrics import precision_recall
+from surya.debug.draw import draw_polys_on_image
+from surya.input.processing import convert_if_not_rgb
+from surya.common.util import rescale_bbox
+from surya.settings import settings
+from surya.detection import DetectionPredictor, InlineDetectionPredictor
+
+import os
+import time
+from tabulate import tabulate
+import datasets
+
+
+@click.command(help="Benchmark inline math detection model.")
+@click.option("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
+@click.option("--max_rows", type=int, help="Maximum number of pdf pages to OCR.", default=100)
+@click.option("--debug", is_flag=True, help="Enable debug mode.", default=False)
+def main(results_dir: str, max_rows: int, debug: bool):
+    det_predictor = DetectionPredictor()
+    inline_det_predictor = InlineDetectionPredictor()
+
+    dataset = datasets.load_dataset(settings.INLINE_MATH_BENCH_DATASET_NAME, split=f"train[:{max_rows}]")
+    images = list(dataset["image"])
+    images = convert_if_not_rgb(images)
+    correct_boxes = []
+    for i, boxes in enumerate(dataset["bboxes"]):
+        img_size = images[i].size
+        # Rescale from normalized 0-1 vals to image size
+        correct_boxes.append([rescale_bbox(b, (1, 1), img_size) for b in boxes])
+
+    if settings.DETECTOR_STATIC_CACHE:
+        # Run through one batch to compile the model
+        det_predictor(images[:1])
+        inline_det_predictor(images[:1], [[]])
+
+    start = time.time()
+    det_results = det_predictor(images)
+
+    # Reformat text boxes to inline math input format
+    text_boxes = []
+    for result in det_results:
+        text_boxes.append([b.bbox for b in result.bboxes])
+
+    inline_results = inline_det_predictor(images, text_boxes)
+    surya_time = time.time() - start
+
+    result_path = Path(results_dir) / "inline_math_bench"
+    result_path.mkdir(parents=True, exist_ok=True)
+
+    page_metrics = collections.OrderedDict()
+    for idx, (sb, cb) in enumerate(zip(inline_results, correct_boxes)):
+        surya_boxes = [s.bbox for s in sb.bboxes]
+        surya_polys = [s.polygon for s in sb.bboxes]
+
+        surya_metrics = precision_recall(surya_boxes, cb)
+
+        page_metrics[idx] = {
+            "surya": surya_metrics,
+        }
+
+        if debug:
+            bbox_image = draw_polys_on_image(surya_polys, copy.deepcopy(images[idx]))
+            bbox_image.save(result_path / f"{idx}_bbox.png")
+
+    mean_metrics = {}
+    metric_types = sorted(page_metrics[0]["surya"].keys())
+    models = ["surya"]
+
+    for k in models:
+        for m in metric_types:
+            metric = []
+            for page in page_metrics:
+                metric.append(page_metrics[page][k][m])
+            if k not in mean_metrics:
+                mean_metrics[k] = {}
+            mean_metrics[k][m] = sum(metric) / len(metric)
+
+    out_data = {
+        "times": {
+            "surya": surya_time,
+        },
+        "metrics": mean_metrics,
+        "page_metrics": page_metrics
+    }
+
+    with open(result_path / "results.json", "w+", encoding="utf-8") as f:
+        json.dump(out_data, f, indent=4)
+
+    table_headers = ["Model", "Time (s)", "Time per page (s)"] + metric_types
+    table_data = [
+        ["surya", surya_time, surya_time / len(images)] + [mean_metrics["surya"][m] for m in metric_types],
+    ]
+
+    print(tabulate(table_data, headers=table_headers, tablefmt="github"))
+    print("Precision and recall are over the mutual coverage of the detected boxes and the ground truth boxes at a .5 threshold.  There is a precision penalty for multiple boxes overlapping reference lines.")
+    print(f"Wrote results to {result_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -10,6 +10,7 @@
 from surya.settings import settings
 from surya.recognition.languages import CODE_TO_LANGUAGE
 from benchmark.utils.tesseract import tesseract_ocr_parallel, surya_lang_to_tesseract, TESS_CODE_TO_LANGUAGE
+from benchmark.utils.textract import textract_ocr_parallel
 import os
 import datasets
 import json
@@ -22,22 +23,24 @@
 @click.option("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
 @click.option("--max_rows", type=int, help="Maximum number of pdf pages to OCR.", default=None)
 @click.option("--debug", is_flag=True, help="Enable debug mode.", default=False)
-@click.option("--tesseract", is_flag=True, help="Run tesseract instead of surya.", default=False)
+@click.option("--tesseract", is_flag=True, help="Run benchmarks on tesseract.", default=False)
+@click.option("--textract", is_flag=True, help="Run benchmarks on textract.", default=False)
 @click.option("--langs", type=str, help="Specify certain languages to benchmark.", default=None)
 @click.option("--tess_cpus", type=int, help="Number of CPUs to use for tesseract.", default=28)
+@click.option("--textract_cpus", type=int, help="Number of CPUs to use for textract.", default=28)
 @click.option("--specify_language", is_flag=True, help="Pass language codes into the model.", default=False)
-def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: str, tess_cpus: int, specify_language: bool):
+def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, textract: bool, langs: str, tess_cpus: int, textract_cpus:int, specify_language: bool):
     rec_predictor = RecognitionPredictor()
 
     split = "train"
-    if max_rows:
-        split = f"train[:{max_rows}]"
-
     dataset = datasets.load_dataset(settings.RECOGNITION_BENCH_DATASET_NAME, split=split)
 
     if langs:
         langs = langs.split(",")
         dataset = dataset.filter(lambda x: x["language"] in langs, num_proc=4)
+    
+    if max_rows and max_rows<len(dataset):
+        dataset = dataset.shuffle().select(range(max_rows))
 
     images = list(dataset["image"])
     images = convert_if_not_rgb(images)
@@ -121,6 +124,28 @@ def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: s
         with open(os.path.join(result_path, "tesseract_scores.json"), "w+") as f:
             json.dump(tess_scores, f)
 
+    if textract:
+        start = time.time()
+        textract_predictions = textract_ocr_parallel(images, cpus=textract_cpus)
+        textract_time = time.time()-start
+
+        textract_scores = defaultdict(list)
+        for idx, (pred, ref_text, lang) in enumerate(zip(textract_predictions, line_text, lang_list)):
+            image_score = overlap_score(pred, ref_text)
+            for l in lang:
+                textract_scores[CODE_TO_LANGUAGE[l]].append(image_score)
+
+        flat_textract_scores = [s for l in textract_scores for s in textract_scores[l]]
+        benchmark_stats["textract"] = {
+            "avg_score": sum(flat_textract_scores) / len(flat_textract_scores),
+            "lang_scores": {l: sum(scores) / len(scores) for l, scores in textract_scores.items()},
+            "time_per_img": textract_time / len(images)
+        }
+        print(len(flat_textract_scores))
+
+        with open(os.path.join(result_path, "textract_scores.json"), "w+") as f:
+            json.dump(textract_scores, f)
+
     with open(os.path.join(result_path, "results.json"), "w+", encoding="utf-8") as f:
         json.dump(benchmark_stats, f)
 
@@ -133,6 +158,10 @@ def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: s
         table_data.append(
             ["tesseract", benchmark_stats["tesseract"]["time_per_img"], benchmark_stats["tesseract"]["avg_score"]] + [benchmark_stats["tesseract"]["lang_scores"].get(l, 0) for l in key_languages]
         )
+    if textract:
+        table_data.append(
+            ["textract", benchmark_stats["textract"]["time_per_img"], benchmark_stats["textract"]["avg_score"]] + [benchmark_stats["textract"]["lang_scores"][l] for l in key_languages],
+        )
 
     print(tabulate(table_data, headers=table_headers, tablefmt="github"))
     print("Only a few major languages are displayed. See the result path for additional languages.")
 
@@ -0,0 +1,29 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import traceback
+
+from surya.input.processing import slice_bboxes_from_image
+from surya.recognition import RecognitionPredictor
+
+def textract_ocr(extractor, img):
+    try:
+        document = extractor.detect_document_text(file_source=img)
+        return [line.text for line in document.lines]
+    except:
+        traceback.print_exc()
+        return [None]
+
+def textract_ocr_parallel(imgs, cpus=None):
+    from textractor import Textractor # Optional dependency
+
+    extractor = Textractor(profile_name='default')
+    parallel_cores = min(len(imgs), RecognitionPredictor().get_batch_size())
+    if not cpus:
+        cpus = os.cpu_count()
+    parallel_cores = min(parallel_cores, cpus)
+
+    with ThreadPoolExecutor(max_workers=parallel_cores) as executor:
+        textract_text = tqdm(executor.map(textract_ocr, [extractor]*len(imgs), imgs), total=len(imgs), desc="Running textract OCR")
+        textract_text = list(textract_text)
+    return textract_text
@@ -18,6 +18,11 @@ def verify_det(data):
         raise ValueError("Scores do not meet the required threshold")
 
 
+def verify_inline_det(data):
+    scores = data["metrics"]["surya"]
+    if scores["precision"] <= 0.5 or scores["recall"] <= 0.5:
+        raise ValueError("Scores do not meet the required threshold")
+
 def verify_rec(data):
     scores = data["surya"]
     if scores["avg_score"] <= 0.9:
@@ -62,6 +67,8 @@ def main(file_path, bench_type):
         verify_table_rec(data)
     elif bench_type == "texify":
         verify_texify(data)
+    elif bench_type == "inline_detection":
+        verify_inline_det(data)
     else:
         raise ValueError("Invalid benchmark type")