datalab-to
diff --git a/‎poetry.lock
Lines changed: 105 additions & 110 deletions b/‎poetry.lock
Lines changed: 105 additions & 110 deletions
diff --git a/‎pyproject.toml
Lines changed: 3 additions & 2 deletions b/‎pyproject.toml
Lines changed: 3 additions & 2 deletions
diff --git a/‎surya/common/__init__.py
Lines changed: 3 additions & 0 deletions b/‎surya/common/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎surya/common/donut/processor.py
Lines changed: 3 additions & 2 deletions b/‎surya/common/donut/processor.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎surya/common/load.py
Lines changed: 1 addition & 8 deletions b/‎surya/common/load.py
Lines changed: 1 addition & 8 deletions
diff --git a/‎surya/common/s3.py
Lines changed: 134 additions & 0 deletions b/‎surya/common/s3.py
Lines changed: 134 additions & 0 deletions
diff --git a/‎surya/detection/loader.py
Lines changed: 2 additions & 5 deletions b/‎surya/detection/loader.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎surya/detection/model/config.py
Lines changed: 3 additions & 1 deletion b/‎surya/detection/model/config.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎surya/detection/model/encoderdecoder.py
Lines changed: 2 additions & 1 deletion b/‎surya/detection/model/encoderdecoder.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎surya/detection/processor.py
Lines changed: 3 additions & 1 deletion b/‎surya/detection/processor.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎surya/layout/loader.py
Lines changed: 2 additions & 4 deletions b/‎surya/layout/loader.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎surya/layout/model/config.py
Lines changed: 2 additions & 1 deletion b/‎surya/layout/model/config.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎surya/layout/model/encoderdecoder.py
Lines changed: 2 additions & 1 deletion b/‎surya/layout/model/encoderdecoder.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎surya/ocr_error/loader.py
Lines changed: 2 additions & 5 deletions b/‎surya/ocr_error/loader.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎surya/ocr_error/model/config.py
Lines changed: 3 additions & 1 deletion b/‎surya/ocr_error/model/config.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎surya/ocr_error/model/encoder.py
Lines changed: 2 additions & 1 deletion b/‎surya/ocr_error/model/encoder.py
Lines changed: 2 additions & 1 deletion
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.11.1"
+version = "0.12.0"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
@@ -20,9 +20,10 @@ pydantic-settings = "^2.1.0"
 python-dotenv = "^1.0.0"
 pillow = "^10.2.0"
 pypdfium2 = "=4.30.0"
-opencv-python = "^4.9.0.80"
 filetype = "^1.2.0"
 click = "^8.1.8"
+platformdirs = "^4.3.6"
+opencv-python-headless = "^4.11.0.86"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
 
@@ -0,0 +1,3 @@
+
+
+
@@ -9,10 +9,11 @@
 import numpy as np
 from PIL import Image
 import PIL
-from surya.settings import settings
 
+from surya.common.s3 import S3DownloaderMixin
+from surya.settings import settings
 
-class SuryaEncoderImageProcessor(DonutImageProcessor):
+class SuryaEncoderImageProcessor(S3DownloaderMixin, DonutImageProcessor):
     def __init__(self, *args, max_size=None, align_long_axis=False, **kwargs):
         super().__init__(*args, **kwargs)
 
 
@@ -18,11 +18,4 @@ def model(
     def processor(
             self
     ) -> Any:
-        raise NotImplementedError()
-
-    @staticmethod
-    def split_checkpoint_revision(checkpoint: str) -> tuple[str, str | None]:
-        parts = checkpoint.rsplit("@", 1)
-        if len(parts) == 1:
-            return parts[0], "main" # Default revision is main
-        return parts[0], parts[1]
+        raise NotImplementedError()
@@ -0,0 +1,134 @@
+import json
+import os
+import shutil
+import tempfile
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import requests
+from platformdirs import user_cache_dir
+from tqdm import tqdm
+
+from surya.settings import settings
+
+def join_urls(url1: str, url2: str):
+    url1 = url1.rstrip("/")
+    url2 = url2.lstrip("/")
+    return f"{url1}/{url2}"
+
+
+def get_model_name(pretrained_model_name_or_path: str):
+    return pretrained_model_name_or_path.split("/")[0]
+
+
+def download_file(remote_path: str, local_path: str, chunk_size: int = 1024 * 1024):
+    local_path = Path(local_path)
+    try:
+        response = requests.get(remote_path, stream=True, allow_redirects=True)
+        response.raise_for_status()  # Raise an exception for bad status codes
+
+        with open(local_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                if chunk:
+                    f.write(chunk)
+
+        return local_path
+    except Exception as e:
+        if local_path.exists():
+            local_path.unlink()
+        print(f"Download error for file {remote_path}: {str(e)}")
+        raise
+
+def check_manifest(local_dir: str):
+    local_dir = Path(local_dir)
+    manifest_path = local_dir / "manifest.json"
+    if not os.path.exists(manifest_path):
+        return False
+
+    try:
+        with open(manifest_path, "r") as f:
+            manifest = json.load(f)
+        for file in manifest["files"]:
+            if not os.path.exists(local_dir / file):
+                return False
+    except Exception as e:
+        return False
+
+    return True
+
+
+def download_directory(remote_path: str, local_dir: str):
+    model_name = get_model_name(remote_path)
+    s3_url = join_urls(settings.S3_BASE_URL, remote_path)
+    # Check to see if it's already downloaded
+    model_exists = check_manifest(local_dir)
+    if model_exists:
+        return
+
+    # Use tempfile.TemporaryDirectory to automatically clean up
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Download the manifest file
+        manifest_file = join_urls(s3_url, "manifest.json")
+        manifest_path = os.path.join(temp_dir, "manifest.json")
+        download_file(manifest_file, manifest_path)
+
+        # List and download all files
+        with open(manifest_path, "r") as f:
+            manifest = json.load(f)
+
+        pbar = tqdm(desc=f"Downloading {model_name} model...", total=len(manifest["files"]))
+
+        with ThreadPoolExecutor(max_workers=settings.PARALLEL_DOWNLOAD_WORKERS) as executor:
+            futures = []
+            for file in manifest["files"]:
+                remote_file = join_urls(s3_url, file)
+                local_file = os.path.join(temp_dir, file)
+                futures.append(executor.submit(download_file, remote_file, local_file))
+
+            for future in futures:
+                future.result()
+                pbar.update(1)
+
+        pbar.close()
+
+        # Move all files to new directory
+        for file in os.listdir(temp_dir):
+            shutil.move(os.path.join(temp_dir, file), local_dir)
+
+
+class S3DownloaderMixin:
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        # Allow loading models directly from the hub, or using s3
+        if not pretrained_model_name_or_path.startswith("s3://"):
+            return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        pretrained_model_name_or_path = pretrained_model_name_or_path.replace("s3://", "")
+        cache_dir = Path(user_cache_dir('datalab')) / "models"
+        local_path = os.path.join(cache_dir, pretrained_model_name_or_path)
+        os.makedirs(local_path, exist_ok=True)
+
+        # Retry logic for downloading the model folder
+        retries = 3
+        delay = 5
+        attempt = 0
+        success = False
+        while not success and attempt < retries:
+            try:
+                download_directory(pretrained_model_name_or_path, local_path)
+                success = True  # If download succeeded
+            except Exception as e:
+                print(f"Error downloading model from {pretrained_model_name_or_path}. Attempt {attempt+1} of {retries}. Error: {e}")
+                attempt += 1
+                if attempt < retries:
+                    print(f"Retrying in {delay} seconds...")
+                    time.sleep(delay)  # Wait before retrying
+                else:
+                    print(f"Failed to download {pretrained_model_name_or_path} after {retries} attempts.")
+                    raise e  # Reraise exception after max retries
+
+            pretrained_model_name_or_path = local_path
+        
+        return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -17,8 +17,6 @@ def __init__(self, checkpoint: Optional[str] = None):
         if self.checkpoint is None:
             self.checkpoint = settings.DETECTOR_MODEL_CHECKPOINT
 
-        self.checkpoint, self.revision = self.split_checkpoint_revision(self.checkpoint)
-
     def model(
             self,
             device: Optional[torch.device | str] = None,
@@ -29,12 +27,11 @@ def model(
         if dtype is None:
             dtype = settings.MODEL_DTYPE
 
-        config = EfficientViTConfig.from_pretrained(self.checkpoint, revision=self.revision)
+        config = EfficientViTConfig.from_pretrained(self.checkpoint)
         model = EfficientViTForSemanticSegmentation.from_pretrained(
             self.checkpoint,
             torch_dtype=dtype,
             config=config,
-            revision=self.revision
         )
         model = model.to(device)
         model = model.eval()
@@ -52,7 +49,7 @@ def model(
         return model
 
     def processor(self) -> SegformerImageProcessor:
-        return SegformerImageProcessor.from_pretrained(self.checkpoint, revision=self.revision)
+        return SegformerImageProcessor.from_pretrained(self.checkpoint)
 
 class InlineDetectionModelLoader(DetectionModelLoader):
     def __init__(self, checkpoint: Optional[str] = None):
 
@@ -1,7 +1,9 @@
 from transformers import PretrainedConfig
 
+from surya.common.s3 import S3DownloaderMixin
 
-class EfficientViTConfig(PretrainedConfig):
+
+class EfficientViTConfig(S3DownloaderMixin, PretrainedConfig):
     r"""
     ```"""
 
 
@@ -18,6 +18,7 @@
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import SemanticSegmenterOutput
 
+from surya.common.s3 import S3DownloaderMixin
 from surya.detection.model.config import EfficientViTConfig
 
 
@@ -721,7 +722,7 @@ def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
         return logits
 
 
-class EfficientViTForSemanticSegmentation(EfficientViTPreTrainedModel):
+class EfficientViTForSemanticSegmentation(S3DownloaderMixin, EfficientViTPreTrainedModel):
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.vit = EfficientVitLarge(config)
 
@@ -20,8 +20,10 @@
 import PIL.Image
 import torch
 
+from surya.common.s3 import S3DownloaderMixin
 
-class SegformerImageProcessor(BaseImageProcessor):
+
+class SegformerImageProcessor(S3DownloaderMixin, BaseImageProcessor):
     r"""
     Constructs a Segformer image processor.
 
 
@@ -16,8 +16,6 @@ def __init__(self, checkpoint: Optional[str] = None):
         if self.checkpoint is None:
             self.checkpoint = settings.LAYOUT_MODEL_CHECKPOINT
 
-        self.checkpoint, self.revision = self.split_checkpoint_revision(self.checkpoint)
-
     def model(
         self,
         device=settings.TORCH_DEVICE_MODEL,
@@ -28,7 +26,7 @@ def model(
         if dtype is None:
             dtype = settings.MODEL_DTYPE
 
-        config = SuryaLayoutConfig.from_pretrained(self.checkpoint, revision=self.revision)
+        config = SuryaLayoutConfig.from_pretrained(self.checkpoint)
         decoder_config = config.decoder
         decoder = SuryaLayoutDecoderConfig(**decoder_config)
         config.decoder = decoder
@@ -37,7 +35,7 @@ def model(
         encoder = DonutSwinLayoutConfig(**encoder_config)
         config.encoder = encoder
 
-        model = SuryaLayoutModel.from_pretrained(self.checkpoint, config=config, torch_dtype=dtype, revision=self.revision)
+        model = SuryaLayoutModel.from_pretrained(self.checkpoint, config=config, torch_dtype=dtype)
         model = model.to(device)
         model = model.eval()
 
 
@@ -4,6 +4,7 @@
 from transformers import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutput
 from transformers.utils import ModelOutput
+from surya.common.s3 import S3DownloaderMixin
 from surya.settings import settings
 
 SPECIAL_TOKENS = 3
@@ -36,7 +37,7 @@
 LABEL_COUNT = len(ID_TO_LABEL)
 
 
-class SuryaLayoutConfig(PretrainedConfig):
+class SuryaLayoutConfig(S3DownloaderMixin, PretrainedConfig):
     model_type = "vision-encoder-decoder"
     is_composition = True
 
 
@@ -4,6 +4,7 @@
 import torch
 from transformers import PreTrainedModel, VisionEncoderDecoderConfig, PretrainedConfig
 from transformers.modeling_outputs import BaseModelOutput
+from surya.common.s3 import S3DownloaderMixin
 from surya.layout.model.encoder import DonutSwinLayoutModel
 from surya.layout.model.decoder import SuryaLayoutDecoder
 from transformers.utils import ModelOutput
@@ -16,7 +17,7 @@ class LayoutBboxOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
 
 
-class SuryaLayoutModel(PreTrainedModel):
+class SuryaLayoutModel(S3DownloaderMixin, PreTrainedModel):
     config_class = VisionEncoderDecoderConfig
     base_model_prefix = "vision_encoder_decoder"
     main_input_name = "pixel_values"
 
@@ -16,8 +16,6 @@ def __init__(self, checkpoint: Optional[str] = None):
         if self.checkpoint is None:
             self.checkpoint = settings.OCR_ERROR_MODEL_CHECKPOINT
 
-        self.checkpoint, self.revision = self.split_checkpoint_revision(self.checkpoint)
-
     def model(
         self,
         device=settings.TORCH_DEVICE_MODEL,
@@ -28,12 +26,11 @@ def model(
         if dtype is None:
             dtype = settings.MODEL_DTYPE
 
-        config = DistilBertConfig.from_pretrained(self.checkpoint, revision=self.revision)
+        config = DistilBertConfig.from_pretrained(self.checkpoint)
         model = DistilBertForSequenceClassification.from_pretrained(
             self.checkpoint,
             torch_dtype=dtype,
             config=config,
-            revision=self.revision
         ).to(device).eval()
 
         if settings.COMPILE_ALL or settings.COMPILE_OCR_ERROR:
@@ -50,4 +47,4 @@ def model(
     def processor(
             self
     ) -> DistilBertTokenizer:
-        return DistilBertTokenizer.from_pretrained(self.checkpoint, revision=self.revision)
+        return DistilBertTokenizer.from_pretrained(self.checkpoint)
@@ -4,12 +4,14 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.onnx import OnnxConfig
 
+from surya.common.s3 import S3DownloaderMixin
+
 ID2LABEL = {
     0: 'good',
     1: 'bad'
 }
 
-class DistilBertConfig(PretrainedConfig):
+class DistilBertConfig(S3DownloaderMixin, PretrainedConfig):
     model_type = "distilbert"
     attribute_map = {
         "hidden_size": "dim",
 
@@ -21,6 +21,7 @@
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 
+from surya.common.s3 import S3DownloaderMixin
 from surya.ocr_error.model.config import DistilBertConfig
 
 
@@ -693,7 +694,7 @@ def forward(
         )
 
 
-class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
+class DistilBertForSequenceClassification(S3DownloaderMixin, DistilBertPreTrainedModel):
     def __init__(self, config: DistilBertConfig):
         super().__init__(config)
         self.num_labels = config.num_labels