Merge pull request #374 from VikParuchuri/dev

VikParuchuri · web-flow · commit 8a63dfc08733 · 2025-05-20T09:03:31.000-07:00
Fix large image issue
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ There is a hosted API for all surya models available [here](https://www.datalab.
 
 I want surya to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
 
-The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/).  If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
+The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under \$2M USD in gross revenue in the most recent 12-month period AND under \$2M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/).  If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
 
 # Installation
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.14.1"
+version = "0.14.2"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
@@ -14,7 +14,7 @@ packages = [
 [tool.poetry.dependencies]
 python = "^3.10"
 transformers = "^4.51.2"
-torch = "^2.5.1"
+torch = "^2.7.0"
 pydantic = "^2.5.3"
 pydantic-settings = "^2.1.0"
 python-dotenv = "^1.0.0"
@@ -25,8 +25,8 @@ click = "^8.1.8"
 platformdirs = "^4.3.6"
 opencv-python-headless = "^4.11.0.86"
 einops = "^0.8.1"
-
 pre-commit = "^4.2.0"
+
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
 pytesseract = "^0.3.10"
diff --git a/surya/common/s3.py b/surya/common/s3.py
@@ -85,7 +85,8 @@ def download_directory(remote_path: str, local_dir: str):
             manifest = json.load(f)
 
         pbar = tqdm(
-            desc=f"Downloading {model_name} model...", total=len(manifest["files"])
+            desc=f"Downloading {model_name} model to {local_dir}",
+            total=len(manifest["files"]),
         )
 
         with ThreadPoolExecutor(
diff --git a/surya/common/surya/__init__.py b/surya/common/surya/__init__.py
@@ -10,15 +10,19 @@
 
 from surya.common.s3 import S3DownloaderMixin
 from surya.common.surya.config import SuryaModelConfig
-from surya.common.surya.decoder.__init__ import SuryaDecoderModel
-from surya.common.surya.embedder.__init__ import SimpleTokenEmbedder
-from surya.common.surya.encoder.__init__ import SuryaEncoderModel
+from surya.common.surya.decoder import SuryaDecoderModel
+from surya.common.surya.embedder import SimpleTokenEmbedder
+from surya.common.surya.encoder import SuryaEncoderModel
 
 from transformers.utils import is_flash_attn_2_available
 
+from surya.logging import get_logger
+
 if is_flash_attn_2_available():
     from surya.common.surya.flash_attn_utils import _get_unpad_data
 
+logger = get_logger()
+
 
 @dataclass
 class SuryaModelOutput(CausalLMOutputWithPast):
@@ -123,11 +127,57 @@ def set_output_embeddings(self, new_embeddings: nn.Module):
     def set_input_embeddings(self, new_embeddings: nn.Module):
         self.embedder.token_embed = new_embeddings
 
-    def get_image_embeddings(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor):
+    def get_image_embeddings(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+        encoder_chunk_size: int | None,
+    ):
         # embed all images with the vision encoder after they have already been tiled and flattened into a single batch
-        embeddings = self.vision_encoder.embed_images(
-            image_batch=pixel_values, grid_thw=grid_thw
+        chunks = [0]
+        grid_chunks = [0]
+        curr_chunk_len = 0
+        curr_seq_len = 0
+        for i in range(len(grid_thw)):
+            curr_chunk_len += (grid_thw[i][0] * grid_thw[i][1] * grid_thw[i][2]).item()
+            if curr_chunk_len > encoder_chunk_size:
+                chunks.append(curr_chunk_len + curr_seq_len)
+                curr_seq_len += curr_chunk_len
+                curr_chunk_len = 0
+                grid_chunks.append(i + 1)
+
+        if curr_chunk_len > 0:
+            chunks.append(pixel_values.shape[0])
+            grid_chunks.append(len(grid_thw))
+
+        assert curr_chunk_len + curr_seq_len == pixel_values.shape[0], (
+            f"Mismatch in encoder chunking, {curr_chunk_len} + {curr_seq_len} != {pixel_values.shape[0]}"
+        )
+
+        logger.debug(
+            f"Chunking encoder sequence into {len(chunks) - 1} chunks of size {encoder_chunk_size} with lengths {chunks} and grids {grid_chunks}"
         )
+        embeddings = []
+        for i in range(len(chunks) - 1):
+            start = chunks[i]
+            end = chunks[i + 1]
+            grid_start = grid_chunks[i]
+            grid_end = grid_chunks[i + 1]
+            chunk_embeddings = self.vision_encoder.embed_images(
+                image_batch=pixel_values[start:end],
+                grid_thw=grid_thw[grid_start:grid_end],
+            )
+            embeddings.append(chunk_embeddings)
+
+        if len(embeddings) == 0:
+            raise ValueError(
+                "No image embeddings were generated. Check the input images and grid sizes."
+            )
+        elif len(embeddings) == 1:
+            embeddings = embeddings[0]
+        else:
+            embeddings = torch.cat(embeddings, dim=0)
+
         encoding_2d = self.get_2d_learned_embeddings(
             grid_thw,
             device=embeddings.device,
@@ -144,7 +194,9 @@ def get_image_embeddings(self, pixel_values: torch.Tensor, grid_thw: torch.Tenso
 
         return embeddings
 
-    def embed_ids_boxes_images(self, input_ids, pixel_values, grid_thw):
+    def embed_ids_boxes_images(
+        self, input_ids, pixel_values, grid_thw, encoder_chunk_size: int
+    ):
         """
         Insert embedded image tiles into the corresponding positions into the full input sequence
 
@@ -154,7 +206,9 @@ def embed_ids_boxes_images(self, input_ids, pixel_values, grid_thw):
         inputs_embeds = self.embedder.embed(input_tokens=input_ids)
         if pixel_values is not None:
             image_features = self.get_image_embeddings(
-                pixel_values=pixel_values, grid_thw=grid_thw
+                pixel_values=pixel_values,
+                grid_thw=grid_thw,
+                encoder_chunk_size=encoder_chunk_size,
             )
 
             special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
@@ -230,12 +284,13 @@ def forward(
         output_attentions=False,
         use_cache=False,
         logits_to_keep=None,
+        encoder_chunk_size=None,
         **kwargs: KwargsForCausalLM,
     ):
         # Process the mixed batch if provided
         if inputs_embeds is None:
             inputs_embeds = self.embed_ids_boxes_images(
-                input_ids, image_tiles, grid_thw
+                input_ids, image_tiles, grid_thw, encoder_chunk_size
             )
 
         # Handling flash attention kwargs outside the decoder to speed up + avoid graph breaks inside the decoder
diff --git a/surya/common/surya/processor/__init__.py b/surya/common/surya/processor/__init__.py
@@ -168,8 +168,8 @@ def scale_to_fit(
         elif current_pixels < min_pixels:
             scale_factor = (min_pixels / current_pixels) ** 0.5
 
-            new_width = int(width * scale_factor)
-            new_height = int(height * scale_factor)
+            new_width = math.ceil(width * scale_factor)
+            new_height = math.ceil(height * scale_factor)
         else:
             return img
 
diff --git a/surya/logging.py b/surya/logging.py
@@ -1,10 +1,11 @@
 import logging
 import warnings
+from surya.settings import settings
 
 
 def configure_logging():
     # Setup surya logger
-    logger = logging.getLogger("surya")
+    logger = get_logger()
 
     if not logger.handlers:
         handler = logging.StreamHandler()
@@ -14,7 +15,7 @@ def configure_logging():
         handler.setFormatter(formatter)
         logger.addHandler(handler)
 
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(settings.LOGLEVEL)
     warnings.simplefilter(action="ignore", category=FutureWarning)
 
 
diff --git a/surya/recognition/__init__.py b/surya/recognition/__init__.py
@@ -41,6 +41,10 @@
     ContinuousBatchingQuantizedCache,
 )
 from surya.settings import settings
+from surya.logging import get_logger, configure_logging
+
+configure_logging()
+logger = get_logger()
 
 
 @dataclass
@@ -73,6 +77,8 @@ class RecognitionPredictor(BasePredictor):
     batch_size = settings.RECOGNITION_BATCH_SIZE
     torch_dtype = settings.MODEL_DTYPE_BFLOAT
     default_batch_sizes = {"cpu": 32, "mps": 64, "cuda": 256, "xla": 128}
+    encoder_chunk_size: int = 4096
+    encoder_chunk_sizes = {"cpu": 4096, "mps": 4096, "cuda": 32768, "xla": 32768}
     min_prefill_ratio: int = 0.2
     min_trim_length: int = 50
     tasks = {
@@ -104,6 +110,13 @@ def __init__(self, checkpoint=None, device=settings.TORCH_DEVICE_MODEL, dtype=No
             self.processor.pad_token_id, device=self.model.device, dtype=torch.long
         )
 
+    def get_encoder_chunk_size(self):
+        chunk_size = self.encoder_chunk_size
+        if settings.TORCH_DEVICE_MODEL in self.encoder_chunk_sizes:
+            if settings.TORCH_DEVICE_MODEL in self.encoder_chunk_sizes:
+                chunk_size = self.encoder_chunk_sizes[settings.TORCH_DEVICE_MODEL]
+        return chunk_size
+
     def setup_cache(self, batch_size: int):
         self.kv_cache = None
         self.prompt_queue.clear()
@@ -328,6 +341,7 @@ def decode(self, current_inputs: Optional[ContinuousBatchInput] = None):
         return new_input, processed_output
 
     def prefill(self, current_inputs: Optional[ContinuousBatchInput] = None):
+        logger.debug(f"Prefilling {self.num_empty_slots} slots")
         prompts: List[RecognitionPrompt] = [
             self.prompt_queue.popleft()
             for _ in range(min(self.num_empty_slots, len(self.prompt_queue)))
@@ -380,6 +394,7 @@ def prefill(self, current_inputs: Optional[ContinuousBatchInput] = None):
                 past_key_values=prefill_cache,
                 use_cache=True,
                 logits_to_keep=1,
+                encoder_chunk_size=self.get_encoder_chunk_size(),
             )
 
         # Process outputs
@@ -462,6 +477,7 @@ def maybe_trim_cache_padding(self, current_inputs: ContinuousBatchInput):
         if trim_start < self.min_trim_length:
             return current_inputs
 
+        logger.debug(f"Trimming cache from left by {trim_start} tokens.")
         trimmed_attention_mask = attention_mask[:, trim_start:]
         current_inputs.attention_mask = trimmed_attention_mask
 
diff --git a/surya/settings.py b/surya/settings.py
@@ -22,6 +22,7 @@ class Settings(BaseSettings):
         10  # Number of workers for parallel model downloads
     )
     MODEL_CACHE_DIR: str = str(Path(user_cache_dir("datalab")) / "models")
+    LOGLEVEL: str = "INFO"  # Logging level
 
     # Paths
     DATA_DIR: str = "data"

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,8 @@ def download_directory(remote_path: str, local_dir: str):`
`85`	`85`	`manifest = json.load(f)`
`86`	`86`
`87`	`87`	`pbar = tqdm(`
`88`		`- desc=f"Downloading {model_name} model...", total=len(manifest["files"])`
	`88`	`+ desc=f"Downloading {model_name} model to {local_dir}",`
	`89`	`+ total=len(manifest["files"]),`
`89`	`90`	`)`
`90`	`91`
`91`	`92`	`with ThreadPoolExecutor(`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ class Settings(BaseSettings):`
`22`	`22`	`10 # Number of workers for parallel model downloads`
`23`	`23`	`)`
`24`	`24`	`MODEL_CACHE_DIR: str = str(Path(user_cache_dir("datalab")) / "models")`
	`25`	`+ LOGLEVEL: str = "INFO" # Logging level`
`25`	`26`
`26`	`27`	`# Paths`
`27`	`28`	`DATA_DIR: str = "data"`