Merge pull request #379 from VikParuchuri/dev

VikParuchuri · web-flow · commit 442069eee7f5 · 2025-05-28T18:41:49.000-07:00
Dev
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.14.2"
+version = "0.14.3"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/common/adetr/decoder.py b/surya/common/adetr/decoder.py
@@ -193,6 +193,12 @@ def forward(
         attn_output = self.o_proj(attn_output)
         return attn_output
 
+    def _clear_cache(self):
+        if self.value_states is not None:
+            del self.value_states
+        if self.key_states is not None:
+            del self.key_states
+
     def _setup_cache(self, batch_size, device, dtype=None):
         # Setup initial caches
         self.value_states = None
@@ -297,6 +303,12 @@ def _setup_cache(self, batch_size, device, dtype=None):
             self.value_states = torch.zeros(cache_shape, dtype=dtype, device=device)
             self.key_states = torch.zeros(cache_shape, dtype=dtype, device=device)
 
+    def _clear_cache(self):
+        if self.value_states is not None:
+            del self.value_states
+        if self.key_states is not None:
+            del self.key_states
+
     def _update_static_cache(self, key_states, value_states, **cache_kwargs):
         cache_position = cache_kwargs.get("cache_position")
         k_out, v_out = self.key_states.to(key_states.device), self.value_states.to(value_states.device)
@@ -479,6 +491,14 @@ def _setup_cache(self, config, batch, device, dtype):
             if layer.cross_attn_block:
                 layer.cross_attn_block._setup_cache(batch, device, dtype)
 
+    def _clear_cache(self):
+        layers = getattr(self, "model", self).layers
+        for layer in layers:
+            if layer.temporal_block:
+                layer.temporal_block._clear_cache()
+            if layer.cross_attn_block:
+                layer.cross_attn_block._clear_cache()
+
     def reset_cache(self, batch, device, dtype):
         pass
 
diff --git a/surya/common/surya/decoder/__init__.py b/surya/common/surya/decoder/__init__.py
@@ -156,6 +156,7 @@ def __init__(self, config: SuryaDecoderConfig, layer_idx: int):
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=False
         )
+        self.merged_kv = False
 
     def forward(
         self,
@@ -178,9 +179,6 @@ def forward(
             query_states, key_states, cos, sin
         )
 
-        # IMPORTANT: Do not use causal mask for prefill; Matches training
-        # This is required for flash attn, which doesn't support a 4D mask as input
-        # The `is_causal` argument is ignored by SDPA since we pass a 4D attention mask
         is_prefill = all(
             (
                 input_shape[1] > 1,
diff --git a/surya/common/surya/encoder/__init__.py b/surya/common/surya/encoder/__init__.py
@@ -270,6 +270,100 @@ def __init__(self, dim: int, num_heads: int = 16) -> None:
         self.qkv = nn.Linear(dim, dim * 3, bias=True)
         self.proj = nn.Linear(dim, dim)
 
+    def unpack_qkv_with_mask(self, q, k, v, cu_seqlens):
+        """
+        Unpacks q, k, v sequences into batch-major form and constructs an additive attention mask.
+
+        Args:
+            q, k, v: Tensors of shape (total_seq_len, num_heads, head_dim)
+            cu_seqlens: Tensor of shape (batch_size + 1,) with cumulative sequence lengths
+
+        Returns:
+            batched_q: Tensor of shape (batch_size, max_seq_len, num_heads, head_dim)
+            batched_k: Tensor of shape (batch_size, max_seq_len, num_heads, head_dim)
+            batched_v: Tensor of shape (batch_size, max_seq_len, num_heads, head_dim)
+            attention_mask: Tensor of shape (batch_size, 1, max_seq_len, max_seq_len)
+                            with 0 for valid tokens and -inf for padding (for additive attention)
+        """
+        device = q.device
+        dtype = q.dtype
+
+        batch_size = cu_seqlens.shape[0] - 1
+        num_heads = q.shape[1]
+        head_dim = q.shape[2]
+
+        seq_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seq_len = seq_lengths.max().item()
+
+        batch_indices = []
+        position_indices = []
+
+        for i, seq_len in enumerate(seq_lengths):
+            batch_indices.extend([i] * seq_len)
+            position_indices.extend(list(range(seq_len)))
+
+        batch_indices = torch.tensor(batch_indices, device=device)
+        position_indices = torch.tensor(position_indices, device=device)
+
+        batched_q = torch.zeros((batch_size, max_seq_len, num_heads, head_dim), device=device, dtype=dtype)
+        batched_k = torch.zeros_like(batched_q)
+        batched_v = torch.zeros_like(batched_q)
+
+        # Create additive attention mask: shape (batch_size, 1, max_seq_len, max_seq_len)
+        # Each batch has a (max_seq_len, max_seq_len) matrix:
+        # - Rows = queries, Columns = keys
+        # - If query or key is padding, set to -inf
+        attention_mask = torch.full(
+            (batch_size, max_seq_len, max_seq_len),
+            fill_value=float('-inf'),
+            device=device,
+            dtype=dtype
+        )
+        for b in range(batch_size):
+            valid_len = seq_lengths[b].item()
+            attention_mask[b, :valid_len, :valid_len] = 0  # Unmasked
+
+        attention_mask = attention_mask.unsqueeze(1)  # (batch_size, 1, max_seq_len, max_seq_len)
+
+        batched_q[batch_indices, position_indices] = q
+        batched_k[batch_indices, position_indices] = k
+        batched_v[batch_indices, position_indices] = v
+
+        return batched_q, batched_k, batched_v, attention_mask
+
+    def repack_hidden_states(self, batched_output, cu_seqlens):
+        """
+        Reverses the unpacking operation using indexing to convert batched outputs 
+        back to a flat tensor of shape (total_seq_len, hidden_dim).
+
+        Args:
+            batched_output: Tensor of shape (batch_size, max_seq_len, hidden_dim)
+            cu_seqlens: Tensor of shape (batch_size + 1,) with cumulative sequence lengths
+
+        Returns:
+            packed_output: Tensor of shape (total_seq_len, hidden_dim)
+        """
+        device = batched_output.device
+        dtype = batched_output.dtype
+
+        batch_size, max_seq_len, hidden_dim = batched_output.shape
+        seq_lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        total_seq_len = seq_lengths.sum().item()
+
+        batch_indices = []
+        position_indices = []
+
+        for i, seq_len in enumerate(seq_lengths):
+            batch_indices.extend([i] * seq_len)
+            position_indices.extend(list(range(seq_len)))
+
+        batch_indices = torch.tensor(batch_indices, device=device)
+        position_indices = torch.tensor(position_indices, device=device)
+
+        packed_output = batched_output[batch_indices, position_indices]
+
+        return packed_output  # Shape: (total_seq_len, hidden_dim)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -298,28 +392,22 @@ def forward(
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
-        attention_mask = torch.zeros(
-            [1, seq_length, seq_length], device=q.device, dtype=torch.bool
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = True
-        q = q.transpose(0, 1)
-        k = k.transpose(0, 1)
-        v = v.transpose(0, 1)
+        q, k, v, attention_mask = self.unpack_qkv_with_mask(q, k, v, cu_seqlens)
+        batch_size, max_seqlen = q.shape[:2]
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
         attn_output = F.scaled_dot_product_attention(
-            q.unsqueeze(0),
-            k.unsqueeze(0),
-            v.unsqueeze(0),
+            q,
+            k,
+            v,
             attention_mask,
             dropout_p=0.0,
         )
-        attn_output = attn_output.squeeze(0).transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = attn_output.permute(0, 2, 1, 3).reshape(batch_size, max_seqlen, -1)     # Bring back to (batch_size, max_seqlen, hidden_dim)
         attn_output = self.proj(attn_output)
+        attn_output = self.repack_hidden_states(attn_output, cu_seqlens)
+
         return attn_output
 
 
diff --git a/surya/common/util.py b/surya/common/util.py
@@ -1,5 +1,6 @@
 import copy
 from typing import List
+import torch
 
 from surya.common.polygon import PolygonBox
 from surya.settings import settings
@@ -22,7 +23,12 @@ def clean_boxes(boxes: List[PolygonBox]) -> List[PolygonBox]:
             other_box = other_box_obj.bbox
             if box == other_box:
                 continue
-            if box[0] >= other_box[0] and box[1] >= other_box[1] and box[2] <= other_box[2] and box[3] <= other_box[3]:
+            if (
+                box[0] >= other_box[0]
+                and box[1] >= other_box[1]
+                and box[2] <= other_box[2]
+                and box[3] <= other_box[3]
+            ):
                 contained = True
                 break
         if not contained:
@@ -45,18 +51,42 @@ def rescale_bbox(bbox, processor_size, image_size):
     return new_bbox
 
 
-def expand_bbox(bbox, expansion_factor=.01):
+def expand_bbox(bbox, expansion_factor=0.01):
     expansion_low = 1 - expansion_factor
     expansion_high = 1 + expansion_factor
     return [
         bbox[0] * expansion_low,
         bbox[1] * expansion_low,
         bbox[2] * expansion_high,
-        bbox[3] * expansion_high
+        bbox[3] * expansion_high,
     ]
 
 
-if settings.TORCH_DEVICE_MODEL == 'xla':
+def is_flash_attn_2_supported(device: str | torch.device) -> bool:
+    if not torch.cuda.is_available():
+        return False
+
+    if "cuda" not in str(device):
+        return False
+
+    # Check CUDA version >= 12.0
+    cuda_version_str = torch.version.cuda
+    if cuda_version_str is None:
+        return False
+    cuda_version = tuple(map(int, cuda_version_str.split(".")))
+    if cuda_version < (12, 0):
+        return False
+
+    # Check GPU compute capability (Ampere, Ada, Hopper GPUs)
+    major, minor = torch.cuda.get_device_capability()
+    compute_capability = major + minor / 10
+    if compute_capability < 8.0:
+        return False
+
+    return True
+
+
+if settings.TORCH_DEVICE_MODEL == "xla":
     import torch_xla.core.xla_model as xm
 else:
     xm = None
diff --git a/surya/detection/__init__.py b/surya/detection/__init__.py
@@ -151,3 +151,5 @@ def batch_detection(
                     preds[idx] = heatmaps
 
             yield preds, [orig_sizes[j] for j in batch_image_idxs]
+
+        torch.cuda.empty_cache()
diff --git a/surya/layout/__init__.py b/surya/layout/__init__.py
@@ -219,5 +219,8 @@ def batch_layout_detection(
             batch_results = slicer.join(batch_results, tile_positions)
             results.extend(batch_results)
 
+        self.model.decoder.model._clear_cache()
+        torch.cuda.empty_cache()
+
         assert len(results) == len(images)
         return results
diff --git a/surya/recognition/__init__.py b/surya/recognition/__init__.py
@@ -33,6 +33,7 @@
     words_from_chars,
     detect_repeat_token,
     prediction_to_polygon_batch,
+    unwrap_math,
 )
 from surya.recognition.schema import TextLine, OCRResult, TextChar
 from surya.common.surya.schema import TaskNames
@@ -75,7 +76,7 @@ class RecognitionPrompt:
 class RecognitionPredictor(BasePredictor):
     model_loader_cls = RecognitionModelLoader
     batch_size = settings.RECOGNITION_BATCH_SIZE
-    torch_dtype = settings.MODEL_DTYPE_BFLOAT
+    torch_dtype = None      # No default, loader picks the dtype based on device properties - bf16/fp16
     default_batch_sizes = {"cpu": 32, "mps": 64, "cuda": 256, "xla": 128}
     encoder_chunk_size: int = 4096
     encoder_chunk_sizes = {"cpu": 4096, "mps": 4096, "cuda": 32768, "xla": 32768}
@@ -85,7 +86,7 @@ class RecognitionPredictor(BasePredictor):
         TaskNames.ocr_with_boxes: {
             "needs_bboxes": True,
             "img_size": (1024, 256),  # 370 max tokens
-            "max_tokens": 224,
+            "max_tokens": 256,
         },
         TaskNames.ocr_without_boxes: {
             "needs_bboxes": False,
@@ -272,6 +273,10 @@ def prepare_input(
 
             # Task input is the same for all tasks for now
             text = text or ""
+
+            # Remove input text that exceeds max generation tokens (likely invalid)
+            if len(text) > self.tasks[task_name]["max_tokens"]:
+                text = ""
             inputs = [
                 {"type": "image", "image": image, "rotated": False},
                 {"type": "text", "text": text.strip(), "math": math_mode},
@@ -588,11 +593,20 @@ def prediction_loop(
             current_inputs = self.maybe_trim_cache_padding(current_inputs)
             mark_step()
         pbar.close()
+        
+        del self.kv_cache
+        self.kv_cache = None
+        torch.cuda.empty_cache()
 
         return predicted_tokens, batch_bboxes, scores
 
     def get_bboxes_text(
-        self, flat: dict, predicted_tokens: list, scores: list, predicted_polygons: list
+        self,
+        flat: dict,
+        predicted_tokens: list,
+        scores: list,
+        predicted_polygons: list,
+        drop_repeated_text: bool = False,
     ) -> list:
         char_predictions = []
         needs_boxes = [
@@ -614,10 +628,23 @@ def get_bboxes_text(
                 needs_boxes,
             )
         ):
+            blank_bbox = [[0, 0], [0, 1], [1, 1], [1, 0]]
             if self.processor.no_output_token in image_tokens:
                 char_predictions.append(None)
                 continue
 
+            # If the image is very out of distribution, we can get nonsense repeats, and we may need to drop the text entirely
+            if drop_repeated_text and detect_repeat_token(image_tokens):
+                char_predictions.append(
+                    TextChar(
+                        text="",
+                        polygon=blank_bbox,
+                        confidence=0,
+                        bbox_valid=False,
+                    )
+                )
+                continue
+
             image_polygons = image_polygons[: len(image_tokens)].cpu().numpy().tolist()
 
             detokenize_sequences = []
@@ -681,7 +708,6 @@ def _add_detokenize_sequence(
             img_chars = []
             for sequence in detokenize_sequences:
                 token_ids, seq_score, bboxes, token_type = sequence
-                blank_bbox = [[0, 0], [0, 1], [1, 1], [1, 0]]
                 if token_type == "ocr":
                     text = self.processor.ocr_tokenizer.decode(
                         token_ids, task=TaskNames.ocr_with_boxes
@@ -750,6 +776,7 @@ def __call__(
         sort_lines: bool = False,
         math_mode: bool = True,
         return_words: bool = False,
+        drop_repeated_text: bool = False,
     ) -> List[OCRResult]:
         allowed_tasks = self.tasks.keys()
         if task_names is None:
@@ -874,6 +901,7 @@ def __call__(
                         text_line, self.processor.ocr_tokenizer.special_tokens
                     )
                     text = "".join([char.text for char in text_line])
+                    text = unwrap_math(text)
                     lines.append(
                         TextLine(
                             text=text,
diff --git a/surya/recognition/loader.py b/surya/recognition/loader.py
diff --git a/surya/recognition/util.py b/surya/recognition/util.py
diff --git a/tests/test_recognition.py b/tests/test_recognition.py

Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,7 @@ def __init__(self, config: SuryaDecoderConfig, layer_idx: int):`
`156`	`156`	`self.o_proj = nn.Linear(`
`157`	`157`	`config.num_attention_heads * self.head_dim, config.hidden_size, bias=False`
`158`	`158`	`)`
	`159`	`+ self.merged_kv = False`
`159`	`160`
`160`	`161`	`def forward(`
`161`	`162`	`self,`
`@@ -178,9 +179,6 @@ def forward(`
`178`	`179`	`query_states, key_states, cos, sin`
`179`	`180`	`)`
`180`	`181`
`181`		`- # IMPORTANT: Do not use causal mask for prefill; Matches training`
`182`		`- # This is required for flash attn, which doesn't support a 4D mask as input`
`183`		- # The `is_causal` argument is ignored by SDPA since we pass a 4D attention mask
`184`	`182`	`is_prefill = all(`
`185`	`183`	`(`
`186`	`184`	`input_shape[1] > 1,`