Concatenate last batches for batched inference (#200)

CeliaBenquet · web-flow · commit 81b964cd7ec2 · 2025-01-21T23:47:49.000+01:00
* Concatenate last to batches for batched inference

* Add test case
diff --git a/cebra/solver/base.py b/cebra/solver/base.py
@@ -231,7 +231,19 @@ def __getitem__(self, idx):
     index_dataloader = DataLoader(index_dataset, batch_size=batch_size)
 
     output = []
-    for index_batch in index_dataloader:
+    for batch_idx, index_batch in enumerate(index_dataloader):
+        # NOTE(celia): This is to prevent that adding the offset to the
+        # penultimate batch for larger offset make the batch_end_idx larger
+        # than the input length, while we also don't want to drop the last
+        # samples that do not fit in a complete batch.
+        if batch_idx == (len(index_dataloader) - 2):
+            # penultimate batch, last complete batch
+            last_batch = index_batch
+            continue
+        if batch_idx == (len(index_dataloader) - 1):
+            # last batch, incomplete
+            index_batch = torch.cat((last_batch, index_batch), dim=0)
+
         batch_start_idx, batch_end_idx = index_batch[0], index_batch[-1] + 1
         batched_data = _get_batch(inputs=inputs,
                                   offset=offset,
diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py
@@ -1506,3 +1506,20 @@ def test_new_transform(model_architecture, device):
     embedding2 = cebra_model.transform_deprecated(X, session_id=2)
     assert np.allclose(embedding1, embedding2, rtol=1e-5,
                        atol=1e-8), "Arrays are not close enough"
+
+
+def test_last_incomplete_batch_smaller_than_offset():
+    """
+    When offset of the model is larger than the remaining samples in the 
+    last batch, an error could happen. We merge the penultimate 
+    and last batches together to avoid this.
+    """
+    train = cebra.data.TensorDataset(neural=np.random.rand(20111, 100),
+                                     continuous=np.random.rand(20111, 2))
+
+    model = cebra.CEBRA(max_iterations=2,
+                        model_architecture="offset36-model-more-dropout",
+                        device="cpu")
+    model.fit(train.neural, train.continuous)
+
+    _ = model.transform(train.neural, batch_size=300)