NVIDIA · pstjohn · Aug 2, 2024 · Aug 2, 2024 · pstjohn · Aug 2, 2024
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/dataset.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/dataset.py
@@ -300,18 +300,46 @@ def process_item(
         token_ids, max_len, tokenizer.token_to_id(tokenizer.pad_token), sample=False
     )
 
+    token_ids, mask, attention_mask, labels = apply_masking(
+        token_ids,
+        mask_prob,
+        mask_token_prob,
+        random_token_prob,
+        prepend_cls_token,
+        tokenizer.pad_id,
+        tokenizer.token_to_id(tokenizer.cls_token),
+        tokenizer.token_to_id(tokenizer.mask_token),
+        len(tokenizer.vocab) - 5,
+    )
+
+    # NeMo megatron assumes this return structure.
+    item = {
+        "text": token_ids.astype(np.int64),
+        "types": np.zeros_like(token_ids).astype(np.int64),
+        "attention_mask": attention_mask.astype(np.int64),
+        "labels": labels.astype(np.int64),
+        "loss_mask": mask,
+        "is_random": np.zeros_like(token_ids).astype(np.int64),
+    }
+
+    return item
+
+
+def apply_masking(
+    token_ids, mask_prob, mask_token_prob, random_token_prob, prepend_cls_token, pad_id, cls_id, mask_id, num_tokens
+):
     mask = None
     mask_tokens_positions = None
     random_tokens_positions = None
 
     # - masked tokens
     if mask_prob > 0.0:
         probs = np.full(token_ids.shape[0], mask_prob)
-        probs[token_ids == tokenizer.token_to_id(tokenizer.pad_token)] = 0.0
+        probs[token_ids == pad_id] = 0.0
         mask = np.random.binomial(1, probs).astype(bool)
         mask_tokens_positions = mask & np.random.binomial(1, mask_token_prob, mask.shape).astype(bool)
-        random_tokens_positions = (
-            mask & np.random.binomial(1, random_token_prob, mask.shape).astype(bool) & (~mask_tokens_positions)
+        random_tokens_positions = (mask & np.random.binomial(1, random_token_prob, mask.shape).astype(bool)) & (
+            ~mask_tokens_positions
         )
         # - ensure [CLS] token is masked from the loss. Note that we're dealing with 1d arrays so flattening isn't a problem here.
         if prepend_cls_token:
@@ -321,8 +349,8 @@ def process_item(
 
     # - add [CLS] token, note that token_ids is a 1d array so flattening isn't a problem here.
     if prepend_cls_token:
-        token_ids = np.insert(token_ids, 0, tokenizer.token_to_id(tokenizer.cls_token))
-    attention_mask = token_ids != tokenizer.token_to_id(tokenizer.pad_token)
+        token_ids = np.insert(token_ids, 0, cls_id)
+    attention_mask = token_ids != pad_id
 
     labels = np.ones(len(token_ids)) * -1
 
@@ -339,19 +367,8 @@ def process_item(
     if random_tokens_positions is None:
         random_tokens_positions = np.zeros_like(mask)
     # identity_tokens = mask & (~mask_tokens_positions) & (~random_tokens_positions), not needed because
-    token_ids[mask_tokens_positions] = tokenizer.token_to_id(tokenizer.mask_token)
+    token_ids[mask_tokens_positions] = mask_id
     # There are 5 special tokens in the tokenizer, so we start from 5. TODO make this a parameter of the tokenizer.
     if random_tokens_positions.sum() > 0:
-        token_ids[random_tokens_positions] = np.random.randint(5, len(tokenizer.vocab), random_tokens_positions.sum())
-
-    # NeMo megatron assumes this return structure.
-    item = {
-        "text": token_ids.astype(np.int64),
-        "types": np.zeros_like(token_ids).astype(np.int64),
-        "attention_mask": attention_mask.astype(np.int64),
-        "labels": labels.astype(np.int64),
-        "loss_mask": mask,
-        "is_random": np.zeros_like(token_ids).astype(np.int64),
-    }
-
-    return item
+        token_ids[random_tokens_positions] = np.random.randint(5, num_tokens + 5, random_tokens_positions.sum())
+    return token_ids, mask, attention_mask, labels
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_dataset.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_dataset.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from bionemo.geneformer.data.singlecell.dataset import apply_masking
+
+
+def test_masking_gives_expected_ratios():
+    token_ids = np.ones(100_000, dtype=np.int64)
+
+    masked_token_ids, loss_mask, _, _ = apply_masking(
+        token_ids,
+        mask_prob=0.5,
+        mask_token_prob=0.25,
+        random_token_prob=0.12,
+        prepend_cls_token=True,
+        pad_id=0,
+        cls_id=5,
+        mask_id=2,
+        num_tokens=2,
+    )
+
+    assert len(masked_token_ids) == 100_001
+    masked_token_ids = masked_token_ids[1:]
+
+    # Check that overall masking probability is correct.
+    assert pytest.approx(loss_mask.mean(), abs=0.01) == 0.5
+
+    # Check that the distribution of masked tokens is correct.
+    assert pytest.approx((masked_token_ids == 2).mean(), abs=0.01) == 0.5 * 0.25
+
+    # Check that the distribution of random tokens is correct.
+    assert pytest.approx(((masked_token_ids == 5) | (masked_token_ids == 6)).mean(), abs=0.01) == 0.5 * 0.12
+
+    # Check that the distribution of unmasked tokens is correct.
+    assert pytest.approx((masked_token_ids[loss_mask] == 1).mean(), abs=0.01) == 1.0 - (0.25 + 0.12)