Add cli interface for esm2 checkpoint conversion

pstjohn · pstjohn · commit f18adf483c6f · 2025-06-04T07:21:08.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-esm2/pyproject.toml b/sub-packages/bionemo-esm2/pyproject.toml
@@ -18,21 +18,20 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-test = [
-    'bionemo-testing'
-]
+test = ['bionemo-testing']
 te = [
     # TE & Apex need to be installed after PyTorch, NVCC, and CUDA.
     # TODO(@pstjohn, @cspades): Figure out how to do this without post-installation.
-    'transformer_engine[pytorch]'
+    'transformer_engine[pytorch]',
 ]
 
 [project.scripts]
-bionemo-esm2-train= "bionemo.esm2.run.main:main"
-bionemo-esm2-recipe= "bionemo.esm2.run.recipes:main"
+bionemo-esm2-train = "bionemo.esm2.run.main:main"
+bionemo-esm2-recipe = "bionemo.esm2.run.recipes:main"
 infer_esm2 = "bionemo.esm2.scripts.infer_esm2:infer_esm2_entrypoint"
 train_esm2 = "bionemo.esm2.scripts.train_esm2:train_esm2_entrypoint"
 finetune_esm2 = "bionemo.esm2.scripts.finetune_esm2:finetune_esm2_entrypoint"
+convert_esm2 = "bionemo.esm2.model.convert:app"
 
 # Make sure that the tokenizer files are included along with the python files during installation.
 [tool.setuptools.package-data]
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/convert.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/convert.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 
 import torch
+import typer
 from nemo.lightning import io, teardown
 from nemo.lightning.pytorch.utils import dtype_from_hf
 from transformers import AutoConfig as HFAutoConfig
@@ -122,19 +123,20 @@ def init(self, dtype: torch.dtype = torch.bfloat16) -> EsmForMaskedLM:
 
     def apply(self, output_path: Path) -> Path:
         """Applies the transformation."""
-        nemo_config = ESM2Config(
-            initial_ckpt_path=str(self),
-            include_embeddings=True,
-            include_hiddens=True,
-            params_dtype=torch.bfloat16,
-            autocast_dtype=torch.bfloat16,
-            bf16=True,
-            fp16=False,
+        from megatron.core.dist_checkpointing.validation import StrictHandling
+        from nemo.lightning import MegatronStrategy, Trainer
+
+        cpu = not torch.distributed.is_initialized()
+        trainer = Trainer(
+            devices=1,
+            accelerator="cpu" if cpu else "gpu",
+            strategy=MegatronStrategy(
+                ddp="pytorch", setup_optimizers=False, ckpt_load_strictness=StrictHandling.LOG_UNEXPECTED
+            ),
         )
+        source, _ = self.nemo_load(self, trainer=trainer, cpu=cpu)
 
-        source = nemo_config.configure_model(self.tokenizer)
-
-        target = self.init(torch.bfloat16)
+        target = self.init(source.dtype)
         target = self.convert_state(source, target)
 
         target = target.cpu()
@@ -169,8 +171,6 @@ def convert_state(self, nemo_module, target):
             "lm_head.layer_norm.bias": "lm_head.layer_norm.bias",
         }
 
-        nemo_module.lm_head.to(torch.bfloat16)
-
         return io.apply_transforms(
             nemo_module,
             target,
@@ -340,3 +340,33 @@ def _import_qkv_bias(ctx: io.TransformCTX, query, key, value):
     concat_biases = concat_biases.transpose(0, 1).contiguous()
     concat_biases = concat_biases.view(*input_shape)
     return concat_biases
+
+
+app = typer.Typer()
+
+
+@app.command()
+def convert_nemo_to_hf(nemo_path: str, output_path: str):
+    """Convert a NeMo ESM-2 checkpoint to a HuggingFace checkpoint.
+
+    Args:
+        nemo_path: Path to the NeMo checkpoint.
+        output_path: Path to the output HuggingFace checkpoint.
+    """
+    io.export_ckpt(Path(nemo_path), "hf", Path(output_path))
+
+
+@app.command()
+def convert_hf_to_nemo(hf_tag_or_path: str, output_path: str):
+    """Convert a HuggingFace ESM-2 checkpoint to a NeMo ESM-2 checkpoint.
+
+    Args:
+        hf_tag_or_path: Tag or path to the HuggingFace checkpoint.
+        output_path: Path to the output NeMo checkpoint.
+    """
+    module = biobert_lightning_module(config=ESM2Config(), post_process=True)
+    io.import_ckpt(module, f"hf://{hf_tag_or_path}", Path(output_path))
+
+
+if __name__ == "__main__":
+    app()
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_convert.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_convert.py
@@ -18,33 +18,41 @@
 import torch
 from nemo.lightning import io
 from transformers import AutoModelForMaskedLM
+from typer.testing import CliRunner
 
 from bionemo.core.data.load import load
-from bionemo.esm2.model.convert import HFESM2Exporter, HFESM2Importer  # noqa: F401
+from bionemo.esm2.model.convert import (
+    HFESM2Importer,  # noqa: F401
+    app,
+)
 from bionemo.esm2.model.model import ESM2Config
 from bionemo.esm2.testing.compare import assert_esm2_equivalence
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.testing import megatron_parallel_state_utils
 
 
-# pytestmark = pytest.mark.xfail(
-#     reason="These tests are failing due to a bug in nemo global state when run in the same process as previous "
-#     "checkpoint save/load scripts."
-# )
+def test_nemo2_conversion_equivalent_8m(tmp_path):
+    model_tag = "facebook/esm2_t6_8M_UR50D"
+    module = biobert_lightning_module(config=ESM2Config())
+    io.import_ckpt(module, f"hf://{model_tag}", tmp_path / "nemo_checkpoint")
+    with megatron_parallel_state_utils.distributed_model_parallel_state():
+        assert_esm2_equivalence(tmp_path / "nemo_checkpoint", model_tag)
 
 
-def test_nemo2_conversion_equivalent_8m(tmp_path):
+def test_nemo2_conversion_equivalent_8m_with_local_path(tmp_path):
     model_tag = "facebook/esm2_t6_8M_UR50D"
+    hf_model = AutoModelForMaskedLM.from_pretrained(model_tag)
+    hf_model.save_pretrained(tmp_path / "hf_checkpoint")
+
     module = biobert_lightning_module(config=ESM2Config(), post_process=True)
-    io.import_ckpt(module, f"hf://{model_tag}", tmp_path / "nemo_checkpoint")
+    io.import_ckpt(module, f"hf://{tmp_path / 'hf_checkpoint'}", tmp_path / "nemo_checkpoint")
     with megatron_parallel_state_utils.distributed_model_parallel_state():
         assert_esm2_equivalence(tmp_path / "nemo_checkpoint", model_tag)
 
 
 def test_nemo2_export_8m_weights_equivalent(tmp_path):
     ckpt_path = load("esm2/8m:2.0")
-    with megatron_parallel_state_utils.distributed_model_parallel_state():
-        output_path = io.export_ckpt(ckpt_path, "hf", tmp_path / "hf_checkpoint")
+    output_path = io.export_ckpt(ckpt_path, "hf", tmp_path / "hf_checkpoint")
 
     hf_model_from_nemo = AutoModelForMaskedLM.from_pretrained(output_path)
     hf_model_from_hf = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")
@@ -56,19 +64,25 @@ def test_nemo2_export_8m_weights_equivalent(tmp_path):
         torch.testing.assert_close(
             hf_model_from_nemo.state_dict()[key],
             hf_model_from_hf.state_dict()[key],
-            atol=1e-2,
-            rtol=1e-2,
+            atol=1e-4,
+            rtol=1e-4,
             msg=lambda msg: f"{key}: {msg}",
         )
 
 
 def test_nemo2_export_golden_values(tmp_path):
     ckpt_path = load("esm2/8m:2.0")
+    output_path = io.export_ckpt(ckpt_path, "hf", tmp_path / "hf_checkpoint")
     with megatron_parallel_state_utils.distributed_model_parallel_state():
-        output_path = io.export_ckpt(ckpt_path, "hf", tmp_path / "hf_checkpoint")
         assert_esm2_equivalence(ckpt_path, output_path, precision="bf16")
 
 
+def test_nemo2_export_on_gpu(tmp_path):
+    ckpt_path = load("esm2/8m:2.0")
+    with megatron_parallel_state_utils.distributed_model_parallel_state():
+        io.export_ckpt(ckpt_path, "hf", tmp_path / "hf_checkpoint")
+
+
 def test_nemo2_conversion_equivalent_8m_bf16(tmp_path):
     model_tag = "facebook/esm2_t6_8M_UR50D"
     module = biobert_lightning_module(config=ESM2Config())
@@ -84,3 +98,35 @@ def test_nemo2_conversion_equivalent_650m(tmp_path):
     io.import_ckpt(module, f"hf://{model_tag}", tmp_path / "nemo_checkpoint")
     with megatron_parallel_state_utils.distributed_model_parallel_state():
         assert_esm2_equivalence(tmp_path / "nemo_checkpoint", model_tag, atol=1e-4, rtol=1e-4)
+
+
+def test_cli_nemo2_conversion_equivalent_8m(tmp_path):
+    """Test that the CLI conversion functions maintain model equivalence."""
+    model_tag = "facebook/esm2_t6_8M_UR50D"
+    runner = CliRunner()
+
+    # First convert HF to NeMo
+    nemo_path = tmp_path / "nemo_checkpoint"
+    result = runner.invoke(app, ["convert-hf-to-nemo", model_tag, str(nemo_path)])
+    assert result.exit_code == 0, f"CLI command failed: {result.output}"
+
+    # Then convert back to HF
+    hf_path = tmp_path / "hf_checkpoint"
+    result = runner.invoke(app, ["convert-nemo-to-hf", str(nemo_path), str(hf_path)])
+    assert result.exit_code == 0, f"CLI command failed: {result.output}"
+
+    hf_model_from_nemo = AutoModelForMaskedLM.from_pretrained(model_tag)
+    hf_model_from_hf = AutoModelForMaskedLM.from_pretrained(hf_path)
+
+    # These aren't initialized, so they're going to be different.
+    del hf_model_from_nemo.esm.contact_head
+    del hf_model_from_hf.esm.contact_head
+
+    for key in hf_model_from_nemo.state_dict().keys():
+        torch.testing.assert_close(
+            hf_model_from_nemo.state_dict()[key],
+            hf_model_from_hf.state_dict()[key],
+            atol=1e-4,
+            rtol=1e-4,
+            msg=lambda msg: f"{key}: {msg}",
+        )