Added an MLP module.

RaulPPelaez · RaulPPelaez · commit 8dca199dbdc1 · 2024-04-04T16:49:24.000+02:00
Allow number of hidden MLP layers in Scalar OutputModel to be
configured from the yaml input.
diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py
@@ -127,6 +127,7 @@ def create_model(args, prior_model=None, mean=None, std=None):
         activation=args["activation"],
         reduce_op=args["reduce_op"],
         dtype=dtype,
+        num_layers=args.get("output_mlp_num_layers", 0),
     )
 
     # combine representation and output network
diff --git a/torchmdnet/models/output_modules.py b/torchmdnet/models/output_modules.py
@@ -6,7 +6,12 @@
 from typing import Optional
 import torch
 from torch import nn
-from torchmdnet.models.utils import act_class_mapping, GatedEquivariantBlock, scatter
+from torchmdnet.models.utils import (
+    act_class_mapping,
+    GatedEquivariantBlock,
+    scatter,
+    MLP,
+)
 from torchmdnet.utils import atomic_masses
 from torchmdnet.extensions import is_current_stream_capturing
 from warnings import warn
@@ -60,24 +65,23 @@ def __init__(
         allow_prior_model=True,
         reduce_op="sum",
         dtype=torch.float,
+        **kwargs
     ):
         super(Scalar, self).__init__(
             allow_prior_model=allow_prior_model, reduce_op=reduce_op
         )
-        act_class = act_class_mapping[activation]
-        self.output_network = nn.Sequential(
-            nn.Linear(hidden_channels, hidden_channels // 2, dtype=dtype),
-            act_class(),
-            nn.Linear(hidden_channels // 2, 1, dtype=dtype),
+        self.output_network = MLP(
+            in_channels=hidden_channels,
+            out_channels=1,
+            hidden_channels=hidden_channels // 2,
+            activation=activation,
+            num_layers=kwargs.get("num_layers", 0),
+            dtype=dtype,
         )
-
         self.reset_parameters()
 
     def reset_parameters(self):
-        nn.init.xavier_uniform_(self.output_network[0].weight)
-        self.output_network[0].bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.output_network[2].weight)
-        self.output_network[2].bias.data.fill_(0)
+        self.output_network.reset_parameters()
 
     def pre_reduce(self, x, v: Optional[torch.Tensor], z, pos, batch):
         return self.output_network(x)
@@ -91,10 +95,13 @@ def __init__(
         allow_prior_model=True,
         reduce_op="sum",
         dtype=torch.float,
+        **kwargs
     ):
         super(EquivariantScalar, self).__init__(
             allow_prior_model=allow_prior_model, reduce_op=reduce_op
         )
+        if kwargs.get("num_layers", 0) > 0:
+            warn("num_layers is not used in EquivariantScalar")
         self.output_network = nn.ModuleList(
             [
                 GatedEquivariantBlock(
@@ -125,14 +132,20 @@ def pre_reduce(self, x, v, z, pos, batch):
 
 class DipoleMoment(Scalar):
     def __init__(
-        self, hidden_channels, activation="silu", reduce_op="sum", dtype=torch.float
+        self,
+        hidden_channels,
+        activation="silu",
+        reduce_op="sum",
+        dtype=torch.float,
+        **kwargs
     ):
         super(DipoleMoment, self).__init__(
             hidden_channels,
             activation,
             allow_prior_model=False,
             reduce_op=reduce_op,
             dtype=dtype,
+            **kwargs
         )
         atomic_mass = torch.from_numpy(atomic_masses).to(dtype)
         self.register_buffer("atomic_mass", atomic_mass)
@@ -152,14 +165,20 @@ def post_reduce(self, x):
 
 class EquivariantDipoleMoment(EquivariantScalar):
     def __init__(
-        self, hidden_channels, activation="silu", reduce_op="sum", dtype=torch.float
+        self,
+        hidden_channels,
+        activation="silu",
+        reduce_op="sum",
+        dtype=torch.float,
+        **kwargs
     ):
         super(EquivariantDipoleMoment, self).__init__(
             hidden_channels,
             activation,
             allow_prior_model=False,
             reduce_op=reduce_op,
             dtype=dtype,
+            **kwargs
         )
         atomic_mass = torch.from_numpy(atomic_masses).to(dtype)
         self.register_buffer("atomic_mass", atomic_mass)
@@ -180,27 +199,31 @@ def post_reduce(self, x):
 
 class ElectronicSpatialExtent(OutputModel):
     def __init__(
-        self, hidden_channels, activation="silu", reduce_op="sum", dtype=torch.float
+        self,
+        hidden_channels,
+        activation="silu",
+        reduce_op="sum",
+        dtype=torch.float,
+        **kwargs
     ):
         super(ElectronicSpatialExtent, self).__init__(
             allow_prior_model=False, reduce_op=reduce_op
         )
-        act_class = act_class_mapping[activation]
-        self.output_network = nn.Sequential(
-            nn.Linear(hidden_channels, hidden_channels // 2, dtype=dtype),
-            act_class(),
-            nn.Linear(hidden_channels // 2, 1, dtype=dtype),
+        self.output_network = MLP(
+            in_channels=hidden_channels,
+            out_channels=1,
+            hidden_channels=hidden_channels // 2,
+            activation=activation,
+            num_layers=kwargs.get("num_layers", 0),
+            dtype=dtype,
         )
         atomic_mass = torch.from_numpy(atomic_masses).to(dtype)
         self.register_buffer("atomic_mass", atomic_mass)
 
         self.reset_parameters()
 
     def reset_parameters(self):
-        nn.init.xavier_uniform_(self.output_network[0].weight)
-        self.output_network[0].bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.output_network[2].weight)
-        self.output_network[2].bias.data.fill_(0)
+        self.output_network.reset_parameters()
 
     def pre_reduce(self, x, v: Optional[torch.Tensor], z, pos, batch):
         x = self.output_network(x)
@@ -219,14 +242,20 @@ class EquivariantElectronicSpatialExtent(ElectronicSpatialExtent):
 
 class EquivariantVectorOutput(EquivariantScalar):
     def __init__(
-        self, hidden_channels, activation="silu", reduce_op="sum", dtype=torch.float
+        self,
+        hidden_channels,
+        activation="silu",
+        reduce_op="sum",
+        dtype=torch.float,
+        **kwargs
     ):
         super(EquivariantVectorOutput, self).__init__(
             hidden_channels,
             activation,
             allow_prior_model=False,
             reduce_op="sum",
             dtype=dtype,
+            **kwargs
         )
 
     def pre_reduce(self, x, v, z, pos, batch):
diff --git a/torchmdnet/models/utils.py b/torchmdnet/models/utils.py
@@ -434,6 +434,49 @@ def forward(self, distances: Tensor) -> Tensor:
             return cutoffs
 
 
+class MLP(nn.Module):
+    """A simple multi-layer perceptron with a given number of layers and hidden channels.
+
+    Args:
+        in_channels (int): Number of input features.
+        out_channels (int): Number of output features.
+        hidden_channels (int): Number of hidden features.
+        activation (str): Activation function to use.
+        num_layers (int, optional): Number of layers. Defaults to 0.
+        dtype (torch.dtype, optional): Data type to use. Defaults to torch.float32.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        activation,
+        num_layers=0,
+        dtype=torch.float32,
+    ):
+        super(MLP, self).__init__()
+        act_class = act_class_mapping[activation]
+        self.act = act_class()
+        self.layers = nn.Sequential()
+        self.layers.append(nn.Linear(in_channels, hidden_channels, dtype=dtype))
+        self.layers.append(self.act)
+        for _ in range(num_layers):
+            self.layers.append(nn.Linear(hidden_channels, hidden_channels, dtype=dtype))
+            self.layers.append(self.act)
+        self.layers.append(nn.Linear(hidden_channels, out_channels, dtype=dtype))
+
+    def reset_parameters(self):
+        for layer in self.layers:
+            if isinstance(layer, nn.Linear):
+                nn.init.xavier_uniform_(layer.weight)
+                layer.bias.data.fill_(0)
+
+    def forward(self, x):
+        x = self.layers(x)
+        return x
+
+
 class GatedEquivariantBlock(nn.Module):
     """Gated Equivariant Block as defined in Schütt et al. (2021):
     Equivariant message passing for the prediction of tensorial properties and molecular spectra
@@ -462,21 +505,20 @@ def __init__(
         )
 
         act_class = act_class_mapping[activation]
-        self.update_net = nn.Sequential(
-            nn.Linear(hidden_channels * 2, intermediate_channels, dtype=dtype),
-            act_class(),
-            nn.Linear(intermediate_channels, out_channels * 2, dtype=dtype),
+        self.update_net = MLP(
+            in_channels=hidden_channels * 2,
+            out_channels=out_channels * 2,
+            hidden_channels=intermediate_channels,
+            activation=activation,
+            num_layers=0,
+            dtype=dtype,
         )
-
         self.act = act_class() if scalar_activation else None
 
     def reset_parameters(self):
         nn.init.xavier_uniform_(self.vec1_proj.weight)
         nn.init.xavier_uniform_(self.vec2_proj.weight)
-        nn.init.xavier_uniform_(self.update_net[0].weight)
-        self.update_net[0].bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.update_net[2].weight)
-        self.update_net[2].bias.data.fill_(0)
+        self.update_net.reset_parameters()
 
     def forward(self, x, v):
         vec1_buffer = self.vec1_proj(v)
diff --git a/torchmdnet/scripts/train.py b/torchmdnet/scripts/train.py
@@ -74,6 +74,7 @@ def get_argparse():
     # model architecture
     parser.add_argument('--model', type=str, default='graph-network', choices=models.__all_models__, help='Which model to train')
     parser.add_argument('--output-model', type=str, default='Scalar', choices=output_modules.__all__, help='The type of output model')
+    parser.add_argument('--output-mlp-num-layers', type=int, default=0, help='If the output model uses an MLP this will be the number of inner layers.')
     parser.add_argument('--prior-model', type=str, default=None, help='Which prior model to use. It can be a string, a dict if you want to add arguments for it or a dicts to add more than one prior. e.g. {"Atomref": {"max_z":100}, "Coulomb":{"max_num_neighs"=100, "lower_switch_distance"=4, "upper_switch_distance"=8}', action="extend", nargs="*")
 
     # architectural args

Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,7 @@ def create_model(args, prior_model=None, mean=None, std=None):`
`127`	`127`	`activation=args["activation"],`
`128`	`128`	`reduce_op=args["reduce_op"],`
`129`	`129`	`dtype=dtype,`
	`130`	`+ num_layers=args.get("output_mlp_num_layers", 0),`
`130`	`131`	`)`
`131`	`132`
`132`	`133`	`# combine representation and output network`