feat: Add version validation if model fails to load (#194)

HCookie · web-flow · commit bfd890f3f656 · 2025-04-24T11:49:58.000+01:00
diff --git a/src/anemoi/inference/checkpoint.py b/src/anemoi/inference/checkpoint.py
@@ -258,24 +258,25 @@ def validate_environment(
         self,
         *,
         all_packages: bool = False,
-        on_difference: Literal["warn", "error", "ignore"] = "warn",
-        exempt_packages: Optional[List[str]] = None,
-    ) -> bool:
+        on_difference: Literal["warn", "error", "ignore", "return"] = "warn",
+        exempt_packages: Optional[list[str]] = None,
+    ) -> Union[bool, str]:
         """Validate the environment.
 
         Parameters
         ----------
         all_packages : bool, optional
-            Whether to validate all packages, by default False.
-        on_difference : str, optional
-            Action to take on difference, by default "warn".
-        exempt_packages : Optional[List[str]], optional
-            List of packages to exempt, by default None.
+            Check all packages in the environment (True) or just anemoi's (False), by default False.
+        on_difference : Literal['warn', 'error', 'ignore', 'return'], optional
+            What to do on difference, by default "warn"
+        exempt_packages : list[str], optional
+            List of packages to exempt from the check, by default EXEMPT_PACKAGES
 
         Returns
         -------
-        bool
-            True if the environment is valid, False otherwise.
+        Union[bool, str]
+            boolean if `on_difference` is not 'return', otherwise formatted text of the differences
+            True if environment is valid, False otherwise
         """
         return self._metadata.validate_environment(
             all_packages=all_packages, on_difference=on_difference, exempt_packages=exempt_packages
diff --git a/src/anemoi/inference/metadata.py b/src/anemoi/inference/metadata.py
@@ -631,23 +631,24 @@ def validate_environment(
         self,
         *,
         all_packages: bool = False,
-        on_difference: Literal["warn", "error", "ignore"] = "warn",
+        on_difference: Literal["warn", "error", "ignore", "return"] = "warn",
         exempt_packages: Optional[list[str]] = None,
-    ) -> bool:
+    ) -> Union[bool, str]:
         """Validate environment of the checkpoint against the current environment.
 
         Parameters
         ----------
         all_packages : bool, optional
-            Check all packages in environment or just `anemoi`'s, by default False
-        on_difference : Literal['warn', 'error', 'ignore'], optional
+            Check all packages in the environment (True) or just anemoi's (False), by default False.
+        on_difference : Literal['warn', 'error', 'ignore', 'return'], optional
             What to do on difference, by default "warn"
         exempt_packages : list[str], optional
             List of packages to exempt from the check, by default EXEMPT_PACKAGES
 
         Returns
         -------
-        bool
+        Union[bool, str]
+            boolean if `on_difference` is not 'return', otherwise formatted text of the differences
             True if environment is valid, False otherwise
 
         Raises
diff --git a/src/anemoi/inference/provenance.py b/src/anemoi/inference/provenance.py
@@ -14,6 +14,8 @@
 from typing import List
 from typing import Literal
 from typing import Optional
+from typing import Union
+from typing import overload
 
 from anemoi.utils.provenance import gather_provenance_info
 from packaging.version import Version
@@ -25,8 +27,10 @@
 # Complete package name to be exempt
 EXEMPT_PACKAGES = [
     "anemoi.training",
+    "anemoi.inference",
     "hydra",
     "hydra_plugins",
+    "hydra_plugins.anemoi_searchpath",
     "lightning",
     "pytorch_lightning",
     "lightning_fabric",
@@ -41,13 +45,33 @@
 LOG = logging.getLogger(__name__)
 
 
+@overload
 def validate_environment(
     metadata: "Metadata",
     *,
     all_packages: bool = False,
     on_difference: Literal["warn", "error", "ignore"] = "warn",
-    exempt_packages: Optional[list[str]] = None,
-) -> bool:
+    exempt_packages: Optional[List[str]] = None,
+) -> bool: ...
+
+
+@overload
+def validate_environment(
+    metadata: "Metadata",
+    *,
+    all_packages: bool = False,
+    on_difference: Literal["return"] = "return",
+    exempt_packages: Optional[List[str]] = None,
+) -> str: ...
+
+
+def validate_environment(
+    metadata: "Metadata",
+    *,
+    all_packages: bool = False,
+    on_difference: Literal["warn", "error", "ignore", "return"] = "warn",
+    exempt_packages: Optional[List[str]] = None,
+) -> Union[bool, str]:
     """Validate environment of the checkpoint against the current environment.
 
     Parameters
@@ -58,12 +82,13 @@ def validate_environment(
         Check all packages in environment or just `anemoi`'s, by default False
     on_difference : Literal['warn', 'error', 'ignore'], optional
         What to do on difference, by default "warn"
-    exempt_packages : list[str], optional
+    exempt_packages : List[str], optional
         List of packages to exempt from the check, by default EXEMPT_PACKAGES
 
     Returns
     -------
-    bool
+    Union[bool, str]
+        boolean if `on_difference` is not 'return', otherwise formatted text of the differences
         True if environment is valid, False otherwise
 
     Raises
@@ -105,6 +130,7 @@ def validate_environment(
     for module in train_environment["module_versions"].keys():
         inference_module_name = module  # Due to package name differences between retrieval methods this may change
 
+        train_module_version_str = train_environment["module_versions"][module]
         if not all_packages and "anemoi" not in module:
             continue
         elif module in exempt_packages or module.split(".")[0] in EXEMPT_NAMESPACES:
@@ -122,7 +148,9 @@ def validate_environment(
                     continue
                 except (ModuleNotFoundError, ImportError):
                     pass
-                invalid_messages["missing"].append(f"Missing module in inference environment: {module}")
+                invalid_messages["missing"].append(
+                    f"Missing module in inference environment: {module}=={train_module_version_str}"
+                )
                 continue
 
         train_environment_version = Version(train_environment["module_versions"][module])
@@ -142,6 +170,9 @@ def validate_environment(
         if file_record["modified_files"] == 0 and file_record["untracked_files"] == 0:
             continue
 
+        if git_record in exempt_packages:
+            continue
+
         if git_record not in inference_environment["git_versions"]:
             invalid_messages["uncommitted"].append(
                 f"Training environment contained uncommitted change missing in inference environment: {git_record}"
@@ -159,6 +190,9 @@ def validate_environment(
         if file_record["modified_files"] == 0 and file_record["untracked_files"] == 0:
             continue
 
+        if git_record in exempt_packages:
+            continue
+
         if git_record not in train_environment["git_versions"]:
             invalid_messages["uncommitted"].append(
                 f"Inference environment contains uncommited changes missing in training: {git_record}"
@@ -174,6 +208,8 @@ def validate_environment(
             raise RuntimeError(text)
         elif on_difference == "ignore":
             pass
+        elif on_difference == "return":
+            return text
         else:
             raise ValueError(f"Invalid value for `on_difference`: {on_difference}")
         return False
diff --git a/src/anemoi/inference/runner.py b/src/anemoi/inference/runner.py
@@ -448,7 +448,12 @@ def model(self) -> torch.nn.Module:
             The loaded model.
         """
         with Timer(f"Loading {self.checkpoint}"):
-            model = torch.load(self.checkpoint.path, map_location=self.device, weights_only=False).to(self.device)
+            try:
+                model = torch.load(self.checkpoint.path, map_location=self.device, weights_only=False).to(self.device)
+            except Exception as e:  # Wildcard exception to catch all errors
+                validation_result = self.checkpoint.validate_environment(on_difference="return")
+                error_msg = f"Error loading model - {validation_result}"
+                raise RuntimeError(error_msg) from e
             # model.set_inference_options(**self.inference_options)
             assert getattr(model, "runner", None) is None, model.runner
             model.runner = self