fix: Update Profiler (#160)

cathalobrien · pre-commit-ci[bot] · web-flow · commit 6cfa021ec8cd · 2025-03-03T11:54:03.000Z
* profiler hotfix fix a bug where only the input state preprocessing was being profiled removed memory_timeline.html bc its slow to generate and the memory pickle shows the same info but better change where the profile output is written to, so new runs dont overwrite old ones add more log output to explain how to view profiler output remove saving the stack trace from pytorch profiler, otherwise the trace files couldnt be opened * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/src/anemoi/inference/profiler.py b/src/anemoi/inference/profiler.py
@@ -9,6 +9,8 @@
 
 
 import logging
+import socket
+import time
 from contextlib import contextmanager
 
 import torch
@@ -47,7 +49,7 @@ def ProfilingRunner(use_profiler: bool) -> None:
         Weither to profile the wrapped code (True) or not (False).
 
     """
-    dirname = "profiling-output"
+    dirname = f"profiling-output/{socket.gethostname()}-{int(time.time())}"
     if use_profiler:
         torch.cuda.memory._record_memory_history(max_entries=100000)
         activities = [torch.profiler.ProfilerActivity.CPU]
@@ -56,7 +58,6 @@ def ProfilingRunner(use_profiler: bool) -> None:
         with torch.profiler.profile(
             profile_memory=True,
             record_shapes=True,
-            with_stack=True,
             activities=activities,
             with_flops=True,
             on_trace_ready=torch.profiler.tensorboard_trace_handler(dirname),
@@ -75,7 +76,8 @@ def ProfilingRunner(use_profiler: bool) -> None:
             f"Top {row_limit} kernels by runtime on CUDA:\n {prof.key_averages().table(sort_by='self_cuda_time_total', row_limit=row_limit)}"
         )
         LOG.info("Memory summary \n%s", torch.cuda.memory_summary())
-        if torch.cuda.is_available():
-            prof.export_memory_timeline(f"{dirname}/memory_timeline.html", device="cuda:0")
+        LOG.info(
+            f"Memory snapshot and trace file stored to '{dirname}'. To view the memory snapshot, upload the pickle file to 'https://pytorch.org/memory_viz'. To view the trace file, see 'https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-tensorboard-to-view-results-and-analyze-model-performance'"
+        )
     else:
         yield
diff --git a/src/anemoi/inference/runner.py b/src/anemoi/inference/runner.py
@@ -138,12 +138,12 @@ def run(self, *, input_state, lead_time):
             with ProfilingLabel("Prepare input tensor", self.use_profiler):
                 input_tensor = self.prepare_input_tensor(input_state)
 
-        try:
-            yield from self.forecast(lead_time, input_tensor, input_state)
-        except (TypeError, ModuleNotFoundError, AttributeError):
-            if self.report_error:
-                self.checkpoint.report_error()
-            raise
+            try:
+                yield from self.forecast(lead_time, input_tensor, input_state)
+            except (TypeError, ModuleNotFoundError, AttributeError):
+                if self.report_error:
+                    self.checkpoint.report_error()
+                raise
 
     def add_initial_forcings_to_input_state(self, input_state):
         # Should that be alreay a list of dates