fix stats per episodes and aggregate stats and casting to tensor

Cadene · Cadene · commit 8feeedebbc1d · 2025-01-26T19:49:21.000+01:00
diff --git a/lerobot/common/datasets/compute_stats.py b/lerobot/common/datasets/compute_stats.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-import torch
 
 from lerobot.common.datasets.utils import load_image_as_numpy
 
@@ -31,7 +30,7 @@ def compute_episode_stats(episode_buffer: dict, features: dict, num_image_sample
                 "max": np.max(data, axis=axes_to_reduce),
                 "mean": np.mean(data, axis=axes_to_reduce),
                 "std": np.std(data, axis=axes_to_reduce),
-                "count": data.shape[0],
+                "count": np.array([data.shape[0]]),
             }
     return stats
 
@@ -71,11 +70,11 @@ def compute_image_stats(image_paths: list[str], num_samples: int | None = None)
         "max": np.max(images, axis=axes_to_reduce, keepdims=True),
         "mean": np.mean(images, axis=axes_to_reduce, keepdims=True),
         "std": np.std(images, axis=axes_to_reduce, keepdims=True),
-        "count": len(images),
     }
     for key in image_stats:  # squeeze batch dim
         image_stats[key] = np.squeeze(image_stats[key], axis=0)
 
+    image_stats["count"] = np.array([len(images)])
     return image_stats
 
 
@@ -95,15 +94,15 @@ def _assert_type_and_shape(stats_list):
         for i in range(len(stats_list)):
             for fkey in stats_list[i]:
                 for k, v in stats_list[i][fkey].items():
-                    if not isinstance(v, torch.Tensor):
+                    if not isinstance(v, np.ndarray):
                         raise ValueError(
-                            f"Stats must be compared of torch tensors, but is {type(v)} instead."
+                            f"Stats must be composed of numpy array, but key '{k}' of feature '{fkey}' is of type '{type(v)}' instead."
                         )
                     if v.ndim == 0:
                         raise ValueError("Number of dimensions must be at least 1, and is 0 instead.")
-                    if k == "count" and v.shape != torch.Size([1]):
+                    if k == "count" and v.shape != (1,):
                         raise ValueError(f"Shape of 'count' must be (1), but is {v.shape} instead.")
-                    if "image" in k and v.shape != torch.Size([3, 1, 1]):
+                    if "image" in k and v.shape != (3, 1, 1):
                         raise ValueError(f"Shape of '{k}' must be (3,1,1), but is {v.shape} instead.")
 
     _assert_type_and_shape(stats_list)
@@ -116,35 +115,33 @@ def _assert_type_and_shape(stats_list):
         stats_with_key = [stats[key] for stats in stats_list if key in stats]
 
         # Aggregate 'min' and 'max' using np.minimum and np.maximum
-        min_, argmin_ = torch.min(torch.stack([s["min"] for s in stats_with_key]), dim=0)
-        max_, argmax_ = torch.max(torch.stack([s["max"] for s in stats_with_key]), dim=0)
-        aggregated_stats[key]["min"] = min_
-        aggregated_stats[key]["max"] = max_
+        aggregated_stats[key]["min"] = np.min(np.stack([s["min"] for s in stats_with_key]), axis=0)
+        aggregated_stats[key]["max"] = np.max(np.stack([s["max"] for s in stats_with_key]), axis=0)
 
         # Extract means, variances (std^2), and counts
-        means = torch.stack([s["mean"] for s in stats_with_key])
-        variances = torch.stack([s["std"] ** 2 for s in stats_with_key])
-        counts = torch.stack([s["count"] for s in stats_with_key])
+        means = np.stack([s["mean"] for s in stats_with_key])
+        variances = np.stack([s["std"] ** 2 for s in stats_with_key])
+        counts = np.stack([s["count"] for s in stats_with_key])
 
         # Compute total counts
-        total_count = counts.sum(dim=0)
+        total_count = counts.sum(axis=0)
 
         # Prepare weighted mean by matching number of dimensions
         while counts.ndim < means.ndim:
-            counts = counts.unsqueeze(-1)
+            counts = np.expand_dims(counts, axis=-1)
 
         # Compute the weighted mean
         weighted_means = means * counts
-        total_mean = weighted_means.sum(dim=0) / total_count
+        total_mean = weighted_means.sum(axis=0) / total_count
 
         # Compute the variance using the parallel algorithm
         delta_means = means - total_mean
         weighted_variances = (variances + delta_means**2) * counts
-        total_variance = weighted_variances.sum(dim=0) / total_count
+        total_variance = weighted_variances.sum(axis=0) / total_count
 
         # Store the aggregated stats
         aggregated_stats[key]["mean"] = total_mean
-        aggregated_stats[key]["std"] = torch.sqrt(total_variance)
+        aggregated_stats[key]["std"] = np.sqrt(total_variance)
         aggregated_stats[key]["count"] = total_count
 
     return aggregated_stats
diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py
@@ -34,12 +34,7 @@
 from lerobot.common.datasets.utils import (
     DEFAULT_FEATURES,
     DEFAULT_IMAGE_PATH,
-    EPISODES_PATH,
-    EPISODES_STATS_PATH,
     INFO_PATH,
-    STATS_PATH,
-    TASKS_PATH,
-    append_jsonlines,
     backward_compatible_episodes_stats,
     check_delta_timestamps,
     check_timestamps_sync,
@@ -58,9 +53,13 @@
     load_info,
     load_stats,
     load_tasks,
-    serialize_dict,
+    write_episode,
+    write_episode_stats,
+    write_info,
     write_json,
     write_parquet,
+    write_stats,
+    write_task,
 )
 from lerobot.common.datasets.video_utils import (
     VideoFrame,
@@ -101,7 +100,7 @@ def __init__(
                 "'episodes_stats.jsonl' not found. Use global dataset stats for each episode instead.",
                 stacklevel=1,
             )
-            self.episodes_stats = backward_compatible_episodes_stats(self.stats, self.episodes.keys())
+            self.episodes_stats = backward_compatible_episodes_stats(self.stats, self.episodes)
 
     def pull_from_repo(
         self,
@@ -241,30 +240,26 @@ def save_episode(
         if task_index not in self.tasks:
             self.info["total_tasks"] += 1
             self.tasks[task_index] = task
-            task_dict = {
-                "task_index": task_index,
-                "task": task,
-            }
-            append_jsonlines(task_dict, self.root / TASKS_PATH)
+            write_task(task_index, task, self.root)
 
         chunk = self.get_episode_chunk(episode_index)
         if chunk >= self.total_chunks:
             self.info["total_chunks"] += 1
 
         self.info["splits"] = {"train": f"0:{self.info['total_episodes']}"}
         self.info["total_videos"] += len(self.video_keys)
-        write_json(self.info, self.root / INFO_PATH)
+        write_info(self.info, self.root)
 
         episode_dict = {
             "episode_index": episode_index,
             "tasks": [task],
             "length": episode_length,
         }
-        self.episodes.append(episode_dict)
-        append_jsonlines(episode_dict, self.root / EPISODES_PATH)
+        self.episodes[episode_index] = episode_dict
+        write_episode(episode_dict, self.root)
 
-        self.episodes_stats.append(episode_stats)
-        append_jsonlines(episode_stats, self.root / EPISODES_STATS_PATH)
+        self.episodes_stats[episode_index] = episode_stats
+        write_episode_stats(episode_index, episode_stats, self.root)
 
     def write_video_info(self) -> None:
         """
@@ -323,7 +318,7 @@ def create(
             # TODO(aliberts, rcadene): implement sanity check for features
             features = {**features, **DEFAULT_FEATURES}
 
-        obj.tasks, obj.stats, obj.episodes, obj.episodes_stats = {}, {}, [], []
+        obj.tasks, obj.stats, obj.episodes, obj.episodes_stats = {}, {}, {}, {}
         obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
         if len(obj.video_keys) > 0 and not use_videos:
             raise ValueError()
@@ -664,8 +659,7 @@ def __getitem__(self, idx) -> dict:
 
         query_indices = None
         if self.delta_indices is not None:
-            current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
-            query_indices, padding = self._get_query_indices(idx, current_ep_idx)
+            query_indices, padding = self._get_query_indices(idx, ep_idx)
             query_result = self._query_hf_dataset(query_indices)
             item = {**item, **padding}
             for key, val in query_result.items():
@@ -807,18 +801,20 @@ def _prepare_episode_buffer(self, episode_buffer: dict, task: str):
             raise ValueError()
 
         for key, ft in self.features.items():
+            # We add an extra dimension to index, frame_index, timestamp, episode_index, task_index
+            # to fit the shape `(1,)` defined in `self.features`
             if key == "index":
                 episode_buffer[key] = np.arange(
                     self.meta.total_frames, self.meta.total_frames + episode_length
-                )
+                )[:, np.newaxis]
+            elif key == "frame_index" or key == "timestamp":
+                episode_buffer[key] = np.array(episode_buffer[key])[:, np.newaxis]
             elif key == "episode_index":
-                episode_buffer[key] = np.full((episode_length,), episode_index)
+                episode_buffer[key] = np.full((episode_length, 1), episode_index)
             elif key == "task_index":
-                episode_buffer[key] = np.full((episode_length,), task_index)
+                episode_buffer[key] = np.full((episode_length, 1), task_index)
             elif ft["dtype"] in ["image", "video"]:
                 continue
-            elif len(ft["shape"]) == 1 and ft["shape"][0] == 1:
-                episode_buffer[key] = np.array(episode_buffer[key], dtype=ft["dtype"])
             elif len(ft["shape"]) == 1 and ft["shape"][0] > 1:
                 episode_buffer[key] = np.stack(episode_buffer[key])
             else:
@@ -828,7 +824,7 @@ def _prepare_episode_buffer(self, episode_buffer: dict, task: str):
 
     def _compute_episode_stats(self, episode_buffer: dict):
         ep_stats = compute_episode_stats(episode_buffer, self.features)
-        return serialize_dict(ep_stats)
+        return ep_stats
 
     def _save_episode_table(self, episode_buffer: dict, episode_index: int) -> None:
         episode_dict = {key: episode_buffer[key] for key in self.hf_features}
@@ -926,9 +922,8 @@ def consolidate(self, run_compute_stats: bool = True, keep_image_files: bool = F
 
         if run_compute_stats:
             self.stop_image_writer()
-            self.meta.stats = aggregate_stats(self.meta.episodes_stats)
-            serialized_stats = serialize_dict(self.meta.stats)
-            write_json(serialized_stats, self.root / STATS_PATH)
+            self.meta.stats = aggregate_stats(list(self.meta.episodes_stats.values()))
+            write_stats(self.meta.stats, self.root)
             self.consolidated = True
         else:
             logging.warning(
@@ -1051,7 +1046,10 @@ def __init__(
 
         self.image_transforms = image_transforms
         self.delta_timestamps = delta_timestamps
-        self.stats = aggregate_stats(self._datasets)
+        # TODO(rcadene, aliberts): We should not perform this aggregation for datasets
+        # with multiple robots of different ranges. Instead we should have one normalization
+        # per robot.
+        self.stats = aggregate_stats([dataset.meta.stats for dataset in self._datasets])
 
     @property
     def repo_id_to_index(self):
diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py
@@ -163,41 +163,73 @@ def append_jsonlines(data: dict, fpath: Path) -> None:
         writer.write(data)
 
 
+def write_info(info: dict, local_dir: Path):
+    write_json(info, local_dir / INFO_PATH)
+
+
 def load_info(local_dir: Path) -> dict:
     info = load_json(local_dir / INFO_PATH)
     for ft in info["features"].values():
         ft["shape"] = tuple(ft["shape"])
     return info
 
 
+def write_stats(stats: dict, local_dir: Path):
+    serialized_stats = serialize_dict(stats)
+    write_json(serialized_stats, local_dir / STATS_PATH)
+
+
+def cast_stats_to_numpy(stats):
+    stats = {key: np.array(value) for key, value in flatten_dict(stats).items()}
+    return unflatten_dict(stats)
+
+
 def load_stats(local_dir: Path) -> dict:
     if not (local_dir / STATS_PATH).exists():
         return None
     stats = load_json(local_dir / STATS_PATH)
-    stats = {key: np.array(value) for key, value in flatten_dict(stats).items()}
-    return unflatten_dict(stats)
+    return cast_stats_to_numpy(stats)
+
+
+def write_task(task_index: int, task: dict, local_dir: Path):
+    task_dict = {
+        "task_index": task_index,
+        "task": task,
+    }
+    append_jsonlines(task_dict, local_dir / TASKS_PATH)
 
 
 def load_tasks(local_dir: Path) -> dict:
     tasks = load_jsonlines(local_dir / TASKS_PATH)
     return {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])}
 
 
+def write_episode(episode: dict, local_dir: Path):
+    append_jsonlines(episode, local_dir / EPISODES_PATH)
+
+
 def load_episodes(local_dir: Path) -> dict:
     episodes = load_jsonlines(local_dir / EPISODES_PATH)
     return {item["episode_index"]: item for item in sorted(episodes, key=lambda x: x["episode_index"])}
 
 
+def write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path):
+    # We wrap episode_stats in a dictionnary since `episode_stats["episode_index"]`
+    # is a dictionary of stats and not an integer.
+    episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)}
+    append_jsonlines(episode_stats, local_dir / EPISODES_STATS_PATH)
+
+
 def load_episodes_stats(local_dir: Path) -> dict:
-    episodes_tasks = load_jsonlines(local_dir / EPISODES_STATS_PATH)
+    episodes_stats = load_jsonlines(local_dir / EPISODES_STATS_PATH)
     return {
-        item["episode_index"]: item["stats"]
-        for item in sorted(episodes_tasks, key=lambda x: x["episode_index"])
+        item["episode_index"]: cast_stats_to_numpy(item["stats"])
+        for item in sorted(episodes_stats, key=lambda x: x["episode_index"])
     }
 
 
 def backward_compatible_episodes_stats(stats, episodes: list[int]):
-    return {ep_idx: {"episode_index": ep_idx, "stats": stats} for ep_idx in episodes}
+    return {ep_idx: stats for ep_idx in episodes}
 
 
 def load_image_as_numpy(fpath: str | Path, dtype="float32", channel_first: bool = True) -> np.ndarray:
@@ -381,7 +413,7 @@ def create_empty_dataset_info(
 
 
 def get_episode_data_index(
-    episode_dicts: list[dict], episodes: list[int] | None = None
+    episode_dicts: dict[dict], episodes: list[int] | None = None
 ) -> dict[str, torch.Tensor]:
     episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
     if episodes is not None:
diff --git a/lerobot/common/policies/normalize.py b/lerobot/common/policies/normalize.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import torch
 from torch import Tensor, nn
 
@@ -22,7 +23,7 @@
 def create_stats_buffers(
     features: dict[str, PolicyFeature],
     norm_map: dict[str, NormalizationMode],
-    stats: dict[str, dict[str, Tensor]] | None = None,
+    stats: dict[str, dict[str, torch.Tensor]] | None = None,
 ) -> dict[str, dict[str, nn.ParameterDict]]:
     """
     Create buffers per modality (e.g. "observation.image", "action") containing their mean, std, min, max
@@ -78,16 +79,27 @@ def create_stats_buffers(
             )
 
         if stats:
-            # Note: The clone is needed to make sure that the logic in save_pretrained doesn't see duplicated
-            # tensors anywhere (for example, when we use the same stats for normalization and
-            # unnormalization). See the logic here
-            # https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/py_src/safetensors/torch.py#L97.
-            if norm_mode is NormalizationMode.MEAN_STD:
-                buffer["mean"].data = stats[key]["mean"].clone()
-                buffer["std"].data = stats[key]["std"].clone()
-            elif norm_mode is NormalizationMode.MIN_MAX:
-                buffer["min"].data = stats[key]["min"].clone()
-                buffer["max"].data = stats[key]["max"].clone()
+            if isinstance(stats[key]["mean"], np.ndarray):
+                if norm_mode is NormalizationMode.MEAN_STD:
+                    buffer["mean"].data = torch.from_numpy(stats[key]["mean"]).to(dtype=torch.float32)
+                    buffer["std"].data = torch.from_numpy(stats[key]["std"]).to(dtype=torch.float32)
+                elif norm_mode is NormalizationMode.MIN_MAX:
+                    buffer["min"].data = torch.from_numpy(stats[key]["min"]).to(dtype=torch.float32)
+                    buffer["max"].data = torch.from_numpy(stats[key]["max"]).to(dtype=torch.float32)
+            elif isinstance(stats[key]["mean"], torch.Tensor):
+                # Note: The clone is needed to make sure that the logic in save_pretrained doesn't see duplicated
+                # tensors anywhere (for example, when we use the same stats for normalization and
+                # unnormalization). See the logic here
+                # https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/py_src/safetensors/torch.py#L97.
+                if norm_mode is NormalizationMode.MEAN_STD:
+                    buffer["mean"].data = stats[key]["mean"].clone().to(dtype=torch.float32)
+                    buffer["std"].data = stats[key]["std"].clone().to(dtype=torch.float32)
+                elif norm_mode is NormalizationMode.MIN_MAX:
+                    buffer["min"].data = stats[key]["min"].clone().to(dtype=torch.float32)
+                    buffer["max"].data = stats[key]["max"].clone().to(dtype=torch.float32)
+            else:
+                type_ = type(stats[key]["mean"])
+                raise ValueError(f"np.ndarray or torch.Tensor expected, but type is '{type_}' instead.")
 
         stats_buffers[key] = buffer
     return stats_buffers
diff --git a/tests/test_compute_stats.py b/tests/test_compute_stats.py