From 6acd0538d656f507b59be4dd15276ff0ec320c6d Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sun, 27 Apr 2025 17:09:49 +0530 Subject: [PATCH 1/6] Added functions to support IO for Parquet files. Added test_parquet_io.py. --- movement/io/load_poses.py | 233 +++++++++++++++++++++++++++++++++----- movement/io/save_poses.py | 116 ++++++++++++++++++- tests/test_parquet_io.py | 219 +++++++++++++++++++++++++++++++++++ 3 files changed, 538 insertions(+), 30 deletions(-) create mode 100644 tests/test_parquet_io.py diff --git a/movement/io/load_poses.py b/movement/io/load_poses.py index fd38055fe..36e6c5377 100644 --- a/movement/io/load_poses.py +++ b/movement/io/load_poses.py @@ -40,7 +40,7 @@ def from_numpy( Array of shape (n_frames, n_keypoints, n_individuals) containing the point-wise confidence scores. It will be converted to a :class:`xarray.DataArray` object named "confidence". - If None (default), the scores will be set to an array of NaNs. + If None (default), no confidence data variable is included. individual_names : list of str, optional List of unique names for the individuals in the video. If None (default), the individuals will be named "individual_0", @@ -59,8 +59,8 @@ def from_numpy( Returns ------- xarray.Dataset - ``movement`` dataset containing the pose tracks, confidence scores, - and associated metadata. + ``movement`` dataset containing the pose tracks, confidence scores + (if provided), and associated metadata. Examples -------- @@ -94,7 +94,11 @@ def from_numpy( def from_file( file_path: Path | str, source_software: Literal[ - "DeepLabCut", "SLEAP", "LightningPose", "Anipose" + "DeepLabCut", + "SLEAP", + "LightningPose", + "Anipose", + "animovement", ], fps: float | None = None, **kwargs, @@ -106,10 +110,17 @@ def from_file( file_path : pathlib.Path or str Path to the file containing predicted poses. The file format must be among those supported by the ``from_dlc_file()``, - ``from_slp_file()`` or ``from_lp_file()`` functions. One of these - these functions will be called internally, based on + ``from_slp_file()``, ``from_lp_file()``, ``from_anipose_file()``, + or ``from_animovement_file()`` functions. One of these + functions will be called internally, based on the value of ``source_software``. - source_software : "DeepLabCut", "SLEAP", "LightningPose", or "Anipose" + source_software : Literal[ + "DeepLabCut", + "SLEAP", + "LightningPose", + "Anipose", + "animovement", + ] The source software of the file. fps : float, optional The number of frames per second in the video. If None (default), @@ -130,6 +141,7 @@ def from_file( movement.io.load_poses.from_sleap_file movement.io.load_poses.from_lp_file movement.io.load_poses.from_anipose_file + movement.io.load_poses.from_animovement_file Examples -------- @@ -147,6 +159,8 @@ def from_file( return from_lp_file(file_path, fps) elif source_software == "Anipose": return from_anipose_file(file_path, fps, **kwargs) + elif source_software == "animovement": + return from_animovement_file(file_path, fps) else: raise logger.error( ValueError(f"Unsupported source software: {source_software}") @@ -289,6 +303,7 @@ def from_sleap_file( # Add metadata as attrs ds.attrs["source_file"] = file.path.as_posix() logger.info(f"Loaded pose tracks from {file.path}:\n{ds}") + logger.info(ds) return ds @@ -506,18 +521,26 @@ def _ds_from_sleap_labels_file( file = ValidHDF5(file_path, expected_datasets=["pred_points", "metadata"]) labels = read_labels(file.path.as_posix()) tracks_with_scores = _sleap_labels_to_numpy(labels) - individual_names = [track.name for track in labels.tracks] or None - if individual_names is None: + + individual_names: list[str] = ( + [track.name for track in labels.tracks] + if labels.tracks + else ["individual_0"] # Fixed: Ensure list[str] + ) + if not labels.tracks: logger.warning( f"Could not find SLEAP Track in {file.path}. " "Assuming single-individual dataset and assigning " "default individual name." ) + + keypoint_names: list[str] = [kp.name for kp in labels.skeletons[0].nodes] + return from_numpy( position_array=tracks_with_scores[:, :-1, :, :], confidence_array=tracks_with_scores[:, -1, :, :], - individual_names=individual_names, - keypoint_names=[kp.name for kp in labels.skeletons[0].nodes], + individual_names=individual_names, # Already a list[str] + keypoint_names=keypoint_names, # Already a list[str] fps=fps, source_software="SLEAP", ) @@ -559,15 +582,19 @@ def _sleap_labels_to_numpy(labels: Labels) -> np.ndarray: lfs = [lf for lf in labels.labeled_frames if lf.video == labels.videos[0]] # Figure out frame index range frame_idxs = [lf.frame_idx for lf in lfs] - first_frame = min(0, min(frame_idxs)) - last_frame = max(0, max(frame_idxs)) + first_frame = min(0, min(frame_idxs)) if frame_idxs else 0 + last_frame = max(0, max(frame_idxs)) if frame_idxs else 0 n_tracks = len(labels.tracks) or 1 # If no tracks, assume 1 individual individuals = labels.tracks or [None] skeleton = labels.skeletons[-1] # Assume project only uses last skeleton n_nodes = len(skeleton.nodes) n_frames = int(last_frame - first_frame + 1) - tracks = np.full((n_frames, 3, n_nodes, n_tracks), np.nan, dtype="float32") + + # Explicitly type the tracks array + tracks: np.ndarray = np.full( + (n_frames, 3, n_nodes, n_tracks), np.nan, dtype=np.float32 + ) for lf in lfs: i = int(lf.frame_idx - first_frame) @@ -583,12 +610,12 @@ def _sleap_labels_to_numpy(labels: Labels) -> np.ndarray: # Use user-labelled instance if available if user_track_instances: inst = user_track_instances[-1] - tracks[i, ..., j] = np.hstack( + tracks[i, 0:3, :, j] = np.hstack( (inst.numpy(), np.full((n_nodes, 1), np.nan)) ).T elif predicted_track_instances: inst = predicted_track_instances[-1] - tracks[i, ..., j] = inst.numpy(scores=True).T + tracks[i, 0:3, :, j] = inst.numpy(scores=True).T return tracks @@ -670,8 +697,8 @@ def _ds_from_valid_data(data: ValidPosesDataset) -> xr.Dataset: Returns ------- xarray.Dataset - ``movement`` dataset containing the pose tracks, confidence scores, - and associated metadata. + ``movement`` dataset containing the pose tracks, confidence scores + (if provided), and associated metadata. """ n_frames = data.position_array.shape[0] @@ -693,14 +720,18 @@ def _ds_from_valid_data(data: ValidPosesDataset) -> xr.Dataset: dataset_attrs["time_unit"] = time_unit DIM_NAMES = ValidPosesDataset.DIM_NAMES + # Initialize data_vars dictionary with position + data_vars = { + "position": xr.DataArray(data.position_array, dims=DIM_NAMES), + } + # Add confidence only if confidence_array is provided + if data.confidence_array is not None: + data_vars["confidence"] = xr.DataArray( + data.confidence_array, dims=DIM_NAMES[:1] + DIM_NAMES[2:] + ) # Convert data to an xarray.Dataset return xr.Dataset( - data_vars={ - "position": xr.DataArray(data.position_array, dims=DIM_NAMES), - "confidence": xr.DataArray( - data.confidence_array, dims=DIM_NAMES[:1] + DIM_NAMES[2:] - ), - }, + data_vars=data_vars, coords={ DIM_NAMES[0]: time_coords, DIM_NAMES[1]: ["x", "y", "z"][:n_space], @@ -734,7 +765,6 @@ def from_anipose_style_df( ``movement`` dataset containing the pose tracks, confidence scores, and associated metadata. - Notes ----- Reshape dataframe with columns keypoint1_x, keypoint1_y, keypoint1_z, @@ -744,7 +774,7 @@ def from_anipose_style_df( with dimensions time, keypoints, individuals. """ - keypoint_names = sorted( + keypoint_names: list[str] = sorted( list( set( [ @@ -769,7 +799,7 @@ def from_anipose_style_df( position_array[:, j, i, 0] = df[f"{kp}_{coord}"] confidence_array[:, i, 0] = df[f"{kp}_score"] - individual_names = [individual_name] + individual_names: list[str] = [individual_name] return from_numpy( position_array=position_array, @@ -822,3 +852,152 @@ def from_anipose_file( return from_anipose_style_df( anipose_df, fps=fps, individual_name=individual_name ) + + +def from_tidy_df( + df: pd.DataFrame, + fps: float | None = None, + source_software: str = "animovement", +) -> xr.Dataset: + """Create a ``movement`` poses dataset from a tidy DataFrame. + + Parameters + ---------- + df : pandas.DataFrame + Tidy DataFrame containing pose tracks and confidence scores. + Expected columns: 'frame', 'track_id', 'keypoint', 'x', 'y', + and optionally 'confidence'. + fps : float, optional + The number of frames per second in the video. If None (default), + the ``time`` coordinates will be in frame numbers. + source_software : str, optional + Name of the pose estimation software or package from which the + data originate. Defaults to "animovement". + + Returns + ------- + xarray.Dataset + ``movement`` dataset containing the pose tracks, confidence scores, + and associated metadata. + + Notes + ----- + The DataFrame must have at least the following columns: + - 'frame': integer, the frame number (time index) + - 'track_id': string or integer, the individual ID + - 'keypoint': string, the keypoint name + - 'x': float, x-coordinate + - 'y': float, y-coordinate + - 'confidence': float, optional, point-wise confidence scores + + Examples + -------- + >>> import pandas as pd + >>> from movement.io import load_poses + >>> df = pd.DataFrame( + ... { + ... "frame": [0, 0, 1, 1], + ... "track_id": ["ind1", "ind1", "ind1", "ind1"], + ... "keypoint": ["nose", "tail", "nose", "tail"], + ... "x": [100.0, 150.0, 101.0, 151.0], + ... "y": [200.0, 250.0, 201.0, 251.0], + ... "confidence": [0.9, 0.8, 0.85, 0.75], + ... } + ... ) + >>> ds = load_poses.from_tidy_df(df, fps=30) + + """ + # Validate DataFrame columns + required_columns = {"frame", "track_id", "keypoint", "x", "y"} + if not required_columns.issubset(df.columns): + missing = required_columns - set(df.columns) + raise ValueError(f"DataFrame missing required columns: {missing}") + + # Ensure correct data types + df = df.astype( + { + "frame": int, + "track_id": str, + "keypoint": str, + "x": float, + "y": float, + } + ) + + # Get unique values for coordinates + time = np.sort(df["frame"].unique()) + individuals = df["track_id"].unique() + keypoints = df["keypoint"].unique() + n_frames = len(time) + n_individuals = len(individuals) + n_keypoints = len(keypoints) + + # Initialize position and confidence arrays + position_array = np.full( + (n_frames, 2, n_keypoints, n_individuals), np.nan, dtype=float + ) + confidence_array = ( + np.full((n_frames, n_keypoints, n_individuals), np.nan, dtype=float) + if "confidence" in df.columns + else None + ) + + # Pivot data to fill arrays + for _idx, row in df.iterrows(): + t_idx = np.where(time == row["frame"])[0][0] + i_idx = np.where(individuals == row["track_id"])[0][0] + k_idx = np.where(keypoints == row["keypoint"])[0][0] + position_array[t_idx, 0, k_idx, i_idx] = row["x"] + position_array[t_idx, 1, k_idx, i_idx] = row["y"] + if "confidence" in df.columns: + confidence_array[t_idx, k_idx, i_idx] = row["confidence"] + + return from_numpy( + position_array=position_array, + confidence_array=confidence_array, + individual_names=individuals.tolist(), + keypoint_names=keypoints.tolist(), + fps=fps, + source_software=source_software, + ) + + +def from_animovement_file( + file_path: Path | str, + fps: float | None = None, +) -> xr.Dataset: + """Create a ``movement`` poses dataset from an animovement Parquet file. + + Parameters + ---------- + file_path : pathlib.Path or str + Path to the Parquet file containing pose tracks in tidy format. + fps : float, optional + The number of frames per second in the video. If None (default), + the ``time`` coordinates will be in frame numbers. + + Returns + ------- + xarray.Dataset + ``movement`` dataset containing the pose tracks, confidence scores, + and associated metadata. + + Examples + -------- + >>> from movement.io import load_poses + >>> ds = load_poses.from_animovement_file("path/to/file.parquet", fps=30) + + """ + file = ValidFile( + file_path, + expected_permission="r", + expected_suffix=[".parquet"], + ) + # Load Parquet file into DataFrame + df = pd.read_parquet(file.path) + # Convert to xarray Dataset + ds = from_tidy_df(df, fps=fps, source_software="animovement") + # Add metadata + ds.attrs["source_file"] = file.path.as_posix() + logger.info(f"Loaded pose tracks from {file.path}:\n{ds}") + return ds diff --git a/movement/io/save_poses.py b/movement/io/save_poses.py index e65bd481e..b5ea96131 100644 --- a/movement/io/save_poses.py +++ b/movement/io/save_poses.py @@ -232,8 +232,8 @@ def to_lp_file( Parameters ---------- ds : xarray.Dataset - ``movement`` dataset containing pose tracks, confidence scores, - and associated metadata. + ``movement`` dataset containing pose tracks, coordinates, + confidence scores, and associated metadata. file_path : pathlib.Path or str Path to the file to save the poses to. File extension must be .csv. @@ -241,7 +241,7 @@ def to_lp_file( ----- LightningPose saves pose estimation outputs as .csv files, using the same format as single-animal DeepLabCut projects. Therefore, under the hood, - this function calls :func:`movement.io.save_poses.to_dlc_file` + this function calls :func:`to_dlc_file` with ``split_individuals=True``. This setting means that each individual is saved to a separate file, with the individual's name appended to the file path, just before the file extension, @@ -361,6 +361,116 @@ def to_sleap_analysis_file(ds: xr.Dataset, file_path: str | Path) -> None: logger.info(f"Saved poses dataset to {file.path}.") +def to_tidy_df(ds: xr.Dataset) -> pd.DataFrame: + """Convert a ``movement`` dataset to a tidy pandas DataFrame. + + Parameters + ---------- + ds : xarray.Dataset + ``movement`` dataset containing pose tracks, confidence scores, + and associated metadata. + + Returns + ------- + pandas.DataFrame + Tidy DataFrame with columns: 'frame', 'track_id', 'keypoint', 'x', 'y', + and 'confidence' (if available in the dataset). + + Notes + ----- + The output DataFrame is in a tidy format where each row represents a + single observation (one keypoint for one individual at one frame). + The columns are: + - 'frame': integer, the frame number (time index) + - 'track_id': string, the individual ID + - 'keypoint': string, the keypoint name + - 'x': float, x-coordinate + - 'y': float, y-coordinate + - 'confidence': float, point-wise confidence scores (if present) + + Examples + -------- + >>> from movement.io import save_poses, load_poses + >>> ds = load_poses.from_sleap_file("path/to/file_sleap.analysis.h5") + >>> df = save_poses.to_tidy_df(ds) + + """ + _validate_dataset(ds) + + # Compute frame indices + fps = getattr(ds, "fps", None) + if fps is not None: + frame_idxs = np.rint(ds.time.values * fps).astype(int) + else: + frame_idxs = ds.time.values.astype(int) + + # Stack data to create tidy format + position = ( + ds["position"] + .stack(obs=["time", "individuals", "keypoints"]) + .transpose("obs", "space") + ) + + # Create frame indices for each observation + time_indices = position.indexes["obs"].get_level_values("time") + frame_values = frame_idxs[np.searchsorted(ds.time.values, time_indices)] + + # Create DataFrame + df = pd.DataFrame( + { + "frame": frame_values, + "track_id": position.individuals.values, + "keypoint": position.keypoints.values, + "x": position.sel(space="x").values, + "y": position.sel(space="y").values, + } + ) + + # Add confidence only if present and not all NaN + if "confidence" in ds and not ds["confidence"].isnull().all(): + confidence = ds["confidence"].stack( + obs=["time", "individuals", "keypoints"] + ) + df["confidence"] = confidence.values + + logger.info("Converted poses dataset to tidy DataFrame.") + return df.reset_index(drop=True) + + +def to_animovement_file(ds: xr.Dataset, file_path: str | Path) -> None: + """Save a ``movement`` dataset to an animovement Parquet file. + + Parameters + ---------- + ds : xarray.Dataset + ``movement`` dataset containing pose tracks, confidence scores, + and associated metadata. + file_path : pathlib.Path or str + Path to the file to save the poses to. File extension must be .parquet. + + Notes + ----- + The dataset is first converted to a tidy DataFrame using + :func:`to_tidy_df`, then saved as a Parquet file using pandas' + `to_parquet` method. + + Examples + -------- + >>> from movement.io import save_poses, load_poses + >>> ds = load_poses.from_sleap_file("path/to/file_sleap.analysis.h5") + >>> save_poses.to_animovement_file(ds, "path/to/file.parquet") + + """ + file = _validate_file_path(file_path, expected_suffix=[".parquet"]) + _validate_dataset(ds) + + # Convert to tidy DataFrame + df = to_tidy_df(ds) + # Save to Parquet + df.to_parquet(file.path, index=False) + logger.info(f"Saved poses dataset to {file.path}.") + + def _remove_unoccupied_tracks(ds: xr.Dataset): """Remove tracks that are completely unoccupied from the dataset. diff --git a/tests/test_parquet_io.py b/tests/test_parquet_io.py new file mode 100644 index 000000000..4e867fd9c --- /dev/null +++ b/tests/test_parquet_io.py @@ -0,0 +1,219 @@ +"""Integration tests for Parquet I/O in movement.io.""" + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from movement.io import load_poses, save_poses + + +@pytest.fixture +def sample_dataset(): + """Create a sample xarray Dataset for testing.""" + position_array = np.random.rand( + 10, 2, 3, 2 + ) # 10 frames, 2D, 3 keypoints, 2 individuals + confidence_array = np.ones((10, 3, 2)) * 0.9 + return load_poses.from_numpy( + position_array=position_array, + confidence_array=confidence_array, + individual_names=["ind1", "ind2"], + keypoint_names=["nose", "tail", "spine"], + fps=30, + source_software="test", + ) + + +@pytest.fixture +def sample_tidy_df(): + """Create a sample tidy DataFrame for testing.""" + data = { + "frame": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + "track_id": ["ind1", "ind1", "ind1", "ind2", "ind2", "ind2"] * 2, + "keypoint": ["nose", "tail", "spine"] * 4, + "x": np.random.rand(12), + "y": np.random.rand(12), + "confidence": np.ones(12) * 0.9, + } + return pd.DataFrame(data) + + +def test_to_tidy_df(sample_dataset): + """Test conversion of xarray Dataset to tidy DataFrame.""" + df = save_poses.to_tidy_df(sample_dataset) + + # Check columns + expected_columns = { + "frame", + "track_id", + "keypoint", + "x", + "y", + "confidence", + } + assert set(df.columns) == expected_columns, ( + "Unexpected columns in tidy DataFrame" + ) + + # Check data types + assert df["frame"].dtype == int, "Frame column should be integer" + assert df["track_id"].dtype == object, ( + "Track_id column should be string/object" + ) + assert df["keypoint"].dtype == object, ( + "Keypoint column should be string/object" + ) + assert df["x"].dtype == float, "X column should be float" + assert df["y"].dtype == float, "Y column should be float" + assert df["confidence"].dtype == float, "Confidence column should be float" + + # Check shape + expected_rows = ( + sample_dataset.sizes["time"] + * sample_dataset.sizes["individuals"] + * sample_dataset.sizes["keypoints"] + ) + assert len(df) == expected_rows, ( + f"Expected {expected_rows} rows, got {len(df)}" + ) + + +def test_from_tidy_df(sample_tidy_df): + """Test conversion of tidy DataFrame to xarray Dataset.""" + ds = load_poses.from_tidy_df(sample_tidy_df, fps=30) + + # Check dataset structure + assert isinstance(ds, xr.Dataset), "Output should be an xarray Dataset" + assert set(ds.dims) == {"time", "space", "keypoints", "individuals"}, ( + "Unexpected dimensions" + ) + assert set(ds.data_vars) == {"position", "confidence"}, ( + "Unexpected data variables" + ) + + # Check coordinates + assert ds.sizes["time"] == 2, "Expected 2 frames" + assert ds.sizes["individuals"] == 2, "Expected 2 individuals" + assert ds.sizes["keypoints"] == 3, "Expected 3 keypoints" + assert ds.sizes["space"] == 2, "Expected 2D space" + + +def test_round_trip_dataframe(sample_dataset): + """Test round-trip conversion: Dataset -> tidy DataFrame -> Dataset.""" + df = save_poses.to_tidy_df(sample_dataset) + ds_roundtrip = load_poses.from_tidy_df( + df, fps=sample_dataset.attrs.get("fps") + ) + + # Compare datasets + xr.testing.assert_allclose( + ds_roundtrip["position"], sample_dataset["position"] + ) + xr.testing.assert_allclose( + ds_roundtrip["confidence"], sample_dataset["confidence"] + ) + assert ds_roundtrip.attrs["fps"] == sample_dataset.attrs["fps"], ( + "FPS metadata mismatch" + ) + assert set(ds_roundtrip.coords["individuals"].values) == set( + sample_dataset.coords["individuals"].values + ) + assert set(ds_roundtrip.coords["keypoints"].values) == set( + sample_dataset.coords["keypoints"].values + ) + + +def test_round_trip_parquet(sample_dataset, tmp_path): + """Test round-trip conversion: Dataset -> Parquet -> Dataset.""" + file_path = tmp_path / "test.parquet" + save_poses.to_animovement_file(sample_dataset, file_path) + ds_roundtrip = load_poses.from_animovement_file( + file_path, fps=sample_dataset.attrs.get("fps") + ) + + # Compare datasets + xr.testing.assert_allclose( + ds_roundtrip["position"], sample_dataset["position"] + ) + xr.testing.assert_allclose( + ds_roundtrip["confidence"], sample_dataset["confidence"] + ) + assert ds_roundtrip.attrs["fps"] == sample_dataset.attrs["fps"], ( + "FPS metadata mismatch" + ) + assert set(ds_roundtrip.coords["individuals"].values) == set( + sample_dataset.coords["individuals"].values + ) + assert set(ds_roundtrip.coords["keypoints"].values) == set( + sample_dataset.coords["keypoints"].values + ) + + +def test_to_tidy_df_no_confidence(): + """Test to_tidy_df with a dataset lacking confidence scores.""" + position_array = np.random.rand(5, 2, 2, 1) + ds = load_poses.from_numpy( + position_array=position_array, + individual_names=["ind1"], + keypoint_names=["nose", "tail"], + fps=25, + ) + df = save_poses.to_tidy_df(ds) + + # Check columns (no confidence) + expected_columns = {"frame", "track_id", "keypoint", "x", "y"} + assert set(df.columns) == expected_columns, ( + "Unexpected columns in tidy DataFrame" + ) + assert len(df) == 5 * 1 * 2, "Incorrect number of rows" + + +def test_from_tidy_df_missing_columns(sample_tidy_df): + """Test from_tidy_df with missing required columns.""" + invalid_df = sample_tidy_df.drop(columns=["x"]) + with pytest.raises( + ValueError, match="DataFrame missing required columns: {'x'}" + ): + load_poses.from_tidy_df(invalid_df) + + +def test_from_animovement_file_invalid_extension(tmp_path): + """Test from_animovement_file with incorrect file extension.""" + invalid_file = tmp_path / "test.csv" + invalid_file.write_text("dummy") + with pytest.raises( + ValueError, + match=r"Expected file with suffix\(es\) \['.parquet'\] " + r"but got suffix .csv", + ): + load_poses.from_animovement_file(invalid_file) + + +def test_to_animovement_file_invalid_extension(tmp_path, sample_dataset): + """Test to_animovement_file with incorrect file extension.""" + invalid_file = tmp_path / "test.csv" + with pytest.raises( + ValueError, + match=r"Expected file with suffix\(es\) \['.parquet'\] " + r"but got suffix .csv", + ): + save_poses.to_animovement_file(sample_dataset, invalid_file) + + +def test_empty_dataset(): + """Test handling of empty dataset.""" + empty_ds = load_poses.from_numpy( + position_array=np.empty((0, 2, 0, 0)), + confidence_array=np.empty((0, 0, 0)), + individual_names=[], + keypoint_names=[], + fps=30, + ) + df = save_poses.to_tidy_df(empty_ds) + assert df.empty, "Tidy DataFrame should be empty for empty dataset" + + ds_roundtrip = load_poses.from_tidy_df(df, fps=30) + assert ds_roundtrip.sizes["time"] == 0, ( + "Round-trip dataset should have zero frames" + ) From c2e396c2abcb4ebca651ee7b80a9fc8feef91d16 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sun, 27 Apr 2025 21:08:04 +0530 Subject: [PATCH 2/6] Fixed failing mypy errors. --- movement/io/load_poses.py | 80 ++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/movement/io/load_poses.py b/movement/io/load_poses.py index 36e6c5377..8fc3954fa 100644 --- a/movement/io/load_poses.py +++ b/movement/io/load_poses.py @@ -1,7 +1,7 @@ """Load pose tracking data from various frameworks into ``movement``.""" from pathlib import Path -from typing import Literal +from typing import Any, Literal, cast import h5py import numpy as np @@ -525,7 +525,7 @@ def _ds_from_sleap_labels_file( individual_names: list[str] = ( [track.name for track in labels.tracks] if labels.tracks - else ["individual_0"] # Fixed: Ensure list[str] + else ["individual_0"] ) if not labels.tracks: logger.warning( @@ -536,11 +536,15 @@ def _ds_from_sleap_labels_file( keypoint_names: list[str] = [kp.name for kp in labels.skeletons[0].nodes] + # Explicit type assertions for mypy + individual_names = cast(list[str], individual_names) + keypoint_names = cast(list[str], keypoint_names) + return from_numpy( position_array=tracks_with_scores[:, :-1, :, :], confidence_array=tracks_with_scores[:, -1, :, :], - individual_names=individual_names, # Already a list[str] - keypoint_names=keypoint_names, # Already a list[str] + individual_names=individual_names, + keypoint_names=keypoint_names, fps=fps, source_software="SLEAP", ) @@ -591,7 +595,7 @@ def _sleap_labels_to_numpy(labels: Labels) -> np.ndarray: n_nodes = len(skeleton.nodes) n_frames = int(last_frame - first_frame + 1) - # Explicitly type the tracks array + # Initialize tracks array with explicit type tracks: np.ndarray = np.full( (n_frames, 3, n_nodes, n_tracks), np.nan, dtype=np.float32 ) @@ -610,12 +614,18 @@ def _sleap_labels_to_numpy(labels: Labels) -> np.ndarray: # Use user-labelled instance if available if user_track_instances: inst = user_track_instances[-1] - tracks[i, 0:3, :, j] = np.hstack( - (inst.numpy(), np.full((n_nodes, 1), np.nan)) - ).T + points = inst.numpy() + for k in range(n_nodes): + tracks[i, 0, k, j] = points[k, 0] # x-coordinate + tracks[i, 1, k, j] = points[k, 1] # y-coordinate + tracks[i, 2, k, j] = np.nan # No scores for user instances elif predicted_track_instances: inst = predicted_track_instances[-1] - tracks[i, 0:3, :, j] = inst.numpy(scores=True).T + points = inst.numpy(scores=True) + for k in range(n_nodes): + tracks[i, 0, k, j] = points[k, 0] # x-coordinate + tracks[i, 1, k, j] = points[k, 1] # y-coordinate + tracks[i, 2, k, j] = points[k, 2] # confidence score return tracks @@ -747,41 +757,13 @@ def from_anipose_style_df( fps: float | None = None, individual_name: str = "individual_0", ) -> xr.Dataset: - """Create a ``movement`` poses dataset from an Anipose 3D dataframe. - - Parameters - ---------- - df : pd.DataFrame - Anipose triangulation dataframe - fps : float, optional - The number of frames per second in the video. If None (default), - the ``time`` coordinates will be in frame units. - individual_name : str, optional - Name of the individual, by default "individual_0" - - Returns - ------- - xarray.Dataset - ``movement`` dataset containing the pose tracks, confidence scores, - and associated metadata. - - Notes - ----- - Reshape dataframe with columns keypoint1_x, keypoint1_y, keypoint1_z, - keypoint1_score,keypoint2_x, keypoint2_y, keypoint2_z, - keypoint2_score...to array of positions with dimensions - time, space, keypoints, individuals, and array of confidence (from scores) - with dimensions time, keypoints, individuals. - - """ + """Create a ``movement`` poses dataset from an Anipose 3D dataframe.""" keypoint_names: list[str] = sorted( list( set( - [ - col.rsplit("_", 1)[0] - for col in df.columns - if any(col.endswith(f"_{s}") for s in ["x", "y", "z"]) - ] + col.rsplit("_", 1)[0] + for col in df.columns + if any(col.endswith(f"_{s}") for s in ["x", "y", "z"]) ) ) ) @@ -925,9 +907,9 @@ def from_tidy_df( ) # Get unique values for coordinates - time = np.sort(df["frame"].unique()) - individuals = df["track_id"].unique() - keypoints = df["keypoint"].unique() + time: np.ndarray[Any, np.dtype[np.int_]] = np.sort(df["frame"].unique()) + individuals: np.ndarray[Any, np.dtype[np.str_]] = df["track_id"].unique() + keypoints: np.ndarray[Any, np.dtype[np.str_]] = df["keypoint"].unique() n_frames = len(time) n_individuals = len(individuals) n_keypoints = len(keypoints) @@ -949,14 +931,18 @@ def from_tidy_df( k_idx = np.where(keypoints == row["keypoint"])[0][0] position_array[t_idx, 0, k_idx, i_idx] = row["x"] position_array[t_idx, 1, k_idx, i_idx] = row["y"] - if "confidence" in df.columns: + if confidence_array is not None and "confidence" in row: confidence_array[t_idx, k_idx, i_idx] = row["confidence"] + # Explicitly convert to lists to ensure mypy recognizes list[str] + individual_names: list[str] = list(individuals) + keypoint_names: list[str] = list(keypoints) + return from_numpy( position_array=position_array, confidence_array=confidence_array, - individual_names=individuals.tolist(), - keypoint_names=keypoints.tolist(), + individual_names=individual_names, + keypoint_names=keypoint_names, fps=fps, source_software=source_software, ) From f2ee3588e56fd312b56dac9e0746b1583e1cf9d5 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Sun, 27 Apr 2025 22:35:28 +0530 Subject: [PATCH 3/6] Replaced np.where with np.nonzero load_poses.py. --- movement/io/load_poses.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/movement/io/load_poses.py b/movement/io/load_poses.py index 8fc3954fa..8eb83b35a 100644 --- a/movement/io/load_poses.py +++ b/movement/io/load_poses.py @@ -926,9 +926,9 @@ def from_tidy_df( # Pivot data to fill arrays for _idx, row in df.iterrows(): - t_idx = np.where(time == row["frame"])[0][0] - i_idx = np.where(individuals == row["track_id"])[0][0] - k_idx = np.where(keypoints == row["keypoint"])[0][0] + t_idx = np.nonzero(time == row["frame"])[0][0] + i_idx = np.nonzero(individuals == row["track_id"])[0][0] + k_idx = np.nonzero(keypoints == row["keypoint"])[0][0] position_array[t_idx, 0, k_idx, i_idx] = row["x"] position_array[t_idx, 1, k_idx, i_idx] = row["y"] if confidence_array is not None and "confidence" in row: From 4513df60eac5efe45de9925e5b4a3d5f6b2609d1 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Sun, 27 Apr 2025 22:41:17 +0530 Subject: [PATCH 4/6] Replaced np.random.rand with a numpy.random.default_rng in test_parquet_io.py --- tests/test_parquet_io.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_parquet_io.py b/tests/test_parquet_io.py index 4e867fd9c..34eb91a41 100644 --- a/tests/test_parquet_io.py +++ b/tests/test_parquet_io.py @@ -11,7 +11,7 @@ @pytest.fixture def sample_dataset(): """Create a sample xarray Dataset for testing.""" - position_array = np.random.rand( + position_array = np.random.default_rng( 10, 2, 3, 2 ) # 10 frames, 2D, 3 keypoints, 2 individuals confidence_array = np.ones((10, 3, 2)) * 0.9 @@ -32,8 +32,8 @@ def sample_tidy_df(): "frame": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], "track_id": ["ind1", "ind1", "ind1", "ind2", "ind2", "ind2"] * 2, "keypoint": ["nose", "tail", "spine"] * 4, - "x": np.random.rand(12), - "y": np.random.rand(12), + "x": np.random.default_rng(12), + "y": np.random.default_rng(12), "confidence": np.ones(12) * 0.9, } return pd.DataFrame(data) @@ -152,7 +152,7 @@ def test_round_trip_parquet(sample_dataset, tmp_path): def test_to_tidy_df_no_confidence(): """Test to_tidy_df with a dataset lacking confidence scores.""" - position_array = np.random.rand(5, 2, 2, 1) + position_array = np.random.default_rng(5, 2, 2, 1) ds = load_poses.from_numpy( position_array=position_array, individual_names=["ind1"], From b3558e640147a94c0c70a8905032ecd193906b41 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sun, 27 Apr 2025 23:08:03 +0530 Subject: [PATCH 5/6] Updated pyproject.toml to include pyarrow. Fixed minor issue. --- pyproject.toml | 1 + tests/test_parquet_io.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1692131e7..b91c30e39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "numpy", "pandas", "h5py", + "pyarrow", "attrs", "pooch", "tqdm", diff --git a/tests/test_parquet_io.py b/tests/test_parquet_io.py index 34eb91a41..0bee9c3c3 100644 --- a/tests/test_parquet_io.py +++ b/tests/test_parquet_io.py @@ -11,8 +11,9 @@ @pytest.fixture def sample_dataset(): """Create a sample xarray Dataset for testing.""" - position_array = np.random.default_rng( - 10, 2, 3, 2 + rng = np.random.default_rng(seed=10) + position_array = rng.random( + (10, 2, 3, 2) ) # 10 frames, 2D, 3 keypoints, 2 individuals confidence_array = np.ones((10, 3, 2)) * 0.9 return load_poses.from_numpy( @@ -28,12 +29,13 @@ def sample_dataset(): @pytest.fixture def sample_tidy_df(): """Create a sample tidy DataFrame for testing.""" + rng = np.random.default_rng(seed=12) data = { "frame": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], "track_id": ["ind1", "ind1", "ind1", "ind2", "ind2", "ind2"] * 2, "keypoint": ["nose", "tail", "spine"] * 4, - "x": np.random.default_rng(12), - "y": np.random.default_rng(12), + "x": rng.random(12), + "y": rng.random(12), "confidence": np.ones(12) * 0.9, } return pd.DataFrame(data) @@ -152,7 +154,8 @@ def test_round_trip_parquet(sample_dataset, tmp_path): def test_to_tidy_df_no_confidence(): """Test to_tidy_df with a dataset lacking confidence scores.""" - position_array = np.random.default_rng(5, 2, 2, 1) + rng = np.random.default_rng(seed=5) + position_array = rng.random((5, 2, 2, 1)) ds = load_poses.from_numpy( position_array=position_array, individual_names=["ind1"], From a8c6c6c9835c9a469451cdd8e379e55a51f44349 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Mon, 28 Apr 2025 17:17:18 +0530 Subject: [PATCH 6/6] Added tests to increase coverage to 100%. --- tests/test_parquet_io.py | 112 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/tests/test_parquet_io.py b/tests/test_parquet_io.py index 0bee9c3c3..003105a7f 100644 --- a/tests/test_parquet_io.py +++ b/tests/test_parquet_io.py @@ -80,6 +80,16 @@ def test_to_tidy_df(sample_dataset): f"Expected {expected_rows} rows, got {len(df)}" ) + # Verify frame indices are integers starting from 0 + expected_frames = np.repeat( + np.arange(sample_dataset.sizes["time"]), + sample_dataset.sizes["individuals"] + * sample_dataset.sizes["keypoints"], + ) + assert np.array_equal(df["frame"].values, expected_frames), ( + "Frame indices should be integers starting from 0" + ) + def test_from_tidy_df(sample_tidy_df): """Test conversion of tidy DataFrame to xarray Dataset.""" @@ -220,3 +230,105 @@ def test_empty_dataset(): assert ds_roundtrip.sizes["time"] == 0, ( "Round-trip dataset should have zero frames" ) + + +def test_from_file_animovement(sample_dataset, tmp_path): + """Test from_file with source_software='animovement'.""" + file_path = tmp_path / "test.parquet" + save_poses.to_animovement_file(sample_dataset, file_path) + ds = load_poses.from_file( + file_path, + source_software="animovement", + fps=sample_dataset.attrs.get("fps"), + ) + + # Verify the loaded dataset + xr.testing.assert_allclose(ds["position"], sample_dataset["position"]) + xr.testing.assert_allclose(ds["confidence"], sample_dataset["confidence"]) + assert ds.attrs["fps"] == sample_dataset.attrs["fps"], ( + "FPS metadata mismatch" + ) + assert set(ds.coords["individuals"].values) == set( + sample_dataset.coords["individuals"].values + ) + assert set(ds.coords["keypoints"].values) == set( + sample_dataset.coords["keypoints"].values + ) + + +def test_to_tidy_df_float_time(): + """Test to_tidy_df with non-integer float time coordinates.""" + rng = np.random.default_rng(seed=15) + position_array = rng.random( + (5, 2, 2, 1) + ) # 5 frames, 2D, 2 keypoints, 1 individual + ds = load_poses.from_numpy( + position_array=position_array, + confidence_array=np.ones((5, 2, 1)) * 0.8, + individual_names=["ind1"], + keypoint_names=["nose", "tail"], + fps=10, + source_software="test", + ) + # Explicitly set time coordinates to non-integer floats + ds = ds.assign_coords(time=np.array([1.5, 2.5, 3.5, 4.5, 5.5])) + df = save_poses.to_tidy_df(ds) + + # Check columns + expected_columns = { + "frame", + "track_id", + "keypoint", + "x", + "y", + "confidence", + } + assert set(df.columns) == expected_columns, ( + "Unexpected columns in tidy DataFrame" + ) + assert df["frame"].dtype == int, "Frame column should be integer" + assert len(df) == 5 * 1 * 2, "Incorrect number of rows" + # Verify frame indices match scaled time values + expected_frames = np.repeat(np.array([15, 25, 35, 45, 55]), 2) + assert np.array_equal(df["frame"].values, expected_frames), ( + "Frame indices should match scaled time values" + ) + + +def test_to_tidy_df_no_fps(): + """Test to_tidy_df with fps=None and integer time coordinates.""" + rng = np.random.default_rng(seed=15) + position_array = rng.random( + (5, 2, 2, 1) + ) # 5 frames, 2D, 2 keypoints, 1 individual + ds = load_poses.from_numpy( + position_array=position_array, + confidence_array=np.ones((5, 2, 1)) * 0.8, + individual_names=["ind1"], + keypoint_names=["nose", "tail"], + fps=None, + source_software="test", + ) + # Explicitly set time coordinates to integers + ds = ds.assign_coords(time=np.array([0, 1, 2, 3, 4])) + df = save_poses.to_tidy_df(ds) + + # Check columns + expected_columns = { + "frame", + "track_id", + "keypoint", + "x", + "y", + "confidence", + } + assert set(df.columns) == expected_columns, ( + "Unexpected columns in tidy DataFrame" + ) + assert df["frame"].dtype == int, "Frame column should be integer" + assert len(df) == 5 * 1 * 2, "Incorrect number of rows" + # Verify frame indices match time values + expected_frames = np.repeat(np.array([0, 1, 2, 3, 4]), 2) + assert np.array_equal(df["frame"].values, expected_frames), ( + "Frame indices should match time values" + )