Improve signature calculation efficiency (#27)

niklasmueboe · web-flow · commit 372b89b492a4 · 2025-02-12T23:05:33.000+01:00
* more efficient/less flexible signature calculation

* don't sort when grouping

* improve typing in signatures

* don't normalize signatures
diff --git a/sainsc/utils/_signatures.py b/sainsc/utils/_signatures.py
@@ -1,21 +1,20 @@
-from collections.abc import Callable
+from collections.abc import Hashable
 
 import anndata as ad
+import numpy as np
 import pandas as pd
+from numpy.typing import DTypeLike
 
 
 def celltype_signatures(
     adata: ad.AnnData,
     *,
     celltype_col: str = "leiden",
     layer: str | None = None,
-    agg_method: str | Callable = "mean",
+    dtype: DTypeLike = np.float32,
 ) -> pd.DataFrame:
     """
-    Calculate gene expression signatures per 'celltype'.
-
-    Note, that this will make a dense copy of `adata.X` or the selected `layer`,
-    therefore potentially leading to large memory usage.
+    Calculate gene expression signatures per 'cell type'.
 
     Parameters
     ----------
@@ -24,25 +23,24 @@ def celltype_signatures(
         Name of column in :py:attr:`anndata.AnnData.obs` containing cell-type
         information.
     layer : str, optional
-        Which layer to use for aggregation. If `None`, `adata.X` is used.
-    agg_method : str or collections.abc.Callable, optional
-        Function to aggregate gene expression per cluster used by
-        :py:meth:`pandas.DataFrame.agg`.
+        Which :py:attr:`anndata.AnnData.layers` to use for aggregation. If `None`,
+        :py:attr:`anndata.AnnData.X` is used.
+    dytpe : numpy.typing.DTypeLike
+        Data type to use for the signatures.
 
     Returns
     -------
     pandas.DataFrame
-        :py:class:`pandas.DataFrame` of gene expression aggregated per 'celltype'.
+        :py:class:`pandas.DataFrame` of gene expression aggregated per 'cell type'.
     """
-    signatures = (
-        adata.to_df(layer=layer)
-        .merge(adata.obs[celltype_col], left_index=True, right_index=True)
-        .groupby(celltype_col, observed=True, sort=False)
-        .agg(agg_method)
-        .transpose()
-        .rename_axis(adata.var_names.name)
-    )
+    X = adata.X if layer is None else adata.layers[layer]
+    grouping = adata.obs.groupby(celltype_col, observed=True, sort=False).indices
 
-    signatures /= signatures.sum(axis=0)
+    signatures: dict[Hashable, np.ndarray] = {}
+    for name, indices in grouping.items():
+        mean_X_group = X[indices].mean(axis=0, dtype=dtype)
+        signatures[name] = (
+            mean_X_group.A1 if isinstance(mean_X_group, np.matrix) else mean_X_group
+        )
 
-    return signatures
+    return pd.DataFrame(signatures, index=adata.var_names)