Added numpy random generator

nglm · nglm · commit 9b59a9019d1f · 2024-10-14T18:53:05.000+02:00
diff --git a/docs/releases/PyCVI-0.1.5.md b/docs/releases/PyCVI-0.1.5.md
@@ -0,0 +1,25 @@
+# PyCVI 0.1.5 release notes
+
+YYYY/MM/DD
+
+_summary-XXX_
+
+## Python versions
+
+This version supports Python versions 3.8 to 3.11.
+
+## New Features
+
+- `pycvi.compute_scores.compute_all_scores`, CVIs and their corresponding functions can now use a specific numpy [Random Generator](https://numpy.org/doc/stable/reference/random/generator.html) for reproducibility purpose.
+
+## Changes
+
+- `pycvi.cvi.select` now raises a `SelectionError` if no clustering could be selected (was `None` previously)
+
+## Fixes
+
+- Fix Davies-Bouldin in order to ignore comparing the distance between a centroid to itself when computing the index.
+
+## Contributors
+
+- Natacha Galmiche (@nglm)
diff --git a/pycvi/cluster.py b/pycvi/cluster.py
@@ -81,6 +81,7 @@ def generate_uniform(
     data: np.ndarray,
     zero_type: str = "bounds",
     N_zero: int = 10,
+    rng = np.random.default_rng(611),
 ) -> List[np.ndarray]:
     """
     Generate `N_zero` samples from a uniform distribution based on data.
@@ -104,6 +105,9 @@ def generate_uniform(
 
     N_zero : int, optional
         Number of uniform distributions sampled, by default 10
+    rng : A numpy Random Generator, optional
+        The numpy random generator to use to sample from the uniform
+        distribution, by default np.random.default_rng(611)
 
     Returns
     -------
@@ -135,7 +139,7 @@ def generate_uniform(
     # Generate N_zero samples from a uniform distribution with shape
     # the same shape as data
     l_data0 = [
-        np.random.uniform(low=mins, high=maxs, size=data.shape)
+        rng.uniform(low=mins, high=maxs, size=data.shape)
         for _ in range(N_zero)
     ]
     return l_data0
diff --git a/pycvi/compute_scores.py b/pycvi/compute_scores.py
@@ -368,6 +368,7 @@ def compute_all_scores(
     time_window: int = None,
     N_zero: int = 10,
     zero_type: str = "bounds",
+    rng = np.random.default_rng(611),
     cvi_kwargs: dict = {},
     return_list: bool = False,
 ) -> Union[List[List[Dict[int, float]]], List[Dict[int, float]], Dict[int, float]]:
@@ -426,6 +427,9 @@ def compute_all_scores(
           has the same variance and mean as the original data.
         - `"bounds"`: the uniform distribution is defined such that it
           has the same bounds as the original data.
+    rng : A numpy Random Generator, optional
+        The numpy random generator to use to sample from the uniform
+        distribution, by default np.random.default_rng(611)
     cvi_kwargs : dict, optional
         Specific kwargs to give to the CVI, by default {}
     return_list: bool, optional
@@ -455,7 +459,9 @@ def compute_all_scores(
     # --------------------------------------------------------------
 
     data_copy = set_data_shape(data)
-    l_data0 = generate_uniform(data_copy, zero_type=zero_type, N_zero=N_zero)
+    l_data0 = generate_uniform(
+        data_copy, zero_type=zero_type, N_zero=N_zero, rng=rng
+    )
     (N, T, d) = data_copy.shape
     if scaler is not None:
         scaler.fit(data_copy.reshape(N*T, d))
diff --git a/pycvi/cvi.py b/pycvi/cvi.py
@@ -114,6 +114,9 @@ class CVI():
         example used in the Hartigan index where we don't use
         :math:`k=0` as a reference score if :math:`k=0` is more
         relevant than :math:`k=1`, by default False
+    rng : A numpy Random Generator, optional
+        The numpy random generator to use when sampling from random
+        distributions, by default np.random.default_rng(611)
 
     Raises
     ------
@@ -134,6 +137,7 @@ def __init__(
         criterion_function: callable = None,
         k_condition: callable = None,
         ignore0: bool = False,
+        rng = np.random.default_rng(611),
     ) -> None:
         self.function = cvi_function
         self.criterion_function = criterion_function
@@ -150,6 +154,7 @@ def __init__(
         self.ignore0 = ignore0
         self.N = None
         self.d = None
+        self.rng = rng
 
     def __call__(
         self,
@@ -895,6 +900,7 @@ def get_cvi_kwargs(
         """
         cvi_kw = {}
         cvi_kw["k"] = n_clusters
+        cvi_kw["rng"] = self.rng
         if n_clusters < len(X_clus):
             cvi_kw["clusters_next"] = clusterings_t.get(n_clusters+1, None)
         if n_clusters == 0:
@@ -996,6 +1002,7 @@ def get_cvi_kwargs(
         """
         cvi_kw = {}
         cvi_kw["k"] = n_clusters
+        cvi_kw["rng"] = self.rng
         if n_clusters == 0:
             cvi_kw["X1"] = X_clus
         if "zero_type" not in cvi_kw or cvi_kw["zero_type"] == None:
@@ -1127,6 +1134,7 @@ def get_cvi_kwargs(
         """
         cvi_kw = {"B" : 10}
         cvi_kw["k"] = n_clusters
+        cvi_kw["rng"] = self.rng
         if self.cvi_type == "original":
             cvi_kw["return_s"] = True
         cvi_kw.update(cvi_kwargs)
diff --git a/pycvi/cvi_func.py b/pycvi/cvi_func.py
@@ -213,6 +213,7 @@ def gap_statistic(
     k: int = None,
     B: int = 10,
     zero_type: str = "variance",
+    rng = np.random.default_rng(611),
     return_s: bool = False,
 ) -> Union[float, Tuple[float, float]]:
     """
@@ -236,6 +237,9 @@ def gap_statistic(
         has the same bounds as the original data.
 
     :type zero_type: str, optional
+    :param rng: The numpy random generator to use to sample from the
+      uniform distribution, by default np.random.default_rng(611)
+    :type rng: A numpy Random Generator, optional
     :param return_s: Should s be returned as well?
     :type return_s: bool, optional
     :return: The gap statistics
@@ -249,7 +253,8 @@ def gap_statistic(
 
         # Generate B random datasets with the same shape as the input data
         # and the same parameters
-        random_datasets = generate_uniform(X, zero_type=zero_type, N_zero=B)
+        random_datasets = generate_uniform(
+            X, zero_type=zero_type, N_zero=B, rng=rng)
 
         # Compute the log of the within-cluster dispersion for each random dataset
         wcss_rand = []
@@ -327,6 +332,7 @@ def hartigan(
     k:int = None,
     clusters_next: List[List[int]] = None,
     X1: np.ndarray = None,
+    rng = np.random.default_rng(611),
 ) -> float:
     """
     Compute the Hartigan index for a given clustering.
@@ -343,6 +349,9 @@ def hartigan(
         then the values of all datapoints when sampled from a uniform
         distribution.
     :type X1: np.ndarray, shape: (N, d*w_t) or (N, w_t, d)
+    :param rng: The numpy random generator to use to sample from the
+        uniform distribution, by default np.random.default_rng(611)
+    :type rng: A numpy Random Generator, optional
     :return: The Hartigan index
     :rtype: float
     """
@@ -359,7 +368,7 @@ def hartigan(
     elif k == 0:
         # X0 shape: (N, d*w_t) or (N, w_t, d)
         if X1 is None:
-            l_X0 = generate_uniform(X, zero_type="bounds", N_zero=1)
+            l_X0 = generate_uniform(X, zero_type="bounds", N_zero=1, rng=rng)
             X1 = X
         else:
             l_X0 = [X]
@@ -442,6 +451,7 @@ def CH(
     k: int = None,
     X1: np.ndarray = None,
     zero_type: str = "variance",
+    rng = np.random.default_rng(611),
     dist_kwargs: dict = {},
 ) -> float:
     """
@@ -467,6 +477,9 @@ def CH(
         has the same bounds as the original data.
 
     :type zero_type: str, optional
+    :param rng: The numpy random generator to use to sample from the
+        uniform distribution, by default np.random.default_rng(611)
+    :type rng: A numpy Random Generator, optional
     :param dist_kwargs: kwargs for the distance function, defaults to {}
     :type dist_kwargs: dict, optional
     :return: The CH index
@@ -484,7 +497,7 @@ def CH(
 
         # X0 shape: (N, d*w_t) or (N, w_t, d)
         if X1 is None:
-            X0 = generate_uniform(X, zero_type=zero_type, N_zero=1)[0]
+            X0 = generate_uniform(X, zero_type=zero_type, N_zero=1, rng=rng)[0]
             X1 = X
         else:
             X0 = X