@@ -73,7 +73,8 @@ def chatterjee_xi(x: np.ndarray, y: np.ndarray) -> float:
73
73
underlying distributions of the variable.
74
74
75
75
It ranges from 0 (variables are completely independent) to 1
76
- (one is a measurable function of the other).
76
+ (one is a measurable function of the other). But a lot of the times the maximum
77
+ value of the coefficient is lower than 1.
77
78
78
79
This implementation does not break ties at random, instead
79
80
it break ties depending on order. This makes it dependent on
@@ -212,10 +213,10 @@ def concordance_rate(x: np.ndarray, y: np.ndarray) -> float:
212
213
sem_y = np .std (y , ddof = 0 ) / n ** 0.5
213
214
return float (
214
215
(
215
- np .sum ((x > mean_x + sem_x ) & (y > mean_y + sem_y ))
216
- + np .sum ((x < mean_x - sem_x ) & (y > mean_y + sem_y ))
217
- - np .sum ((x < mean_x - sem_x ) & (y < mean_y - sem_y ))
218
- - np .sum ((x > mean_x + sem_x ) & (y < mean_y - sem_y ))
216
+ np .sum ((x >= mean_x + sem_x ) & (y >= mean_y + sem_y ))
217
+ - np .sum ((x <= mean_x - sem_x ) & (y >= mean_y + sem_y ))
218
+ + np .sum ((x <= mean_x - sem_x ) & (y <= mean_y - sem_y ))
219
+ - np .sum ((x >= mean_x + sem_x ) & (y <= mean_y - sem_y ))
219
220
)
220
221
/ n
221
222
)
@@ -228,7 +229,8 @@ def symmetric_chatterjee_xi(x: np.ndarray, y: np.ndarray) -> float:
228
229
underlying distributions of the variable.
229
230
230
231
It ranges from 0 (variables are completely independent) to 1
231
- (one is a measurable function of the other).
232
+ (one is a measurable function of the other). But a lot of the times the maximum
233
+ value of the coefficient is lower than 1.
232
234
233
235
This implementation does not break ties at random, instead
234
236
it break ties depending on order. This makes it dependent on
@@ -311,28 +313,27 @@ def zhang_i(x: np.ndarray, y: np.ndarray) -> float:
311
313
312
314
References
313
315
----------
314
- Zhang, Q. (2023).
315
- On relationships between Chatterjee's and Spearman's correlation coefficients.
316
- arXiv preprint arXiv:2302.10131.
317
-
318
- Notes
319
- -----
320
- This measure is assymetric: (x, y) != (y, x).
316
+ Zhang, Q. (2025).
317
+ On the extensions of the Chatterjee-Spearman test.
318
+ Journal of Nonparametric Statistics, 1-30.
321
319
322
320
See Also
323
321
--------
324
322
scipy.stats.spearmanr - Spearman R coefficient.
325
- obscure_stats.associaton.chatterjee_xi - Chatterjee Xi coefficient.
323
+ obscure_stats.associaton.symmetric_chatterjee_xi - Chatterjee Xi coefficient.
326
324
"""
327
325
if _check_arrays (x , y ):
328
326
return np .nan
329
327
x , y = _prep_arrays (x , y )
330
328
if _check_arrays (x , y ):
331
329
return np .nan
332
330
return float (
333
- max (
334
- abs (stats .spearmanr (x , y , nan_policy = "omit" )[0 ]),
335
- 2.5 ** 0.5 * chatterjee_xi (x , y ),
331
+ min (
332
+ 1.0 ,
333
+ max (
334
+ abs (stats .spearmanr (x , y , nan_policy = "omit" )[0 ]),
335
+ 2.5 ** 0.5 * symmetric_chatterjee_xi (x , y ),
336
+ ),
336
337
)
337
338
)
338
339
@@ -596,13 +597,11 @@ def tukey_correlation(x: np.ndarray, y: np.ndarray) -> float:
596
597
s_y = gini_mean_difference (y )
597
598
x_norm = x / s_x
598
599
y_norm = y / s_y
599
- return float (
600
- 0.25
601
- * (
602
- gini_mean_difference (x_norm + y_norm ) ** 2
603
- - gini_mean_difference (x_norm - y_norm ) ** 2
604
- )
600
+ coef = 0.25 * (
601
+ gini_mean_difference (x_norm + y_norm ) ** 2
602
+ - gini_mean_difference (x_norm - y_norm ) ** 2
605
603
)
604
+ return float (max (min (coef , 1.0 ), - 1.0 ))
606
605
607
606
608
607
def gaussain_rank_correlation (x : np .ndarray , y : np .ndarray ) -> float :
@@ -640,10 +639,10 @@ def gaussain_rank_correlation(x: np.ndarray, y: np.ndarray) -> float:
640
639
norm_factor = 1 / (n + 1 )
641
640
x_ranks_norm = (np .argsort (x ) + 1 ) * norm_factor
642
641
y_ranks_norm = (np .argsort (y ) + 1 ) * norm_factor
643
- return float (
644
- np .sum (stats .norm .ppf (x_ranks_norm ) * stats .norm .ppf (y_ranks_norm ))
645
- / np .sum (stats .norm .ppf (np .arange (1 , n + 1 ) * norm_factor ) ** 2 )
642
+ coef = np .sum (stats .norm .ppf (x_ranks_norm ) * stats .norm .ppf (y_ranks_norm )) / np .sum (
643
+ stats .norm .ppf (np .arange (1 , n + 1 ) * norm_factor ) ** 2
646
644
)
645
+ return float ((coef - 0.5 ) * 2 )
647
646
648
647
649
648
def quantile_correlation (x : np .ndarray , y : np .ndarray , q : float = 0.5 ) -> float :
@@ -689,3 +688,82 @@ def quantile_correlation(x: np.ndarray, y: np.ndarray, q: float = 0.5) -> float:
689
688
np .mean ((q - (y < np .quantile (y , q = q ))) * (x - np .mean (x )))
690
689
/ (((q - q ** 2 ) * np .var (x )) ** 0.5 )
691
690
)
691
+
692
+
693
+ def normalized_chatterjee_xi (x : np .ndarray , y : np .ndarray ) -> float :
694
+ """Calculate normalizd Xi correlation coefficient.
695
+
696
+ Another variation of rank correlation which does not make any assumptions about
697
+ underlying distributions of the variable.
698
+
699
+ It ranges from 0 (variables are completely independent) to 1
700
+ (one is a measurable function of the other). This variant normalizes Chatterjee Xi,
701
+ so it's maximum will always be 1.0.
702
+
703
+ This implementation does not break ties at random, instead
704
+ it break ties depending on order. This makes it dependent on
705
+ data sorting, which could be useful in application like time
706
+ series.
707
+
708
+ The arrays will be flatten before any calculations.
709
+
710
+ Parameters
711
+ ----------
712
+ x : array_like
713
+ Input array.
714
+ y : array_like
715
+ Input array.
716
+
717
+ Returns
718
+ -------
719
+ nxi : float.
720
+ The value of the normalized xi correlation coefficient.
721
+
722
+ References
723
+ ----------
724
+ Dalitz, C.; Arning, J.; Goebbels, S. (2024).
725
+ A Simple Bias Reduction for Chatterjee's Correlation.
726
+ J Stat Theory Pract 18, 51.
727
+
728
+ Notes
729
+ -----
730
+ This measure is assymetric: (x, y) != (y, x).
731
+ """
732
+ if _check_arrays (x , y ):
733
+ return np .nan
734
+ x , y = _prep_arrays (x , y )
735
+ if _check_arrays (x , y ):
736
+ return np .nan
737
+ n = len (x )
738
+ # y ~ f(x)
739
+ y_forward_ordered = y [np .argsort (x )]
740
+ _ , y_unique_indexes , y_counts = np .unique (
741
+ y_forward_ordered , return_inverse = True , return_counts = True
742
+ )
743
+ right_xy = np .cumsum (y_counts )[y_unique_indexes ]
744
+ left_xy = np .cumsum (y_counts [::- 1 ])[len (y_counts ) - y_unique_indexes - 1 ]
745
+ # y ~ f(y)
746
+ y_ordered = y [np .argsort (y )]
747
+ _ , y_unique_indexes , y_counts = np .unique (
748
+ y_ordered , return_inverse = True , return_counts = True
749
+ )
750
+ right_yy = np .cumsum (y_counts )[y_unique_indexes ]
751
+ left_yy = np .cumsum (y_counts [::- 1 ])[len (y_counts ) - y_unique_indexes - 1 ]
752
+ # divide one by another
753
+ return float (
754
+ max (
755
+ - 1 ,
756
+ (
757
+ 1
758
+ - 0.5
759
+ * np .sum (np .abs (np .diff (right_xy )))
760
+ / np .mean (left_xy * (n - left_xy ))
761
+ )
762
+ / (
763
+ 1
764
+ - 0.5
765
+ * np .sum (np .abs (np .diff (right_yy )))
766
+ / np .mean (left_yy * (n - left_yy )),
767
+ ),
768
+ )
769
+ )
0 commit comments