Skip to content

Commit ef4c308

Browse files
bevhannoBever
andauthored
Adding chi-square distance method for categorical variables (#444)
* add chi-square method for categorical distance tests * add chi-square method for categorical distance tests * update comments * refactor CategoricalDistanceMethod trait, add constants * add documentation for constants * add alpha parameter, tests and comments Co-authored-by: Bever <[email protected]> Co-authored-by: bevhanno <[email protected]>
1 parent 38bac18 commit ef4c308

File tree

3 files changed

+333
-25
lines changed

3 files changed

+333
-25
lines changed

pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,12 @@
9494
<version>${spark.version}</version>
9595
</dependency>
9696

97+
<dependency>
98+
<groupId>org.apache.spark</groupId>
99+
<artifactId>spark-mllib_${scala.major.version}</artifactId>
100+
<version>${spark.version}</version>
101+
</dependency>
102+
97103
<dependency>
98104
<groupId>org.scalanlp</groupId>
99105
<artifactId>breeze_${scala.major.version}</artifactId>

src/main/scala/com/amazon/deequ/analyzers/Distance.scala

Lines changed: 238 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,47 @@
1515
*/
1616

1717
package com.amazon.deequ.analyzers
18+
import org.apache.spark.SparkContext
19+
import org.apache.spark.mllib.linalg._
20+
import org.apache.spark.mllib.regression.LabeledPoint
21+
import org.apache.spark.mllib.stat.Statistics
22+
import org.apache.spark.mllib.stat.Statistics._
23+
import org.apache.spark.mllib.stat.test.ChiSqTestResult
24+
25+
26+
27+
1828

1929
object Distance {
2030

21-
/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
31+
// Chi-square constants
32+
// at least two distinct categories are required to run the chi-square test for a categorical variable
33+
private val chisquareMinDimension: Int = 2
34+
35+
//for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
36+
private val defaultAbsThresholdYates: Integer = 5
37+
private val defaultPercThresholdYates: Double = 0.2
38+
39+
// for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
40+
private val defaultAbsThresholdCochran: Integer = 10
41+
42+
// Default c(alpha) value corresponding to an alpha value of 0.003, Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), 3rd Edition, Addison Wesley, Reading Mass, 1998.
43+
private val defaultCAlpha : Double = 1.8
44+
45+
trait CategoricalDistanceMethod
46+
case class LInfinityMethod(alpha: Option[Double] = None) extends CategoricalDistanceMethod
47+
case class ChisquareMethod(
48+
absThresholdYates: Integer = defaultAbsThresholdYates,
49+
percThresholdYates: Double = defaultPercThresholdYates,
50+
absThresholdCochran: Integer = defaultAbsThresholdCochran)
51+
extends CategoricalDistanceMethod
52+
53+
/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
2254
def numericalDistance(
2355
sample1: QuantileNonSample[Double],
2456
sample2: QuantileNonSample[Double],
25-
correctForLowNumberOfSamples: Boolean = false)
57+
correctForLowNumberOfSamples: Boolean = false,
58+
alpha: Option[Double] = None)
2659
: Double = {
2760
val rankMap1 = sample1.getRankMap()
2861
val rankMap2 = sample2.getRankMap()
@@ -37,50 +70,230 @@ object Distance {
3770
val cdfDiff = Math.abs(cdf1 - cdf2)
3871
linfSimple = Math.max(linfSimple, cdfDiff)
3972
}
40-
selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
73+
selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples, alpha)
4174
}
4275

43-
/** Calculate distance of categorical profiles based on L-Infinity Distance */
44-
def categoricalDistance(
45-
sample1: scala.collection.mutable.Map[String, Long],
46-
sample2: scala.collection.mutable.Map[String, Long],
47-
correctForLowNumberOfSamples: Boolean = false)
48-
: Double = {
76+
/** Calculate distance of categorical profiles based on different distance methods
77+
*
78+
* Thresholds for chi-square method:
79+
* - for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
80+
* - for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
81+
*
82+
* @param sample1 the mapping between categories(keys) and counts(values) of the observed sample
83+
* @param sample2 the mapping between categories(keys) and counts(values) of the expected baseline
84+
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
85+
* @param method Method to use: LInfinity or Chisquare
86+
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
87+
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
88+
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
89+
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
90+
*/
91+
def categoricalDistance(
92+
sample1: scala.collection.mutable.Map[String, Long],
93+
sample2: scala.collection.mutable.Map[String, Long],
94+
correctForLowNumberOfSamples: Boolean = false,
95+
method: CategoricalDistanceMethod = LInfinityMethod())
96+
: Double = {
97+
method match {
98+
case LInfinityMethod(alpha) => categoricalLInfinityDistance(sample1, sample2, correctForLowNumberOfSamples, alpha)
99+
case ChisquareMethod(absThresholdYates, percThresholdYates, absThresholdCochran)
100+
=> categoricalChiSquareTest(
101+
sample1,
102+
sample2,
103+
correctForLowNumberOfSamples,
104+
absThresholdYates,
105+
percThresholdYates,
106+
absThresholdCochran )
107+
}
108+
}
109+
110+
/** Calculate distance of categorical profiles based on Chisquare test or stats
111+
*
112+
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
113+
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
114+
*
115+
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
116+
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
117+
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
118+
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
119+
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
120+
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
121+
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
122+
*
123+
*/
124+
private[this] def categoricalChiSquareTest(
125+
sample: scala.collection.mutable.Map[String, Long],
126+
expected: scala.collection.mutable.Map[String, Long],
127+
correctForLowNumberOfSamples: Boolean = false,
128+
absThresholdYates : Integer = defaultAbsThresholdYates ,
129+
percThresholdYates : Double = defaultPercThresholdYates,
130+
absThresholdCochran : Integer = defaultAbsThresholdCochran,
131+
normalizeExpected : Boolean = true)
132+
: Double = {
133+
134+
val sampleSum: Double = sample.filter(e => expected.contains(e._1)).map((e => e._2)).sum
135+
val expectedSum: Double = expected.map(e => e._2).sum
136+
137+
// Normalize the expected input, normalization is required to conduct the chi-square test
138+
// While normalization is already included in the mllib chi-square test, we perform normalization manually to execute proper regrouping
139+
// https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest:org.apache.spark.mllib.stat.test.ChiSqTestResult
140+
val expectedNorm: scala.collection.mutable.Map[String, Double] = expected.map(e => (e._1, (e._2 / expectedSum * sampleSum)))
141+
142+
// Call the function that regroups categories if necessary depending on thresholds
143+
val (regroupedSample, regroupedExpected) = regroupCategories(sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
49144

50-
var n = 0.0
51-
var m = 0.0
52-
sample1.keySet.foreach { key =>
53-
n += sample1(key)
145+
// If less than 2 categories remain we cannot conduct the test
146+
if (regroupedSample.keySet.size < chisquareMinDimension) {
147+
Double.NaN
148+
} else {
149+
// run chi-square test and return statistics or p-value
150+
val result = chiSquareTest(regroupedSample, regroupedExpected)
151+
if (correctForLowNumberOfSamples) {
152+
result.statistic
153+
} else {
154+
result.pValue
54155
}
55-
sample2.keySet.foreach { key =>
56-
m += sample2(key)
156+
}
157+
}
158+
159+
/** Regroup categories with elements below threshold, required for chi-square test
160+
*
161+
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
162+
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
163+
*
164+
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
165+
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
166+
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
167+
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
168+
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
169+
* @return (sample, expected) returns the two regrouped mappings
170+
*
171+
*/
172+
private[this] def regroupCategories(
173+
sample: scala.collection.mutable.Map[String, Double],
174+
expected: scala.collection.mutable.Map[String, Double],
175+
absThresholdYates: Integer = defaultAbsThresholdYates,
176+
percThresholdYates: Double = defaultPercThresholdYates,
177+
absThresholdCochran: Integer = defaultAbsThresholdCochran)
178+
: (scala.collection.mutable.Map[String, Double], scala.collection.mutable.Map[String, Double]) = {
179+
180+
// If number of categories is below the minimum return original mappings
181+
if (expected.keySet.size < chisquareMinDimension) {
182+
(sample, expected)
183+
} else {
184+
// Determine thresholds depending on dimensions of mapping (2x2 tables use Cochran, all other tables Yates thresholds)
185+
var absThresholdPerColumn : Integer = absThresholdCochran
186+
var maxNbColumnsBelowThreshold: Integer = 0
187+
if (expected.keySet.size > chisquareMinDimension) {
188+
absThresholdPerColumn = absThresholdYates
189+
maxNbColumnsBelowThreshold = (percThresholdYates * expected.keySet.size).toInt
57190
}
58-
val combinedKeys = sample1.keySet.union(sample2.keySet)
59-
var linfSimple = 0.0
191+
// Count number of categories below threshold
192+
val nbExpectedColumnsBelowThreshold = expected.filter(e => e._2 < absThresholdPerColumn).keySet.size
60193

61-
combinedKeys.foreach { key =>
62-
val cdf1 = sample1.getOrElse(key, 0L) / n
63-
val cdf2 = sample2.getOrElse(key, 0L) / m
64-
val cdfDiff = Math.abs(cdf1 - cdf2)
65-
linfSimple = Math.max(linfSimple, cdfDiff)
194+
// If the number of categories below threshold exceeds the authorized maximum, small categories are regrouped until valid
195+
if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold){
196+
197+
// Identified key that holds minimum value
198+
val expectedMin: (String, Double) = expected.minBy(e => e._2)
199+
val sampleMinValue : Double = sample.getOrElse(expectedMin._1, 0)
200+
201+
// Remove smallest category
202+
expected.remove(expectedMin._1)
203+
sample.remove(expectedMin._1)
204+
205+
// Add value of smallest category to second smallest category
206+
val expectedSecondMin = expected.minBy(e => e._2)
207+
val sampleSecondMinValue : Double = sample.getOrElse(expectedSecondMin._1, 0)
208+
209+
expected.update(expectedSecondMin._1, expectedSecondMin._2 + expectedMin._2 )
210+
sample.update(expectedSecondMin._1, sampleMinValue + sampleSecondMinValue )
211+
212+
// Recursively call function until mappings are valid
213+
regroupCategories(sample, expected, absThresholdYates, percThresholdYates, absThresholdCochran)
214+
} else {
215+
// In case the mappings are valid the original mappings are returned
216+
(sample, expected)
66217
}
67-
selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples)
68218
}
219+
}
220+
221+
222+
/** Runs chi-square test on two mappings
223+
*
224+
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
225+
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
226+
* @return ChiSqTestResult returns the chi-square test result object (contains both statistics and p-value)
227+
*
228+
*/
229+
private[this] def chiSquareTest(
230+
sample: scala.collection.mutable.Map[String, Double],
231+
expected: scala.collection.mutable.Map[String, Double])
232+
: ChiSqTestResult = {
233+
234+
var sampleArray = Array[Double]()
235+
var expectedArray = Array[Double]()
236+
237+
expected.keySet.foreach { key =>
238+
val cdf1: Double = sample.getOrElse(key, 0.0)
239+
val cdf2: Double = expected(key)
240+
sampleArray = sampleArray :+ cdf1
241+
expectedArray = expectedArray :+ cdf2
242+
}
243+
244+
val vecSample: Vector = Vectors.dense(sampleArray)
245+
val vecExpected: Vector = Vectors.dense(expectedArray)
246+
247+
Statistics.chiSqTest(vecSample, vecExpected)
248+
}
249+
250+
/** Calculate distance of categorical profiles based on L-Infinity Distance */
251+
private[this] def categoricalLInfinityDistance(
252+
sample1: scala.collection.mutable.Map[String, Long],
253+
sample2: scala.collection.mutable.Map[String, Long],
254+
correctForLowNumberOfSamples: Boolean = false,
255+
alpha: Option[Double])
256+
: Double = {
257+
var n = 0.0
258+
var m = 0.0
259+
sample1.keySet.foreach { key =>
260+
n += sample1(key)
261+
}
262+
sample2.keySet.foreach { key =>
263+
m += sample2(key)
264+
}
265+
val combinedKeys = sample1.keySet.union(sample2.keySet)
266+
var linfSimple = 0.0
267+
268+
combinedKeys.foreach { key =>
269+
val cdf1 = sample1.getOrElse(key, 0L) / n
270+
val cdf2 = sample2.getOrElse(key, 0L) / m
271+
val cdfDiff = Math.abs(cdf1 - cdf2)
272+
linfSimple = Math.max(linfSimple, cdfDiff)
273+
}
274+
selectMetrics(linfSimple, n, m, correctForLowNumberOfSamples, alpha)
275+
}
69276

70277
/** Select which metrics to compute (linf_simple or linf_robust)
71278
* based on whether samples are enough */
72279
private[this] def selectMetrics(
73280
linfSimple: Double,
74281
n: Double,
75282
m: Double,
76-
correctForLowNumberOfSamples: Boolean = false)
283+
correctForLowNumberOfSamples: Boolean = false,
284+
alpha: Option[Double])
77285
: Double = {
78286
if (correctForLowNumberOfSamples) {
79287
linfSimple
80288
} else {
81289
// This formula is based on “Two-sample Kolmogorov–Smirnov test"
82290
// Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
83-
val linfRobust = Math.max(0.0, linfSimple - 1.8 * Math.sqrt((n + m) / (n * m)))
291+
292+
val cAlpha : Double = alpha match {
293+
case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
294+
case None => defaultCAlpha
295+
}
296+
val linfRobust = Math.max(0.0, linfSimple - cAlpha * Math.sqrt((n + m) / (n * m)))
84297
linfRobust
85298
}
86299
}

0 commit comments

Comments
 (0)