feat(pyspark): implement covariance and correlation

cpcloud · kszucs · commit ae818fb71b8f · 2022-06-16T16:15:40.000+02:00
diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py
@@ -690,6 +690,44 @@ def compile_variance(t, expr, scope, timecontext, context=None, **kwargs):
     )
 
 
+@compiles(ops.Covariance)
+def compile_covariance(t, expr, scope, timecontext, context=None, **kwargs):
+    op = expr.op()
+    how = op.how
+
+    fn = {"sample": F.covar_samp, "pop": F.covar_pop}[how]
+
+    pyspark_double_type = ibis_dtype_to_spark_dtype(dtypes.double)
+    expr = op.__class__(
+        left=op.left.cast(pyspark_double_type),
+        right=op.right.cast(pyspark_double_type),
+        how=how,
+        where=op.where,
+    ).to_expr()
+    return compile_aggregator(
+        t, expr, scope, timecontext, fn=fn, context=context
+    )
+
+
+@compiles(ops.Correlation)
+def compile_correlation(t, expr, scope, timecontext, context=None, **kwargs):
+    op = expr.op()
+
+    if (how := op.how) == "pop":
+        raise ValueError("PySpark only implements sample correlation")
+
+    pyspark_double_type = ibis_dtype_to_spark_dtype(dtypes.double)
+    expr = op.__class__(
+        left=op.left.cast(pyspark_double_type),
+        right=op.right.cast(pyspark_double_type),
+        how=how,
+        where=op.where,
+    ).to_expr()
+    return compile_aggregator(
+        t, expr, scope, timecontext, fn=F.corr, context=context
+    )
+
+
 @compiles(ops.Arbitrary)
 def compile_arbitrary(t, expr, scope, timecontext, context=None, **kwargs):
     how = expr.op().how
diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py
@@ -200,8 +200,6 @@ def test_aggregate_grouped(
                         "impala",
                         "mysql",
                         "pandas",
-                        "postgres",
-                        "pyspark",
                         "sqlite",
                     ]
                 )
@@ -220,8 +218,6 @@ def test_aggregate_grouped(
                         "impala",
                         "mysql",
                         "pandas",
-                        "postgres",
-                        "pyspark",
                         "sqlite",
                     ]
                 )

Original file line number	Diff line number	Diff line change
`@@ -200,8 +200,6 @@ def test_aggregate_grouped(`
`200`	`200`	`"impala",`
`201`	`201`	`"mysql",`
`202`	`202`	`"pandas",`
`203`		`- "postgres",`
`204`		`- "pyspark",`
`205`	`203`	`"sqlite",`
`206`	`204`	`]`
`207`	`205`	`)`
`@@ -220,8 +218,6 @@ def test_aggregate_grouped(`
`220`	`218`	`"impala",`
`221`	`219`	`"mysql",`
`222`	`220`	`"pandas",`
`223`		`- "postgres",`
`224`		`- "pyspark",`
`225`	`221`	`"sqlite",`
`226`	`222`	`]`
`227`	`223`	`)`