2
2
3
3
import sys
4
4
5
- import pandas .testing as tm
6
5
import pytest
7
6
8
7
import ibis
9
8
import ibis .common .exceptions as com
10
9
from ibis import _
11
- from ibis .backends .tests .errors import (
12
- PsycoPg2InternalError ,
13
- Py4JJavaError ,
14
- PyDruidProgrammingError ,
15
- )
10
+ from ibis .backends .tests .errors import Py4JJavaError
11
+
12
+ tm = pytest . importorskip ( "pandas.testing" )
13
+
14
+ pytestmark = pytest . mark . xdist_group ( "impure" )
16
15
17
16
no_randoms = [
18
17
pytest .mark .notimpl (
19
- ["dask" , "pandas" , "polars" ], raises = com .OperationNotDefinedError
20
- ),
21
- pytest .mark .notimpl ("druid" , raises = PyDruidProgrammingError ),
22
- pytest .mark .notyet (
23
- "risingwave" ,
24
- raises = PsycoPg2InternalError ,
25
- reason = "function random() does not exist" ,
18
+ ["polars" , "druid" , "risingwave" ], raises = com .OperationNotDefinedError
26
19
),
27
20
]
28
21
32
25
[
33
26
"bigquery" ,
34
27
"clickhouse" ,
35
- "dask" ,
36
28
"druid" ,
37
29
"exasol" ,
38
30
"impala" ,
39
31
"mssql" ,
40
32
"mysql" ,
41
33
"oracle" ,
42
- "pandas" ,
43
34
"trino" ,
44
35
"risingwave" ,
45
36
]
46
37
),
47
- pytest .mark .notimpl ("pyspark" , reason = "only supports pandas UDFs" ),
48
38
pytest .mark .notyet (
49
39
"flink" ,
50
40
condition = sys .version_info >= (3 , 11 ),
55
45
56
46
no_uuids = [
57
47
pytest .mark .notimpl (
58
- [
59
- "druid" ,
60
- "exasol" ,
61
- "oracle" ,
62
- "polars" ,
63
- "pyspark" ,
64
- "risingwave" ,
65
- "pandas" ,
66
- "dask" ,
67
- ],
48
+ ["druid" , "exasol" , "oracle" , "polars" , "pyspark" , "risingwave" ],
68
49
raises = com .OperationNotDefinedError ,
69
50
),
70
51
pytest .mark .notyet ("mssql" , reason = "Unrelated bug: Incorrect syntax near '('" ),
@@ -82,11 +63,7 @@ def my_random(x: float) -> float:
82
63
mark_impures = pytest .mark .parametrize (
83
64
"impure" ,
84
65
[
85
- pytest .param (
86
- lambda _ : ibis .random (),
87
- marks = no_randoms ,
88
- id = "random" ,
89
- ),
66
+ pytest .param (lambda _ : ibis .random (), marks = no_randoms , id = "random" ),
90
67
pytest .param (
91
68
lambda _ : ibis .uuid ().cast (str ).contains ("a" ).ifelse (1 , 0 ),
92
69
marks = [
@@ -107,6 +84,7 @@ def my_random(x: float) -> float:
107
84
)
108
85
109
86
87
+ # You can work around this by .cache()ing the table.
110
88
@pytest .mark .notyet ("sqlite" , reason = "instances are uncorrelated" )
111
89
@mark_impures
112
90
def test_impure_correlated (alltypes , impure ):
@@ -120,14 +98,12 @@ def test_impure_correlated(alltypes, impure):
120
98
# t AS (SELECT random() AS common)
121
99
# SELECT common as x, common as y FROM t
122
100
# Then both x and y should have the same value.
123
- df = (
124
- alltypes .select (common = impure (alltypes ))
125
- .select (x = _ .common , y = _ .common )
126
- .execute ()
127
- )
101
+ expr = alltypes .select (common = impure (alltypes )).select (x = _ .common , y = _ .common )
102
+ df = expr .execute ()
128
103
tm .assert_series_equal (df .x , df .y , check_names = False )
129
104
130
105
106
+ # You can work around this by .cache()ing the table.
131
107
@pytest .mark .notyet ("sqlite" , reason = "instances are uncorrelated" )
132
108
@mark_impures
133
109
def test_chained_selections (alltypes , impure ):
@@ -153,9 +129,7 @@ def test_chained_selections(alltypes, impure):
153
129
lambda _ : ibis .random (),
154
130
marks = [
155
131
* no_randoms ,
156
- pytest .mark .notyet (
157
- ["impala" , "trino" ], reason = "instances are correlated"
158
- ),
132
+ pytest .mark .notyet (["impala" ], reason = "instances are correlated" ),
159
133
],
160
134
id = "random" ,
161
135
),
@@ -164,24 +138,24 @@ def test_chained_selections(alltypes, impure):
164
138
lambda _ : ibis .uuid ().cast (str ).contains ("a" ).ifelse (1 , 0 ),
165
139
marks = [
166
140
* no_uuids ,
167
- pytest .mark .notyet (
168
- ["mysql" , "trino" ], reason = "instances are correlated"
169
- ),
141
+ pytest .mark .notyet (["mysql" ], reason = "instances are correlated" ),
170
142
],
171
143
id = "uuid" ,
172
144
),
173
145
pytest .param (
174
146
lambda table : my_random (table .float_col ),
175
147
marks = [
176
148
* no_udfs ,
177
- pytest .mark .notyet ("duckdb" , reason = "instances are correlated" ),
149
+ # no "impure" argument for pyspark yet
150
+ pytest .mark .notimpl ("pyspark" ),
178
151
],
179
152
id = "udf" ,
180
153
),
181
154
],
182
155
)
183
156
184
157
158
+ # You can work around this by doing .select().cache().select()
185
159
@pytest .mark .notyet (["clickhouse" ], reason = "instances are correlated" )
186
160
@impure_params_uncorrelated
187
161
def test_impure_uncorrelated_different_id (alltypes , impure ):
@@ -191,15 +165,57 @@ def test_impure_uncorrelated_different_id(alltypes, impure):
191
165
# eg if you look at the following SQL:
192
166
# select random() as x, random() as y
193
167
# Then x and y should be uncorrelated.
194
- df = alltypes .select (x = impure (alltypes ), y = impure (alltypes )).execute ()
168
+ expr = alltypes .select (x = impure (alltypes ), y = impure (alltypes ))
169
+ df = expr .execute ()
195
170
assert (df .x != df .y ).any ()
196
171
197
172
173
+ # You can work around this by doing .select().cache().select()
198
174
@pytest .mark .notyet (["clickhouse" ], reason = "instances are correlated" )
199
175
@impure_params_uncorrelated
200
176
def test_impure_uncorrelated_same_id (alltypes , impure ):
201
177
# Similar to test_impure_uncorrelated_different_id, but the two expressions
202
178
# have the same ID. Still, they should be uncorrelated.
203
179
common = impure (alltypes )
204
- df = alltypes .select (x = common , y = common ).execute ()
180
+ expr = alltypes .select (x = common , y = common )
181
+ df = expr .execute ()
205
182
assert (df .x != df .y ).any ()
183
+
184
+
185
+ @pytest .mark .notyet (
186
+ [
187
+ "duckdb" ,
188
+ "clickhouse" ,
189
+ "datafusion" ,
190
+ "mysql" ,
191
+ "impala" ,
192
+ "mssql" ,
193
+ "trino" ,
194
+ "flink" ,
195
+ "bigquery" ,
196
+ ],
197
+ raises = AssertionError ,
198
+ reason = "instances are not correlated but ideally they would be" ,
199
+ )
200
+ @pytest .mark .notyet (
201
+ ["sqlite" ],
202
+ raises = AssertionError ,
203
+ reason = "instances are *sometimes* correlated but ideally they would always be" ,
204
+ strict = False ,
205
+ )
206
+ @pytest .mark .notimpl (
207
+ ["polars" , "risingwave" , "druid" , "exasol" , "oracle" , "pyspark" ],
208
+ raises = com .OperationNotDefinedError ,
209
+ )
210
+ def test_self_join_with_generated_keys (con ):
211
+ # Even with CTEs in the generated SQL, the backends still
212
+ # materialize a new value every time it is referenced.
213
+ # This isn't ideal behavior, but there is nothing we can do about it
214
+ # on the ibis side. The best you can do is to .cache() the table
215
+ # right after you assign the uuid().
216
+ # https://github.com/ibis-project/ibis/pull/9014#issuecomment-2399449665
217
+ left = ibis .memtable ({"idx" : list (range (5 ))}).mutate (key = ibis .uuid ())
218
+ right = left .filter (left .idx < 3 )
219
+ expr = left .join (right , "key" )
220
+ result = con .execute (expr .count ())
221
+ assert result == 3
0 commit comments