Skip to content

Commit 422c98d

Browse files
committed
perf(pyspark): ensure that pyspark DDL doesn't use VALUES
1 parent aac9524 commit 422c98d

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

ibis/backends/pyspark/__init__.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,17 +88,15 @@ def __exit__(self, exc_type, exc_value, traceback):
8888

8989
class PySparkTableSetFormatter(TableSetFormatter):
9090
def _format_in_memory_table(self, op):
91-
names = op.schema.names
92-
rows = ", ".join(
93-
f"({', '.join(map(repr, row))})"
94-
for row in op.data.itertuples(index=False)
95-
)
96-
signature = ", ".join(map(self._quote_identifier, names))
97-
name = self._quote_identifier(op.name or "_")
98-
return f"(VALUES {rows} AS {name} ({signature}))"
91+
# we don't need to compile the table to a VALUES statement because the
92+
# table has been registered already by createOrReplaceTempView.
93+
#
94+
# The only place where the SQL API is currently used is DDL operations
95+
return op.name
9996

10097

10198
class PySparkCompiler(Compiler):
99+
cheap_in_memory_tables = True
102100
table_set_formatter_class = PySparkTableSetFormatter
103101

104102

@@ -463,6 +461,8 @@ def create_table(
463461
table_name, format=format, mode=mode
464462
)
465463
return
464+
else:
465+
self._register_in_memory_tables(obj)
466466

467467
ast = self.compiler.to_ast(obj)
468468
select = ast.queries[0]
@@ -487,6 +487,10 @@ def create_table(
487487

488488
return self.raw_sql(statement.compile())
489489

490+
def _register_in_memory_table(self, table_op):
491+
spark_df = self.compile(table_op.to_expr())
492+
spark_df.createOrReplaceTempView(table_op.name)
493+
490494
def create_view(
491495
self,
492496
name: str,

ibis/backends/pyspark/compiler.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2168,17 +2168,16 @@ def compile_random(*args, **kwargs):
21682168
return F.rand()
21692169

21702170

2171+
@compiles(ops.InMemoryTable)
21712172
@compiles(PandasInMemoryTable)
21722173
def compile_in_memory_table(t, expr, scope, timecontext, session, **kwargs):
21732174
op = expr.op()
21742175
fields = [
21752176
pt.StructField(name, ibis_dtype_to_spark_dtype(dtype), dtype.nullable)
21762177
for name, dtype in op.schema.items()
21772178
]
2178-
return session.createDataFrame(
2179-
data=op.data._df,
2180-
schema=pt.StructType(fields),
2181-
)
2179+
schema = pt.StructType(fields)
2180+
return session.createDataFrame(data=op.data.to_frame(), schema=schema)
21822181

21832182

21842183
@compiles(ops.BitwiseAnd)

0 commit comments

Comments
 (0)