fix(pyspark): custom format converter to handle pyspark timestamps

cpcloud · cpcloud · commit 758ec2528242 · 2023-12-18T08:43:20.000-05:00
diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
@@ -31,6 +31,8 @@
 from ibis.backends.pyspark.client import PySparkTable
 from ibis.backends.pyspark.compiler import PySparkExprTranslator
 from ibis.backends.pyspark.datatypes import PySparkType
+from ibis.common.temporal import normalize_timezone
+from ibis.formats.pandas import PandasData
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
@@ -104,6 +106,18 @@ class PySparkCompiler(Compiler):
     table_set_formatter_class = PySparkTableSetFormatter
 
 
+class PySparkPandasData(PandasData):
+    @classmethod
+    def convert_Timestamp_element(cls, dtype):
+        def converter(value, dtype=dtype):
+            if (tz := dtype.timezone) is not None:
+                return value.astimezone(normalize_timezone(tz))
+
+            return value.astimezone(normalize_timezone("UTC")).replace(tzinfo=None)
+
+        return converter
+
+
 class Backend(BaseSQLBackend, CanCreateDatabase):
     compiler = PySparkCompiler
     name = "pyspark"
@@ -219,7 +233,9 @@ def execute(self, expr: ir.Expr, **kwargs: Any) -> Any:
         df = self.compile(table_expr, **kwargs).toPandas()
 
         # TODO: remove the extra conversion
-        return expr.__pandas_result__(table_expr.__pandas_result__(df))
+        return expr.__pandas_result__(
+            PySparkPandasData.convert_table(df, table_expr.schema())
+        )
 
     def _fully_qualified_name(self, name, database):
         if is_fully_qualified(name):
@@ -232,17 +248,15 @@ def close(self):
         self._context.stop()
 
     def fetch_from_cursor(self, cursor, schema):
-        df = cursor.query.toPandas()  # blocks until finished
-        return schema.apply_to(df)
+        return cursor.query.toPandas()  # blocks until finished
 
     def raw_sql(self, query: str) -> _PySparkCursor:
         query = self._session.sql(query)
         return _PySparkCursor(query)
 
     def _get_schema_using_query(self, query):
         cursor = self.raw_sql(f"SELECT * FROM ({query}) t0 LIMIT 0")
-        struct = PySparkType.to_ibis(cursor.query.schema)
-        return sch.Schema(struct)
+        return sch.Schema(PySparkType.to_ibis(cursor.query.schema))
 
     def _get_jtable(self, name, database=None):
         get_table = self._catalog._jcatalog.getTable
diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py
@@ -344,10 +344,6 @@ def test_timestamp_extract_milliseconds(backend, alltypes, df):
     raises=GoogleBadRequest,
     reason="UNIX_SECONDS does not support DATETIME arguments",
 )
-@pytest.mark.xfail_version(
-    pyspark=["pandas<2.1"],
-    reason="test was adjusted to work with pandas 2.1 output; pyspark doesn't support pandas 2",
-)
 @pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError)
 def test_timestamp_extract_epoch_seconds(backend, alltypes, df):
     expr = alltypes.timestamp_col.epoch_seconds().name("tmp")