|
22 | 22 | PySparkAttributeError,
|
23 | 23 | )
|
24 | 24 | from pyspark.resource import ResourceProfile
|
| 25 | +from pyspark.sql.connect.logging import logger |
25 | 26 | from pyspark.sql.connect.utils import check_dependencies
|
26 | 27 |
|
27 | 28 | check_dependencies(__name__)
|
|
69 | 70 | PySparkRuntimeError,
|
70 | 71 | )
|
71 | 72 | from pyspark.util import PythonEvalType
|
| 73 | +from pyspark.serializers import CPickleSerializer |
72 | 74 | from pyspark.storagelevel import StorageLevel
|
73 | 75 | import pyspark.sql.connect.plan as plan
|
74 | 76 | from pyspark.sql.conversion import ArrowTableToRowsConversion
|
@@ -141,6 +143,7 @@ def __init__(
|
141 | 143 | # by __repr__ and _repr_html_ while eager evaluation opens.
|
142 | 144 | self._support_repr_html = False
|
143 | 145 | self._cached_schema: Optional[StructType] = None
|
| 146 | + self._cached_schema_serialized: Optional[bytes] = None |
144 | 147 | self._execution_info: Optional["ExecutionInfo"] = None
|
145 | 148 |
|
146 | 149 | def __reduce__(self) -> Tuple:
|
@@ -1836,11 +1839,24 @@ def _schema(self) -> StructType:
|
1836 | 1839 | if self._cached_schema is None:
|
1837 | 1840 | query = self._plan.to_proto(self._session.client)
|
1838 | 1841 | self._cached_schema = self._session.client.schema(query)
|
| 1842 | + try: |
| 1843 | + self._cached_schema_serialized = CPickleSerializer().dumps(self._schema) |
| 1844 | + except Exception as e: |
| 1845 | + logger.warn(f"DataFrame schema pickle dumps failed with exception: {e}.") |
| 1846 | + self._cached_schema_serialized = None |
1839 | 1847 | return self._cached_schema
|
1840 | 1848 |
|
1841 | 1849 | @property
|
1842 | 1850 | def schema(self) -> StructType:
|
1843 |
| - return copy.deepcopy(self._schema) |
| 1851 | + # self._schema call will cache the schema and serialize it if it is not cached yet. |
| 1852 | + _schema = self._schema |
| 1853 | + if self._cached_schema_serialized is not None: |
| 1854 | + try: |
| 1855 | + return CPickleSerializer().loads(self._cached_schema_serialized) |
| 1856 | + except Exception as e: |
| 1857 | + logger.warn(f"DataFrame schema pickle loads failed with exception: {e}.") |
| 1858 | + # In case of pickle ser/de failure, fallback to deepcopy approach. |
| 1859 | + return copy.deepcopy(_schema) |
1844 | 1860 |
|
1845 | 1861 | @functools.cache
|
1846 | 1862 | def isLocal(self) -> bool:
|
|
0 commit comments