unionai-oss
diff --git a/‎.github/workflows/ci-tests.yml
+8-2 b/‎.github/workflows/ci-tests.yml
+8-2
diff --git a/‎pandera/api/polars/model.py
+27-30 b/‎pandera/api/polars/model.py
+27-30
diff --git a/‎pandera/api/pyspark/container.py
+30-1 b/‎pandera/api/pyspark/container.py
+30-1
diff --git a/‎pandera/api/pyspark/model.py
+18 b/‎pandera/api/pyspark/model.py
+18
diff --git a/‎pandera/engines/polars_engine.py
+40-3 b/‎pandera/engines/polars_engine.py
+40-3
diff --git a/‎pandera/external_config.py
+2 b/‎pandera/external_config.py
+2
diff --git a/‎pandera/io/pandas_io.py
+1-1 b/‎pandera/io/pandas_io.py
+1-1
@@ -100,14 +100,20 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        os:
+        - ubuntu-latest
+        - windows-latest
+        - macos-13
+        # - macos-latest  # see: https://github.com/actions/setup-python/issues/696
         python-version: ["3.8", "3.9", "3.10", "3.11"]
         pandas-version: ["1.5.3", "2.0.3", "2.2.0"]
         pydantic-version: ["1.10.11", "2.3.0"]
         include:
         - os: ubuntu-latest
           pip-cache: ~/.cache/pip
-        - os: macos-latest
+        # - os: macos-latest
+        #   pip-cache: ~/Library/Caches/pip
+        - os: macos-13
           pip-cache: ~/Library/Caches/pip
         - os: windows-latest
           pip-cache: ~/AppData/Local/pip/Cache
 
@@ -1,7 +1,6 @@
 """Class-based api for polars models."""
 
 from typing import (
-    Any,
     Dict,
     List,
     Tuple,
@@ -19,8 +18,10 @@
 from pandera.api.polars.container import DataFrameSchema
 from pandera.api.polars.components import Column
 from pandera.api.polars.model_config import BaseConfig
+from pandera.engines import polars_engine as pe
 from pandera.errors import SchemaInitError
 from pandera.typing import AnnotationInfo
+from pandera.typing.polars import Series
 
 
 class DataFrameModel(_DataFrameModel[pl.LazyFrame, DataFrameSchema]):
@@ -52,24 +53,30 @@ def _build_columns(  # pylint:disable=too-many-locals
             field_name = field.name
             check_name = getattr(field, "check_name", None)
 
-            if annotation.metadata:
-                if field.dtype_kwargs:
-                    raise TypeError(
-                        "Cannot specify redundant 'dtype_kwargs' "
-                        + f"for {annotation.raw_annotation}."
-                        + "\n Usage Tip: Drop 'typing.Annotated'."
-                    )
-                dtype_kwargs = get_dtype_kwargs(annotation)
-                dtype = annotation.arg(**dtype_kwargs)  # type: ignore
-            elif annotation.default_dtype:
-                dtype = annotation.default_dtype
-            else:
-                dtype = annotation.arg
-
-            dtype = None if dtype is Any else dtype
-
-            if annotation.origin is None or isinstance(
-                annotation.origin, pl.datatypes.DataTypeClass
+            engine_dtype = None
+            try:
+                engine_dtype = pe.Engine.dtype(annotation.raw_annotation)
+                dtype = engine_dtype.type
+            except TypeError as exc:
+                if annotation.metadata:
+                    if field.dtype_kwargs:
+                        raise TypeError(
+                            "Cannot specify redundant 'dtype_kwargs' "
+                            + f"for {annotation.raw_annotation}."
+                            + "\n Usage Tip: Drop 'typing.Annotated'."
+                        ) from exc
+                    dtype_kwargs = get_dtype_kwargs(annotation)
+                    dtype = annotation.arg(**dtype_kwargs)  # type: ignore
+                elif annotation.default_dtype:
+                    dtype = annotation.default_dtype
+                else:
+                    dtype = annotation.arg
+
+            if (
+                annotation.origin is None
+                or isinstance(annotation.origin, pl.datatypes.DataTypeClass)
+                or annotation.origin is Series
+                or engine_dtype
             ):
                 if check_name is False:
                     raise SchemaInitError(
@@ -89,19 +96,9 @@ def _build_columns(  # pylint:disable=too-many-locals
                 columns[field_name] = Column(**column_kwargs)
 
             else:
-                origin_name = (
-                    f"{annotation.origin.__module__}."
-                    f"{annotation.origin.__name__}"
-                )
-                msg = (
-                    " Series[TYPE] annotations are not supported for polars. "
-                    "Use the bare TYPE directly"
-                    if origin_name == "pandera.typing.pandas.Series"
-                    else ""
-                )
                 raise SchemaInitError(
                     f"Invalid annotation '{field_name}: "
-                    f"{annotation.raw_annotation}'.{msg}"
+                    f"{annotation.raw_annotation}'."
                 )
 
         return columns
 
@@ -8,7 +8,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast, overload
 
-from pyspark.sql import DataFrame
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import StructType, StructField
 
 from pandera import errors
 from pandera.api.base.schema import BaseSchema
@@ -563,6 +564,34 @@ def to_json(
 
         return pandera.io.to_json(self, target, **kwargs)
 
+    def to_structtype(self) -> StructType:
+        """Recover fields of DataFrameSchema as a Pyspark StructType object.
+
+        As the output of this method will be used to specify a read schema in Pyspark
+            (avoiding automatic schema inference), the False `nullable` properties are
+            just ignored, as this check will be executed by the Pandera validations
+            after a dataset is read.
+
+        :returns: StructType object with current schema fields.
+        """
+        fields = [
+            StructField(column, self.columns[column].dtype.type, True)
+            for column in self.columns
+        ]
+        return StructType(fields)
+
+    def to_ddl(self) -> str:
+        """Recover fields of DataFrameSchema as a Pyspark DDL string.
+
+        :returns: String with current schema fields, in compact DDL format.
+        """
+        # `StructType.toDDL()` is only available in internal java classes
+        spark = SparkSession.builder.getOrCreate()
+        # Create a base dataframe from where we access underlying Java classes
+        empty_df_with_schema = spark.createDataFrame([], self.to_structtype())
+
+        return empty_df_with_schema._jdf.schema().toDDL()
+
 
 def _validate_columns(
     column_dict: dict[Any, "pandera.api.pyspark.components.Column"],  # type: ignore [name-defined]
 
@@ -1,4 +1,5 @@
 """Class-based api for pyspark models."""
+
 # pylint:disable=abstract-method
 import copy
 import inspect
@@ -22,6 +23,7 @@
 )
 
 import pyspark.sql as ps
+from pyspark.sql.types import StructType
 
 from pandera.api.base.model import BaseModel
 from pandera.api.checks import Check
@@ -271,6 +273,22 @@ def to_yaml(cls, stream: Optional[os.PathLike] = None):
         """
         return cls.to_schema().to_yaml(stream)
 
+    @classmethod
+    def to_structtype(cls) -> StructType:
+        """Recover fields of DataFrameModel as a Pyspark StructType object.
+
+        :returns: StructType object with current model fields.
+        """
+        return cls.to_schema().to_structtype()
+
+    @classmethod
+    def to_ddl(cls) -> str:
+        """Recover fields of DataFrameModel as a Pyspark DDL string.
+
+        :returns: String with current model fields, in compact DDL format.
+        """
+        return cls.to_schema().to_ddl()
+
     @classmethod
     @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)
     def validate(
 
@@ -5,7 +5,16 @@
 import decimal
 import inspect
 import warnings
-from typing import Any, Union, Optional, Iterable, Literal, Sequence, Tuple
+from typing import (
+    Any,
+    Union,
+    Optional,
+    Iterable,
+    Literal,
+    Sequence,
+    Tuple,
+    Type,
+)
 
 
 import polars as pl
@@ -416,16 +425,26 @@ class Date(DataType, dtypes.Date):
 class DateTime(DataType, dtypes.DateTime):
     """Polars datetime data type."""
 
-    type = pl.Datetime
+    type: Type[pl.Datetime] = pl.Datetime
+    time_zone_agnostic: bool = False
 
     def __init__(  # pylint:disable=super-init-not-called
         self,
         time_zone: Optional[str] = None,
         time_unit: Optional[str] = None,
+        time_zone_agnostic: bool = False,
     ) -> None:
+
+        _kwargs = {}
+        if time_unit is not None:
+            # avoid deprecated warning when initializing pl.Datetime:
+            # passing time_unit=None is deprecated.
+            _kwargs["time_unit"] = time_unit
+
         object.__setattr__(
-            self, "type", pl.Datetime(time_zone=time_zone, time_unit=time_unit)
+            self, "type", pl.Datetime(time_zone=time_zone, **_kwargs)
         )
+        object.__setattr__(self, "time_zone_agnostic", time_zone_agnostic)
 
     @classmethod
     def from_parametrized_dtype(cls, polars_dtype: pl.Datetime):
@@ -435,6 +454,24 @@ def from_parametrized_dtype(cls, polars_dtype: pl.Datetime):
             time_zone=polars_dtype.time_zone, time_unit=polars_dtype.time_unit
         )
 
+    def check(
+        self,
+        pandera_dtype: dtypes.DataType,
+        data_container: Optional[PolarsDataContainer] = None,
+    ) -> Union[bool, Iterable[bool]]:
+        try:
+            pandera_dtype = Engine.dtype(pandera_dtype)
+        except TypeError:
+            return False
+
+        if self.time_zone_agnostic:
+            return (
+                isinstance(pandera_dtype.type, pl.Datetime)
+                and pandera_dtype.type.time_unit == self.type.time_unit
+            )
+
+        return self.type == pandera_dtype.type and super().check(pandera_dtype)
+
 
 @Engine.register_dtype(
     equivalents=[
 
@@ -21,6 +21,8 @@
         os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
 
     import pyspark.pandas
+except (ImportError, ModuleNotFoundError):
+    pass
 finally:
     if is_spark_local_ip_dirty:
         os.environ.pop("SPARK_LOCAL_IP")
 
@@ -740,7 +740,7 @@ def from_frictionless_schema(
     schema: Union[str, Path, Dict, FrictionlessSchema]
 ) -> DataFrameSchema:
     # pylint: disable=line-too-long,anomalous-backslash-in-string
-    """Create a :class:`~pandera.api.pandas.container.DataFrameSchema` from either a
+    r"""Create a :class:`~pandera.api.pandas.container.DataFrameSchema` from either a
     frictionless json/yaml schema file saved on disk, or from a frictionless
     schema already loaded into memory.