Enhancement: Add support for timezone-flexible DateTime (unionai-oss#1352)

max-raphael · max-raphael · commit 1acadbe08112 · 2025-01-24T23:25:24.000-08:00
Signed-off-by: Max Raphael <mrap96@gmail.com> Enhancement: Add support for timezone-flexible DateTime (unionai-oss#1352) Signed-off-by: Max Raphael <mrap96@gmail.com>
diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -860,6 +860,13 @@ class DateTime(_BaseDateTime, dtypes.Timestamp):
     tz: Optional[datetime.tzinfo] = None
     """The timezone."""
 
+    timezone_flexible: bool = False
+    """
+    A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
+    When set to True, the function will ignore 'tz' and allow datetimes with any timezone(s). If coerce is set to True,
+    the function can accept timezone-naive datetimes, and will convert all datetimes to the specified tz (or 'UTC').
+    """
+
     to_datetime_kwargs: Dict[str, Any] = dataclasses.field(
         default_factory=dict, compare=False, repr=False
     )
@@ -936,14 +943,94 @@ def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype):
         return cls(unit=pd_dtype.unit, tz=pd_dtype.tz)  # type: ignore
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
+        if self.timezone_flexible:
+            data_container = self._prepare_coerce_timezone_flexible(
+                data_container=data_container
+            )
         return self._coerce(data_container, pandas_dtype=self.type)
 
+    def _prepare_coerce_timezone_flexible(
+        self, data_container: PandasObject
+    ) -> PandasObject:
+        if not self.tz:
+            raise errors.ParserError(
+                "Cannot coerce timezone-naive datetimes when 'tz' is not specified. "
+                "Either specify a timezone using 'tz' parameter or set 'timezone_flexible=True' "
+                "to allow flexible timezone handling.",
+                failure_cases=utils.numpy_pandas_coerce_failure_cases(
+                    data_container, self
+                ),
+            )
+        # If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
+        if isinstance(data_container.dtype, pd.DatetimeTZDtype):
+            tz = self.tz if self.tz else data_container.dtype.tz
+            unit = self.unit if self.unit else data_container.dtype.unit
+            type_ = pd.DatetimeTZDtype(unit, tz)
+            object.__setattr__(self, "tz", tz)
+            object.__setattr__(self, "type", type_)
+        # If there are multiple timezones, convert them to the specified tz (default 'UTC') and set the type accordingly
+        elif all(isinstance(x, datetime.datetime) for x in data_container):
+            container_type = type(data_container)
+            tz = self.tz if self.tz else "UTC"
+            unit = self.unit if self.unit else data_container.dtype.unit
+            data_container = container_type(
+                [
+                    (
+                        pd.Timestamp(ts).tz_convert(tz)
+                        if pd.Timestamp(ts).tzinfo
+                        else pd.Timestamp(ts).tz_localize(tz)
+                    )
+                    for ts in data_container
+                ]
+            )
+            type_ = pd.DatetimeTZDtype(unit, tz)
+            object.__setattr__(self, "tz", tz)
+            object.__setattr__(self, "type", type_)
+        else:
+            # Prepare to raise exception, adding type strictly for the check_dtype error message
+            object.__setattr__(self, "type", "datetime64[ns, <timezone>]")
+        return data_container
+
     def coerce_value(self, value: Any) -> Any:
         """Coerce an value to specified datatime type."""
         return self._get_to_datetime_fn(value)(
             value, **self.to_datetime_kwargs
         )
 
+    def check(
+        self,
+        pandera_dtype: dtypes.DataType,
+        data_container: Optional[PandasObject] = None,
+    ) -> Union[bool, Iterable[bool]]:
+        if self.timezone_flexible:
+            self._prepare_check_timezone_flexible(
+                pandera_dtype=pandera_dtype, data_container=data_container
+            )
+        return super().check(pandera_dtype, data_container)
+
+    def _prepare_check_timezone_flexible(
+        self,
+        pandera_dtype: dtypes.DataType,
+        data_container: Optional[PandasObject],
+    ) -> None:
+        # If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
+        if (
+            isinstance(pandera_dtype, DateTime)
+            and pandera_dtype.tz is not None
+        ):
+            type_ = pd.DatetimeTZDtype(self.unit, pandera_dtype.tz)
+            object.__setattr__(self, "tz", pandera_dtype.tz)
+            object.__setattr__(self, "type", type_)
+        # If the data has a mix of timezones, pandas defines the dtype as 'object
+        elif all(
+            isinstance(x, datetime.datetime) and x.tzinfo is not None
+            for x in data_container  # type: ignore
+        ):
+            object.__setattr__(self, "type", np.dtype("O"))
+        else:
+            # Prepare to raise exception, adding type strictly for the check_dtype error message
+            object.__setattr__(self, "type", "datetime64[ns, <timezone>]")
+
     def __str__(self) -> str:
         if self.type == np.dtype("datetime64[ns]"):
             return "datetime64[ns]"
diff --git a/tests/core/test_pandas_engine.py b/tests/core/test_pandas_engine.py
@@ -1,7 +1,8 @@
 """Test pandas engine."""
 
-from datetime import date
-from typing import Any, Set
+import datetime as dt
+from typing import Tuple, List, Optional, Any, Set
+from zoneinfo import ZoneInfo
 
 import hypothesis
 import hypothesis.extra.pandas as pd_st
@@ -13,8 +14,9 @@
 import pytz
 from hypothesis import given
 
+from pandera import Field, DataFrameModel
 from pandera.engines import pandas_engine
-from pandera.errors import ParserError
+from pandera.errors import ParserError, SchemaError
 
 UNSUPPORTED_DTYPE_CLS: Set[Any] = set()
 
@@ -202,6 +204,143 @@ def test_pandas_datetimetz_dtype(timezone_aware, data, timezone):
         assert coerced_data.dt.tz == timezone
 
 
+def generate_test_cases_timezone_flexible() -> List[
+    Tuple[
+        List[dt.datetime],
+        Optional[dt.tzinfo],
+        bool,
+        List[dt.datetime],
+        bool,
+    ]
+]:
+    """
+    Generate test parameter combinations for a given list of datetime lists.
+
+    Returns:
+        List of tuples:
+        - List of input datetimes
+        - tz for DateTime constructor
+        - coerce flag for Field constructor
+        - expected output datetimes
+        - raises flag (True if an exception is expected, False otherwise)
+    """
+    datetimes = [
+        # multi tz and tz naive
+        [
+            dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
+            dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/Los_Angeles")),
+            dt.datetime(2023, 3, 1, 5),
+        ],
+        # multiz tz
+        [
+            dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
+            dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/Los_Angeles")),
+        ],
+        # tz naive
+        [dt.datetime(2023, 3, 1, 4), dt.datetime(2023, 3, 1, 5)],
+        # single tz
+        [
+            dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
+            dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/New_York")),
+        ],
+    ]
+
+    test_cases = []
+
+    for datetime_list in datetimes:
+        for coerce in [True, False]:
+            for tz in [
+                None,
+                ZoneInfo("America/Chicago"),
+                dt.timezone(dt.timedelta(hours=2)),
+            ]:
+                # Determine if the test should raise an exception
+                # Should raise error when:
+                # * coerce is False but there is a timezone-naive datetime
+                # * coerce is True but tz is not set
+                has_naive_datetime = any(
+                    dt.tzinfo is None for dt in datetime_list
+                )
+                raises = (not coerce and has_naive_datetime) or (
+                    coerce and tz is None
+                )
+
+                # Generate expected output
+                if raises:
+                    expected_output = None  # No expected output since an exception will be raised
+                else:
+                    if coerce:
+                        # localize / convert the input datetimes to the specified tz or 'UTC' (default)
+                        use_tz = tz if tz else ZoneInfo("UTC")
+                        expected_output_naive = [
+                            dt.replace(tzinfo=use_tz)
+                            for dt in datetime_list
+                            if dt.tzinfo is None
+                        ]
+                        expected_output_aware = [
+                            dt.astimezone(use_tz)
+                            for dt in datetime_list
+                            if dt.tzinfo is not None
+                        ]
+                        expected_output = (
+                            expected_output_naive + expected_output_aware
+                        )
+                    else:
+                        # ignore tz
+                        expected_output = datetime_list
+
+                test_case = (
+                    datetime_list,
+                    tz,
+                    coerce,
+                    expected_output,
+                    raises,
+                )
+                test_cases.append(test_case)
+
+    # define final test cases with improper type
+    datetime_list = [
+        dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
+        "hello world",
+    ]
+    tz = None
+    expected_output = None
+    raises = True
+
+    bad_type_coerce = (datetime_list, tz, True, expected_output, raises)
+    bad_type_no_coerce = (datetime_list, tz, False, expected_output, raises)
+    test_cases.extend([bad_type_coerce, bad_type_no_coerce])  # type: ignore
+
+    return test_cases  # type: ignore
+
+
+@pytest.mark.parametrize(
+    "examples, tz, coerce, expected_output, raises",
+    generate_test_cases_timezone_flexible(),
+)
+def test_dt_timezone_flexible(examples, tz, coerce, expected_output, raises):
+    """Test that timezone_flexible works as expected"""
+
+    # Testing using a pandera DataFrameModel rather than directly calling dtype coerce or validate because with
+    # timezone_flexible, dtype is set dynamically based on the input data
+    class SimpleSchema(DataFrameModel):
+        # pylint: disable=unexpected-keyword-arg,no-value-for-parameter
+        datetime_column: pandas_engine.DateTime(
+            timezone_flexible=True, tz=tz
+        ) = Field(coerce=coerce)
+
+    data = pd.DataFrame({"datetime_column": examples})
+
+    if raises:
+        with pytest.raises(SchemaError):
+            SimpleSchema.validate(data)
+    else:
+        validated_df = SimpleSchema.validate(data)
+        assert sorted(validated_df["datetime_column"].tolist()) == sorted(
+            expected_output
+        )
+
+
 @hypothesis.settings(max_examples=1000)
 @pytest.mark.parametrize("to_df", [True, False])
 @given(
@@ -225,7 +364,7 @@ def test_pandas_date_coerce_dtype(to_df, data):
         )
 
         assert (
-            coerced_data.applymap(lambda x: isinstance(x, date))
+            coerced_data.applymap(lambda x: isinstance(x, dt.date))
             | coerced_data.isna()
         ).all(axis=None)
         return
@@ -234,7 +373,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
         coerced_data.isna().all() and coerced_data.dtype == "datetime64[ns]"
     )
     assert (
-        coerced_data.map(lambda x: isinstance(x, date)) | coerced_data.isna()
+        coerced_data.map(lambda x: isinstance(x, dt.date))
+        | coerced_data.isna()
     ).all()
 
 
@@ -246,8 +386,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
         pyarrow.struct([("foo", pyarrow.int64()), ("bar", pyarrow.string())]),
     ),
     (pd.Series([None, pd.NA, np.nan]), pyarrow.null),
-    (pd.Series([None, date(1970, 1, 1)]), pyarrow.date32),
-    (pd.Series([None, date(1970, 1, 1)]), pyarrow.date64),
+    (pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date32),
+    (pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date64),
     (pd.Series([1, 2]), pyarrow.duration("ns")),
     (pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time32("ms")),
     (pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time64("ns")),
@@ -292,8 +432,8 @@ def test_pandas_arrow_dtype(data, dtype):
         pyarrow.struct([("foo", pyarrow.string()), ("bar", pyarrow.int64())]),
     ),
     (pd.Series(["a", "1"]), pyarrow.null),
-    (pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
-    (pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
+    (pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
+    (pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
     (pd.Series(["a"]), pyarrow.duration("ns")),
     (pd.Series(["a", "b"]), pyarrow.time32("ms")),
     (pd.Series(["a", "b"]), pyarrow.time64("ns")),