Enhancement: Add support for timezone-flexible DateTime (#1352) (#1902)

max-raphael · web-flow · commit 32b08fde3b5b · 2025-02-12T10:08:04.000-05:00
Enhancement: Add support for timezone-flexible DateTime (#1352) Signed-off-by: Max Raphael <mrap96@gmail.com>
diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -860,6 +860,19 @@ class DateTime(_BaseDateTime, dtypes.Timestamp):
     tz: Optional[datetime.tzinfo] = None
     """The timezone."""
 
+    time_zone_agnostic: bool = False
+    """
+    A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
+
+    - If set to `True` and `coerce` is `False`, the function will accept datetimes with any timezone(s)
+        but not timezone-naive datetimes. If passed, the `tz` argument will be ignored, as this use
+        case is handled by setting `time_zone_agnostic=False`.
+
+    - If set to `True` and `coerce` is `True`, a `tz` must also be specified. The function will then
+        accept datetimes with any timezone(s) and convert them to the specified tz, as well as
+        timezone-naive datetimes, and localize them to the specified tz.
+    """
+
     to_datetime_kwargs: Dict[str, Any] = dataclasses.field(
         default_factory=dict, compare=False, repr=False
     )
@@ -936,14 +949,111 @@ def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype):
         return cls(unit=pd_dtype.unit, tz=pd_dtype.tz)  # type: ignore
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
+        if self.time_zone_agnostic:
+            data_container = self._prepare_coerce_time_zone_agnostic(
+                data_container=data_container
+            )
         return self._coerce(data_container, pandas_dtype=self.type)
 
+    def _prepare_coerce_time_zone_agnostic(
+        self, data_container: PandasObject
+    ) -> PandasObject:
+        if not self.tz:
+            raise errors.ParserError(
+                "Cannot coerce datetimes when 'time_zone_agnostic=True' and 'tz' is not specified. "
+                "When using 'time_zone_agnostic' and 'coerce', you must specify a timezone using 'tz' parameter.",
+                failure_cases=utils.numpy_pandas_coerce_failure_cases(
+                    data_container, self
+                ),
+            )
+        # If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
+        if isinstance(data_container.dtype, pd.DatetimeTZDtype):
+            tz = self.tz
+            unit = self.unit if self.unit else data_container.dtype.unit
+            type_ = pd.DatetimeTZDtype(unit, tz)
+            object.__setattr__(self, "tz", tz)
+            object.__setattr__(self, "type", type_)
+        # If there are multiple timezones, convert them to the specified tz and set the type accordingly
+        elif all(isinstance(x, datetime.datetime) for x in data_container):
+            container_type = type(data_container)
+            tz = self.tz
+            unit = self.unit if self.unit else data_container.dtype.unit
+            data_container = container_type(
+                [
+                    (
+                        pd.Timestamp(ts).tz_convert(tz)
+                        if pd.Timestamp(ts).tzinfo
+                        else pd.Timestamp(ts).tz_localize(tz)
+                    )
+                    for ts in data_container
+                ]
+            )
+            type_ = pd.DatetimeTZDtype(unit, tz)
+            object.__setattr__(self, "tz", tz)
+            object.__setattr__(self, "type", type_)
+        else:
+            raise errors.ParserError(
+                "When time_zone_agnostic=True, data must either be:\n"
+                "1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
+                "2. A Series of datetime objects\n"
+                f"Got data with dtype: {data_container.dtype}",
+                failure_cases=utils.numpy_pandas_coerce_failure_cases(
+                    data_container, self
+                ),
+            )
+        return data_container
+
     def coerce_value(self, value: Any) -> Any:
         """Coerce an value to specified datatime type."""
         return self._get_to_datetime_fn(value)(
             value, **self.to_datetime_kwargs
         )
 
+    def check(
+        self,
+        pandera_dtype: dtypes.DataType,
+        data_container: Optional[PandasObject] = None,
+    ) -> Union[bool, Iterable[bool]]:
+        if self.time_zone_agnostic:
+            self._prepare_check_time_zone_agnostic(
+                pandera_dtype=pandera_dtype, data_container=data_container
+            )
+        return super().check(pandera_dtype, data_container)
+
+    def _prepare_check_time_zone_agnostic(
+        self,
+        pandera_dtype: dtypes.DataType,
+        data_container: Optional[PandasObject],
+    ) -> None:
+        # If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
+        if (
+            isinstance(pandera_dtype, DateTime)
+            and pandera_dtype.tz is not None
+        ):
+            type_ = pd.DatetimeTZDtype(self.unit, pandera_dtype.tz)
+            object.__setattr__(self, "tz", pandera_dtype.tz)
+            object.__setattr__(self, "type", type_)
+        # If the data has a mix of timezones, pandas defines the dtype as 'object`
+        elif all(
+            isinstance(x, datetime.datetime) and x.tzinfo is not None
+            for x in data_container  # type: ignore
+        ):
+            object.__setattr__(self, "type", np.dtype("O"))
+        else:
+            raise errors.ParserError(
+                "When time_zone_agnostic=True, data must either be:\n"
+                "1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
+                "2. A Series of timezone-aware datetime objects\n"
+                f"Got data with dtype: {data_container.dtype if data_container is not None else 'None'}",
+                failure_cases=(
+                    utils.numpy_pandas_coerce_failure_cases(
+                        data_container, self
+                    )
+                    if data_container is not None
+                    else None
+                ),
+            )
+
     def __str__(self) -> str:
         if self.type == np.dtype("datetime64[ns]"):
             return "datetime64[ns]"
diff --git a/tests/core/test_pandas_engine.py b/tests/core/test_pandas_engine.py
@@ -1,7 +1,7 @@
 """Test pandas engine."""
 
-from datetime import date
-from typing import Any, Set
+import datetime as dt
+from typing import Tuple, List, Optional, Any, Set
 
 import hypothesis
 import hypothesis.extra.pandas as pd_st
@@ -13,8 +13,9 @@
 import pytz
 from hypothesis import given
 
+from pandera import Field, DataFrameModel, errors
 from pandera.engines import pandas_engine
-from pandera.errors import ParserError
+from pandera.errors import ParserError, SchemaError
 
 UNSUPPORTED_DTYPE_CLS: Set[Any] = set()
 
@@ -202,6 +203,165 @@ def test_pandas_datetimetz_dtype(timezone_aware, data, timezone):
         assert coerced_data.dt.tz == timezone
 
 
+def generate_test_cases_time_zone_agnostic() -> List[
+    Tuple[
+        List[dt.datetime],
+        Optional[dt.tzinfo],
+        bool,
+        List[dt.datetime],
+        bool,
+    ]
+]:
+    """
+    Generate test parameter combinations for a given list of datetime lists.
+
+    Returns:
+        List of tuples:
+        - List of input datetimes
+        - tz for DateTime constructor
+        - coerce flag for Field constructor
+        - expected output datetimes
+        - raises flag (True if an exception is expected, False otherwise)
+    """
+    datetimes = [
+        # multi tz and tz naive
+        [
+            pytz.timezone("America/New_York").localize(
+                dt.datetime(2023, 3, 1, 4)
+            ),
+            pytz.timezone("America/Los_Angeles").localize(
+                dt.datetime(2023, 3, 1, 5)
+            ),
+            dt.datetime(2023, 3, 1, 5),  # naive datetime
+        ],
+        # multi tz
+        [
+            pytz.timezone("America/New_York").localize(
+                dt.datetime(2023, 3, 1, 4)
+            ),
+            pytz.timezone("America/Los_Angeles").localize(
+                dt.datetime(2023, 3, 1, 5)
+            ),
+        ],
+        # tz naive
+        [dt.datetime(2023, 3, 1, 4), dt.datetime(2023, 3, 1, 5)],
+        # single tz
+        [
+            pytz.timezone("America/New_York").localize(
+                dt.datetime(2023, 3, 1, 4)
+            ),
+            pytz.timezone("America/New_York").localize(
+                dt.datetime(2023, 3, 1, 5)
+            ),
+        ],
+    ]
+
+    test_cases = []
+
+    for datetime_list in datetimes:
+        for coerce in [True, False]:
+            for tz in [
+                None,
+                pytz.timezone("America/Chicago"),
+                pytz.FixedOffset(120),  # 120 minutes = 2 hours offset
+            ]:
+                # Determine if the test should raise an exception
+                # Should raise error when:
+                # * coerce is False but there is a timezone-naive datetime
+                # * coerce is True but tz is not set
+                has_naive_datetime = any(
+                    dt.tzinfo is None for dt in datetime_list
+                )
+                raises = (not coerce and has_naive_datetime) or (
+                    coerce and tz is None
+                )
+
+                # Generate expected output
+                if raises:
+                    expected_output = None  # No expected output since an exception will be raised
+                else:
+                    if coerce:
+                        # Replace naive datetimes with localized ones
+                        expected_output_naive = [
+                            tz.localize(dtime) if tz is not None else dtime
+                            for dtime in datetime_list
+                            if dtime.tzinfo is None
+                        ]
+
+                        # Convert timezone-aware datetimes to the desired timezone
+                        expected_output_aware = [
+                            dtime.astimezone(
+                                tz
+                            )  # Use .astimezone() for aware datetimes
+                            for dtime in datetime_list
+                            if dtime.tzinfo is not None
+                        ]
+                        expected_output = (
+                            expected_output_naive + expected_output_aware
+                        )
+                    else:
+                        # ignore tz
+                        expected_output = datetime_list
+
+                test_case = (
+                    datetime_list,
+                    tz,
+                    coerce,
+                    expected_output,
+                    raises,
+                )
+                test_cases.append(test_case)
+
+    # define final test cases with improper type
+    datetime_list = [
+        pytz.timezone("America/New_York").localize(
+            dt.datetime(
+                2023,
+                3,
+                1,
+                4,
+            )
+        ),
+        "hello world",
+    ]
+    tz = None
+    expected_output = None
+    raises = True
+
+    bad_type_coerce = (datetime_list, tz, True, expected_output, raises)
+    bad_type_no_coerce = (datetime_list, tz, False, expected_output, raises)
+    test_cases.extend([bad_type_coerce, bad_type_no_coerce])  # type: ignore
+
+    return test_cases  # type: ignore
+
+
+@pytest.mark.parametrize(
+    "examples, tz, coerce, expected_output, raises",
+    generate_test_cases_time_zone_agnostic(),
+)
+def test_dt_time_zone_agnostic(examples, tz, coerce, expected_output, raises):
+    """Test that time_zone_agnostic works as expected"""
+
+    # Testing using a pandera DataFrameModel rather than directly calling dtype coerce or validate because with
+    # time_zone_agnostic, dtype is set dynamically based on the input data
+    class SimpleSchema(DataFrameModel):
+        # pylint: disable=unexpected-keyword-arg,no-value-for-parameter
+        datetime_column: pandas_engine.DateTime(
+            time_zone_agnostic=True, tz=tz
+        ) = Field(coerce=coerce)
+
+    data = pd.DataFrame({"datetime_column": examples})
+
+    if raises:
+        with pytest.raises((SchemaError, errors.ParserError)):
+            SimpleSchema.validate(data)
+    else:
+        validated_df = SimpleSchema.validate(data)
+        assert sorted(validated_df["datetime_column"].tolist()) == sorted(
+            expected_output
+        )
+
+
 @hypothesis.settings(max_examples=1000)
 @pytest.mark.parametrize("to_df", [True, False])
 @given(
@@ -225,7 +385,7 @@ def test_pandas_date_coerce_dtype(to_df, data):
         )
 
         assert (
-            coerced_data.applymap(lambda x: isinstance(x, date))
+            coerced_data.applymap(lambda x: isinstance(x, dt.date))
             | coerced_data.isna()
         ).all(axis=None)
         return
@@ -234,7 +394,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
         coerced_data.isna().all() and coerced_data.dtype == "datetime64[ns]"
     )
     assert (
-        coerced_data.map(lambda x: isinstance(x, date)) | coerced_data.isna()
+        coerced_data.map(lambda x: isinstance(x, dt.date))
+        | coerced_data.isna()
     ).all()
 
 
@@ -246,8 +407,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
         pyarrow.struct([("foo", pyarrow.int64()), ("bar", pyarrow.string())]),
     ),
     (pd.Series([None, pd.NA, np.nan]), pyarrow.null),
-    (pd.Series([None, date(1970, 1, 1)]), pyarrow.date32),
-    (pd.Series([None, date(1970, 1, 1)]), pyarrow.date64),
+    (pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date32),
+    (pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date64),
     (pd.Series([1, 2]), pyarrow.duration("ns")),
     (pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time32("ms")),
     (pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time64("ns")),
@@ -292,8 +453,8 @@ def test_pandas_arrow_dtype(data, dtype):
         pyarrow.struct([("foo", pyarrow.string()), ("bar", pyarrow.int64())]),
     ),
     (pd.Series(["a", "1"]), pyarrow.null),
-    (pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
-    (pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
+    (pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
+    (pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
     (pd.Series(["a"]), pyarrow.duration("ns")),
     (pd.Series(["a", "b"]), pyarrow.time32("ms")),
     (pd.Series(["a", "b"]), pyarrow.time64("ns")),