Skip to content

Commit 32b08fd

Browse files
authored
Enhancement: Add support for timezone-flexible DateTime (#1352) (#1902)
Enhancement: Add support for timezone-flexible DateTime (#1352) Signed-off-by: Max Raphael <[email protected]>
1 parent 754e66d commit 32b08fd

File tree

2 files changed

+280
-9
lines changed

2 files changed

+280
-9
lines changed

pandera/engines/pandas_engine.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,19 @@ class DateTime(_BaseDateTime, dtypes.Timestamp):
860860
tz: Optional[datetime.tzinfo] = None
861861
"""The timezone."""
862862

863+
time_zone_agnostic: bool = False
864+
"""
865+
A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
866+
867+
- If set to `True` and `coerce` is `False`, the function will accept datetimes with any timezone(s)
868+
but not timezone-naive datetimes. If passed, the `tz` argument will be ignored, as this use
869+
case is handled by setting `time_zone_agnostic=False`.
870+
871+
- If set to `True` and `coerce` is `True`, a `tz` must also be specified. The function will then
872+
accept datetimes with any timezone(s) and convert them to the specified tz, as well as
873+
timezone-naive datetimes, and localize them to the specified tz.
874+
"""
875+
863876
to_datetime_kwargs: Dict[str, Any] = dataclasses.field(
864877
default_factory=dict, compare=False, repr=False
865878
)
@@ -936,14 +949,111 @@ def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype):
936949
return cls(unit=pd_dtype.unit, tz=pd_dtype.tz) # type: ignore
937950

938951
def coerce(self, data_container: PandasObject) -> PandasObject:
952+
if self.time_zone_agnostic:
953+
data_container = self._prepare_coerce_time_zone_agnostic(
954+
data_container=data_container
955+
)
939956
return self._coerce(data_container, pandas_dtype=self.type)
940957

958+
def _prepare_coerce_time_zone_agnostic(
959+
self, data_container: PandasObject
960+
) -> PandasObject:
961+
if not self.tz:
962+
raise errors.ParserError(
963+
"Cannot coerce datetimes when 'time_zone_agnostic=True' and 'tz' is not specified. "
964+
"When using 'time_zone_agnostic' and 'coerce', you must specify a timezone using 'tz' parameter.",
965+
failure_cases=utils.numpy_pandas_coerce_failure_cases(
966+
data_container, self
967+
),
968+
)
969+
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
970+
if isinstance(data_container.dtype, pd.DatetimeTZDtype):
971+
tz = self.tz
972+
unit = self.unit if self.unit else data_container.dtype.unit
973+
type_ = pd.DatetimeTZDtype(unit, tz)
974+
object.__setattr__(self, "tz", tz)
975+
object.__setattr__(self, "type", type_)
976+
# If there are multiple timezones, convert them to the specified tz and set the type accordingly
977+
elif all(isinstance(x, datetime.datetime) for x in data_container):
978+
container_type = type(data_container)
979+
tz = self.tz
980+
unit = self.unit if self.unit else data_container.dtype.unit
981+
data_container = container_type(
982+
[
983+
(
984+
pd.Timestamp(ts).tz_convert(tz)
985+
if pd.Timestamp(ts).tzinfo
986+
else pd.Timestamp(ts).tz_localize(tz)
987+
)
988+
for ts in data_container
989+
]
990+
)
991+
type_ = pd.DatetimeTZDtype(unit, tz)
992+
object.__setattr__(self, "tz", tz)
993+
object.__setattr__(self, "type", type_)
994+
else:
995+
raise errors.ParserError(
996+
"When time_zone_agnostic=True, data must either be:\n"
997+
"1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
998+
"2. A Series of datetime objects\n"
999+
f"Got data with dtype: {data_container.dtype}",
1000+
failure_cases=utils.numpy_pandas_coerce_failure_cases(
1001+
data_container, self
1002+
),
1003+
)
1004+
return data_container
1005+
9411006
def coerce_value(self, value: Any) -> Any:
9421007
"""Coerce an value to specified datatime type."""
9431008
return self._get_to_datetime_fn(value)(
9441009
value, **self.to_datetime_kwargs
9451010
)
9461011

1012+
def check(
1013+
self,
1014+
pandera_dtype: dtypes.DataType,
1015+
data_container: Optional[PandasObject] = None,
1016+
) -> Union[bool, Iterable[bool]]:
1017+
if self.time_zone_agnostic:
1018+
self._prepare_check_time_zone_agnostic(
1019+
pandera_dtype=pandera_dtype, data_container=data_container
1020+
)
1021+
return super().check(pandera_dtype, data_container)
1022+
1023+
def _prepare_check_time_zone_agnostic(
1024+
self,
1025+
pandera_dtype: dtypes.DataType,
1026+
data_container: Optional[PandasObject],
1027+
) -> None:
1028+
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
1029+
if (
1030+
isinstance(pandera_dtype, DateTime)
1031+
and pandera_dtype.tz is not None
1032+
):
1033+
type_ = pd.DatetimeTZDtype(self.unit, pandera_dtype.tz)
1034+
object.__setattr__(self, "tz", pandera_dtype.tz)
1035+
object.__setattr__(self, "type", type_)
1036+
# If the data has a mix of timezones, pandas defines the dtype as 'object`
1037+
elif all(
1038+
isinstance(x, datetime.datetime) and x.tzinfo is not None
1039+
for x in data_container # type: ignore
1040+
):
1041+
object.__setattr__(self, "type", np.dtype("O"))
1042+
else:
1043+
raise errors.ParserError(
1044+
"When time_zone_agnostic=True, data must either be:\n"
1045+
"1. A Series with DatetimeTZDtype (timezone-aware datetime series), or\n"
1046+
"2. A Series of timezone-aware datetime objects\n"
1047+
f"Got data with dtype: {data_container.dtype if data_container is not None else 'None'}",
1048+
failure_cases=(
1049+
utils.numpy_pandas_coerce_failure_cases(
1050+
data_container, self
1051+
)
1052+
if data_container is not None
1053+
else None
1054+
),
1055+
)
1056+
9471057
def __str__(self) -> str:
9481058
if self.type == np.dtype("datetime64[ns]"):
9491059
return "datetime64[ns]"

tests/core/test_pandas_engine.py

Lines changed: 170 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Test pandas engine."""
22

3-
from datetime import date
4-
from typing import Any, Set
3+
import datetime as dt
4+
from typing import Tuple, List, Optional, Any, Set
55

66
import hypothesis
77
import hypothesis.extra.pandas as pd_st
@@ -13,8 +13,9 @@
1313
import pytz
1414
from hypothesis import given
1515

16+
from pandera import Field, DataFrameModel, errors
1617
from pandera.engines import pandas_engine
17-
from pandera.errors import ParserError
18+
from pandera.errors import ParserError, SchemaError
1819

1920
UNSUPPORTED_DTYPE_CLS: Set[Any] = set()
2021

@@ -202,6 +203,165 @@ def test_pandas_datetimetz_dtype(timezone_aware, data, timezone):
202203
assert coerced_data.dt.tz == timezone
203204

204205

206+
def generate_test_cases_time_zone_agnostic() -> List[
207+
Tuple[
208+
List[dt.datetime],
209+
Optional[dt.tzinfo],
210+
bool,
211+
List[dt.datetime],
212+
bool,
213+
]
214+
]:
215+
"""
216+
Generate test parameter combinations for a given list of datetime lists.
217+
218+
Returns:
219+
List of tuples:
220+
- List of input datetimes
221+
- tz for DateTime constructor
222+
- coerce flag for Field constructor
223+
- expected output datetimes
224+
- raises flag (True if an exception is expected, False otherwise)
225+
"""
226+
datetimes = [
227+
# multi tz and tz naive
228+
[
229+
pytz.timezone("America/New_York").localize(
230+
dt.datetime(2023, 3, 1, 4)
231+
),
232+
pytz.timezone("America/Los_Angeles").localize(
233+
dt.datetime(2023, 3, 1, 5)
234+
),
235+
dt.datetime(2023, 3, 1, 5), # naive datetime
236+
],
237+
# multi tz
238+
[
239+
pytz.timezone("America/New_York").localize(
240+
dt.datetime(2023, 3, 1, 4)
241+
),
242+
pytz.timezone("America/Los_Angeles").localize(
243+
dt.datetime(2023, 3, 1, 5)
244+
),
245+
],
246+
# tz naive
247+
[dt.datetime(2023, 3, 1, 4), dt.datetime(2023, 3, 1, 5)],
248+
# single tz
249+
[
250+
pytz.timezone("America/New_York").localize(
251+
dt.datetime(2023, 3, 1, 4)
252+
),
253+
pytz.timezone("America/New_York").localize(
254+
dt.datetime(2023, 3, 1, 5)
255+
),
256+
],
257+
]
258+
259+
test_cases = []
260+
261+
for datetime_list in datetimes:
262+
for coerce in [True, False]:
263+
for tz in [
264+
None,
265+
pytz.timezone("America/Chicago"),
266+
pytz.FixedOffset(120), # 120 minutes = 2 hours offset
267+
]:
268+
# Determine if the test should raise an exception
269+
# Should raise error when:
270+
# * coerce is False but there is a timezone-naive datetime
271+
# * coerce is True but tz is not set
272+
has_naive_datetime = any(
273+
dt.tzinfo is None for dt in datetime_list
274+
)
275+
raises = (not coerce and has_naive_datetime) or (
276+
coerce and tz is None
277+
)
278+
279+
# Generate expected output
280+
if raises:
281+
expected_output = None # No expected output since an exception will be raised
282+
else:
283+
if coerce:
284+
# Replace naive datetimes with localized ones
285+
expected_output_naive = [
286+
tz.localize(dtime) if tz is not None else dtime
287+
for dtime in datetime_list
288+
if dtime.tzinfo is None
289+
]
290+
291+
# Convert timezone-aware datetimes to the desired timezone
292+
expected_output_aware = [
293+
dtime.astimezone(
294+
tz
295+
) # Use .astimezone() for aware datetimes
296+
for dtime in datetime_list
297+
if dtime.tzinfo is not None
298+
]
299+
expected_output = (
300+
expected_output_naive + expected_output_aware
301+
)
302+
else:
303+
# ignore tz
304+
expected_output = datetime_list
305+
306+
test_case = (
307+
datetime_list,
308+
tz,
309+
coerce,
310+
expected_output,
311+
raises,
312+
)
313+
test_cases.append(test_case)
314+
315+
# define final test cases with improper type
316+
datetime_list = [
317+
pytz.timezone("America/New_York").localize(
318+
dt.datetime(
319+
2023,
320+
3,
321+
1,
322+
4,
323+
)
324+
),
325+
"hello world",
326+
]
327+
tz = None
328+
expected_output = None
329+
raises = True
330+
331+
bad_type_coerce = (datetime_list, tz, True, expected_output, raises)
332+
bad_type_no_coerce = (datetime_list, tz, False, expected_output, raises)
333+
test_cases.extend([bad_type_coerce, bad_type_no_coerce]) # type: ignore
334+
335+
return test_cases # type: ignore
336+
337+
338+
@pytest.mark.parametrize(
339+
"examples, tz, coerce, expected_output, raises",
340+
generate_test_cases_time_zone_agnostic(),
341+
)
342+
def test_dt_time_zone_agnostic(examples, tz, coerce, expected_output, raises):
343+
"""Test that time_zone_agnostic works as expected"""
344+
345+
# Testing using a pandera DataFrameModel rather than directly calling dtype coerce or validate because with
346+
# time_zone_agnostic, dtype is set dynamically based on the input data
347+
class SimpleSchema(DataFrameModel):
348+
# pylint: disable=unexpected-keyword-arg,no-value-for-parameter
349+
datetime_column: pandas_engine.DateTime(
350+
time_zone_agnostic=True, tz=tz
351+
) = Field(coerce=coerce)
352+
353+
data = pd.DataFrame({"datetime_column": examples})
354+
355+
if raises:
356+
with pytest.raises((SchemaError, errors.ParserError)):
357+
SimpleSchema.validate(data)
358+
else:
359+
validated_df = SimpleSchema.validate(data)
360+
assert sorted(validated_df["datetime_column"].tolist()) == sorted(
361+
expected_output
362+
)
363+
364+
205365
@hypothesis.settings(max_examples=1000)
206366
@pytest.mark.parametrize("to_df", [True, False])
207367
@given(
@@ -225,7 +385,7 @@ def test_pandas_date_coerce_dtype(to_df, data):
225385
)
226386

227387
assert (
228-
coerced_data.applymap(lambda x: isinstance(x, date))
388+
coerced_data.applymap(lambda x: isinstance(x, dt.date))
229389
| coerced_data.isna()
230390
).all(axis=None)
231391
return
@@ -234,7 +394,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
234394
coerced_data.isna().all() and coerced_data.dtype == "datetime64[ns]"
235395
)
236396
assert (
237-
coerced_data.map(lambda x: isinstance(x, date)) | coerced_data.isna()
397+
coerced_data.map(lambda x: isinstance(x, dt.date))
398+
| coerced_data.isna()
238399
).all()
239400

240401

@@ -246,8 +407,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
246407
pyarrow.struct([("foo", pyarrow.int64()), ("bar", pyarrow.string())]),
247408
),
248409
(pd.Series([None, pd.NA, np.nan]), pyarrow.null),
249-
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date32),
250-
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date64),
410+
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date32),
411+
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date64),
251412
(pd.Series([1, 2]), pyarrow.duration("ns")),
252413
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time32("ms")),
253414
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time64("ns")),
@@ -292,8 +453,8 @@ def test_pandas_arrow_dtype(data, dtype):
292453
pyarrow.struct([("foo", pyarrow.string()), ("bar", pyarrow.int64())]),
293454
),
294455
(pd.Series(["a", "1"]), pyarrow.null),
295-
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
296-
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
456+
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
457+
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
297458
(pd.Series(["a"]), pyarrow.duration("ns")),
298459
(pd.Series(["a", "b"]), pyarrow.time32("ms")),
299460
(pd.Series(["a", "b"]), pyarrow.time64("ns")),

0 commit comments

Comments
 (0)