Skip to content

Commit d19a963

Browse files
committed
Enhancement: Add support for timezone-flexible DateTime (unionai-oss#1352)
Signed-off-by: Max Raphael <[email protected]>
1 parent 0b2068e commit d19a963

File tree

3 files changed

+174
-4
lines changed

3 files changed

+174
-4
lines changed

pandera/engines/pandas_engine.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,11 @@ class DateTime(_BaseDateTime, dtypes.Timestamp):
862862

863863
timezone_flexible: bool = False
864864
"""
865+
<<<<<<< HEAD
865866
A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
867+
=======
868+
A flag indicating whether the datetime data should be handled flexibly with respect to timezones.
869+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
866870
When set to True, the function will ignore 'tz' and allow datetimes with any timezone(s). If coerce is set to True,
867871
the function can accept timezone-naive datetimes, and will convert all datetimes to the specified tz (or 'UTC').
868872
"""
@@ -944,6 +948,7 @@ def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype):
944948

945949
def coerce(self, data_container: PandasObject) -> PandasObject:
946950
if self.timezone_flexible:
951+
<<<<<<< HEAD
947952
data_container = self._prepare_coerce_timezone_flexible(data_container=data_container)
948953
return self._coerce(data_container, pandas_dtype=self.type)
949954

@@ -953,18 +958,45 @@ def _prepare_coerce_timezone_flexible(self, data_container: PandasObject) -> Pan
953958
tz = self.tz if self.tz else data_container.dtype.tz
954959
unit = self.unit if self.unit else data_container.dtype.unit
955960
type_ = pd.DatetimeTZDtype(unit, tz)
961+
=======
962+
data_container = self._prepare_coerce_timezone_flexible(
963+
data_container=data_container
964+
)
965+
return self._coerce(data_container, pandas_dtype=self.type)
966+
967+
def _prepare_coerce_timezone_flexible(
968+
self, data_container: PandasObject
969+
) -> PandasObject:
970+
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
971+
if isinstance(data_container.dtype, pd.DatetimeTZDtype):
972+
tz = self.tz if self.tz else data_container.dtype.tz
973+
type_ = pd.DatetimeTZDtype("ns", tz)
974+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
956975
object.__setattr__(self, "tz", tz)
957976
object.__setattr__(self, "type", type_)
958977
# If there are multiple timezones, convert them to the specified tz (default 'UTC') and set the type accordingly
959978
elif all(isinstance(x, datetime.datetime) for x in data_container):
960979
container_type = type(data_container)
980+
<<<<<<< HEAD
961981
tz = self.tz if self.tz else 'UTC'
962982
unit = self.unit if self.unit else data_container.dtype.unit
963983
data_container = container_type(
964984
[pd.Timestamp(ts).tz_convert(tz) if pd.Timestamp(ts).tzinfo else pd.Timestamp(ts).tz_localize(tz)
965985
for ts in data_container]
966986
)
967987
type_ = pd.DatetimeTZDtype(unit, tz)
988+
=======
989+
tz = self.tz if self.tz else "UTC" # type: ignore
990+
data_container = container_type(
991+
[
992+
pd.Timestamp(ts).tz_convert(tz) # type: ignore
993+
if pd.Timestamp(ts).tzinfo # type: ignore
994+
else pd.Timestamp(ts).tz_localize(tz) # type: ignore
995+
for ts in data_container
996+
]
997+
)
998+
type_ = pd.DatetimeTZDtype("ns", tz)
999+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
9681000
object.__setattr__(self, "tz", tz)
9691001
object.__setattr__(self, "type", type_)
9701002
else:
@@ -984,6 +1016,7 @@ def check(
9841016
data_container: Optional[PandasObject] = None,
9851017
) -> Union[bool, Iterable[bool]]:
9861018
if self.timezone_flexible:
1019+
<<<<<<< HEAD
9871020
self._prepare_check_timezone_flexible(pandera_dtype=pandera_dtype, data_container=data_container)
9881021
return super().check(pandera_dtype, data_container)
9891022

@@ -998,6 +1031,34 @@ def _prepare_check_timezone_flexible(
9981031
# If the data has a mix of timezones, pandas defines the dtype as 'object
9991032
elif all(isinstance(x, datetime.datetime) and x.tzinfo is not None for x in data_container):
10001033
object.__setattr__(self, "type", np.dtype('O'))
1034+
=======
1035+
self._prepare_check_timezone_flexible(
1036+
pandera_dtype=pandera_dtype, data_container=data_container
1037+
)
1038+
return super().check(pandera_dtype, data_container)
1039+
1040+
def _prepare_check_timezone_flexible(
1041+
self,
1042+
pandera_dtype: dtypes.DataType,
1043+
data_container: Optional[PandasObject],
1044+
) -> None:
1045+
if data_container is None:
1046+
return
1047+
# If there is a single timezone, define the type as a timezone-aware DatetimeTZDtype
1048+
if (
1049+
isinstance(pandera_dtype, DateTime)
1050+
and pandera_dtype.tz is not None
1051+
):
1052+
type_ = pd.DatetimeTZDtype("ns", pandera_dtype.tz)
1053+
object.__setattr__(self, "tz", pandera_dtype.tz)
1054+
object.__setattr__(self, "type", type_)
1055+
# If the data has a mix of timezones, pandas defines the dtype as 'object
1056+
elif all(
1057+
isinstance(x, datetime.datetime) and x.tzinfo is not None
1058+
for x in data_container
1059+
):
1060+
object.__setattr__(self, "type", np.dtype("O"))
1061+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
10011062
else:
10021063
# Prepare to raise exception, adding type strictly for the check_dtype error message
10031064
object.__setattr__(self, "type", "datetime64[ns, <timezone>]")

tests/core/test_pandas_engine.py

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Test pandas engine."""
22

3+
<<<<<<< HEAD
34
<<<<<<< HEAD
45
import datetime as dt
56
from typing import Tuple, List, Optional
@@ -8,6 +9,11 @@
89
from datetime import date
910
from typing import Any, Set
1011
>>>>>>> add pandas pyarrow backend support (#1628)
12+
=======
13+
import datetime as dt
14+
from typing import Any, Set, Tuple, List, Optional
15+
from zoneinfo import ZoneInfo
16+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
1117

1218
import hypothesis
1319
import hypothesis.extra.pandas as pd_st
@@ -210,7 +216,13 @@ def test_pandas_datetimetz_dtype(timezone_aware, data, timezone):
210216

211217

212218
def generate_test_cases_timezone_flexible() -> List[
219+
<<<<<<< HEAD
213220
Tuple[List[dt.datetime], Optional[dt.tzinfo], bool, List[dt.datetime], bool]
221+
=======
222+
Tuple[
223+
List[dt.datetime], Optional[dt.tzinfo], bool, List[dt.datetime], bool
224+
]
225+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
214226
]:
215227
"""
216228
Generate test parameter combinations for a given list of datetime lists.
@@ -226,6 +238,7 @@ def generate_test_cases_timezone_flexible() -> List[
226238
datetimes = [
227239
# multi tz and tz naive
228240
[
241+
<<<<<<< HEAD
229242
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo('America/New_York')),
230243
dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo('America/Los_Angeles')),
231244
dt.datetime(2023, 3, 1, 5)
@@ -245,15 +258,45 @@ def generate_test_cases_timezone_flexible() -> List[
245258
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo('America/New_York')),
246259
dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo('America/New_York'))
247260
]
261+
=======
262+
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
263+
dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/Los_Angeles")),
264+
dt.datetime(2023, 3, 1, 5),
265+
],
266+
# multiz tz
267+
[
268+
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
269+
dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/Los_Angeles")),
270+
],
271+
# tz naive
272+
[dt.datetime(2023, 3, 1, 4), dt.datetime(2023, 3, 1, 5)],
273+
# single tz
274+
[
275+
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
276+
dt.datetime(2023, 3, 1, 5, tzinfo=ZoneInfo("America/New_York")),
277+
],
278+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
248279
]
249280

250281
test_cases = []
251282

252283
for datetime_list in datetimes:
253284
for coerce in [True, False]:
285+
<<<<<<< HEAD
254286
for tz in [None, ZoneInfo("America/Chicago"), dt.timezone(dt.timedelta(hours=2))]:
255287
# Determine if the test should raise an exception
256288
has_naive_datetime = any([dt.tzinfo is None for dt in datetime_list])
289+
=======
290+
for tz in [
291+
None,
292+
ZoneInfo("America/Chicago"),
293+
dt.timezone(dt.timedelta(hours=2)),
294+
]:
295+
# Determine if the test should raise an exception
296+
has_naive_datetime = any(
297+
dt.tzinfo is None for dt in datetime_list
298+
)
299+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
257300
raises = has_naive_datetime and not coerce
258301

259302
# Generate expected output
@@ -264,52 +307,112 @@ def generate_test_cases_timezone_flexible() -> List[
264307
# localize / convert the input datetimes to the specified tz or 'UTC' (default)
265308
use_tz = tz if tz else ZoneInfo("UTC")
266309
expected_output_naive = [
310+
<<<<<<< HEAD
267311
dt.replace(tzinfo=use_tz) for dt in datetime_list if dt.tzinfo is None
268312
]
269313
expected_output_aware = [
270314
dt.astimezone(use_tz) for dt in datetime_list if dt.tzinfo is not None
271315
]
272316
expected_output = expected_output_naive + expected_output_aware
317+
=======
318+
dt.replace(tzinfo=use_tz)
319+
for dt in datetime_list
320+
if dt.tzinfo is None
321+
]
322+
expected_output_aware = [
323+
dt.astimezone(use_tz)
324+
for dt in datetime_list
325+
if dt.tzinfo is not None
326+
]
327+
expected_output = (
328+
expected_output_naive + expected_output_aware
329+
)
330+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
273331
else:
274332
# ignore tz
275333
expected_output = datetime_list
276334

335+
<<<<<<< HEAD
277336
test_case = (datetime_list, tz, coerce, expected_output, raises)
278337
test_cases.append(test_case)
279338

280339
# define final test cases with improper type
281340
datetime_list = [dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo('America/New_York')), "hello world"]
341+
=======
342+
test_case = (
343+
datetime_list,
344+
tz,
345+
coerce,
346+
expected_output,
347+
raises,
348+
)
349+
test_cases.append(test_case)
350+
351+
# define final test cases with improper type
352+
datetime_list = [
353+
dt.datetime(2023, 3, 1, 4, tzinfo=ZoneInfo("America/New_York")),
354+
"hello world",
355+
]
356+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
282357
tz = None
283358
expected_output = None
284359
raises = True
285360

286361
bad_type_coerce = (datetime_list, tz, True, expected_output, raises)
287362
bad_type_no_coerce = (datetime_list, tz, False, expected_output, raises)
363+
<<<<<<< HEAD
288364
test_cases.extend([bad_type_coerce, bad_type_no_coerce])
289365

290366
return test_cases
367+
=======
368+
test_cases.extend([bad_type_coerce, bad_type_no_coerce]) # type: ignore
369+
370+
return test_cases # type: ignore
371+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
291372

292373

293374
@pytest.mark.parametrize(
294375
"examples, tz, coerce, expected_output, raises",
376+
<<<<<<< HEAD
295377
generate_test_cases_timezone_flexible()
378+
=======
379+
generate_test_cases_timezone_flexible(),
380+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
296381
)
297382
def test_dt_timezone_flexible(examples, tz, coerce, expected_output, raises):
298383
"""Test that timezone_flexible works as expected"""
299384

300385
# Testing using a pandera DataFrameModel rather than directly calling dtype coerce or validate because with
301386
# timezone_flexible, dtype is set dynamically based on the input data
302387
class SimpleSchema(DataFrameModel):
388+
<<<<<<< HEAD
303389
datetime_column: pandas_engine.DateTime(timezone_flexible=True, tz=tz) = Field(coerce=coerce)
304390

305391
data = pd.DataFrame({'datetime_column': examples})
392+
=======
393+
"""Simple DF Model for testing"""
394+
395+
datetime_column: pandas_engine.DateTime( # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
396+
timezone_flexible=True, tz=tz
397+
) = Field(
398+
coerce=coerce
399+
)
400+
401+
data = pd.DataFrame({"datetime_column": examples})
402+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
306403

307404
if raises:
308405
with pytest.raises(SchemaError):
309406
SimpleSchema.validate(data)
310407
else:
311408
validated_df = SimpleSchema.validate(data)
409+
<<<<<<< HEAD
312410
assert sorted(validated_df['datetime_column'].tolist()) == sorted(expected_output)
411+
=======
412+
assert sorted(validated_df["datetime_column"].tolist()) == sorted(
413+
expected_output
414+
)
415+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
313416

314417

315418
@hypothesis.settings(max_examples=1000)
@@ -344,7 +447,12 @@ def test_pandas_date_coerce_dtype(to_df, data):
344447
coerced_data.isna().all() and coerced_data.dtype == "datetime64[ns]"
345448
)
346449
assert (
450+
<<<<<<< HEAD
347451
coerced_data.map(lambda x: isinstance(x, dt.date)) | coerced_data.isna()
452+
=======
453+
coerced_data.map(lambda x: isinstance(x, dt.date))
454+
| coerced_data.isna()
455+
>>>>>>> Enhancement: Add support for timezone-flexible DateTime (#1352)
348456
).all()
349457

350458

@@ -356,8 +464,8 @@ def test_pandas_date_coerce_dtype(to_df, data):
356464
pyarrow.struct([("foo", pyarrow.int64()), ("bar", pyarrow.string())]),
357465
),
358466
(pd.Series([None, pd.NA, np.nan]), pyarrow.null),
359-
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date32),
360-
(pd.Series([None, date(1970, 1, 1)]), pyarrow.date64),
467+
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date32),
468+
(pd.Series([None, dt.date(1970, 1, 1)]), pyarrow.date64),
361469
(pd.Series([1, 2]), pyarrow.duration("ns")),
362470
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time32("ms")),
363471
(pd.Series([1, 1e3, 1e6, 1e9, None]), pyarrow.time64("ns")),
@@ -402,8 +510,8 @@ def test_pandas_arrow_dtype(data, dtype):
402510
pyarrow.struct([("foo", pyarrow.string()), ("bar", pyarrow.int64())]),
403511
),
404512
(pd.Series(["a", "1"]), pyarrow.null),
405-
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
406-
(pd.Series(["a", date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
513+
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date32),
514+
(pd.Series(["a", dt.date(1970, 1, 1), "1970-01-01"]), pyarrow.date64),
407515
(pd.Series(["a"]), pyarrow.duration("ns")),
408516
(pd.Series(["a", "b"]), pyarrow.time32("ms")),
409517
(pd.Series(["a", "b"]), pyarrow.time64("ns")),

tests/pyspark/test_schemas_on_pyspark_pandas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ def test_nullable(
292292
data: st.DataObject,
293293
):
294294
"""Test nullable checks on pyspark.pandas dataframes."""
295+
print(dtype)
295296

296297
if version.parse(np.__version__) >= version.parse(
297298
"1.24.0"

0 commit comments

Comments
 (0)