Skip to content

Commit 593ec22

Browse files
author
Arne Recknagel
committed
Merge remote-tracking branch 'upstream/main' into feature/unionai-oss#992
2 parents 1ad2023 + e2bc5b9 commit 593ec22

21 files changed

+152
-110
lines changed

.github/CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ pip install -e .
5151
pip install virtualenv
5252
virtualenv .venv/pandera-dev
5353
source .venv/pandera-dev/bin/activate
54+
pip install --upgrade pip
5455
pip install -r requirements-dev.txt
5556
pip install -e .
5657
```

.github/workflows/ci-tests.yml

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ env:
1919
# Increase this value to reset cache if environment.yml has not changed
2020
CACHE_VERSION: 6
2121

22+
concurrency:
23+
group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
24+
cancel-in-progress: true
25+
2226
jobs:
2327

2428
lint:
@@ -93,14 +97,12 @@ jobs:
9397
PYTEST_FLAGS: --cov=pandera --cov-report=term-missing --cov-report=xml --cov-append
9498
HYPOTHESIS_FLAGS: -n=auto -q --hypothesis-profile=ci
9599
strategy:
96-
fail-fast: true
100+
fail-fast: false
97101
matrix:
98102
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
99103
python-version: ["3.7", "3.8", "3.9", "3.10"]
100-
pandas-version: ["1.2.0", "1.3.0", "latest"]
104+
pandas-version: ["1.3.0", "latest"]
101105
exclude:
102-
- python-version: "3.10"
103-
pandas-version: "1.2.0"
104106
- python-version: "3.10"
105107
pandas-version: "1.3.0"
106108
include:
@@ -151,6 +153,11 @@ jobs:
151153
if: ${{ matrix.pandas-version != 'latest' }}
152154
run: mamba install -c conda-forge pandas==${{ matrix.pandas-version }} geopandas
153155

156+
# ray currently cannot be installed on python 3.10, windows
157+
- name: Remove Ray from Deps
158+
if: ${{ matrix.os == 'windows-latest' && matrix.python-version == '3.10' }}
159+
run: sed -i 's/^ray//g' requirements-dev.txt
160+
154161
- name: Install Pip Deps
155162
run: |
156163
python -m pip install -U pip
@@ -199,7 +206,11 @@ jobs:
199206
CI_MODIN_ENGINES: dask
200207

201208
- name: Unit Tests - Modin-Ray
202-
if: ${{ matrix.python-version != '3.10' }}
209+
# ray CI issues with the following:
210+
# - windows, python 3.10
211+
# - mac, python 3.7
212+
# Tracking issue: https://github.com/modin-project/modin/issues/5466
213+
if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
203214
run: pytest tests/modin ${{ env.PYTEST_FLAGS }}
204215
env:
205216
CI_MODIN_ENGINES: ray
@@ -208,9 +219,9 @@ jobs:
208219
uses: codecov/codecov-action@v3
209220

210221
- name: Check Docstrings
211-
if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
222+
if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
212223
run: nox ${{ env.NOX_FLAGS }} --session doctests
213224

214225
- name: Check Docs
215-
if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
226+
if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
216227
run: nox ${{ env.NOX_FLAGS }} --session docs

.pre-commit-config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ repos:
5454
entry: mypy
5555
language: python
5656
types: [python]
57-
files: (^pandera/|^tests/|^scripts/)
57+
pass_filenames: false
5858
exclude: (^docs/|^tests/mypy/modules/)
5959
require_serial: true
60+
args: ["pandera", "tests", "scripts"]
61+
verbose: true

environment.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies:
2121
- pydantic
2222

2323
# mypy extra
24-
- pandas-stubs
24+
- pandas-stubs <= 1.5.2.221213
2525

2626
# pyspark extra
2727
- pyspark >= 3.2.0
@@ -80,7 +80,7 @@ dependencies:
8080

8181
- pip:
8282
- furo
83-
- ray <= 1.7.0; python_version < '3.10'
83+
- ray
8484
- types-click
8585
- types-pyyaml
8686
- types-pkg_resources

mypy.ini

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[mypy]
2+
ignore_missing_imports = True
3+
follow_imports = skip
4+
allow_redefinition = True
5+
warn_return_any = False
6+
warn_unused_configs = True
7+
show_error_codes = True
8+
exclude = tests/mypy/modules

noxfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def requirements(session: Session) -> None: # pylint:disable=unused-argument
265265
print(f"{REQUIREMENT_PATH} has been re-generated ✨ 🍰 ✨")
266266
raise err
267267

268-
ignored_pkgs = {"black", "pandas"}
268+
ignored_pkgs = {"black", "pandas", "pandas-stubs"}
269269
mismatched = []
270270
# only compare package versions, not python version markers.
271271
str_dev_reqs = [str(x) for x in DEV_REQUIREMENTS]

pandera/checks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ def _prepare_series_input(
325325
if check_utils.is_field(df_or_series):
326326
return df_or_series # type: ignore[return-value]
327327
elif self.groupby is None:
328-
return df_or_series[column] # type: ignore[index]
328+
return df_or_series[column] # type: ignore
329329
elif isinstance(self.groupby, list):
330330
return self._format_groupby_input( # type: ignore[return-value]
331331
df_or_series.groupby(self.groupby)[column], # type: ignore[index]

pandera/decorators.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,4 +752,11 @@ def _wrapper(
752752
out = wrapped_(*validated_pos.values(), **validated_kwd)
753753
return _check_arg("return", out)
754754

755-
return _wrapper(wrapped) # pylint:disable=no-value-for-parameter
755+
wrapped_fn = _wrapper(wrapped) # pylint:disable=no-value-for-parameter
756+
757+
# The wrapt.decorator function returns a FunctionWrapper, which
758+
# exposes an __iter__ method that causes the function to be recognized as
759+
# an iterable. This causes unintended downstream issues, see for example:
760+
# https://github.com/unionai-oss/pandera/issues/1021
761+
wrapped_fn.__iter__ = None
762+
return wrapped_fn

pandera/engines/pandas_engine.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ def coerce_value(self, value: Any) -> decimal.Decimal:
525525
return dec.quantize(self._exp, context=self._ctx)
526526

527527
def coerce(self, data_container: PandasObject) -> PandasObject:
528-
return data_container.apply(self.coerce_value)
528+
return data_container.apply(self.coerce_value) # type: ignore
529529

530530
def check( # type: ignore
531531
self,
@@ -577,7 +577,7 @@ def __init__( # pylint:disable=super-init-not-called
577577
object.__setattr__(
578578
self,
579579
"type",
580-
pd.CategoricalDtype(self.categories, self.ordered),
580+
pd.CategoricalDtype(self.categories, self.ordered), # type: ignore
581581
)
582582

583583
def coerce(self, data_container: PandasObject) -> PandasObject:
@@ -639,13 +639,13 @@ def __str__(self) -> str:
639639
else:
640640

641641
@Engine.register_dtype(
642-
equivalents=["string", pd.StringDtype, pd.StringDtype()]
643-
) # type: ignore
642+
equivalents=["string", pd.StringDtype, pd.StringDtype()] # type: ignore
643+
)
644644
@immutable
645645
class STRING(DataType, dtypes.String): # type: ignore
646646
"""Semantic representation of a :class:`pandas.StringDtype`."""
647647

648-
type = pd.StringDtype()
648+
type = pd.StringDtype() # type: ignore
649649

650650

651651
@Engine.register_dtype(
@@ -984,8 +984,8 @@ def __post_init__(self):
984984
def from_parametrized_dtype(cls, pd_dtype: pd.SparseDtype):
985985
"""Convert a :class:`pandas.SparseDtype` to
986986
a Pandera :class:`pandera.engines.pandas_engine.Sparse`."""
987-
return cls( # type: ignore
988-
dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value
987+
return cls(
988+
dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value # type: ignore
989989
)
990990

991991

pandera/schemas.py

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -180,13 +180,16 @@ def __init__(
180180
See :ref:`here<DataFrameSchemas>` for more usage details.
181181
182182
"""
183+
if columns is None:
184+
columns = {}
185+
_validate_columns(columns)
186+
columns = _columns_renamed(columns)
187+
183188
if checks is None:
184189
checks = []
185190
if isinstance(checks, (Check, Hypothesis)):
186191
checks = [checks]
187192

188-
self.columns: Dict[Any, Column] = {} if columns is None else columns
189-
190193
if strict not in (
191194
False,
192195
True,
@@ -197,6 +200,7 @@ def __init__(
197200
"or `'filter'`."
198201
)
199202

203+
self.columns: Dict[Any, Column] = columns
200204
self.checks: CheckListProperty = checks
201205
self.index = index
202206
self.strict: StrictType = strict
@@ -209,8 +213,6 @@ def __init__(
209213
self._unique_column_names = unique_column_names
210214
self._title = title
211215
self._description = description
212-
self._validate_schema()
213-
self._set_column_names()
214216

215217
# this attribute is not meant to be accessed by users and is explicitly
216218
# set to True in the case that a schema is created by infer_schema.
@@ -277,36 +279,6 @@ def _is_inferred(self) -> bool:
277279
def _is_inferred(self, value: bool) -> None:
278280
self._IS_INFERRED = value
279281

280-
def _validate_schema(self) -> None:
281-
for column_name, column in self.columns.items():
282-
for check in column.checks:
283-
if check.groupby is None or callable(check.groupby):
284-
continue
285-
nonexistent_groupby_columns = [
286-
c for c in check.groupby if c not in self.columns
287-
]
288-
if nonexistent_groupby_columns:
289-
raise errors.SchemaInitError(
290-
f"groupby argument {nonexistent_groupby_columns} in "
291-
f"Check for Column {column_name} not "
292-
"specified in the DataFrameSchema."
293-
)
294-
295-
def _set_column_names(self) -> None:
296-
def _set_column_handler(column, column_name):
297-
if column.name is not None and column.name != column_name:
298-
warnings.warn(
299-
f"resetting column for {column} to '{column_name}'."
300-
)
301-
elif column.name == column_name:
302-
return column
303-
return column.set_name(column_name)
304-
305-
self.columns = {
306-
column_name: _set_column_handler(column, column_name)
307-
for column_name, column in self.columns.items()
308-
}
309-
310282
@property
311283
def dtypes(self) -> Dict[str, DataType]:
312284
# pylint:disable=anomalous-backslash-in-string
@@ -347,7 +319,7 @@ def get_dtypes(self, dataframe: pd.DataFrame) -> Dict[str, DataType]:
347319
)
348320
return {
349321
**{n: c.dtype for n, c in self.columns.items() if not c.regex},
350-
**regex_dtype,
322+
**regex_dtype, # type: ignore
351323
}
352324

353325
@property
@@ -595,7 +567,7 @@ def _validate(
595567
is_schema_col = column in expanded_column_names
596568
if (self.strict is True) and not is_schema_col:
597569
msg = (
598-
f"column '{column}' not in {self.__class__.__name__}"
570+
f"column {column!r} not in {self.__class__.__name__}"
599571
f" {self.columns}"
600572
)
601573
error_handler.collect_error(
@@ -621,7 +593,7 @@ def _validate(
621593
errors.SchemaError(
622594
self,
623595
check_obj,
624-
message=f"column '{column}' out-of-order",
596+
message=f"column {column!r} out-of-order",
625597
failure_cases=scalar_failure_case(column),
626598
check="column_ordered",
627599
),
@@ -2541,3 +2513,31 @@ def convert_uniquesettings(unique: UniqueSettings) -> Union[bool, str]:
25412513
str(unique) + " is not a recognized report_duplicates value"
25422514
)
25432515
return keep_argument
2516+
2517+
2518+
def _validate_columns(column_dict: dict[Any, Column]) -> None:
2519+
for column_name, column in column_dict.items():
2520+
for check in column.checks:
2521+
if check.groupby is None or callable(check.groupby):
2522+
continue
2523+
nonexistent_groupby_columns = [
2524+
c for c in check.groupby if c not in column_dict
2525+
]
2526+
if nonexistent_groupby_columns:
2527+
raise errors.SchemaInitError(
2528+
f"groupby argument {nonexistent_groupby_columns} in "
2529+
f"Check for Column {column_name} not "
2530+
"specified in the DataFrameSchema."
2531+
)
2532+
2533+
2534+
def _columns_renamed(columns: dict[Any, Column]) -> dict[Any, Column]:
2535+
def renamed(column, new_name):
2536+
column = copy.deepcopy(column)
2537+
column.set_name(new_name)
2538+
return column
2539+
2540+
return {
2541+
column_name: renamed(column, column_name)
2542+
for column_name, column in columns.items()
2543+
}

pandera/strategies.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _mask(
7474
) -> Union[pd.Series, pd.Index]:
7575
if pd.api.types.is_timedelta64_dtype(val): # type: ignore [arg-type]
7676
return val.mask(null_mask, pd.NaT) # type: ignore [union-attr,arg-type]
77-
elif val.dtype == pd.StringDtype():
77+
elif val.dtype == pd.StringDtype(): # type: ignore [call-arg]
7878
return val.mask(null_mask, pd.NA) # type: ignore [union-attr,arg-type]
7979
return val.mask(null_mask) # type: ignore [union-attr]
8080

requirements-dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ typing_extensions >= 3.7.4.3
1414
frictionless
1515
pyarrow
1616
pydantic
17-
pandas-stubs
17+
pandas-stubs <= 1.5.2.221213
1818
pyspark >= 3.2.0
1919
modin
2020
protobuf <= 3.20.3
@@ -47,7 +47,7 @@ twine
4747
asv
4848
pre_commit
4949
furo
50-
ray <= 1.7.0; python_version < '3.10'
50+
ray
5151
types-click
5252
types-pyyaml
5353
types-pkg_resources

setup.cfg

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
11
[isort]
22
float_to_top = true
33
profile = black
4-
5-
[mypy]
6-
ignore_missing_imports = True
7-
allow_redefinition = True
8-
warn_return_any = False
9-
warn_unused_configs = True
10-
show_error_codes = True
11-
exclude = tests/mypy/modules

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from setuptools import setup, find_packages
1+
from setuptools import find_packages, setup
22

33
with open("README.md") as f:
44
long_description = f.read()
@@ -12,8 +12,8 @@
1212
"hypotheses": ["scipy"],
1313
"io": ["pyyaml >= 5.1", "black", "frictionless"],
1414
"pyspark": ["pyspark >= 3.2.0"],
15-
"modin": ["modin", "ray <= 1.7.0", "dask"],
16-
"modin-ray": ["modin", "ray <= 1.7.0"],
15+
"modin": ["modin", "ray", "dask"],
16+
"modin-ray": ["modin", "ray"],
1717
"modin-dask": ["modin", "dask"],
1818
"dask": ["dask"],
1919
"mypy": ["pandas-stubs"],

0 commit comments

Comments
 (0)