Merge remote-tracking branch 'upstream/main' into feature/unionai-oss#992

Arne Recknagel · Arne Recknagel · commit 593ec224b527 · 2023-01-30T09:09:37.000+01:00
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -51,6 +51,7 @@ pip install -e .
 pip install virtualenv
 virtualenv .venv/pandera-dev
 source .venv/pandera-dev/bin/activate
+pip install --upgrade pip
 pip install -r requirements-dev.txt
 pip install -e .
 ```
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -19,6 +19,10 @@ env:
   # Increase this value to reset cache if environment.yml has not changed
   CACHE_VERSION: 6
 
+concurrency:
+  group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
+  cancel-in-progress: true
+
 jobs:
 
   lint:
@@ -93,14 +97,12 @@ jobs:
       PYTEST_FLAGS: --cov=pandera --cov-report=term-missing --cov-report=xml --cov-append
       HYPOTHESIS_FLAGS: -n=auto -q --hypothesis-profile=ci
     strategy:
-      fail-fast: true
+      fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         python-version: ["3.7", "3.8", "3.9", "3.10"]
-        pandas-version: ["1.2.0", "1.3.0", "latest"]
+        pandas-version: ["1.3.0", "latest"]
         exclude:
-        - python-version: "3.10"
-          pandas-version: "1.2.0"
         - python-version: "3.10"
           pandas-version: "1.3.0"
         include:
@@ -151,6 +153,11 @@ jobs:
         if: ${{ matrix.pandas-version != 'latest' }}
         run: mamba install -c conda-forge pandas==${{ matrix.pandas-version }} geopandas
 
+      # ray currently cannot be installed on python 3.10, windows
+      - name: Remove Ray from Deps
+        if: ${{ matrix.os == 'windows-latest' && matrix.python-version == '3.10' }}
+        run: sed -i 's/^ray//g' requirements-dev.txt
+
       - name: Install Pip Deps
         run: |
           python -m pip install -U pip
@@ -199,7 +206,11 @@ jobs:
           CI_MODIN_ENGINES: dask
 
       - name: Unit Tests - Modin-Ray
-        if: ${{ matrix.python-version != '3.10' }}
+        # ray CI issues with the following:
+        # - windows, python 3.10
+        # - mac, python 3.7
+        # Tracking issue: https://github.com/modin-project/modin/issues/5466
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
         run: pytest tests/modin ${{ env.PYTEST_FLAGS }}
         env:
           CI_MODIN_ENGINES: ray
@@ -208,9 +219,9 @@ jobs:
         uses: codecov/codecov-action@v3
 
       - name: Check Docstrings
-        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
         run: nox ${{ env.NOX_FLAGS }} --session doctests
 
       - name: Check Docs
-        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
         run: nox ${{ env.NOX_FLAGS }} --session docs
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -54,6 +54,8 @@ repos:
         entry: mypy
         language: python
         types: [python]
-        files: (^pandera/|^tests/|^scripts/)
+        pass_filenames: false
         exclude: (^docs/|^tests/mypy/modules/)
         require_serial: true
+        args: ["pandera", "tests", "scripts"]
+        verbose: true
diff --git a/environment.yml b/environment.yml
@@ -21,7 +21,7 @@ dependencies:
   - pydantic
 
   # mypy extra
-  - pandas-stubs
+  - pandas-stubs <= 1.5.2.221213
 
   # pyspark extra
   - pyspark >= 3.2.0
@@ -80,7 +80,7 @@ dependencies:
 
   - pip:
       - furo
-      - ray <= 1.7.0; python_version < '3.10'
+      - ray
       - types-click
       - types-pyyaml
       - types-pkg_resources
diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,8 @@
+[mypy]
+ignore_missing_imports = True
+follow_imports = skip
+allow_redefinition = True
+warn_return_any = False
+warn_unused_configs = True
+show_error_codes = True
+exclude = tests/mypy/modules
diff --git a/noxfile.py b/noxfile.py
@@ -265,7 +265,7 @@ def requirements(session: Session) -> None:  # pylint:disable=unused-argument
         print(f"{REQUIREMENT_PATH} has been re-generated ✨ 🍰 ✨")
         raise err
 
-    ignored_pkgs = {"black", "pandas"}
+    ignored_pkgs = {"black", "pandas", "pandas-stubs"}
     mismatched = []
     # only compare package versions, not python version markers.
     str_dev_reqs = [str(x) for x in DEV_REQUIREMENTS]
diff --git a/pandera/checks.py b/pandera/checks.py
@@ -325,7 +325,7 @@ def _prepare_series_input(
         if check_utils.is_field(df_or_series):
             return df_or_series  # type: ignore[return-value]
         elif self.groupby is None:
-            return df_or_series[column]  # type: ignore[index]
+            return df_or_series[column]  # type: ignore
         elif isinstance(self.groupby, list):
             return self._format_groupby_input(  # type: ignore[return-value]
                 df_or_series.groupby(self.groupby)[column],  # type: ignore[index]
diff --git a/pandera/decorators.py b/pandera/decorators.py
@@ -752,4 +752,11 @@ def _wrapper(
                 out = wrapped_(*validated_pos.values(), **validated_kwd)
             return _check_arg("return", out)
 
-    return _wrapper(wrapped)  # pylint:disable=no-value-for-parameter
+    wrapped_fn = _wrapper(wrapped)  # pylint:disable=no-value-for-parameter
+
+    # The wrapt.decorator function returns a FunctionWrapper, which
+    # exposes an __iter__ method that causes the function to be recognized as
+    # an iterable. This causes unintended downstream issues, see for example:
+    # https://github.com/unionai-oss/pandera/issues/1021
+    wrapped_fn.__iter__ = None
+    return wrapped_fn
diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -525,7 +525,7 @@ def coerce_value(self, value: Any) -> decimal.Decimal:
         return dec.quantize(self._exp, context=self._ctx)
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
-        return data_container.apply(self.coerce_value)
+        return data_container.apply(self.coerce_value)  # type: ignore
 
     def check(  # type: ignore
         self,
@@ -577,7 +577,7 @@ def __init__(  # pylint:disable=super-init-not-called
         object.__setattr__(
             self,
             "type",
-            pd.CategoricalDtype(self.categories, self.ordered),
+            pd.CategoricalDtype(self.categories, self.ordered),  # type: ignore
         )
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
@@ -639,13 +639,13 @@ def __str__(self) -> str:
 else:
 
     @Engine.register_dtype(
-        equivalents=["string", pd.StringDtype, pd.StringDtype()]
-    )  # type: ignore
+        equivalents=["string", pd.StringDtype, pd.StringDtype()]  # type: ignore
+    )
     @immutable
     class STRING(DataType, dtypes.String):  # type: ignore
         """Semantic representation of a :class:`pandas.StringDtype`."""
 
-        type = pd.StringDtype()
+        type = pd.StringDtype()  # type: ignore
 
 
 @Engine.register_dtype(
@@ -984,8 +984,8 @@ def __post_init__(self):
     def from_parametrized_dtype(cls, pd_dtype: pd.SparseDtype):
         """Convert a :class:`pandas.SparseDtype` to
         a Pandera :class:`pandera.engines.pandas_engine.Sparse`."""
-        return cls(  # type: ignore
-            dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value
+        return cls(
+            dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value  # type: ignore
         )
 
 
diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -180,13 +180,16 @@ def __init__(
         See :ref:`here<DataFrameSchemas>` for more usage details.
 
         """
+        if columns is None:
+            columns = {}
+        _validate_columns(columns)
+        columns = _columns_renamed(columns)
+
         if checks is None:
             checks = []
         if isinstance(checks, (Check, Hypothesis)):
             checks = [checks]
 
-        self.columns: Dict[Any, Column] = {} if columns is None else columns
-
         if strict not in (
             False,
             True,
@@ -197,6 +200,7 @@ def __init__(
                 "or `'filter'`."
             )
 
+        self.columns: Dict[Any, Column] = columns
         self.checks: CheckListProperty = checks
         self.index = index
         self.strict: StrictType = strict
@@ -209,8 +213,6 @@ def __init__(
         self._unique_column_names = unique_column_names
         self._title = title
         self._description = description
-        self._validate_schema()
-        self._set_column_names()
 
         # this attribute is not meant to be accessed by users and is explicitly
         # set to True in the case that a schema is created by infer_schema.
@@ -277,36 +279,6 @@ def _is_inferred(self) -> bool:
     def _is_inferred(self, value: bool) -> None:
         self._IS_INFERRED = value
 
-    def _validate_schema(self) -> None:
-        for column_name, column in self.columns.items():
-            for check in column.checks:
-                if check.groupby is None or callable(check.groupby):
-                    continue
-                nonexistent_groupby_columns = [
-                    c for c in check.groupby if c not in self.columns
-                ]
-                if nonexistent_groupby_columns:
-                    raise errors.SchemaInitError(
-                        f"groupby argument {nonexistent_groupby_columns} in "
-                        f"Check for Column {column_name} not "
-                        "specified in the DataFrameSchema."
-                    )
-
-    def _set_column_names(self) -> None:
-        def _set_column_handler(column, column_name):
-            if column.name is not None and column.name != column_name:
-                warnings.warn(
-                    f"resetting column for {column} to '{column_name}'."
-                )
-            elif column.name == column_name:
-                return column
-            return column.set_name(column_name)
-
-        self.columns = {
-            column_name: _set_column_handler(column, column_name)
-            for column_name, column in self.columns.items()
-        }
-
     @property
     def dtypes(self) -> Dict[str, DataType]:
         # pylint:disable=anomalous-backslash-in-string
@@ -347,7 +319,7 @@ def get_dtypes(self, dataframe: pd.DataFrame) -> Dict[str, DataType]:
                 )
         return {
             **{n: c.dtype for n, c in self.columns.items() if not c.regex},
-            **regex_dtype,
+            **regex_dtype,  # type: ignore
         }
 
     @property
@@ -595,7 +567,7 @@ def _validate(
                 is_schema_col = column in expanded_column_names
                 if (self.strict is True) and not is_schema_col:
                     msg = (
-                        f"column '{column}' not in {self.__class__.__name__}"
+                        f"column {column!r} not in {self.__class__.__name__}"
                         f" {self.columns}"
                     )
                     error_handler.collect_error(
@@ -621,7 +593,7 @@ def _validate(
                             errors.SchemaError(
                                 self,
                                 check_obj,
-                                message=f"column '{column}' out-of-order",
+                                message=f"column {column!r} out-of-order",
                                 failure_cases=scalar_failure_case(column),
                                 check="column_ordered",
                             ),
@@ -2541,3 +2513,31 @@ def convert_uniquesettings(unique: UniqueSettings) -> Union[bool, str]:
             str(unique) + " is not a recognized report_duplicates value"
         )
     return keep_argument
+
+
+def _validate_columns(column_dict: dict[Any, Column]) -> None:
+    for column_name, column in column_dict.items():
+        for check in column.checks:
+            if check.groupby is None or callable(check.groupby):
+                continue
+            nonexistent_groupby_columns = [
+                c for c in check.groupby if c not in column_dict
+            ]
+            if nonexistent_groupby_columns:
+                raise errors.SchemaInitError(
+                    f"groupby argument {nonexistent_groupby_columns} in "
+                    f"Check for Column {column_name} not "
+                    "specified in the DataFrameSchema."
+                )
+
+
+def _columns_renamed(columns: dict[Any, Column]) -> dict[Any, Column]:
+    def renamed(column, new_name):
+        column = copy.deepcopy(column)
+        column.set_name(new_name)
+        return column
+
+    return {
+        column_name: renamed(column, column_name)
+        for column_name, column in columns.items()
+    }
diff --git a/pandera/strategies.py b/pandera/strategies.py
@@ -74,7 +74,7 @@ def _mask(
 ) -> Union[pd.Series, pd.Index]:
     if pd.api.types.is_timedelta64_dtype(val):  # type: ignore [arg-type]
         return val.mask(null_mask, pd.NaT)  # type: ignore [union-attr,arg-type]
-    elif val.dtype == pd.StringDtype():
+    elif val.dtype == pd.StringDtype():  # type: ignore [call-arg]
         return val.mask(null_mask, pd.NA)  # type: ignore [union-attr,arg-type]
     return val.mask(null_mask)  # type: ignore [union-attr]
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -14,7 +14,7 @@ typing_extensions >= 3.7.4.3
 frictionless
 pyarrow
 pydantic
-pandas-stubs
+pandas-stubs <= 1.5.2.221213
 pyspark >= 3.2.0
 modin
 protobuf <= 3.20.3
@@ -47,7 +47,7 @@ twine
 asv
 pre_commit
 furo
-ray <= 1.7.0; python_version < '3.10'
+ray
 types-click
 types-pyyaml
 types-pkg_resources
diff --git a/setup.cfg b/setup.cfg
@@ -1,11 +1,3 @@
 [isort]
 float_to_top = true
 profile = black
-
-[mypy]
-ignore_missing_imports = True
-allow_redefinition = True
-warn_return_any = False
-warn_unused_configs = True
-show_error_codes = True
-exclude = tests/mypy/modules
diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 with open("README.md") as f:
     long_description = f.read()
@@ -12,8 +12,8 @@
     "hypotheses": ["scipy"],
     "io": ["pyyaml >= 5.1", "black", "frictionless"],
     "pyspark": ["pyspark >= 3.2.0"],
-    "modin": ["modin", "ray <= 1.7.0", "dask"],
-    "modin-ray": ["modin", "ray <= 1.7.0"],
+    "modin": ["modin", "ray", "dask"],
+    "modin-ray": ["modin", "ray"],
     "modin-dask": ["modin", "dask"],
     "dask": ["dask"],
     "mypy": ["pandas-stubs"],
diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py
diff --git a/tests/core/test_logical_dtypes.py b/tests/core/test_logical_dtypes.py
diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py
diff --git a/tests/modin/test_schemas_on_modin.py b/tests/modin/test_schemas_on_modin.py
diff --git a/tests/mypy/modules/pandas_dataframe.py b/tests/mypy/modules/pandas_dataframe.py
diff --git a/tests/mypy/modules/pandas_time.py b/tests/mypy/modules/pandas_time.py
diff --git a/tests/mypy/test_static_type_checking.py b/tests/mypy/test_static_type_checking.py