From abaa1767608e4dd68b8874191acbd4f1e1d8c726 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Mon, 3 Feb 2025 22:43:33 +1100 Subject: [PATCH 1/9] trial type annotations Signed-off-by: Matt Richards --- pandera/api/dataframe/model.py | 46 ++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/pandera/api/dataframe/model.py b/pandera/api/dataframe/model.py index 95c8e835a..3fb89f94c 100644 --- a/pandera/api/dataframe/model.py +++ b/pandera/api/dataframe/model.py @@ -18,8 +18,14 @@ TypeVar, Union, cast, + overload, ) +# TODO hard dependence on pandas and polars, use string forwardref instead? +import pandas as pd # TODO tmp +import polars as pl +from pandera.typing.polars import LazyFrame + from pandera.api.base.model import BaseModel from pandera.api.base.schema import BaseSchema from pandera.api.checks import Check @@ -271,24 +277,50 @@ def to_yaml(cls, stream: Optional[os.PathLike] = None): """ return cls.to_schema().to_yaml(stream) + # Overloads specify effectively check_obj: TDataFrame -> TDataFrame[TDataFrameModel] + # but to do this directly would required higher kinded typevars (https://github.com/python/typing/issues/548) + + @overload + @classmethod + def validate( + cls: Type[TDataFrameModel], + check_obj: pl.LazyFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> LazyFrame[TDataFrameModel]: ... + + @overload + @classmethod + def validate( + cls: Type[TDataFrameModel], + check_obj: pd.DataFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> DataFrame[TDataFrameModel]: ... + @classmethod @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) def validate( cls: Type[TDataFrameModel], - check_obj: TDataFrame, + check_obj: pd.DataFrame | pl.LazyFrame, head: Optional[int] = None, tail: Optional[int] = None, sample: Optional[int] = None, random_state: Optional[int] = None, lazy: bool = False, inplace: bool = False, - ) -> DataFrameBase[TDataFrameModel]: + ) -> DataFrame[TDataFrameModel] | LazyFrame[TDataFrameModel]: """%(validate_doc)s""" - return cast( - DataFrameBase[TDataFrameModel], - cls.to_schema().validate( - check_obj, head, tail, sample, random_state, lazy, inplace - ), + return cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace ) # TODO: add docstring_substitution using generic class From 3491aea0f35902eb5a816877c15494288c9b5ce3 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Thu, 6 Feb 2025 21:29:55 +1100 Subject: [PATCH 2/9] changes in individual api files Signed-off-by: Matt Richards --- pandera/api/dataframe/model.py | 46 ++++++---------------------------- pandera/api/pandas/model.py | 24 +++++++++++++++++- pandera/api/polars/model.py | 27 ++++++++++++++++++-- pandera/api/pyspark/model.py | 5 ++-- 4 files changed, 58 insertions(+), 44 deletions(-) diff --git a/pandera/api/dataframe/model.py b/pandera/api/dataframe/model.py index 3fb89f94c..95c8e835a 100644 --- a/pandera/api/dataframe/model.py +++ b/pandera/api/dataframe/model.py @@ -18,14 +18,8 @@ TypeVar, Union, cast, - overload, ) -# TODO hard dependence on pandas and polars, use string forwardref instead? -import pandas as pd # TODO tmp -import polars as pl -from pandera.typing.polars import LazyFrame - from pandera.api.base.model import BaseModel from pandera.api.base.schema import BaseSchema from pandera.api.checks import Check @@ -277,50 +271,24 @@ def to_yaml(cls, stream: Optional[os.PathLike] = None): """ return cls.to_schema().to_yaml(stream) - # Overloads specify effectively check_obj: TDataFrame -> TDataFrame[TDataFrameModel] - # but to do this directly would required higher kinded typevars (https://github.com/python/typing/issues/548) - - @overload - @classmethod - def validate( - cls: Type[TDataFrameModel], - check_obj: pl.LazyFrame, - head: Optional[int] = None, - tail: Optional[int] = None, - sample: Optional[int] = None, - random_state: Optional[int] = None, - lazy: bool = False, - inplace: bool = False, - ) -> LazyFrame[TDataFrameModel]: ... - - @overload - @classmethod - def validate( - cls: Type[TDataFrameModel], - check_obj: pd.DataFrame, - head: Optional[int] = None, - tail: Optional[int] = None, - sample: Optional[int] = None, - random_state: Optional[int] = None, - lazy: bool = False, - inplace: bool = False, - ) -> DataFrame[TDataFrameModel]: ... - @classmethod @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) def validate( cls: Type[TDataFrameModel], - check_obj: pd.DataFrame | pl.LazyFrame, + check_obj: TDataFrame, head: Optional[int] = None, tail: Optional[int] = None, sample: Optional[int] = None, random_state: Optional[int] = None, lazy: bool = False, inplace: bool = False, - ) -> DataFrame[TDataFrameModel] | LazyFrame[TDataFrameModel]: + ) -> DataFrameBase[TDataFrameModel]: """%(validate_doc)s""" - return cls.to_schema().validate( - check_obj, head, tail, sample, random_state, lazy, inplace + return cast( + DataFrameBase[TDataFrameModel], + cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace + ), ) # TODO: add docstring_substitution using generic class diff --git a/pandera/api/pandas/model.py b/pandera/api/pandas/model.py index 6e668a1dc..188725193 100644 --- a/pandera/api/pandas/model.py +++ b/pandera/api/pandas/model.py @@ -2,10 +2,11 @@ import copy import sys -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast import pandas as pd +from pandera.api.base.schema import BaseSchema from pandera.api.checks import Check from pandera.api.dataframe.model import DataFrameModel as _DataFrameModel from pandera.api.dataframe.model import get_dtype_kwargs @@ -22,6 +23,7 @@ AnnotationInfo, DataFrame, ) +from pandera.utils import docstring_substitution # if python version is < 3.11, import Self from typing_extensions if sys.version_info < (3, 11): @@ -171,6 +173,26 @@ def _build_columns_index( # pylint:disable=too-many-locals,too-many-branches return columns, _build_schema_index(indices, **multiindex_kwargs) + @classmethod + @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) + def validate( + cls: Type[Self], + check_obj: pd.DataFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> DataFrame[Self]: + """%(validate_doc)s""" + return cast( + DataFrame[Self], + cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace + ), + ) + @classmethod def to_json_schema(cls): """Serialize schema metadata into json-schema format. diff --git a/pandera/api/polars/model.py b/pandera/api/polars/model.py index caa345977..a333e431c 100644 --- a/pandera/api/polars/model.py +++ b/pandera/api/polars/model.py @@ -1,11 +1,13 @@ """Class-based api for polars models.""" import inspect -from typing import Dict, List, Tuple, Type +from typing import Dict, List, Tuple, Type, cast, Optional +from typing_extensions import Self import pandas as pd import polars as pl +from pandera.api.base.schema import BaseSchema from pandera.api.checks import Check from pandera.api.dataframe.model import DataFrameModel as _DataFrameModel from pandera.api.dataframe.model import get_dtype_kwargs @@ -16,7 +18,8 @@ from pandera.engines import polars_engine as pe from pandera.errors import SchemaInitError from pandera.typing import AnnotationInfo -from pandera.typing.polars import Series +from pandera.typing.polars import Series, LazyFrame +from pandera.utils import docstring_substitution class DataFrameModel(_DataFrameModel[pl.LazyFrame, DataFrameSchema]): @@ -109,6 +112,26 @@ def _build_columns( # pylint:disable=too-many-locals return columns + @classmethod + @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) + def validate( + cls: Type[Self], + check_obj: pl.LazyFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> LazyFrame[Self]: + """%(validate_doc)s""" + return cast( + LazyFrame[Self], + cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace + ), + ) + @classmethod def to_json_schema(cls): """Serialize schema metadata into json-schema format. diff --git a/pandera/api/pyspark/model.py b/pandera/api/pyspark/model.py index 9fbc6866c..088c8f0b3 100644 --- a/pandera/api/pyspark/model.py +++ b/pandera/api/pyspark/model.py @@ -41,6 +41,7 @@ from pandera.errors import SchemaInitError from pandera.typing import AnnotationInfo from pandera.typing.common import DataFrameBase +from pandera.typing.pyspark import DataFrame try: from typing_extensions import get_type_hints @@ -300,10 +301,10 @@ def validate( random_state: Optional[int] = None, lazy: bool = True, inplace: bool = False, - ) -> Optional[DataFrameBase[TDataFrameModel]]: + ) -> Optional[DataFrame[TDataFrameModel]]: """%(validate_doc)s""" return cast( - DataFrameBase[TDataFrameModel], + DataFrame[TDataFrameModel], cls.to_schema().validate( check_obj, head, tail, sample, random_state, lazy, inplace ), From b4228c6340faa47f06994c140a9f63219780a549 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Fri, 7 Feb 2025 21:08:34 +1100 Subject: [PATCH 3/9] pl.dataframe working in local test Signed-off-by: Matt Richards --- .pre-commit-config.yaml | 1 + pandera/api/polars/model.py | 45 ++++++++++++++++++++++++++++-------- pandera/api/pyspark/model.py | 2 +- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d6520d23..77fd1c5f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,6 +54,7 @@ repos: - types-pyyaml - types-requests - types-setuptools + - polars args: ["pandera", "tests", "scripts"] exclude: (^docs/|^tests/mypy/modules/) pass_filenames: false diff --git a/pandera/api/polars/model.py b/pandera/api/polars/model.py index a333e431c..0badaed0a 100644 --- a/pandera/api/polars/model.py +++ b/pandera/api/polars/model.py @@ -1,7 +1,7 @@ """Class-based api for polars models.""" import inspect -from typing import Dict, List, Tuple, Type, cast, Optional +from typing import Dict, List, Tuple, Type, cast, Optional, overload from typing_extensions import Self import pandas as pd @@ -18,7 +18,7 @@ from pandera.engines import polars_engine as pe from pandera.errors import SchemaInitError from pandera.typing import AnnotationInfo -from pandera.typing.polars import Series, LazyFrame +from pandera.typing.polars import Series, LazyFrame, DataFrame from pandera.utils import docstring_substitution @@ -113,7 +113,20 @@ def _build_columns( # pylint:disable=too-many-locals return columns @classmethod - @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) + @overload + def validate( + cls: Type[Self], + check_obj: pl.DataFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> DataFrame[Self]: ... + + @classmethod + @overload def validate( cls: Type[Self], check_obj: pl.LazyFrame, @@ -123,14 +136,28 @@ def validate( random_state: Optional[int] = None, lazy: bool = False, inplace: bool = False, - ) -> LazyFrame[Self]: + ) -> LazyFrame[Self]: ... + + @classmethod + @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) + def validate( + cls: Type[Self], + check_obj: pl.LazyFrame | pl.DataFrame, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + inplace: bool = False, + ) -> LazyFrame[Self] | DataFrame[Self]: """%(validate_doc)s""" - return cast( - LazyFrame[Self], - cls.to_schema().validate( - check_obj, head, tail, sample, random_state, lazy, inplace - ), + result = cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace ) + if isinstance(check_obj, pl.LazyFrame): + return cast(LazyFrame[Self], result) + else: + return cast(DataFrame[Self], result) @classmethod def to_json_schema(cls): diff --git a/pandera/api/pyspark/model.py b/pandera/api/pyspark/model.py index 088c8f0b3..dffc7332f 100644 --- a/pandera/api/pyspark/model.py +++ b/pandera/api/pyspark/model.py @@ -301,7 +301,7 @@ def validate( random_state: Optional[int] = None, lazy: bool = True, inplace: bool = False, - ) -> Optional[DataFrame[TDataFrameModel]]: + ) -> DataFrame[TDataFrameModel]: """%(validate_doc)s""" return cast( DataFrame[TDataFrameModel], From a6380aaa5fc6d419b5734e0b0032c9fe918b71d2 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 8 Feb 2025 13:06:19 +1100 Subject: [PATCH 4/9] older python union compat Signed-off-by: Matt Richards --- pandera/api/polars/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandera/api/polars/model.py b/pandera/api/polars/model.py index 0badaed0a..d4a35e59a 100644 --- a/pandera/api/polars/model.py +++ b/pandera/api/polars/model.py @@ -1,7 +1,7 @@ """Class-based api for polars models.""" import inspect -from typing import Dict, List, Tuple, Type, cast, Optional, overload +from typing import Dict, List, Tuple, Type, cast, Optional, overload, Union from typing_extensions import Self import pandas as pd @@ -149,7 +149,7 @@ def validate( random_state: Optional[int] = None, lazy: bool = False, inplace: bool = False, - ) -> LazyFrame[Self] | DataFrame[Self]: + ) -> Union[LazyFrame[Self], DataFrame[Self]]: """%(validate_doc)s""" result = cls.to_schema().validate( check_obj, head, tail, sample, random_state, lazy, inplace From a8fa84c9ce1a7fcc002b9eadc3bb419b0d1418e8 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 8 Feb 2025 13:06:34 +1100 Subject: [PATCH 5/9] try polars in the mypy env on ci Signed-off-by: Matt Richards --- .github/workflows/ci-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 26fe16c07..46259cb16 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -57,6 +57,7 @@ jobs: types-pyyaml \ types-requests \ types-setuptools + polars - name: Pip info run: python -m pip list From 2c284e167f9a5448c90e47ded443ea348f0c31c7 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 15 Feb 2025 11:47:12 +1100 Subject: [PATCH 6/9] translate toplevel mypy skip into module specific skips Signed-off-by: Matt Richards --- mypy.ini | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mypy.ini b/mypy.ini index ebec0d32e..0fa9ad32c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,6 @@ [mypy] ignore_missing_imports = True -follow_imports = skip +follow_imports = normal allow_redefinition = True warn_return_any = False warn_unused_configs = True @@ -12,3 +12,25 @@ exclude=(?x)( | ^pandera/backends/pyspark | ^tests/pyspark ) +[mypy-polars.*] +follow_imports = skip +[mypy-mypy.*] +follow_imports = skip +[mypy-tests.mypy.modules.*] +follow_imports = skip +[mypy-pandera.api.pyspark.*] +follow_imports = skip +[mypy-pandera.backends.pyspark.*] +follow_imports = skip +[mypy-pandera.engines.pyspark_engine] +follow_imports = skip +[mypy-tests.mypy] +follow_imports = skip +[mypy-tests.geopandas] +follow_imports = skip +[mypy-docs.*] +follow_imports = skip + +# potentially not required / not ideal, pre-commit mypy sometimes scans the pre-commit venv +[mypy-urllib3.*] +follow_imports = skip From 60be3a99ddbde8889570279ef32dd1d8a9c379ae Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 15 Feb 2025 13:44:13 +1100 Subject: [PATCH 7/9] mypy passes Signed-off-by: Matt Richards --- mypy.ini | 26 +++++++++----------------- pandera/api/polars/components.py | 2 +- pandera/backends/polars/base.py | 5 ++++- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/mypy.ini b/mypy.ini index 0fa9ad32c..ea3653b16 100644 --- a/mypy.ini +++ b/mypy.ini @@ -12,25 +12,17 @@ exclude=(?x)( | ^pandera/backends/pyspark | ^tests/pyspark ) -[mypy-polars.*] -follow_imports = skip -[mypy-mypy.*] -follow_imports = skip -[mypy-tests.mypy.modules.*] -follow_imports = skip [mypy-pandera.api.pyspark.*] follow_imports = skip -[mypy-pandera.backends.pyspark.*] -follow_imports = skip -[mypy-pandera.engines.pyspark_engine] -follow_imports = skip -[mypy-tests.mypy] -follow_imports = skip -[mypy-tests.geopandas] -follow_imports = skip + [mypy-docs.*] follow_imports = skip -# potentially not required / not ideal, pre-commit mypy sometimes scans the pre-commit venv -[mypy-urllib3.*] -follow_imports = skip +[mypy-pandera.engines.polars_engine] +ignore_errors = True + +[mypy-pandera.backends.polars.builtin_checks] +ignore_errors = True + +[mypy-tests.polars.*] +ignore_errors = True diff --git a/pandera/api/polars/components.py b/pandera/api/polars/components.py index 823912d1d..e697c10be 100644 --- a/pandera/api/polars/components.py +++ b/pandera/api/polars/components.py @@ -23,7 +23,7 @@ class Column(ComponentSchema[PolarsCheckObjects]): def __init__( self, - dtype: PolarsDtypeInputTypes = None, + dtype: Optional[PolarsDtypeInputTypes] = None, checks: Optional[CheckList] = None, nullable: bool = False, unique: bool = False, diff --git a/pandera/backends/polars/base.py b/pandera/backends/polars/base.py index ef7f7da13..766efc9a1 100644 --- a/pandera/backends/polars/base.py +++ b/pandera/backends/polars/base.py @@ -47,7 +47,10 @@ def subsample( obj_subsample.append(check_obj.tail(tail)) if sample is not None: obj_subsample.append( - check_obj.sample(sample, random_state=random_state) + # mypy is detecting a bug https://github.com/unionai-oss/pandera/issues/1912 + check_obj.sample( # type:ignore [attr-defined] + sample, random_state=random_state + ) ) return ( check_obj From 2d9bebe2f682410740928fbceb68ce2713101fcc Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 15 Feb 2025 13:48:32 +1100 Subject: [PATCH 8/9] missing line continuation Signed-off-by: Matt Richards --- .github/workflows/ci-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 46259cb16..1161ca9ca 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -56,7 +56,7 @@ jobs: types-pytz \ types-pyyaml \ types-requests \ - types-setuptools + types-setuptools \ polars - name: Pip info run: python -m pip list From 7ca2b863e51504ad959657b5e4249a5203147373 Mon Sep 17 00:00:00 2001 From: Matt Richards Date: Sat, 15 Feb 2025 14:51:05 +1100 Subject: [PATCH 9/9] python 3.8 Signed-off-by: Matt Richards --- pandera/api/polars/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandera/api/polars/model.py b/pandera/api/polars/model.py index d4a35e59a..b65944e12 100644 --- a/pandera/api/polars/model.py +++ b/pandera/api/polars/model.py @@ -142,7 +142,7 @@ def validate( @docstring_substitution(validate_doc=BaseSchema.validate.__doc__) def validate( cls: Type[Self], - check_obj: pl.LazyFrame | pl.DataFrame, + check_obj: Union[pl.LazyFrame, pl.DataFrame], head: Optional[int] = None, tail: Optional[int] = None, sample: Optional[int] = None,