Skip to content

feature/koalas-beta #651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ env:
DEFAULT_PYTHON: 3.8
CI: "true"
# Increase this value to reset cache if environment.yml has not changed
CACHE_VERSION: 4
CACHE_VERSION: 5

jobs:

Expand Down Expand Up @@ -145,6 +145,13 @@ jobs:
--non-interactive
--session "tests-${{ matrix.python-version }}(extra='strategies', pandas='${{ matrix.pandas-version }}')"

- name: Unit Tests - Koalas
run: >
nox
-db virtualenv -r -v
--non-interactive
--session "tests-${{ matrix.python-version }}(extra='koalas', pandas='${{ matrix.pandas-version }}')"

- name: Upload coverage to Codecov
uses: "codecov/codecov-action@v1"

Expand Down
10 changes: 7 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ repos:
- id: flynt

- repo: https://github.com/psf/black
rev: 20.8b1
rev: 21.9b0
hooks:
- id: black
args: ["--line-length=79"]

- repo: https://github.com/pycqa/pylint
rev: v2.10.2
rev: v2.11.1
hooks:
- id: pylint
args: ["--disable=import-error"]
Expand All @@ -50,10 +50,14 @@ repos:
- repo: https://github.com/pre-commit/mirrors-mypy
# TODO: in mypy 0.900+ one needs to pip install type stubs separately (i.e. typeshed is no longer included)
# TODO: but pre-commit downloads mypy in a different location (~/.cache)
rev: v0.812
rev: v0.910
hooks:
- id: mypy
entry: mypy pandera tests
files: (^pandera/|^tests|^scripts)
exclude: (^docs/)
pass_filenames: false
additional_dependencies:
- types-click
- types-pyyaml
- types-pkg_resources
8 changes: 8 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,18 @@
else:
SKIP_STRATEGY = False

try:
import koalas
except ImportError:
KOALAS_INSTALLED = True
else:
KOALAS_INSTALLED = False

SKIP = sys.version_info < (3, 6)
PY36 = sys.version_info < (3, 7)
SKIP_PANDAS_LT_V1 = version.parse(pd.__version__).release < (1, 0) or PY36
SKIP_SCALING = True
SKIP_SCHEMA_MODEL = SKIP_PANDAS_LT_V1 or KOALAS_INSTALLED
"""

doctest_default_flags = (
Expand Down
4 changes: 2 additions & 2 deletions docs/source/schema_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,13 @@ You must give a **type**, not an **instance**.
:red:`✘` Bad:

.. testcode:: dataframe_schema_model
:skipif: SKIP_PANDAS_LT_V1
:skipif: SKIP_SCHEMA_MODEL

class Schema(pa.SchemaModel):
a: Series[pd.StringDtype()]

.. testoutput:: dataframe_schema_model
:skipif: SKIP_PANDAS_LT_V1
:skipif: SKIP_SCHEMA_MODEL

Traceback (most recent call last):
...
Expand Down
7 changes: 5 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,23 @@ dependencies:
- frictionless
- pyarrow

# koalas extra
- koalas
- pyspark

# testing and dependencies
- black >= 20.8b1

# testing
- isort >= 5.7.0
- codecov
- mypy >= 0.902 # mypy no longer bundle stubs for third-party libraries
- pylint >= 2.7.2
- pylint = 2.11.1
- pytest
- pytest-cov
- pytest-xdist
- pytest-asyncio
- xdoctest
- setuptools < 58.0.0
- nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122
- importlib_metadata # required if python < 3.8

Expand Down
6 changes: 5 additions & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,13 @@ def install_extras(
specs.append(
spec if spec != "pandas" else f"pandas{pandas_version}"
)
if extra == "core":
if extra in {"core", "koalas"}:
specs.append(REQUIRES["all"]["hypothesis"])

# this is a temporary measure to install setuptools due to this issue:
# https://github.com/pandera-dev/pandera/pull/602#issuecomment-915622823
session.install("setuptools < 58.0.0")

# CI installs conda dependencies, so only run this for local runs
if (
isinstance(session.virtualenv, nox.virtualenv.CondaEnv)
Expand Down
2 changes: 2 additions & 0 deletions pandera/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""A flexible and expressive pandas validation library."""
import platform

from pandera import external_config
from pandera.dtypes import (
Bool,
Category,
Expand Down Expand Up @@ -56,4 +57,5 @@
from .version import __version__

if platform.system() != "Windows":
# pylint: disable=ungrouped-imports
from pandera.dtypes import Complex256, Float128
98 changes: 94 additions & 4 deletions pandera/check_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,88 @@
"""Utility functions for validation."""

from typing import Optional, Tuple, Union
from functools import lru_cache
from typing import NamedTuple, Optional, Tuple, Union

import pandas as pd

SupportedTypes = NamedTuple(
"SupportedTypes",
(
("table_types", Tuple[type]),
("field_types", Tuple[type]),
("index_types", Tuple[type]),
("multiindex_types", Tuple[type]),
),
)


@lru_cache(maxsize=None)
def _supported_types():
# pylint: disable=import-outside-toplevel
table_types = [pd.DataFrame]
field_types = [pd.Series]
index_types = [pd.Index]
multiindex_types = [pd.MultiIndex]

try:
import databricks.koalas as ks

table_types.append(ks.DataFrame)
field_types.append(ks.Series)
index_types.append(ks.Index)
multiindex_types.append(ks.MultiIndex)
except ImportError:
pass
try: # pragma: no cover
import modin.pandas as mpd

table_types.append(mpd.DataFrame)
field_types.append(mpd.Series)
index_types.append(mpd.Index)
multiindex_types.append(mpd.MultiIndex)
except ImportError:
pass

return SupportedTypes(
tuple(table_types),
tuple(field_types),
tuple(index_types),
tuple(multiindex_types),
)


def is_table(obj):
"""Verifies whether an object is table-like.

Where a table is a 2-dimensional data matrix of rows and columns, which
can be indexed in multiple different ways.
"""
return isinstance(obj, _supported_types().table_types)


def is_field(obj):
"""Verifies whether an object is field-like.

Where a field is a columnar representation of data in a table-like
data structure.
"""
return isinstance(obj, _supported_types().field_types)


def is_index(obj):
"""Verifies whether an object is a table index."""
return isinstance(obj, _supported_types().index_types)


def is_multiindex(obj):
"""Verifies whether an object is a multi-level table index."""
return isinstance(obj, _supported_types().multiindex_types)


def is_supported_check_obj(obj):
"""Verifies whether an object is table- or field-like."""
return is_table(obj) or is_field(obj)


def prepare_series_check_output(
check_obj: Union[pd.Series, pd.DataFrame],
Expand All @@ -25,9 +104,20 @@ def prepare_series_check_output(
check_output = check_output | isna
failure_cases = check_obj[~check_output]
if not failure_cases.empty and n_failure_cases is not None:
failure_cases = failure_cases.groupby(check_output).head(
n_failure_cases
)
# NOTE: this is a hack to support koalas, since you can't use groupby
# on a dataframe with another dataframe
if type(failure_cases).__module__.startswith("databricks.koalas"):
failure_cases = (
failure_cases.rename("failure_cases")
.to_frame()
.assign(check_output=check_output)
.groupby("check_output")
.head(n_failure_cases)["failure_cases"]
)
else:
failure_cases = failure_cases.groupby(check_output).head(
n_failure_cases
)
return check_output, failure_cases


Expand Down
29 changes: 14 additions & 15 deletions pandera/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,27 +369,27 @@ def __call__(
``failure_cases``: subset of the check_object that failed.
"""
# prepare check object
if isinstance(df_or_series, pd.Series) or (
column is not None and isinstance(df_or_series, pd.DataFrame)
if check_utils.is_field(df_or_series) or (
column is not None and check_utils.is_table(df_or_series)
):
check_obj = self._prepare_series_input(df_or_series, column)
elif isinstance(df_or_series, pd.DataFrame):
elif check_utils.is_table(df_or_series):
check_obj = self._prepare_dataframe_input(df_or_series)
else:
raise ValueError(
f"object of type {df_or_series} not supported. Must be a "
"Series, a dictionary of Series, or DataFrame"
f"object of type {type(df_or_series)} not supported. Must be "
"a Series, a dictionary of Series, or DataFrame"
)

# apply check function to check object
check_fn = partial(self._check_fn, **self._check_kwargs)

if self.element_wise:
check_output = (
check_obj.apply(check_fn, axis=1)
if isinstance(check_obj, pd.DataFrame)
else check_obj.map(check_fn)
if isinstance(check_obj, pd.Series)
check_obj.apply(check_fn, axis=1) # type: ignore
if check_utils.is_table(check_obj)
else check_obj.map(check_fn) # type: ignore
if check_utils.is_field(check_obj)
else check_fn(check_obj)
)
else:
Expand All @@ -401,12 +401,12 @@ def __call__(
if (
isinstance(check_obj, dict)
or isinstance(check_output, bool)
or not isinstance(check_output, (pd.Series, pd.DataFrame))
or not check_utils.is_supported_check_obj(check_output)
or check_obj.shape[0] != check_output.shape[0]
or (check_obj.index != check_output.index).all()
):
failure_cases = None
elif isinstance(check_output, pd.Series):
elif check_utils.is_field(check_output):
(
check_output,
failure_cases,
Expand All @@ -416,7 +416,7 @@ def __call__(
ignore_na=self.ignore_na,
n_failure_cases=self.n_failure_cases,
)
elif isinstance(check_output, pd.DataFrame):
elif check_utils.is_table(check_output):
(
check_output,
failure_cases,
Expand All @@ -434,12 +434,11 @@ def __call__(

check_passed = (
check_output.all()
if isinstance(check_output, pd.Series)
if check_utils.is_field(check_output)
else check_output.all(axis=None)
if isinstance(check_output, pd.DataFrame)
if check_utils.is_table(check_output)
else check_output
)

return CheckResult(
check_output, check_passed, check_obj, failure_cases
)
Expand Down
8 changes: 8 additions & 0 deletions pandera/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ def coerce(self, data_container: Any):
"""Coerce data container to the data type."""
raise NotImplementedError()

def try_coerce(self, data_container: Any):
"""Coerce data container to the data type,
raises a `~pandera.errors.ParserError` if the coercion fails

:raises: :class:`~pandera.errors.ParserError`: if coercion fails
"""
raise NotImplementedError()

def __call__(self, data_container: Any):
"""Coerce data container to the data type."""
return self.coerce(data_container)
Expand Down
Loading