|
1 | 1 | """Built-in checks for polars."""
|
2 | 2 |
|
3 |
| -from typing import Any, TypeVar, Iterable, Union |
| 3 | +from typing import Any, TypeVar, Iterable, Union, Optional |
4 | 4 |
|
5 | 5 | import re
|
6 | 6 | import polars as pl
|
7 | 7 |
|
8 | 8 |
|
9 | 9 | from pandera.api.extensions import register_builtin_check
|
10 | 10 | from pandera.api.polars.types import PolarsData
|
11 |
| -from pandera.backends.polars.constants import CHECK_OUTPUT_KEY |
12 | 11 |
|
13 | 12 | T = TypeVar("T")
|
14 | 13 |
|
@@ -180,42 +179,24 @@ def notin(data: PolarsData, forbidden_values: Iterable) -> pl.LazyFrame:
|
180 | 179 | )
|
181 | 180 |
|
182 | 181 |
|
183 |
| -@register_builtin_check( |
184 |
| - error="str_matches('{pattern}')", |
185 |
| -) |
186 |
| -def str_matches( |
187 |
| - data: PolarsData, |
188 |
| - pattern: Union[str, re.Pattern], |
189 |
| -) -> pl.LazyFrame: |
190 |
| - """Ensure that string values match a regular expression. |
191 |
| -
|
192 |
| - :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys |
193 |
| - to access the dataframe is "dataframe" and column name using "key". |
194 |
| - :param pattern: Regular expression pattern to use for matching |
195 |
| - """ |
196 |
| - |
197 |
| - return data.dataframe.select( |
198 |
| - pl.col(data.key).str.contains(pattern=pattern).alias(CHECK_OUTPUT_KEY) |
199 |
| - ) |
200 |
| - |
201 |
| - |
202 | 182 | @register_builtin_check(
|
203 | 183 | error="str_contains('{pattern}')",
|
204 | 184 | )
|
205 | 185 | def str_contains(
|
206 | 186 | data: PolarsData,
|
207 |
| - pattern: str, |
| 187 | + pattern: re.Pattern, |
208 | 188 | ) -> pl.LazyFrame:
|
209 | 189 | """Ensure that a pattern can be found within each row.
|
210 | 190 |
|
211 | 191 | :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
|
212 | 192 | to access the dataframe is "dataframe" and column name using "key".
|
213 | 193 | :param pattern: Regular expression pattern to use for searching
|
214 | 194 | """
|
| 195 | + |
215 | 196 | return data.dataframe.select(
|
216 |
| - pl.col(data.key) |
217 |
| - .str.contains(pattern=pattern, literal=True) |
218 |
| - .alias(CHECK_OUTPUT_KEY) |
| 197 | + pl.col(data.key).str.contains( |
| 198 | + pattern=f"{pattern.pattern}", literal=False |
| 199 | + ) |
219 | 200 | )
|
220 | 201 |
|
221 | 202 |
|
@@ -249,26 +230,26 @@ def str_endswith(data: PolarsData, string: str) -> pl.LazyFrame:
|
249 | 230 | )
|
250 | 231 | def str_length(
|
251 | 232 | data: PolarsData,
|
252 |
| - min_value: int = None, |
253 |
| - max_value: int = None, |
| 233 | + min_value: Optional[int] = None, |
| 234 | + max_value: Optional[int] = None, |
254 | 235 | ) -> pl.LazyFrame:
|
255 | 236 | """Ensure that the length of strings is within a specified range.
|
256 | 237 |
|
257 | 238 | :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The keys
|
258 | 239 | to access the dataframe is "dataframe" and column name using "key".
|
259 |
| - :param min_value: Minimum length of strings (default: no minimum) |
260 |
| - :param max_value: Maximum length of strings (default: no maximum) |
| 240 | + :param min_value: Minimum length of strings (including) (default: no minimum) |
| 241 | + :param max_value: Maximum length of strings (including) (default: no maximum) |
261 | 242 | """
|
262 | 243 | # NOTE: consider using len_bytes (faster but returns != n_chars for non ASCII strings
|
263 |
| - n_chars = pl.col("string_col").str.n_chars() |
| 244 | + n_chars = pl.col(data.key).str.n_chars() |
264 | 245 | is_in_min = (
|
265 | 246 | n_chars.ge(min_value) if min_value is not None else pl.lit(True)
|
266 | 247 | )
|
267 | 248 | is_in_max = (
|
268 | 249 | n_chars.le(max_value) if max_value is not None else pl.lit(True)
|
269 | 250 | )
|
270 | 251 |
|
271 |
| - return data.dataframe.select(is_in_min.and_(is_in_max)) |
| 252 | + return data.dataframe.select(is_in_min.and_(is_in_max).alias(data.key)) |
272 | 253 |
|
273 | 254 |
|
274 | 255 | @register_builtin_check(
|
|
0 commit comments