Implement gt and ge check for the Ibis backend

deepyaman · deepyaman · commit 43b97ffae19b · 2025-04-06T21:48:48.000-06:00
Signed-off-by: Deepyaman Datta &lt;deepyaman.datta@utexas.edu&gt;
diff --git a/pandera/backends/ibis/builtin_checks.py b/pandera/backends/ibis/builtin_checks.py
@@ -53,3 +53,36 @@ def not_equal_to(data: IbisData, value: Any) -> ir.Table:
     """
     value = _infer_interval_with_mixed_units(value)
     return data.table[data.key] != value
+
+
+@register_builtin_check(
+    aliases=["gt"],
+    error="greater_than({value})",
+)
+def greater_than(data: IbisData, min_value: Any) -> ir.Table:
+    """Ensure values of a data container are strictly greater than a minimum
+    value.
+
+    :param data: NamedTuple IbisData contains the table and column name for the check. The key
+        to access the table is "table", and the key to access the column name is "key".
+    :param min_value: Lower bound to be exceeded. Must be a type comparable
+        to the dtype of the :class:`ir.Column` to be validated.
+    """
+    value = _infer_interval_with_mixed_units(min_value)
+    return data.table[data.key] > value
+
+
+@register_builtin_check(
+    aliases=["ge"],
+    error="greater_than_or_equal_to({value})",
+)
+def greater_than_or_equal_to(data: IbisData, min_value: Any) -> ir.Table:
+    """Ensure all values are greater than or equal to a certain value.
+
+    :param data: NamedTuple IbisData contains the table and column name for the check. The key
+        to access the table is "table", and the key to access the column name is "key".
+    :param min_value: Allowed minimum value. Must be a type comparable
+        to the dtype of the :class:`ir.Column` to be validated.
+    """
+    value = _infer_interval_with_mixed_units(min_value)
+    return data.table[data.key] >= value
diff --git a/pandera/backends/pandas/builtin_checks.py b/pandera/backends/pandas/builtin_checks.py
@@ -88,8 +88,7 @@ def greater_than(data: PandasData, min_value: Any) -> PandasData:
     value.
 
     :param min_value: Lower bound to be exceeded. Must be a type comparable
-        to the dtype of the :class:`pandas.Series` to be validated (e.g. a
-        numerical type for float or int and a datetime for datetime).
+        to the dtype of the :class:`pandas.Series` to be validated.
     """
     return data > min_value
 
@@ -100,11 +99,10 @@ def greater_than(data: PandasData, min_value: Any) -> PandasData:
     error="greater_than_or_equal_to({min_value})",
 )
 def greater_than_or_equal_to(data: PandasData, min_value: Any) -> PandasData:
-    """Ensure all values are greater or equal a certain value.
+    """Ensure all values are greater than or equal to a certain value.
 
-    :param min_value: Allowed minimum value for values of a series. Must be
-        a type comparable to the dtype of the :class:`pandas.Series` to be
-        validated.
+    :param min_value: Allowed minimum value. Must be a type comparable
+        to the dtype of the :class:`pandas.Series` to be validated.
     """
     return data >= min_value
 
diff --git a/pandera/backends/polars/builtin_checks.py b/pandera/backends/polars/builtin_checks.py
@@ -62,12 +62,12 @@ def greater_than(data: PolarsData, min_value: Any) -> pl.LazyFrame:
     error="greater_than_or_equal_to({min_value})",
 )
 def greater_than_or_equal_to(data: PolarsData, min_value: Any) -> pl.LazyFrame:
-    """Ensure all values are greater or equal a certain value.
+    """Ensure all values are greater than or equal to a certain value.
 
     :param data: NamedTuple PolarsData contains the dataframe and column name for the check. The key
         to access the dataframe is "dataframe", and the key the to access the column name is "key".
-    :param min_value: Allowed minimum value for values of a series. Must be
-        a type comparable to the dtype of the series datatype of Polars.
+    :param min_value: Allowed minimum value. Must be a type comparable
+        to the dtype of the series datatype of Polars.
     """
     return data.lazyframe.select(pl.col(data.key).ge(min_value))
 
diff --git a/pandera/backends/pyspark/builtin_checks.py b/pandera/backends/pyspark/builtin_checks.py
@@ -60,7 +60,7 @@ def equal_to(data: PysparkDataframeColumnObject, value: Any) -> bool:
 def not_equal_to(data: PysparkDataframeColumnObject, value: Any) -> bool:
     """Ensure no element of a data container equals a certain value.
 
-    :param data: NamedTuple PysparkDataframeColumnObject contains the dataframe and column name for the check. The keys
+    :param data: NamedTuple PysparkDataframeColumnObject contains the dataframe and column name for the check. The key
         to access the dataframe is "dataframe" and column name using "column_name".
     :param value: This value must not occur in the checked
     """
@@ -76,11 +76,11 @@ def not_equal_to(data: PysparkDataframeColumnObject, value: Any) -> bool:
     acceptable_datatypes=convert_to_list(ALL_NUMERIC_TYPE, ALL_DATE_TYPE)
 )
 def greater_than(data: PysparkDataframeColumnObject, min_value: Any) -> bool:
-    """
-    Ensure values of a data container are strictly greater than a minimum
+    """Ensure values of a data container are strictly greater than a minimum
     value.
-    :param data: NamedTuple PysparkDataframeColumnObject contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "column_name".
+
+    :param data: NamedTuple PysparkDataframeColumnObject contains the dataframe and column name for the check. The key
+        to access the dataframe is "dataframe" and column name using "column_name".
     :param min_value: Lower bound to be exceeded.
     """
     cond = col(data.column_name) > min_value
@@ -98,11 +98,12 @@ def greater_than(data: PysparkDataframeColumnObject, min_value: Any) -> bool:
 def greater_than_or_equal_to(
     data: PysparkDataframeColumnObject, min_value: Any
 ) -> bool:
-    """Ensure all values are greater or equal a certain value.
+    """Ensure all values are greater than or equal to a certain value.
+
     :param data: NamedTuple PysparkDataframeColumnObject contains the dataframe and column name for the check. The keys
-                to access the dataframe is "dataframe" and column name using "column_name".
-    :param min_value: Allowed minimum value for values of a series. Must be
-        a type comparable to the dtype of the column datatype of pyspark
+        to access the dataframe is "dataframe" and column name using "column_name".
+    :param min_value: Allowed minimum value. Must be a type comparable
+        to the dtype of the column datatype of pyspark
     """
     cond = col(data.column_name) >= min_value
     return data.dataframe.filter(~cond).limit(1).count() == 0
diff --git a/tests/ibis/test_ibis_builtin_checks.py b/tests/ibis/test_ibis_builtin_checks.py
@@ -373,3 +373,219 @@ def test_not_equal_to_check(self, check_fn, datatype, data) -> None:
             datatype,
             data["test_expression"],
         )
+
+
+class TestGreaterThanCheck(BaseClass):
+    """This class is used to test the greater than check"""
+
+    sample_numeric_data = {
+        "test_pass_data": [("foo", 31), ("bar", 32)],
+        "test_fail_data": [("foo", 30), ("bar", 31)],
+        "test_expression": 30,
+    }
+
+    sample_datetime_data = {
+        "test_pass_data": [
+            ("foo", datetime.datetime(2020, 10, 2, 11, 0)),
+            ("bar", datetime.datetime(2020, 10, 2, 11, 0)),
+        ],
+        "test_fail_data": [
+            ("foo", datetime.datetime(2020, 10, 1, 10, 0)),
+            ("bar", datetime.datetime(2020, 10, 2, 11, 0)),
+        ],
+        "test_expression": datetime.datetime(2020, 10, 1, 10, 0),
+    }
+
+    sample_duration_data = {
+        "test_pass_data": [
+            ("foo", datetime.timedelta(100, 11, 1)),
+            ("bar", datetime.timedelta(100, 12, 1)),
+        ],
+        "test_fail_data": [
+            ("foo", datetime.timedelta(100, 10, 1)),
+            ("bar", datetime.timedelta(100, 11, 1)),
+        ],
+        "test_expression": datetime.timedelta(100, 10, 1),
+    }
+
+    def pytest_generate_tests(self, metafunc):
+        """This function passes the parameter for each function based on parameter form get_data_param function"""
+        # called once per each test function
+        funcarglist = self.get_data_param()[metafunc.function.__name__]
+        argnames = sorted(funcarglist[0])
+        metafunc.parametrize(
+            argnames,
+            [
+                [funcargs[name] for name in argnames]
+                for funcargs in funcarglist
+            ],
+        )
+
+    def get_data_param(self):
+        """Generate the params which will be used to test this function. All the acceptable
+        data types would be tested"""
+        return {
+            "test_greater_than_check": [
+                {"datatype": dt.UInt8, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt16, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt32, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt64, "data": self.sample_numeric_data},
+                {"datatype": dt.Int8, "data": self.sample_numeric_data},
+                {"datatype": dt.Int16, "data": self.sample_numeric_data},
+                {"datatype": dt.Int32, "data": self.sample_numeric_data},
+                {"datatype": dt.Int64, "data": self.sample_numeric_data},
+                {
+                    "datatype": dt.Float32,
+                    "data": self.convert_data(
+                        self.sample_numeric_data, "float32"
+                    ),
+                },
+                {
+                    "datatype": dt.Float64,
+                    "data": self.convert_data(
+                        self.sample_numeric_data, "float64"
+                    ),
+                },
+                {
+                    "datatype": dt.Date,
+                    "data": self.convert_data(
+                        self.sample_datetime_data, "date"
+                    ),
+                },
+                {
+                    "datatype": dt.Timestamp.from_unit("us"),
+                    "data": self.sample_datetime_data,
+                },
+                {
+                    "datatype": dt.Time,
+                    "data": self.convert_data(
+                        self.sample_datetime_data, "time"
+                    ),
+                },
+                {
+                    "datatype": dt.Interval(unit="us"),
+                    "data": self.sample_duration_data,
+                },
+            ]
+        }
+
+    @pytest.mark.parametrize("check_fn", [pa.Check.greater_than, pa.Check.gt])
+    def test_greater_than_check(self, check_fn, datatype, data) -> None:
+        """Test the Check to see if all the values are equal to defined value"""
+        self.check_function(
+            check_fn,
+            data["test_pass_data"],
+            data["test_fail_data"],
+            datatype,
+            data["test_expression"],
+        )
+
+
+class TestGreaterThanEqualToCheck(BaseClass):
+    """This class is used to test the greater than equal to check"""
+
+    sample_numeric_data = {
+        "test_pass_data": [("foo", 31), ("bar", 32)],
+        "test_fail_data": [("foo", 30), ("bar", 31)],
+        "test_expression": 31,
+    }
+
+    sample_datetime_data = {
+        "test_pass_data": [
+            ("foo", datetime.datetime(2020, 10, 1, 11, 0)),
+            ("bar", datetime.datetime(2020, 10, 2, 11, 0)),
+        ],
+        "test_fail_data": [
+            ("foo", datetime.datetime(2020, 10, 1, 11, 0)),
+            ("bar", datetime.datetime(2020, 9, 1, 10, 0)),
+        ],
+        "test_expression": datetime.datetime(2020, 10, 1, 11, 0),
+    }
+
+    sample_duration_data = {
+        "test_pass_data": [
+            ("foo", datetime.timedelta(100, 10, 1)),
+            ("bar", datetime.timedelta(100, 11, 1)),
+        ],
+        "test_fail_data": [
+            ("foo", datetime.timedelta(100, 11, 1)),
+            ("bar", datetime.timedelta(100, 9, 1)),
+        ],
+        "test_expression": datetime.timedelta(100, 10, 1),
+    }
+
+    def pytest_generate_tests(self, metafunc):
+        """This function passes the parameter for each function based on parameter form get_data_param function"""
+        # called once per each test function
+        funcarglist = self.get_data_param()[metafunc.function.__name__]
+        argnames = sorted(funcarglist[0])
+        metafunc.parametrize(
+            argnames,
+            [
+                [funcargs[name] for name in argnames]
+                for funcargs in funcarglist
+            ],
+        )
+
+    def get_data_param(self):
+        """Generate the params which will be used to test this function. All the acceptable
+        data types would be tested"""
+        return {
+            "test_greater_than_or_equal_to_check": [
+                {"datatype": dt.UInt8, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt16, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt32, "data": self.sample_numeric_data},
+                {"datatype": dt.UInt64, "data": self.sample_numeric_data},
+                {"datatype": dt.Int8, "data": self.sample_numeric_data},
+                {"datatype": dt.Int16, "data": self.sample_numeric_data},
+                {"datatype": dt.Int32, "data": self.sample_numeric_data},
+                {"datatype": dt.Int64, "data": self.sample_numeric_data},
+                {
+                    "datatype": dt.Float32,
+                    "data": self.convert_data(
+                        self.sample_numeric_data, "float32"
+                    ),
+                },
+                {
+                    "datatype": dt.Float64,
+                    "data": self.convert_data(
+                        self.sample_numeric_data, "float64"
+                    ),
+                },
+                {
+                    "datatype": dt.Date,
+                    "data": self.convert_data(
+                        self.sample_datetime_data, "date"
+                    ),
+                },
+                {
+                    "datatype": dt.Timestamp.from_unit("us"),
+                    "data": self.sample_datetime_data,
+                },
+                {
+                    "datatype": dt.Time,
+                    "data": self.convert_data(
+                        self.sample_datetime_data, "time"
+                    ),
+                },
+                {
+                    "datatype": dt.Interval(unit="us"),
+                    "data": self.sample_duration_data,
+                },
+            ]
+        }
+
+    @pytest.mark.parametrize(
+        "check_fn", [pa.Check.greater_than_or_equal_to, pa.Check.ge]
+    )
+    def test_greater_than_or_equal_to_check(
+        self, check_fn, datatype, data
+    ) -> None:
+        """Test the Check to see if all the values are equal to defined value"""
+        self.check_function(
+            check_fn,
+            data["test_pass_data"],
+            data["test_fail_data"],
+            datatype,
+            data["test_expression"],
+        )