From 1b244edb40dff718cb9fbf9a9eac1fe1078343d2 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 4 Sep 2024 16:05:50 +0200 Subject: [PATCH 01/40] Add support for custom transformers (not ML.) in ColumnTransformer. --- bigframes/ml/compose.py | 147 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 4ea63d2e81..e083b65598 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,7 +21,9 @@ import re import types import typing -from typing import cast, Iterable, List, Optional, Set, Tuple, Union +from typing import cast, Iterable, List, Optional, Set, Tuple, Union, Dict, Type +import abc +import json import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery @@ -46,6 +48,106 @@ ) +CUSTOM_TRANSFORMER_SQL_RX = re.compile("^(?P.*)/[*]CT.(?P[A-Z]+)[(](?P[^*]*)[)][*]/$", re.IGNORECASE) + +class CustomTransformer(base.BaseTransformer): + _CTID = None + _custom_transformer_classes = {} + @classmethod + def register(cls, transformer_cls:Type[base.BaseTransformer]): + assert transformer_cls._CTID + cls._custom_transformer_classes[transformer_cls._CTID] = transformer_cls + @classmethod + def find_matching_transformer(cls, transform_sql:str) -> Optional[Type[base.BaseTransformer]]: + for transform_cls in cls._custom_transformer_classes.values(): + if transform_cls.understands(transform_sql): + return transform_cls + return None + @classmethod + def understands(cls, transform_sql:str) -> bool: + """ + may be overwritten to have a more advanced matching, possibly without comments in SQL + """ + m = CUSTOM_TRANSFORMER_SQL_RX.match(transform_sql) + if m and m.group("id").strip() == cls._CTID: + return True + return False + + def __init__(self): + super().__init__() + + def _compile_to_sql(self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None) -> List[str]: + if columns is None: + columns = X.columns + return [ + f"{self.custom_compile_to_sql(X, column)} {self._get_sql_comment(column)} AS {self.get_target_column_name(column)}" + for column in columns + ] + + def get_target_column_name(self, column:str): + return f"{self._CTID.lower()}_{column}" + + @classmethod + def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + pass + + @abc.abstractmethod + def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + pass + + def get_persistent_config(self) -> Optional[Union[Dict, List]]: + """ + return structure to be persisted in the comment of the sql + """ + return None + + def _get_pc_as_args(self) -> str: + config = self.get_persistent_config() + if not config: + return "" + return json.dumps(config) + + + def _get_sql_comment(self, column:str): + args = self._get_pc_as_args() + return f"/*CT.{self._CTID}({args})*/" + + @classmethod + def _parse_from_sql(cls, transform_sql: str) -> tuple[base.BaseTransformer, str]: + m = CUSTOM_TRANSFORMER_SQL_RX.match(transform_sql) + if m and m.group("id").strip() != cls._CTID: + raise ValueError("understand() does not match _parse_from_sql!") + args = m.group("config").strip() + if args != "": + config = json.loads(args) + else: + config = None + sql = m.group("sql").strip() + return cls.custom_parse_from_sql(config, sql) + + @abc.abstractclassmethod + def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str): + """ + return transformer instance and column name + """ + pass + + def _keys(self): + """ + this method is used for merging multiple transformers together. + So, keys can be derived from get_persistent_config() + """ + return self._get_pc_as_args() + + # CustomTransformers are thought to be used inside a column transformer. + # So there is no need to implement fit() and transform() directly. + # This can lead to problems if merg() detects only 1 transformer left. + def fit(self, y: Union[bpd.DataFrame, bpd.Series]) -> base.BaseTransformer: + raise NotImplementedError("Unsupported") + def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + raise NotImplementedError("Unsupported") + + @log_adapter.class_logger class ColumnTransformer( base.Transformer, @@ -130,10 +232,7 @@ def camel_to_snake(name): if "transformSql" not in transform_col_dict: continue transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue - output_names.append(transform_col_dict["name"]) found_transformer = False for prefix in _BQML_TRANSFROM_TYPE_MAPPING: if transform_sql.startswith(prefix): @@ -147,11 +246,28 @@ def camel_to_snake(name): found_transformer = True break + + if not found_transformer: + transformer_cls = CustomTransformer.find_matching_transformer(transform_sql) + if transformer_cls: + transformers_set.add( + ( + camel_to_snake(transformer_cls.__name__), + *transformer_cls._parse_from_sql(transform_sql), # type: ignore + ) + ) + found_transformer = True + + if not found_transformer: + if not transform_sql.startswith("ML.") and not transform_sql.startswith("/*CT."): + continue # ignore other patterns, only report unhandled known patterns raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) + output_names.append(transform_col_dict["name"]) + transformer = cls(transformers=list(transformers_set)) transformer._output_names = output_names @@ -167,6 +283,8 @@ def _merge( assert len(transformers) > 0 _, transformer_0, column_0 = transformers[0] + if isinstance(transformer_0, CustomTransformer): + return self # CustomTransformers only work inside ColumnTransformer feature_columns_sorted = sorted( [ cast(str, feature_column.name) @@ -234,6 +352,26 @@ def fit( self._extract_output_names() return self + # Overwrite the implementation in BaseTransformer, as it only supports the "ML." transformers. + def _extract_output_names(self): + """Extract transform output column names. Save the results to self._output_names.""" + assert self._bqml_model is not None + + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + if not CustomTransformer.find_matching_transformer(transform_sql): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") @@ -245,3 +383,4 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + From 72510d4fdde777e6a60a35d427424544241b34ed Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 4 Sep 2024 17:15:42 +0200 Subject: [PATCH 02/40] allow numbers in Custom-Transformer-IDs. --- bigframes/ml/compose.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index e083b65598..0f2f0b778f 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -48,7 +48,7 @@ ) -CUSTOM_TRANSFORMER_SQL_RX = re.compile("^(?P.*)/[*]CT.(?P[A-Z]+)[(](?P[^*]*)[)][*]/$", re.IGNORECASE) +CUSTOM_TRANSFORMER_SQL_RX = re.compile("^(?P.*)/[*]CT.(?P[A-Z]+[A-Z0-9]*)[(](?P[^*]*)[)][*]/$", re.IGNORECASE) class CustomTransformer(base.BaseTransformer): _CTID = None @@ -87,29 +87,24 @@ def _compile_to_sql(self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = N def get_target_column_name(self, column:str): return f"{self._CTID.lower()}_{column}" - @classmethod - def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: - pass - - @abc.abstractmethod + @abc.abstractclassmethod def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: pass - def get_persistent_config(self) -> Optional[Union[Dict, List]]: + def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: """ return structure to be persisted in the comment of the sql """ return None - def _get_pc_as_args(self) -> str: - config = self.get_persistent_config() + def _get_pc_as_args(self, column:str) -> str: + config = self.get_persistent_config(column) if not config: return "" return json.dumps(config) - def _get_sql_comment(self, column:str): - args = self._get_pc_as_args() + args = self._get_pc_as_args(column) return f"/*CT.{self._CTID}({args})*/" @classmethod @@ -133,11 +128,7 @@ def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str): pass def _keys(self): - """ - this method is used for merging multiple transformers together. - So, keys can be derived from get_persistent_config() - """ - return self._get_pc_as_args() + return () # CustomTransformers are thought to be used inside a column transformer. # So there is no need to implement fit() and transform() directly. From 43ba050b063a8a77b3a26ea5879af790aae04835 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 4 Sep 2024 18:50:33 +0200 Subject: [PATCH 03/40] comment was moved to the end of the sql. --- bigframes/ml/compose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 0f2f0b778f..68e54ecc43 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -251,7 +251,7 @@ def camel_to_snake(name): if not found_transformer: - if not transform_sql.startswith("ML.") and not transform_sql.startswith("/*CT."): + if not transform_sql.startswith("ML.") and not "/*CT." in transform_sql: continue # ignore other patterns, only report unhandled known patterns raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" From d6e7bb23a277ad4bbb664523438e160007b31de2 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 4 Sep 2024 18:55:45 +0200 Subject: [PATCH 04/40] Do not offer the feedback link for missing custom transformers. --- bigframes/ml/compose.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 68e54ecc43..ec8cfbbf4c 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -253,8 +253,12 @@ def camel_to_snake(name): if not found_transformer: if not transform_sql.startswith("ML.") and not "/*CT." in transform_sql: continue # ignore other patterns, only report unhandled known patterns - raise NotImplementedError( - f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + if transform_sql.startswith("ML."): + raise NotImplementedError( + f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + ) + raise ValueError( + f"Missing custom transformer" ) output_names.append(transform_col_dict["name"]) From 24e39ec6593ae5851d38c08dd658021345195a11 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Thu, 5 Sep 2024 07:58:47 +0200 Subject: [PATCH 05/40] cleanup typing hints. --- bigframes/ml/compose.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index ec8cfbbf4c..e96300e705 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -84,7 +84,7 @@ def _compile_to_sql(self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = N for column in columns ] - def get_target_column_name(self, column:str): + def get_target_column_name(self, column:str) -> str: return f"{self._CTID.lower()}_{column}" @abc.abstractclassmethod @@ -103,12 +103,12 @@ def _get_pc_as_args(self, column:str) -> str: return "" return json.dumps(config) - def _get_sql_comment(self, column:str): + def _get_sql_comment(self, column:str) -> str: args = self._get_pc_as_args(column) return f"/*CT.{self._CTID}({args})*/" @classmethod - def _parse_from_sql(cls, transform_sql: str) -> tuple[base.BaseTransformer, str]: + def _parse_from_sql(cls, transform_sql: str) -> Tuple[base.BaseTransformer, str]: m = CUSTOM_TRANSFORMER_SQL_RX.match(transform_sql) if m and m.group("id").strip() != cls._CTID: raise ValueError("understand() does not match _parse_from_sql!") @@ -121,7 +121,7 @@ def _parse_from_sql(cls, transform_sql: str) -> tuple[base.BaseTransformer, str] return cls.custom_parse_from_sql(config, sql) @abc.abstractclassmethod - def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str): + def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> Tuple[base.BaseTransformer, str]: """ return transformer instance and column name """ @@ -132,7 +132,8 @@ def _keys(self): # CustomTransformers are thought to be used inside a column transformer. # So there is no need to implement fit() and transform() directly. - # This can lead to problems if merg() detects only 1 transformer left. + # ColumnTransformer.merge() takes care, that a single custom transformer + # is not returned as a standalone transformer. def fit(self, y: Union[bpd.DataFrame, bpd.Series]) -> base.BaseTransformer: raise NotImplementedError("Unsupported") def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: From 1fd25de124d826fa50a8c792437959148fd96b78 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Thu, 5 Sep 2024 21:42:26 +0200 Subject: [PATCH 06/40] Add unit tests for CustomTransformer. --- tests/unit/ml/compose_custom_transformers.py | 75 +++++ tests/unit/ml/test_compose.py | 273 +++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 tests/unit/ml/compose_custom_transformers.py diff --git a/tests/unit/ml/compose_custom_transformers.py b/tests/unit/ml/compose_custom_transformers.py new file mode 100644 index 0000000000..8cec0eca82 --- /dev/null +++ b/tests/unit/ml/compose_custom_transformers.py @@ -0,0 +1,75 @@ +import bigframes.pandas as bpd +from bigframes.ml.compose import CustomTransformer +from typing import List, Optional, Union, Dict +import re + + + +class IdentityTransformer(CustomTransformer): + _CTID = "IDENT" + IDENT_BQSQL_RX = re.compile("^(?P[a-z][a-z0-9_]+)$", flags=re.IGNORECASE) + + def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + return f"{column}" + + @classmethod + def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + col_label = cls.IDENT_BQSQL_RX.match(sql).group("colname") + return cls(), col_label + +CustomTransformer.register(IdentityTransformer) + + +class Length1Transformer(CustomTransformer): + _CTID = "LEN1" + _DEFAULT_VALUE_DEFAULT = -1 + LEN1_BQSQL_RX = re.compile("^CASE WHEN (?P[a-z][a-z0-9_]*) IS NULL THEN (?P[-]?[0-9]+) ELSE LENGTH[(](?P=colname)[)] END$", flags=re.IGNORECASE) + + def __init__(self, default_value: Optional[int] = None): + self.default_value = default_value + + def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + default_value = ( + self.default_value + if self.default_value is not None + else Length1Transformer._DEFAULT_VALUE_DEFAULT + ) + return f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" + + @classmethod + def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + m = cls.LEN1_BQSQL_RX.match(sql) + col_label = m.group("colname") + default_value = int(m.group("defaultvalue")) + return cls(default_value), col_label + +CustomTransformer.register(Length1Transformer) + + +class Length2Transformer(CustomTransformer): + _CTID = "LEN2" + _DEFAULT_VALUE_DEFAULT = -1 + LEN2_BQSQL_RX = re.compile("^CASE WHEN (?P[a-z][a-z0-9_]*) .*$", flags=re.IGNORECASE) + + def __init__(self, default_value: Optional[int] = None): + self.default_value = default_value + + def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: + return [self.default_value] + + def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + default_value = ( + self.default_value + if self.default_value is not None + else Length2Transformer._DEFAULT_VALUE_DEFAULT + ) + return f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" + + @classmethod + def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + col_label = cls.LEN2_BQSQL_RX.match(sql).group("colname") + default_value = config[0] # get default value from persistent_config + return cls(default_value), col_label + +CustomTransformer.register(Length2Transformer) + diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 60dcc75b63..63e0c89835 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -11,11 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pytest import sklearn.compose as sklearn_compose # type: ignore import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing +from bigframes.ml.compose import ColumnTransformer + +from tests.unit.ml.compose_custom_transformers import IdentityTransformer, Length1Transformer, Length2Transformer + +from google.cloud import bigquery +from unittest import mock +from bigframes.ml.preprocessing import LabelEncoder def test_columntransformer_init_expectedtransforms(): @@ -173,3 +181,268 @@ def test_columntransformer_repr_matches_sklearn(): ) assert bf_column_transformer.__repr__() == sk_column_transformer.__repr__() + + +def test_columntransformer_init_with_customtransforms(): + ident_transformer = IdentityTransformer() + len1_transformer = Length1Transformer(-2) + len2_transformer = Length2Transformer(99) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ( + "len1_trafo", + len1_transformer, + ["species"], + ), + ( + "len2_trafo", + len2_transformer, + ["species"], + ), + ("label", label_transformer, "species"), + ] + ) + + assert column_transformer.transformers_ == [ + ("ident_trafo", ident_transformer, "culmen_length_mm"), + ("ident_trafo", ident_transformer, "flipper_length_mm"), + ("len1_trafo", len1_transformer, "species"), + ("len2_trafo", len2_transformer, "species"), + ("label", label_transformer, "species"), + ] + +def test_columntransformer_repr_customtransforms(): + ident_transformer = IdentityTransformer() + len1_transformer = Length1Transformer(-2) + len2_transformer = Length2Transformer(99) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), + ("label", label_transformer, "species") + ] + ) + + assert ( + column_transformer.__repr__() + == """ColumnTransformer(transformers=[('ident_trafo', IdentityTransformer(), + ['culmen_length_mm', 'flipper_length_mm']), + ('len1_trafo', + Length1Transformer(default_value=-2), + ['species']), + ('len2_trafo', + Length2Transformer(default_value=99), + ['species']), + ('label', LabelEncoder(), 'species')])""" + ) + + +IDENT_SQL = "column /*CT.IDENT()*/" +LEN1_SQL = "CASE WHEN column IS NULL THEN -5 ELSE LENGTH(column) END /*CT.LEN1()*/" +LEN2_SQL = "CASE WHEN column IS NULL THEN 99 ELSE LENGTH(column) END /*CT.LEN2([99])*/" +UNKNOWN_CT_SQL = "column /*CT.UNKNOWN()*/" +FOREIGN_SQL = "column" + +def test_customtransformer_registry(): + + compose.CustomTransformer.register(IdentityTransformer) + compose.CustomTransformer.register(Length1Transformer) + compose.CustomTransformer.register(Length2Transformer) + + transformer_cls = compose.CustomTransformer.find_matching_transformer(IDENT_SQL) + assert transformer_cls == IdentityTransformer + + transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN1_SQL) + assert transformer_cls == Length1Transformer + + transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN2_SQL) + assert transformer_cls == Length2Transformer + + transformer_cls = compose.CustomTransformer.find_matching_transformer(UNKNOWN_CT_SQL) + assert transformer_cls == None + + transformer_cls = compose.CustomTransformer.find_matching_transformer(FOREIGN_SQL) + assert transformer_cls == None + + +def test_customtransformer_compile_sql(): + + ident_trafo = IdentityTransformer() + sqls = ident_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + assert sqls == [ + 'col1 /*CT.IDENT()*/ AS ident_col1', + 'col2 /*CT.IDENT()*/ AS ident_col2' + ] + + len1_trafo = Length1Transformer(-5) + sqls = len1_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + assert sqls == [ + 'CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END /*CT.LEN1()*/ AS len1_col1', + 'CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END /*CT.LEN1()*/ AS len1_col2' + ] + + len2_trafo = Length2Transformer(99) + sqls = len2_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + assert sqls == [ + 'CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END /*CT.LEN2([99])*/ AS len2_col1', + 'CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END /*CT.LEN2([99])*/ AS len2_col2' + ] + + +def test_customtransformer_parse_sql(): + + ident_trafo, col_label = IdentityTransformer._parse_from_sql(IDENT_SQL) + assert col_label == "column" + assert ident_trafo + assert isinstance(ident_trafo, IdentityTransformer) + + len1_trafo, col_label = Length1Transformer._parse_from_sql(LEN1_SQL) + assert col_label == "column" + assert len1_trafo + assert isinstance(len1_trafo, Length1Transformer) + assert len1_trafo.default_value == -5 + + len2_trafo, col_label = Length2Transformer._parse_from_sql(LEN2_SQL) + assert col_label == "column" + assert len2_trafo + assert isinstance(len2_trafo, Length2Transformer) + assert len2_trafo.default_value == 99 + + fake_len2_sql = LEN2_SQL.replace("/*CT.LEN2([99])*/", "/*CT.LEN2([77])*/") + len2_trafo, col_label = Length2Transformer._parse_from_sql(fake_len2_sql) + assert col_label == "column" + assert len2_trafo + assert isinstance(len2_trafo, Length2Transformer) + assert len2_trafo.default_value == 77 + + + + +def create_bq_model_mock(transform_columns, feature_columns=None): + class _NameClass: + def __init__(self, name): + self.name = name + properties = {"transformColumns": transform_columns} + mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") + type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) + if feature_columns: + type(mock_bq_model).feature_columns = mock.PropertyMock(return_value=[_NameClass(col) for col in feature_columns]) + return mock_bq_model + + +@pytest.fixture +def bq_model_good(): + return create_bq_model_mock([ + {'name': 'ident_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.IDENT()*/'}, + {'name': 'ident_flipper_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'flipper_length_mm /*CT.IDENT()*/'}, + {'name': 'len1_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/'}, + {'name': 'len2_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'CASE WHEN species IS NULL THEN 99 ELSE LENGTH(address) END /*CT.LEN2([99])*/'}, + {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, + {'name': 'labelencoded_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(species, 1000000, 0) OVER()'} + ]) + +@pytest.fixture +def bq_model_merge(): + return create_bq_model_mock([ + {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, + {'name': 'labelencoded_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(species, 1000000, 0) OVER()'} + ], + ["county", "species"]) + +@pytest.fixture +def bq_model_no_merge(): + return create_bq_model_mock([ + {'name': 'ident_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.IDENT()*/'} + ], + ["culmen_length_mm"]) + +@pytest.fixture +def bq_model_unknown_CT(): + return create_bq_model_mock([ + {'name': 'unknownct_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.UNKNOWN()*/'}, + {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, + ]) + +@pytest.fixture +def bq_model_unknown_ML(): + return create_bq_model_mock([ + {'name': 'unknownml_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.UNKNOWN(culmen_length_mm)'}, + {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, + ]) + +@pytest.fixture +def bq_model_foreign(): + return create_bq_model_mock([ + {'name': 'foreign_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm+1'}, + {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, + ]) + + +def test_columntransformer_extract_from_bq_model_good(bq_model_good): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) + assert len(col_trans.transformers) == 6 + # normalize the representation for string comparing + col_trans.transformers.sort() + actual = col_trans.__repr__() + expected = """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), + 'culmen_length_mm'), + ('identity_transformer', IdentityTransformer(), + 'flipper_length_mm'), + ('label_encoder', + LabelEncoder(max_categories=1000001, + min_frequency=0), + 'county'), + ('label_encoder', + LabelEncoder(max_categories=1000001, + min_frequency=0), + 'species'), + ('length1_transformer', + Length1Transformer(default_value=-5), + 'species'), + ('length2_transformer', + Length2Transformer(default_value=99), + 'species')])""" + assert expected == actual + + +def test_columntransformer_extract_from_bq_model_merge(bq_model_merge): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_merge) + assert isinstance(col_trans, ColumnTransformer) + col_trans = col_trans._merge(bq_model_merge) + assert isinstance(col_trans, LabelEncoder) + assert col_trans.__repr__() == """LabelEncoder(max_categories=1000001, min_frequency=0)""" + assert col_trans._output_names == ['labelencoded_county', 'labelencoded_species'] + +def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_no_merge) + col_trans = col_trans._merge(bq_model_no_merge) + assert isinstance(col_trans, ColumnTransformer) + assert col_trans.__repr__() == """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), + 'culmen_length_mm')])""" + + +def test_columntransformer_extract_from_bq_model_unknown_CT(bq_model_unknown_CT): + try: + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_unknown_CT) + assert False + except ValueError as e: + assert "Missing custom transformer" == e.args[0] + +def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML): + try: + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_unknown_ML) + assert False + except NotImplementedError as e: + assert "Unsupported transformer type" in e.args[0] + +def test_columntransformer_extract_from_bq_model_foreign(bq_model_foreign): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_foreign) + assert col_trans.__repr__() == """ColumnTransformer(transformers=[('label_encoder', + LabelEncoder(max_categories=1000001, + min_frequency=0), + 'county')])""" + From 6dd90381cb87a6cfa43c19076628023de69d348e Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 6 Sep 2024 00:19:06 +0200 Subject: [PATCH 07/40] added unit tests for _extract_output_names() and _compile_to_sql(). --- tests/unit/ml/test_compose.py | 52 +++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 63e0c89835..6fe558e7b6 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -191,16 +191,8 @@ def test_columntransformer_init_with_customtransforms(): column_transformer = compose.ColumnTransformer( [ ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), - ( - "len1_trafo", - len1_transformer, - ["species"], - ), - ( - "len2_trafo", - len2_transformer, - ["species"], - ), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), ("label", label_transformer, "species"), ] ) @@ -424,7 +416,6 @@ def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): assert col_trans.__repr__() == """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), 'culmen_length_mm')])""" - def test_columntransformer_extract_from_bq_model_unknown_CT(bq_model_unknown_CT): try: col_trans = ColumnTransformer._extract_from_bq_model(bq_model_unknown_CT) @@ -446,3 +437,42 @@ def test_columntransformer_extract_from_bq_model_foreign(bq_model_foreign): min_frequency=0), 'county')])""" + +def test_columntransformer_extract_output_names(bq_model_good): + class BQMLModel: + def __init__(self, bq_model): + self._model = bq_model + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) + col_trans._bqml_model = BQMLModel(bq_model_good) + col_trans._extract_output_names() + assert col_trans._output_names == [ + 'ident_culmen_length_mm', + 'ident_flipper_length_mm', + 'len1_species', + 'len2_species', + 'labelencoded_county', + 'labelencoded_species' + ] + + +def test_columntransformer_compile_to_sql(): + ident_transformer = IdentityTransformer() + len1_transformer = Length1Transformer(-2) + len2_transformer = Length2Transformer(99) + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( + [ + ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ("len1_trafo", len1_transformer, ["species"]), + ("len2_trafo", len2_transformer, ["species"]), + ("label", label_transformer, "species"), + ] + ) + sqls = column_transformer._compile_to_sql(None) + assert sqls == [ + 'culmen_length_mm /*CT.IDENT()*/ AS ident_culmen_length_mm', + 'flipper_length_mm /*CT.IDENT()*/ AS ident_flipper_length_mm', + 'CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END /*CT.LEN1()*/ AS len1_species', + 'CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species', + 'ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species' + ] From 9d8d8c49ee55a5b0f48dbf149fd83d14c4e043cb Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 6 Sep 2024 00:41:44 +0200 Subject: [PATCH 08/40] run black and flake8 linter. --- bigframes/ml/compose.py | 65 +++-- tests/unit/ml/compose_custom_transformers.py | 44 +++- tests/unit/ml/test_compose.py | 262 +++++++++++++------ 3 files changed, 256 insertions(+), 115 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index e96300e705..005fffdaed 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -48,23 +48,32 @@ ) -CUSTOM_TRANSFORMER_SQL_RX = re.compile("^(?P.*)/[*]CT.(?P[A-Z]+[A-Z0-9]*)[(](?P[^*]*)[)][*]/$", re.IGNORECASE) +CUSTOM_TRANSFORMER_SQL_RX = re.compile( + "^(?P.*)/[*]CT.(?P[A-Z]+[A-Z0-9]*)[(](?P[^*]*)[)][*]/$", + re.IGNORECASE, +) + class CustomTransformer(base.BaseTransformer): _CTID = None _custom_transformer_classes = {} + @classmethod - def register(cls, transformer_cls:Type[base.BaseTransformer]): + def register(cls, transformer_cls: Type[base.BaseTransformer]): assert transformer_cls._CTID cls._custom_transformer_classes[transformer_cls._CTID] = transformer_cls + @classmethod - def find_matching_transformer(cls, transform_sql:str) -> Optional[Type[base.BaseTransformer]]: + def find_matching_transformer( + cls, transform_sql: str + ) -> Optional[Type[base.BaseTransformer]]: for transform_cls in cls._custom_transformer_classes.values(): if transform_cls.understands(transform_sql): return transform_cls return None + @classmethod - def understands(cls, transform_sql:str) -> bool: + def understands(cls, transform_sql: str) -> bool: """ may be overwritten to have a more advanced matching, possibly without comments in SQL """ @@ -72,38 +81,40 @@ def understands(cls, transform_sql:str) -> bool: if m and m.group("id").strip() == cls._CTID: return True return False - + def __init__(self): super().__init__() - def _compile_to_sql(self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None) -> List[str]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: if columns is None: columns = X.columns return [ - f"{self.custom_compile_to_sql(X, column)} {self._get_sql_comment(column)} AS {self.get_target_column_name(column)}" + f"{self.custom_compile_to_sql(X, column)} {self._get_sql_comment(column)} AS {self.get_target_column_name(column)}" for column in columns ] - def get_target_column_name(self, column:str) -> str: + def get_target_column_name(self, column: str) -> str: return f"{self._CTID.lower()}_{column}" @abc.abstractclassmethod def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: pass - + def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: """ return structure to be persisted in the comment of the sql """ return None - def _get_pc_as_args(self, column:str) -> str: + def _get_pc_as_args(self, column: str) -> str: config = self.get_persistent_config(column) if not config: return "" return json.dumps(config) - def _get_sql_comment(self, column:str) -> str: + def _get_sql_comment(self, column: str) -> str: args = self._get_pc_as_args(column) return f"/*CT.{self._CTID}({args})*/" @@ -121,7 +132,10 @@ def _parse_from_sql(cls, transform_sql: str) -> Tuple[base.BaseTransformer, str] return cls.custom_parse_from_sql(config, sql) @abc.abstractclassmethod - def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> Tuple[base.BaseTransformer, str]: + @classmethod + def custom_parse_from_sql( + cls, config: Optional[Union[Dict, List]], sql: str + ) -> Tuple[base.BaseTransformer, str]: """ return transformer instance and column name """ @@ -129,13 +143,14 @@ def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> def _keys(self): return () - + # CustomTransformers are thought to be used inside a column transformer. # So there is no need to implement fit() and transform() directly. - # ColumnTransformer.merge() takes care, that a single custom transformer + # ColumnTransformer.merge() takes care, that a single custom transformer # is not returned as a standalone transformer. def fit(self, y: Union[bpd.DataFrame, bpd.Series]) -> base.BaseTransformer: raise NotImplementedError("Unsupported") + def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: raise NotImplementedError("Unsupported") @@ -238,9 +253,11 @@ def camel_to_snake(name): found_transformer = True break - + if not found_transformer: - transformer_cls = CustomTransformer.find_matching_transformer(transform_sql) + transformer_cls = CustomTransformer.find_matching_transformer( + transform_sql + ) if transformer_cls: transformers_set.add( ( @@ -249,18 +266,15 @@ def camel_to_snake(name): ) ) found_transformer = True - - + if not found_transformer: - if not transform_sql.startswith("ML.") and not "/*CT." in transform_sql: - continue # ignore other patterns, only report unhandled known patterns - if transform_sql.startswith("ML."): + if not transform_sql.startswith("ML.") and "/*CT." not in transform_sql: + continue # ignore other patterns, only report unhandled known patterns + if transform_sql.startswith("ML."): raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) - raise ValueError( - f"Missing custom transformer" - ) + raise ValueError("Missing custom transformer") output_names.append(transform_col_dict["name"]) @@ -280,7 +294,7 @@ def _merge( assert len(transformers) > 0 _, transformer_0, column_0 = transformers[0] if isinstance(transformer_0, CustomTransformer): - return self # CustomTransformers only work inside ColumnTransformer + return self # CustomTransformers only work inside ColumnTransformer feature_columns_sorted = sorted( [ cast(str, feature_column.name) @@ -379,4 +393,3 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) - diff --git a/tests/unit/ml/compose_custom_transformers.py b/tests/unit/ml/compose_custom_transformers.py index 8cec0eca82..2034f92001 100644 --- a/tests/unit/ml/compose_custom_transformers.py +++ b/tests/unit/ml/compose_custom_transformers.py @@ -4,7 +4,6 @@ import re - class IdentityTransformer(CustomTransformer): _CTID = "IDENT" IDENT_BQSQL_RX = re.compile("^(?P[a-z][a-z0-9_]+)$", flags=re.IGNORECASE) @@ -13,18 +12,24 @@ def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: return f"{column}" @classmethod - def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + def custom_parse_from_sql( + cls, config: Optional[Union[Dict, List]], sql: str + ) -> tuple[CustomTransformer, str]: col_label = cls.IDENT_BQSQL_RX.match(sql).group("colname") return cls(), col_label + CustomTransformer.register(IdentityTransformer) class Length1Transformer(CustomTransformer): _CTID = "LEN1" _DEFAULT_VALUE_DEFAULT = -1 - LEN1_BQSQL_RX = re.compile("^CASE WHEN (?P[a-z][a-z0-9_]*) IS NULL THEN (?P[-]?[0-9]+) ELSE LENGTH[(](?P=colname)[)] END$", flags=re.IGNORECASE) - + LEN1_BQSQL_RX = re.compile( + "^CASE WHEN (?P[a-z][a-z0-9_]*) IS NULL THEN (?P[-]?[0-9]+) ELSE LENGTH[(](?P=colname)[)] END$", + flags=re.IGNORECASE, + ) + def __init__(self, default_value: Optional[int] = None): self.default_value = default_value @@ -34,23 +39,30 @@ def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: if self.default_value is not None else Length1Transformer._DEFAULT_VALUE_DEFAULT ) - return f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" - + return ( + f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" + ) + @classmethod - def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + def custom_parse_from_sql( + cls, config: Optional[Union[Dict, List]], sql: str + ) -> tuple[CustomTransformer, str]: m = cls.LEN1_BQSQL_RX.match(sql) col_label = m.group("colname") default_value = int(m.group("defaultvalue")) return cls(default_value), col_label + CustomTransformer.register(Length1Transformer) class Length2Transformer(CustomTransformer): _CTID = "LEN2" _DEFAULT_VALUE_DEFAULT = -1 - LEN2_BQSQL_RX = re.compile("^CASE WHEN (?P[a-z][a-z0-9_]*) .*$", flags=re.IGNORECASE) - + LEN2_BQSQL_RX = re.compile( + "^CASE WHEN (?P[a-z][a-z0-9_]*) .*$", flags=re.IGNORECASE + ) + def __init__(self, default_value: Optional[int] = None): self.default_value = default_value @@ -63,13 +75,17 @@ def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: if self.default_value is not None else Length2Transformer._DEFAULT_VALUE_DEFAULT ) - return f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" - + return ( + f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" + ) + @classmethod - def custom_parse_from_sql(cls, config: Optional[Union[Dict, List]], sql: str) -> tuple[CustomTransformer, str]: + def custom_parse_from_sql( + cls, config: Optional[Union[Dict, List]], sql: str + ) -> tuple[CustomTransformer, str]: col_label = cls.LEN2_BQSQL_RX.match(sql).group("colname") - default_value = config[0] # get default value from persistent_config + default_value = config[0] # get default value from persistent_config return cls(default_value), col_label -CustomTransformer.register(Length2Transformer) +CustomTransformer.register(Length2Transformer) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 6fe558e7b6..c5b86ca3d4 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -19,7 +19,11 @@ from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer -from tests.unit.ml.compose_custom_transformers import IdentityTransformer, Length1Transformer, Length2Transformer +from tests.unit.ml.compose_custom_transformers import ( + IdentityTransformer, + Length1Transformer, + Length2Transformer, +) from google.cloud import bigquery from unittest import mock @@ -190,7 +194,11 @@ def test_columntransformer_init_with_customtransforms(): label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ - ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("len1_trafo", len1_transformer, ["species"]), ("len2_trafo", len2_transformer, ["species"]), ("label", label_transformer, "species"), @@ -205,6 +213,7 @@ def test_columntransformer_init_with_customtransforms(): ("label", label_transformer, "species"), ] + def test_columntransformer_repr_customtransforms(): ident_transformer = IdentityTransformer() len1_transformer = Length1Transformer(-2) @@ -212,10 +221,14 @@ def test_columntransformer_repr_customtransforms(): label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ - ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("len1_trafo", len1_transformer, ["species"]), ("len2_trafo", len2_transformer, ["species"]), - ("label", label_transformer, "species") + ("label", label_transformer, "species"), ] ) @@ -239,54 +252,57 @@ def test_columntransformer_repr_customtransforms(): UNKNOWN_CT_SQL = "column /*CT.UNKNOWN()*/" FOREIGN_SQL = "column" + def test_customtransformer_registry(): - + compose.CustomTransformer.register(IdentityTransformer) compose.CustomTransformer.register(Length1Transformer) compose.CustomTransformer.register(Length2Transformer) transformer_cls = compose.CustomTransformer.find_matching_transformer(IDENT_SQL) assert transformer_cls == IdentityTransformer - + transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN1_SQL) assert transformer_cls == Length1Transformer - + transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN2_SQL) assert transformer_cls == Length2Transformer - - transformer_cls = compose.CustomTransformer.find_matching_transformer(UNKNOWN_CT_SQL) - assert transformer_cls == None - + + transformer_cls = compose.CustomTransformer.find_matching_transformer( + UNKNOWN_CT_SQL + ) + assert transformer_cls is None + transformer_cls = compose.CustomTransformer.find_matching_transformer(FOREIGN_SQL) - assert transformer_cls == None + assert transformer_cls is None def test_customtransformer_compile_sql(): - + ident_trafo = IdentityTransformer() sqls = ident_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - 'col1 /*CT.IDENT()*/ AS ident_col1', - 'col2 /*CT.IDENT()*/ AS ident_col2' + "col1 /*CT.IDENT()*/ AS ident_col1", + "col2 /*CT.IDENT()*/ AS ident_col2", ] len1_trafo = Length1Transformer(-5) sqls = len1_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - 'CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END /*CT.LEN1()*/ AS len1_col1', - 'CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END /*CT.LEN1()*/ AS len1_col2' + "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END /*CT.LEN1()*/ AS len1_col1", + "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END /*CT.LEN1()*/ AS len1_col2", ] len2_trafo = Length2Transformer(99) sqls = len2_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - 'CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END /*CT.LEN2([99])*/ AS len2_col1', - 'CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END /*CT.LEN2([99])*/ AS len2_col2' + "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END /*CT.LEN2([99])*/ AS len2_col1", + "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END /*CT.LEN2([99])*/ AS len2_col2", ] def test_customtransformer_parse_sql(): - + ident_trafo, col_label = IdentityTransformer._parse_from_sql(IDENT_SQL) assert col_label == "column" assert ident_trafo @@ -304,7 +320,7 @@ def test_customtransformer_parse_sql(): assert isinstance(len2_trafo, Length2Transformer) assert len2_trafo.default_value == 99 - fake_len2_sql = LEN2_SQL.replace("/*CT.LEN2([99])*/", "/*CT.LEN2([77])*/") + fake_len2_sql = LEN2_SQL.replace("/*CT.LEN2([99])*/", "/*CT.LEN2([77])*/") len2_trafo, col_label = Length2Transformer._parse_from_sql(fake_len2_sql) assert col_label == "column" assert len2_trafo @@ -312,72 +328,150 @@ def test_customtransformer_parse_sql(): assert len2_trafo.default_value == 77 - - def create_bq_model_mock(transform_columns, feature_columns=None): class _NameClass: def __init__(self, name): self.name = name + properties = {"transformColumns": transform_columns} mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) if feature_columns: - type(mock_bq_model).feature_columns = mock.PropertyMock(return_value=[_NameClass(col) for col in feature_columns]) + type(mock_bq_model).feature_columns = mock.PropertyMock( + return_value=[_NameClass(col) for col in feature_columns] + ) return mock_bq_model @pytest.fixture def bq_model_good(): - return create_bq_model_mock([ - {'name': 'ident_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.IDENT()*/'}, - {'name': 'ident_flipper_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'flipper_length_mm /*CT.IDENT()*/'}, - {'name': 'len1_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/'}, - {'name': 'len2_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'CASE WHEN species IS NULL THEN 99 ELSE LENGTH(address) END /*CT.LEN2([99])*/'}, - {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, - {'name': 'labelencoded_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(species, 1000000, 0) OVER()'} - ]) + return create_bq_model_mock( + [ + { + "name": "ident_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm /*CT.IDENT()*/", + }, + { + "name": "ident_flipper_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "flipper_length_mm /*CT.IDENT()*/", + }, + { + "name": "len1_species", + "type": {"typeKind": "INT64"}, + "transformSql": "CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/", + }, + { + "name": "len2_species", + "type": {"typeKind": "INT64"}, + "transformSql": "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(address) END /*CT.LEN2([99])*/", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + { + "name": "labelencoded_species", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(species, 1000000, 0) OVER()", + }, + ] + ) + @pytest.fixture def bq_model_merge(): - return create_bq_model_mock([ - {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, - {'name': 'labelencoded_species', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(species, 1000000, 0) OVER()'} - ], - ["county", "species"]) + return create_bq_model_mock( + [ + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + { + "name": "labelencoded_species", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(species, 1000000, 0) OVER()", + }, + ], + ["county", "species"], + ) + @pytest.fixture def bq_model_no_merge(): - return create_bq_model_mock([ - {'name': 'ident_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.IDENT()*/'} + return create_bq_model_mock( + [ + { + "name": "ident_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm /*CT.IDENT()*/", + } ], - ["culmen_length_mm"]) + ["culmen_length_mm"], + ) + @pytest.fixture def bq_model_unknown_CT(): - return create_bq_model_mock([ - {'name': 'unknownct_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm /*CT.UNKNOWN()*/'}, - {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, - ]) + return create_bq_model_mock( + [ + { + "name": "unknownct_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm /*CT.UNKNOWN()*/", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + ] + ) + @pytest.fixture def bq_model_unknown_ML(): - return create_bq_model_mock([ - {'name': 'unknownml_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.UNKNOWN(culmen_length_mm)'}, - {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, - ]) + return create_bq_model_mock( + [ + { + "name": "unknownml_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.UNKNOWN(culmen_length_mm)", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + ] + ) + @pytest.fixture def bq_model_foreign(): - return create_bq_model_mock([ - {'name': 'foreign_culmen_length_mm', 'type': {'typeKind': 'INT64'}, 'transformSql': 'culmen_length_mm+1'}, - {'name': 'labelencoded_county', 'type': {'typeKind': 'INT64'}, 'transformSql': 'ML.LABEL_ENCODER(county, 1000000, 0) OVER()'}, - ]) + return create_bq_model_mock( + [ + { + "name": "foreign_culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm+1", + }, + { + "name": "labelencoded_county", + "type": {"typeKind": "INT64"}, + "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", + }, + ] + ) def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 - # normalize the representation for string comparing + # normalize the representation for string comparing col_trans.transformers.sort() actual = col_trans.__repr__() expected = """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), @@ -406,63 +500,81 @@ def test_columntransformer_extract_from_bq_model_merge(bq_model_merge): assert isinstance(col_trans, ColumnTransformer) col_trans = col_trans._merge(bq_model_merge) assert isinstance(col_trans, LabelEncoder) - assert col_trans.__repr__() == """LabelEncoder(max_categories=1000001, min_frequency=0)""" - assert col_trans._output_names == ['labelencoded_county', 'labelencoded_species'] - + assert ( + col_trans.__repr__() + == """LabelEncoder(max_categories=1000001, min_frequency=0)""" + ) + assert col_trans._output_names == ["labelencoded_county", "labelencoded_species"] + + def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_no_merge) col_trans = col_trans._merge(bq_model_no_merge) assert isinstance(col_trans, ColumnTransformer) - assert col_trans.__repr__() == """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), + assert ( + col_trans.__repr__() + == """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), 'culmen_length_mm')])""" - + ) + + def test_columntransformer_extract_from_bq_model_unknown_CT(bq_model_unknown_CT): try: - col_trans = ColumnTransformer._extract_from_bq_model(bq_model_unknown_CT) + _ = ColumnTransformer._extract_from_bq_model(bq_model_unknown_CT) assert False except ValueError as e: assert "Missing custom transformer" == e.args[0] + def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML): try: - col_trans = ColumnTransformer._extract_from_bq_model(bq_model_unknown_ML) + _ = ColumnTransformer._extract_from_bq_model(bq_model_unknown_ML) assert False except NotImplementedError as e: assert "Unsupported transformer type" in e.args[0] + def test_columntransformer_extract_from_bq_model_foreign(bq_model_foreign): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_foreign) - assert col_trans.__repr__() == """ColumnTransformer(transformers=[('label_encoder', + assert ( + col_trans.__repr__() + == """ColumnTransformer(transformers=[('label_encoder', LabelEncoder(max_categories=1000001, min_frequency=0), 'county')])""" + ) def test_columntransformer_extract_output_names(bq_model_good): class BQMLModel: def __init__(self, bq_model): self._model = bq_model + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) col_trans._bqml_model = BQMLModel(bq_model_good) col_trans._extract_output_names() assert col_trans._output_names == [ - 'ident_culmen_length_mm', - 'ident_flipper_length_mm', - 'len1_species', - 'len2_species', - 'labelencoded_county', - 'labelencoded_species' + "ident_culmen_length_mm", + "ident_flipper_length_mm", + "len1_species", + "len2_species", + "labelencoded_county", + "labelencoded_species", ] - -def test_columntransformer_compile_to_sql(): + +def test_columntransformer_compile_to_sql(): ident_transformer = IdentityTransformer() len1_transformer = Length1Transformer(-2) len2_transformer = Length2Transformer(99) label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ - ("ident_trafo", ident_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("len1_trafo", len1_transformer, ["species"]), ("len2_trafo", len2_transformer, ["species"]), ("label", label_transformer, "species"), @@ -470,9 +582,9 @@ def test_columntransformer_compile_to_sql(): ) sqls = column_transformer._compile_to_sql(None) assert sqls == [ - 'culmen_length_mm /*CT.IDENT()*/ AS ident_culmen_length_mm', - 'flipper_length_mm /*CT.IDENT()*/ AS ident_flipper_length_mm', - 'CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END /*CT.LEN1()*/ AS len1_species', - 'CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species', - 'ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species' + "culmen_length_mm /*CT.IDENT()*/ AS ident_culmen_length_mm", + "flipper_length_mm /*CT.IDENT()*/ AS ident_flipper_length_mm", + "CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END /*CT.LEN1()*/ AS len1_species", + "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species", + "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] From 2665bc78512827857b1f01cdd8580e32e87f1bd6 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 6 Sep 2024 00:44:43 +0200 Subject: [PATCH 09/40] fixed wrong @classmethod annotation. --- bigframes/ml/compose.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 005fffdaed..efff8ac243 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -98,8 +98,9 @@ def _compile_to_sql( def get_target_column_name(self, column: str) -> str: return f"{self._CTID.lower()}_{column}" + @classmethod @abc.abstractclassmethod - def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: + def custom_compile_to_sql(cls, X: bpd.DataFrame, column: str) -> str: pass def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: @@ -131,8 +132,8 @@ def _parse_from_sql(cls, transform_sql: str) -> Tuple[base.BaseTransformer, str] sql = m.group("sql").strip() return cls.custom_parse_from_sql(config, sql) - @abc.abstractclassmethod @classmethod + @abc.abstractclassmethod def custom_parse_from_sql( cls, config: Optional[Union[Dict, List]], sql: str ) -> Tuple[base.BaseTransformer, str]: From ffc6824213b36a5aea62c2d7d112eb62f21b0336 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 20 Sep 2024 06:39:31 +0200 Subject: [PATCH 10/40] on the way to SQLScalarColumnTransformer --- bigframes/ml/compose.py | 59 ++++++++++++------ tests/unit/ml/test_compose.py | 113 +++++++++++++--------------------- 2 files changed, 83 insertions(+), 89 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 1059929360..7ad163e88a 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -54,6 +54,31 @@ ) +class SQLScalarColumnTransformer(base.BaseTransformer): + def __init__(self, sql: str, target_column="transformed_{0}"): + super().__init__() + self.sql = sql + self.target_column = target_column + + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: + if columns is None: + columns = X.columns + result = [] + for column in columns: + current_sql = self.sql.format(column) + current_target_column = self.target_column.format(column) + result.append(f"{current_sql} AS {current_target_column}") + return result + + def _keys(self): + return (self.sql, self.target_column) + + def __lt__(self, other): + return self.target_column < other.target_column + + class CustomTransformer(base.BaseTransformer): _CTID = None _custom_transformer_classes = {} @@ -256,26 +281,24 @@ def camel_to_snake(name): break if not found_transformer: - transformer_cls = CustomTransformer.find_matching_transformer( - transform_sql - ) - if transformer_cls: - transformers_set.add( - ( - camel_to_snake(transformer_cls.__name__), - *transformer_cls._parse_from_sql(transform_sql), # type: ignore - ) - ) - found_transformer = True - - if not found_transformer: - if not transform_sql.startswith("ML.") and "/*CT." not in transform_sql: - continue # ignore other patterns, only report unhandled known patterns if transform_sql.startswith("ML."): raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) - raise ValueError("Missing custom transformer") + + target_column = transform_col_dict["name"] + transformer = SQLScalarColumnTransformer( + transform_sql, target_column=target_column + ) + input_column_name = "?" + transformers_set.add( + ( + camel_to_snake(transformer.__class__.__name__), + transformer, + input_column_name, + ) + ) + found_transformer = True output_names.append(transform_col_dict["name"]) @@ -294,8 +317,8 @@ def _merge( assert len(transformers) > 0 _, transformer_0, column_0 = transformers[0] - if isinstance(transformer_0, CustomTransformer): - return self # CustomTransformers only work inside ColumnTransformer + if isinstance(transformer_0, SQLScalarColumnTransformer): + return self # SQLScalarColumnTransformer only work inside ColumnTransformer feature_columns_sorted = sorted( [ cast(str, feature_column.name) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index c5b86ca3d4..e8e55f0d45 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -414,24 +414,6 @@ def bq_model_no_merge(): ) -@pytest.fixture -def bq_model_unknown_CT(): - return create_bq_model_mock( - [ - { - "name": "unknownct_culmen_length_mm", - "type": {"typeKind": "INT64"}, - "transformSql": "culmen_length_mm /*CT.UNKNOWN()*/", - }, - { - "name": "labelencoded_county", - "type": {"typeKind": "INT64"}, - "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", - }, - ] - ) - - @pytest.fixture def bq_model_unknown_ML(): return create_bq_model_mock( @@ -450,35 +432,13 @@ def bq_model_unknown_ML(): ) -@pytest.fixture -def bq_model_foreign(): - return create_bq_model_mock( - [ - { - "name": "foreign_culmen_length_mm", - "type": {"typeKind": "INT64"}, - "transformSql": "culmen_length_mm+1", - }, - { - "name": "labelencoded_county", - "type": {"typeKind": "INT64"}, - "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", - }, - ] - ) - - def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 # normalize the representation for string comparing col_trans.transformers.sort() actual = col_trans.__repr__() - expected = """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), - 'culmen_length_mm'), - ('identity_transformer', IdentityTransformer(), - 'flipper_length_mm'), - ('label_encoder', + expected = """ColumnTransformer(transformers=[('label_encoder', LabelEncoder(max_categories=1000001, min_frequency=0), 'county'), @@ -486,12 +446,36 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): LabelEncoder(max_categories=1000001, min_frequency=0), 'species'), - ('length1_transformer', - Length1Transformer(default_value=-5), - 'species'), - ('length2_transformer', - Length2Transformer(default_value=99), - 'species')])""" + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm ' + '/*CT.IDENT()*/', + target_column='ident_culmen_length_mm'), + '?'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='flipper_length_mm ' + '/*CT.IDENT()*/', + target_column='ident_flipper_length_mm'), + '?'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='CASE WHEN ' + 'species IS ' + 'NULL THEN -5 ' + 'ELSE ' + 'LENGTH(species) ' + 'END ' + '/*CT.LEN1()*/', + target_column='len1_species'), + '?'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='CASE WHEN ' + 'species IS ' + 'NULL THEN 99 ' + 'ELSE ' + 'LENGTH(address) ' + 'END ' + '/*CT.LEN2([99])*/', + target_column='len2_species'), + '?')])""" assert expected == actual @@ -511,19 +495,13 @@ def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_no_merge) col_trans = col_trans._merge(bq_model_no_merge) assert isinstance(col_trans, ColumnTransformer) - assert ( - col_trans.__repr__() - == """ColumnTransformer(transformers=[('identity_transformer', IdentityTransformer(), - 'culmen_length_mm')])""" - ) - - -def test_columntransformer_extract_from_bq_model_unknown_CT(bq_model_unknown_CT): - try: - _ = ColumnTransformer._extract_from_bq_model(bq_model_unknown_CT) - assert False - except ValueError as e: - assert "Missing custom transformer" == e.args[0] + expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm ' + '/*CT.IDENT()*/', + target_column='ident_culmen_length_mm'), + '?')])""" + actual = col_trans.__repr__() + assert expected == actual def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML): @@ -534,17 +512,6 @@ def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML) assert "Unsupported transformer type" in e.args[0] -def test_columntransformer_extract_from_bq_model_foreign(bq_model_foreign): - col_trans = ColumnTransformer._extract_from_bq_model(bq_model_foreign) - assert ( - col_trans.__repr__() - == """ColumnTransformer(transformers=[('label_encoder', - LabelEncoder(max_categories=1000001, - min_frequency=0), - 'county')])""" - ) - - def test_columntransformer_extract_output_names(bq_model_good): class BQMLModel: def __init__(self, bq_model): @@ -588,3 +555,7 @@ def test_columntransformer_compile_to_sql(): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] + + +if __name__ == "__main__": + pytest.main(["test_compose.py", "-s"]) From a53b51ccb359f342a0605fb8feb3b32a3fd59dda Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 20 Sep 2024 06:43:03 +0200 Subject: [PATCH 11/40] remove pytest.main call. --- tests/unit/ml/test_compose.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index e8e55f0d45..88114b3277 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -555,7 +555,3 @@ def test_columntransformer_compile_to_sql(): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] - - -if __name__ == "__main__": - pytest.main(["test_compose.py", "-s"]) From 41ca2ff1cb0e191420249b04f75edbe16a14f794 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Fri, 20 Sep 2024 11:36:30 +0200 Subject: [PATCH 12/40] remove CustomTransformer class and implementations. --- bigframes/ml/compose.py | 126 +------------- tests/unit/ml/compose_custom_transformers.py | 91 ---------- tests/unit/ml/test_compose.py | 168 ++++++++----------- 3 files changed, 71 insertions(+), 314 deletions(-) delete mode 100644 tests/unit/ml/compose_custom_transformers.py diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 7ad163e88a..14bbcdfe17 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,9 +21,7 @@ import re import types import typing -from typing import cast, Iterable, List, Optional, Set, Tuple, Union, Dict, Type -import abc -import json +from typing import cast, Iterable, List, Optional, Set, Tuple, Union from bigframes_vendored import constants import bigframes_vendored.sklearn.compose._column_transformer @@ -48,12 +46,6 @@ ) -CUSTOM_TRANSFORMER_SQL_RX = re.compile( - "^(?P.*)/[*]CT.(?P[A-Z]+[A-Z0-9]*)[(](?P[^*]*)[)][*]/$", - re.IGNORECASE, -) - - class SQLScalarColumnTransformer(base.BaseTransformer): def __init__(self, sql: str, target_column="transformed_{0}"): super().__init__() @@ -75,111 +67,6 @@ def _compile_to_sql( def _keys(self): return (self.sql, self.target_column) - def __lt__(self, other): - return self.target_column < other.target_column - - -class CustomTransformer(base.BaseTransformer): - _CTID = None - _custom_transformer_classes = {} - - @classmethod - def register(cls, transformer_cls: Type[base.BaseTransformer]): - assert transformer_cls._CTID - cls._custom_transformer_classes[transformer_cls._CTID] = transformer_cls - - @classmethod - def find_matching_transformer( - cls, transform_sql: str - ) -> Optional[Type[base.BaseTransformer]]: - for transform_cls in cls._custom_transformer_classes.values(): - if transform_cls.understands(transform_sql): - return transform_cls - return None - - @classmethod - def understands(cls, transform_sql: str) -> bool: - """ - may be overwritten to have a more advanced matching, possibly without comments in SQL - """ - m = CUSTOM_TRANSFORMER_SQL_RX.match(transform_sql) - if m and m.group("id").strip() == cls._CTID: - return True - return False - - def __init__(self): - super().__init__() - - def _compile_to_sql( - self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None - ) -> List[str]: - if columns is None: - columns = X.columns - return [ - f"{self.custom_compile_to_sql(X, column)} {self._get_sql_comment(column)} AS {self.get_target_column_name(column)}" - for column in columns - ] - - def get_target_column_name(self, column: str) -> str: - return f"{self._CTID.lower()}_{column}" - - @classmethod - @abc.abstractclassmethod - def custom_compile_to_sql(cls, X: bpd.DataFrame, column: str) -> str: - pass - - def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: - """ - return structure to be persisted in the comment of the sql - """ - return None - - def _get_pc_as_args(self, column: str) -> str: - config = self.get_persistent_config(column) - if not config: - return "" - return json.dumps(config) - - def _get_sql_comment(self, column: str) -> str: - args = self._get_pc_as_args(column) - return f"/*CT.{self._CTID}({args})*/" - - @classmethod - def _parse_from_sql(cls, transform_sql: str) -> Tuple[base.BaseTransformer, str]: - m = CUSTOM_TRANSFORMER_SQL_RX.match(transform_sql) - if m and m.group("id").strip() != cls._CTID: - raise ValueError("understand() does not match _parse_from_sql!") - args = m.group("config").strip() - if args != "": - config = json.loads(args) - else: - config = None - sql = m.group("sql").strip() - return cls.custom_parse_from_sql(config, sql) - - @classmethod - @abc.abstractclassmethod - def custom_parse_from_sql( - cls, config: Optional[Union[Dict, List]], sql: str - ) -> Tuple[base.BaseTransformer, str]: - """ - return transformer instance and column name - """ - pass - - def _keys(self): - return () - - # CustomTransformers are thought to be used inside a column transformer. - # So there is no need to implement fit() and transform() directly. - # ColumnTransformer.merge() takes care, that a single custom transformer - # is not returned as a standalone transformer. - def fit(self, y: Union[bpd.DataFrame, bpd.Series]) -> base.BaseTransformer: - raise NotImplementedError("Unsupported") - - def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: - raise NotImplementedError("Unsupported") - @log_adapter.class_logger class ColumnTransformer( @@ -266,6 +153,7 @@ def camel_to_snake(name): continue transform_sql: str = transform_col_dict["transformSql"] + output_names.append(transform_col_dict["name"]) found_transformer = False for prefix in _BQML_TRANSFROM_TYPE_MAPPING: if transform_sql.startswith(prefix): @@ -279,7 +167,6 @@ def camel_to_snake(name): found_transformer = True break - if not found_transformer: if transform_sql.startswith("ML."): raise NotImplementedError( @@ -298,9 +185,6 @@ def camel_to_snake(name): input_column_name, ) ) - found_transformer = True - - output_names.append(transform_col_dict["name"]) transformer = cls(transformers=list(transformers_set)) transformer._output_names = output_names @@ -387,6 +271,7 @@ def fit( return self # Overwrite the implementation in BaseTransformer, as it only supports the "ML." transformers. + # TODO: clarify if this should be changed in base.BaseTransformer? def _extract_output_names(self): """Extract transform output column names. Save the results to self._output_names.""" assert self._bqml_model is not None @@ -397,11 +282,6 @@ def _extract_output_names(self): # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue - transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - if not CustomTransformer.find_matching_transformer(transform_sql): - continue - output_names.append(transform_col_dict["name"]) self._output_names = output_names diff --git a/tests/unit/ml/compose_custom_transformers.py b/tests/unit/ml/compose_custom_transformers.py deleted file mode 100644 index 2034f92001..0000000000 --- a/tests/unit/ml/compose_custom_transformers.py +++ /dev/null @@ -1,91 +0,0 @@ -import bigframes.pandas as bpd -from bigframes.ml.compose import CustomTransformer -from typing import List, Optional, Union, Dict -import re - - -class IdentityTransformer(CustomTransformer): - _CTID = "IDENT" - IDENT_BQSQL_RX = re.compile("^(?P[a-z][a-z0-9_]+)$", flags=re.IGNORECASE) - - def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: - return f"{column}" - - @classmethod - def custom_parse_from_sql( - cls, config: Optional[Union[Dict, List]], sql: str - ) -> tuple[CustomTransformer, str]: - col_label = cls.IDENT_BQSQL_RX.match(sql).group("colname") - return cls(), col_label - - -CustomTransformer.register(IdentityTransformer) - - -class Length1Transformer(CustomTransformer): - _CTID = "LEN1" - _DEFAULT_VALUE_DEFAULT = -1 - LEN1_BQSQL_RX = re.compile( - "^CASE WHEN (?P[a-z][a-z0-9_]*) IS NULL THEN (?P[-]?[0-9]+) ELSE LENGTH[(](?P=colname)[)] END$", - flags=re.IGNORECASE, - ) - - def __init__(self, default_value: Optional[int] = None): - self.default_value = default_value - - def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: - default_value = ( - self.default_value - if self.default_value is not None - else Length1Transformer._DEFAULT_VALUE_DEFAULT - ) - return ( - f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" - ) - - @classmethod - def custom_parse_from_sql( - cls, config: Optional[Union[Dict, List]], sql: str - ) -> tuple[CustomTransformer, str]: - m = cls.LEN1_BQSQL_RX.match(sql) - col_label = m.group("colname") - default_value = int(m.group("defaultvalue")) - return cls(default_value), col_label - - -CustomTransformer.register(Length1Transformer) - - -class Length2Transformer(CustomTransformer): - _CTID = "LEN2" - _DEFAULT_VALUE_DEFAULT = -1 - LEN2_BQSQL_RX = re.compile( - "^CASE WHEN (?P[a-z][a-z0-9_]*) .*$", flags=re.IGNORECASE - ) - - def __init__(self, default_value: Optional[int] = None): - self.default_value = default_value - - def get_persistent_config(self, column: str) -> Optional[Union[Dict, List]]: - return [self.default_value] - - def custom_compile_to_sql(self, X: bpd.DataFrame, column: str) -> str: - default_value = ( - self.default_value - if self.default_value is not None - else Length2Transformer._DEFAULT_VALUE_DEFAULT - ) - return ( - f"CASE WHEN {column} IS NULL THEN {default_value} ELSE LENGTH({column}) END" - ) - - @classmethod - def custom_parse_from_sql( - cls, config: Optional[Union[Dict, List]], sql: str - ) -> tuple[CustomTransformer, str]: - col_label = cls.LEN2_BQSQL_RX.match(sql).group("colname") - default_value = config[0] # get default value from persistent_config - return cls(default_value), col_label - - -CustomTransformer.register(Length2Transformer) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 88114b3277..5f88a37b39 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -17,17 +17,10 @@ import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing -from bigframes.ml.compose import ColumnTransformer - -from tests.unit.ml.compose_custom_transformers import ( - IdentityTransformer, - Length1Transformer, - Length2Transformer, -) +from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer from google.cloud import bigquery from unittest import mock -from bigframes.ml.preprocessing import LabelEncoder def test_columntransformer_init_expectedtransforms(): @@ -188,9 +181,13 @@ def test_columntransformer_repr_matches_sklearn(): def test_columntransformer_init_with_customtransforms(): - ident_transformer = IdentityTransformer() - len1_transformer = Length1Transformer(-2) - len2_transformer = Length2Transformer(99) + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -215,9 +212,13 @@ def test_columntransformer_init_with_customtransforms(): def test_columntransformer_repr_customtransforms(): - ident_transformer = IdentityTransformer() - len1_transformer = Length1Transformer(-2) - len2_transformer = Length2Transformer(99) + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -232,18 +233,29 @@ def test_columntransformer_repr_customtransforms(): ] ) - assert ( - column_transformer.__repr__() - == """ColumnTransformer(transformers=[('ident_trafo', IdentityTransformer(), + expected = """ColumnTransformer(transformers=[('ident_trafo', + SQLScalarColumnTransformer(sql='{0}', + target_column='ident_{0}'), ['culmen_length_mm', 'flipper_length_mm']), ('len1_trafo', - Length1Transformer(default_value=-2), + SQLScalarColumnTransformer(sql='CASE WHEN {0} ' + 'IS NULL THEN ' + '-2 ELSE ' + 'LENGTH({0}) ' + 'END', + target_column='len1_{0}'), ['species']), ('len2_trafo', - Length2Transformer(default_value=99), + SQLScalarColumnTransformer(sql='CASE WHEN {0} ' + 'IS NULL THEN ' + '99 ELSE ' + 'LENGTH({0}) ' + 'END', + target_column='len2_{0}'), ['species']), ('label', LabelEncoder(), 'species')])""" - ) + actual = column_transformer.__repr__() + assert expected == actual IDENT_SQL = "column /*CT.IDENT()*/" @@ -253,81 +265,33 @@ def test_columntransformer_repr_customtransforms(): FOREIGN_SQL = "column" -def test_customtransformer_registry(): - - compose.CustomTransformer.register(IdentityTransformer) - compose.CustomTransformer.register(Length1Transformer) - compose.CustomTransformer.register(Length2Transformer) - - transformer_cls = compose.CustomTransformer.find_matching_transformer(IDENT_SQL) - assert transformer_cls == IdentityTransformer - - transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN1_SQL) - assert transformer_cls == Length1Transformer - - transformer_cls = compose.CustomTransformer.find_matching_transformer(LEN2_SQL) - assert transformer_cls == Length2Transformer - - transformer_cls = compose.CustomTransformer.find_matching_transformer( - UNKNOWN_CT_SQL - ) - assert transformer_cls is None - - transformer_cls = compose.CustomTransformer.find_matching_transformer(FOREIGN_SQL) - assert transformer_cls is None - - def test_customtransformer_compile_sql(): - - ident_trafo = IdentityTransformer() + ident_trafo = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") sqls = ident_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - "col1 /*CT.IDENT()*/ AS ident_col1", - "col2 /*CT.IDENT()*/ AS ident_col2", + "col1 AS ident_col1", + "col2 AS ident_col2", ] - len1_trafo = Length1Transformer(-5) + len1_trafo = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -5 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) sqls = len1_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END /*CT.LEN1()*/ AS len1_col1", - "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END /*CT.LEN1()*/ AS len1_col2", + "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END AS len1_col1", + "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END AS len1_col2", ] - len2_trafo = Length2Transformer(99) + len2_trafo = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) sqls = len2_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) assert sqls == [ - "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END /*CT.LEN2([99])*/ AS len2_col1", - "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END /*CT.LEN2([99])*/ AS len2_col2", + "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END AS len2_col1", + "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END AS len2_col2", ] -def test_customtransformer_parse_sql(): - - ident_trafo, col_label = IdentityTransformer._parse_from_sql(IDENT_SQL) - assert col_label == "column" - assert ident_trafo - assert isinstance(ident_trafo, IdentityTransformer) - - len1_trafo, col_label = Length1Transformer._parse_from_sql(LEN1_SQL) - assert col_label == "column" - assert len1_trafo - assert isinstance(len1_trafo, Length1Transformer) - assert len1_trafo.default_value == -5 - - len2_trafo, col_label = Length2Transformer._parse_from_sql(LEN2_SQL) - assert col_label == "column" - assert len2_trafo - assert isinstance(len2_trafo, Length2Transformer) - assert len2_trafo.default_value == 99 - - fake_len2_sql = LEN2_SQL.replace("/*CT.LEN2([99])*/", "/*CT.LEN2([77])*/") - len2_trafo, col_label = Length2Transformer._parse_from_sql(fake_len2_sql) - assert col_label == "column" - assert len2_trafo - assert isinstance(len2_trafo, Length2Transformer) - assert len2_trafo.default_value == 77 - - def create_bq_model_mock(transform_columns, feature_columns=None): class _NameClass: def __init__(self, name): @@ -436,7 +400,7 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 # normalize the representation for string comparing - col_trans.transformers.sort() + col_trans.transformers.sort(key=lambda trafo: str(trafo)) actual = col_trans.__repr__() expected = """ColumnTransformer(transformers=[('label_encoder', LabelEncoder(max_categories=1000001, @@ -446,16 +410,6 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): LabelEncoder(max_categories=1000001, min_frequency=0), 'species'), - ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='culmen_length_mm ' - '/*CT.IDENT()*/', - target_column='ident_culmen_length_mm'), - '?'), - ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='flipper_length_mm ' - '/*CT.IDENT()*/', - target_column='ident_flipper_length_mm'), - '?'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN ' 'species IS ' @@ -475,6 +429,16 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): 'END ' '/*CT.LEN2([99])*/', target_column='len2_species'), + '?'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm ' + '/*CT.IDENT()*/', + target_column='ident_culmen_length_mm'), + '?'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='flipper_length_mm ' + '/*CT.IDENT()*/', + target_column='ident_flipper_length_mm'), '?')])""" assert expected == actual @@ -483,7 +447,7 @@ def test_columntransformer_extract_from_bq_model_merge(bq_model_merge): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_merge) assert isinstance(col_trans, ColumnTransformer) col_trans = col_trans._merge(bq_model_merge) - assert isinstance(col_trans, LabelEncoder) + assert isinstance(col_trans, preprocessing.LabelEncoder) assert ( col_trans.__repr__() == """LabelEncoder(max_categories=1000001, min_frequency=0)""" @@ -531,9 +495,13 @@ def __init__(self, bq_model): def test_columntransformer_compile_to_sql(): - ident_transformer = IdentityTransformer() - len1_transformer = Length1Transformer(-2) - len2_transformer = Length2Transformer(99) + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -549,9 +517,9 @@ def test_columntransformer_compile_to_sql(): ) sqls = column_transformer._compile_to_sql(None) assert sqls == [ - "culmen_length_mm /*CT.IDENT()*/ AS ident_culmen_length_mm", - "flipper_length_mm /*CT.IDENT()*/ AS ident_flipper_length_mm", - "CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END /*CT.LEN1()*/ AS len1_species", - "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/ AS len2_species", + "culmen_length_mm AS ident_culmen_length_mm", + "flipper_length_mm AS ident_flipper_length_mm", + "CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END AS len1_species", + "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] From 5cd967246cb898825d73ea2a4fad634b7306dee3 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Sun, 22 Sep 2024 23:35:29 +0200 Subject: [PATCH 13/40] fix typing. --- bigframes/ml/compose.py | 10 +++++----- tests/unit/ml/test_compose.py | 32 ++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 14bbcdfe17..2f970ebdb6 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -82,7 +82,7 @@ def __init__( transformers: Iterable[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], Union[str, Iterable[str]], ] ], @@ -101,13 +101,13 @@ def _keys(self): def transformers_( self, ) -> List[ - Tuple[str, Union[preprocessing.PreprocessingType, impute.SimpleImputer], str] + Tuple[str, Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], str] ]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], str, ] ] = [] @@ -136,7 +136,7 @@ def _extract_from_bq_model( transformers_set: Set[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer], + Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], Union[str, List[str]], ] ] = set() @@ -194,7 +194,7 @@ def camel_to_snake(name): def _merge( self, bq_model: bigquery.Model ) -> Union[ - ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer] + ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer] ]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" transformers = self.transformers diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 5f88a37b39..c6b917a217 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -18,6 +18,7 @@ from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer +import bigframes.pandas as bpd from google.cloud import bigquery from unittest import mock @@ -180,7 +181,14 @@ def test_columntransformer_repr_matches_sklearn(): assert bf_column_transformer.__repr__() == sk_column_transformer.__repr__() -def test_columntransformer_init_with_customtransforms(): +@pytest.fixture(scope="session") +def mock_X(): + mock_df = mock.create_autospec(spec=bpd.DataFrame) + return mock_df + + + +def test_columntransformer_init_with_sqltransformers(): ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") len1_transformer = SQLScalarColumnTransformer( "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" @@ -211,7 +219,7 @@ def test_columntransformer_init_with_customtransforms(): ] -def test_columntransformer_repr_customtransforms(): +def test_columntransformer_repr_sqltransformers(): ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") len1_transformer = SQLScalarColumnTransformer( "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" @@ -258,16 +266,9 @@ def test_columntransformer_repr_customtransforms(): assert expected == actual -IDENT_SQL = "column /*CT.IDENT()*/" -LEN1_SQL = "CASE WHEN column IS NULL THEN -5 ELSE LENGTH(column) END /*CT.LEN1()*/" -LEN2_SQL = "CASE WHEN column IS NULL THEN 99 ELSE LENGTH(column) END /*CT.LEN2([99])*/" -UNKNOWN_CT_SQL = "column /*CT.UNKNOWN()*/" -FOREIGN_SQL = "column" - - -def test_customtransformer_compile_sql(): +def test_customtransformer_compile_sql(mock_X): ident_trafo = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") - sqls = ident_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + sqls = ident_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ "col1 AS ident_col1", "col2 AS ident_col2", @@ -276,7 +277,7 @@ def test_customtransformer_compile_sql(): len1_trafo = SQLScalarColumnTransformer( "CASE WHEN {0} IS NULL THEN -5 ELSE LENGTH({0}) END", target_column="len1_{0}" ) - sqls = len1_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + sqls = len1_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END AS len1_col1", "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END AS len1_col2", @@ -285,7 +286,7 @@ def test_customtransformer_compile_sql(): len2_trafo = SQLScalarColumnTransformer( "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" ) - sqls = len2_trafo._compile_to_sql(X=None, columns=["col1", "col2"]) + sqls = len2_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END AS len2_col1", "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END AS len2_col2", @@ -395,7 +396,6 @@ def bq_model_unknown_ML(): ] ) - def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 @@ -523,3 +523,7 @@ def test_columntransformer_compile_to_sql(): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] + + +if __name__ == "__main__": + pytest.main(["test_compose.py", "-s"]) \ No newline at end of file From 980cd1aba8057759380a84b5ef73eb813d35ec90 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Sun, 22 Sep 2024 23:46:39 +0200 Subject: [PATCH 14/40] fix typing. --- bigframes/ml/compose.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 2f970ebdb6..86a1ebff54 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -174,14 +174,14 @@ def camel_to_snake(name): ) target_column = transform_col_dict["name"] - transformer = SQLScalarColumnTransformer( + sql_transformer = SQLScalarColumnTransformer( transform_sql, target_column=target_column ) input_column_name = "?" transformers_set.add( ( - camel_to_snake(transformer.__class__.__name__), - transformer, + camel_to_snake(sql_transformer.__class__.__name__), + sql_transformer, input_column_name, ) ) From e9b3ab0a89f547286c6731eb5559919717506ca7 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 01:19:30 +0200 Subject: [PATCH 15/40] fixed mock typing. --- tests/unit/ml/test_compose.py | 55 ++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index c6b917a217..9dc0c7c1af 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -22,6 +22,7 @@ from google.cloud import bigquery from unittest import mock +from bigframes.ml.core import BqmlModel def test_columntransformer_init_expectedtransforms(): @@ -187,7 +188,6 @@ def mock_X(): return mock_df - def test_columntransformer_init_with_sqltransformers(): ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") len1_transformer = SQLScalarColumnTransformer( @@ -293,7 +293,7 @@ def test_customtransformer_compile_sql(mock_X): ] -def create_bq_model_mock(transform_columns, feature_columns=None): +def create_bq_model_mock(mocker, transform_columns, feature_columns=None): class _NameClass: def __init__(self, name): self.name = name @@ -302,15 +302,19 @@ def __init__(self, name): mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) if feature_columns: - type(mock_bq_model).feature_columns = mock.PropertyMock( - return_value=[_NameClass(col) for col in feature_columns] + result = [_NameClass(col) for col in feature_columns] + mocker.patch( + "google.cloud.bigquery.model.Model.feature_columns", + new_callable=mock.PropertyMock(return_value=result), ) + return mock_bq_model @pytest.fixture -def bq_model_good(): +def bq_model_good(mocker): return create_bq_model_mock( + mocker, [ { "name": "ident_culmen_length_mm", @@ -342,13 +346,14 @@ def bq_model_good(): "type": {"typeKind": "INT64"}, "transformSql": "ML.LABEL_ENCODER(species, 1000000, 0) OVER()", }, - ] + ], ) @pytest.fixture -def bq_model_merge(): +def bq_model_merge(mocker): return create_bq_model_mock( + mocker, [ { "name": "labelencoded_county", @@ -366,8 +371,9 @@ def bq_model_merge(): @pytest.fixture -def bq_model_no_merge(): +def bq_model_no_merge(mocker): return create_bq_model_mock( + mocker, [ { "name": "ident_culmen_length_mm", @@ -380,8 +386,9 @@ def bq_model_no_merge(): @pytest.fixture -def bq_model_unknown_ML(): +def bq_model_unknown_ML(mocker): return create_bq_model_mock( + mocker, [ { "name": "unknownml_culmen_length_mm", @@ -393,9 +400,10 @@ def bq_model_unknown_ML(): "type": {"typeKind": "INT64"}, "transformSql": "ML.LABEL_ENCODER(county, 1000000, 0) OVER()", }, - ] + ], ) + def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 @@ -446,25 +454,28 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): def test_columntransformer_extract_from_bq_model_merge(bq_model_merge): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_merge) assert isinstance(col_trans, ColumnTransformer) - col_trans = col_trans._merge(bq_model_merge) - assert isinstance(col_trans, preprocessing.LabelEncoder) + merged_col_trans = col_trans._merge(bq_model_merge) + assert isinstance(merged_col_trans, preprocessing.LabelEncoder) assert ( - col_trans.__repr__() + merged_col_trans.__repr__() == """LabelEncoder(max_categories=1000001, min_frequency=0)""" ) - assert col_trans._output_names == ["labelencoded_county", "labelencoded_species"] + assert merged_col_trans._output_names == [ + "labelencoded_county", + "labelencoded_species", + ] def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_no_merge) - col_trans = col_trans._merge(bq_model_no_merge) - assert isinstance(col_trans, ColumnTransformer) + merged_col_trans = col_trans._merge(bq_model_no_merge) + assert isinstance(merged_col_trans, ColumnTransformer) expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm ' '/*CT.IDENT()*/', target_column='ident_culmen_length_mm'), '?')])""" - actual = col_trans.__repr__() + actual = merged_col_trans.__repr__() assert expected == actual @@ -477,7 +488,7 @@ def test_columntransformer_extract_from_bq_model_unknown_ML(bq_model_unknown_ML) def test_columntransformer_extract_output_names(bq_model_good): - class BQMLModel: + class BQMLModel(BqmlModel): def __init__(self, bq_model): self._model = bq_model @@ -494,7 +505,7 @@ def __init__(self, bq_model): ] -def test_columntransformer_compile_to_sql(): +def test_columntransformer_compile_to_sql(mock_X): ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") len1_transformer = SQLScalarColumnTransformer( "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" @@ -515,7 +526,7 @@ def test_columntransformer_compile_to_sql(): ("label", label_transformer, "species"), ] ) - sqls = column_transformer._compile_to_sql(None) + sqls = column_transformer._compile_to_sql(mock_X) assert sqls == [ "culmen_length_mm AS ident_culmen_length_mm", "flipper_length_mm AS ident_flipper_length_mm", @@ -523,7 +534,3 @@ def test_columntransformer_compile_to_sql(): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] - - -if __name__ == "__main__": - pytest.main(["test_compose.py", "-s"]) \ No newline at end of file From d858035f54c05c4465e86d0886471f737c70a6b6 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 08:35:48 +0200 Subject: [PATCH 16/40] replace _NameClass. --- tests/unit/ml/test_compose.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 9dc0c7c1af..9b2f38ee56 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -294,15 +294,11 @@ def test_customtransformer_compile_sql(mock_X): def create_bq_model_mock(mocker, transform_columns, feature_columns=None): - class _NameClass: - def __init__(self, name): - self.name = name - properties = {"transformColumns": transform_columns} mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) if feature_columns: - result = [_NameClass(col) for col in feature_columns] + result = [bigquery.standard_sql.StandardSqlField(col, None) for col in feature_columns] mocker.patch( "google.cloud.bigquery.model.Model.feature_columns", new_callable=mock.PropertyMock(return_value=result), @@ -534,3 +530,6 @@ def test_columntransformer_compile_to_sql(mock_X): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] + +if __name__ == "__main__": + pytest.main(["test_compose.py", "-s"]) From b21b2f4dd350accba934727c591c21d6d100dfbd Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 08:36:10 +0200 Subject: [PATCH 17/40] black formating. --- .project | 17 +++++++++++++++++ .pydevproject | 22 ++++++++++++++++++++++ tests/unit/ml/test_compose.py | 7 +++---- 3 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 .project create mode 100644 .pydevproject diff --git a/.project b/.project new file mode 100644 index 0000000000..aae0c562a8 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + python-bigquery-dataframes_FORK + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000000..512d546c8c --- /dev/null +++ b/.pydevproject @@ -0,0 +1,22 @@ + + + + + + python interpreter + + + + bftest2 + + + + + + /${PROJECT_DIR_NAME} + + /${PROJECT_DIR_NAME}/third_party + + + + diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 9b2f38ee56..bbef82f2e4 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -298,7 +298,9 @@ def create_bq_model_mock(mocker, transform_columns, feature_columns=None): mock_bq_model = bigquery.Model("model_project.model_dataset.model_id") type(mock_bq_model)._properties = mock.PropertyMock(return_value=properties) if feature_columns: - result = [bigquery.standard_sql.StandardSqlField(col, None) for col in feature_columns] + result = [ + bigquery.standard_sql.StandardSqlField(col, None) for col in feature_columns + ] mocker.patch( "google.cloud.bigquery.model.Model.feature_columns", new_callable=mock.PropertyMock(return_value=result), @@ -530,6 +532,3 @@ def test_columntransformer_compile_to_sql(mock_X): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] - -if __name__ == "__main__": - pytest.main(["test_compose.py", "-s"]) From acb74ab860b1accdd7ba80b8543150b1b7acb3ad Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 10:21:38 +0200 Subject: [PATCH 18/40] add traget_column as input_column with a "?" prefix when parsing SQLScalarColumnTransformer from sql. --- bigframes/ml/compose.py | 41 +++++++++++++++++++++++++++-------- tests/unit/ml/test_compose.py | 14 ++++++------ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 86a1ebff54..6cb2574dc5 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -82,7 +82,11 @@ def __init__( transformers: Iterable[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], + Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, + ], Union[str, Iterable[str]], ] ], @@ -101,13 +105,25 @@ def _keys(self): def transformers_( self, ) -> List[ - Tuple[str, Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], str] + Tuple[ + str, + Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, + ], + str, + ] ]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], + Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, + ], str, ] ] = [] @@ -136,7 +152,11 @@ def _extract_from_bq_model( transformers_set: Set[ Tuple[ str, - Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer], + Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, + ], Union[str, List[str]], ] ] = set() @@ -177,7 +197,7 @@ def camel_to_snake(name): sql_transformer = SQLScalarColumnTransformer( transform_sql, target_column=target_column ) - input_column_name = "?" + input_column_name = f"?{target_column}" transformers_set.add( ( camel_to_snake(sql_transformer.__class__.__name__), @@ -191,10 +211,13 @@ def camel_to_snake(name): return transformer - def _merge( - self, bq_model: bigquery.Model - ) -> Union[ - ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer, SQLScalarColumnTransformer] + def _merge(self, bq_model: bigquery.Model) -> Union[ + ColumnTransformer, + Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, + ], ]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" transformers = self.transformers diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index bbef82f2e4..891e9421ac 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -332,7 +332,7 @@ def bq_model_good(mocker): { "name": "len2_species", "type": {"typeKind": "INT64"}, - "transformSql": "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(address) END /*CT.LEN2([99])*/", + "transformSql": "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/", }, { "name": "labelencoded_county", @@ -425,27 +425,27 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): 'END ' '/*CT.LEN1()*/', target_column='len1_species'), - '?'), + '?len1_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN ' 'species IS ' 'NULL THEN 99 ' 'ELSE ' - 'LENGTH(address) ' + 'LENGTH(species) ' 'END ' '/*CT.LEN2([99])*/', target_column='len2_species'), - '?'), + '?len2_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm ' '/*CT.IDENT()*/', target_column='ident_culmen_length_mm'), - '?'), + '?ident_culmen_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='flipper_length_mm ' '/*CT.IDENT()*/', target_column='ident_flipper_length_mm'), - '?')])""" + '?ident_flipper_length_mm')])""" assert expected == actual @@ -472,7 +472,7 @@ def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): SQLScalarColumnTransformer(sql='culmen_length_mm ' '/*CT.IDENT()*/', target_column='ident_culmen_length_mm'), - '?')])""" + '?ident_culmen_length_mm')])""" actual = merged_col_trans.__repr__() assert expected == actual From 78f964ad2ebca58d83018ea3e567b49bbeaf6af4 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 16:59:26 +0200 Subject: [PATCH 19/40] reformatted with black version 22.3.0. --- bigframes/ml/compose.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 6cb2574dc5..104eae3c44 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -211,7 +211,9 @@ def camel_to_snake(name): return transformer - def _merge(self, bq_model: bigquery.Model) -> Union[ + def _merge( + self, bq_model: bigquery.Model + ) -> Union[ ColumnTransformer, Union[ preprocessing.PreprocessingType, From 7dc544a0e9d3035fc044247f75dbbaf0e579f7d4 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 23 Sep 2024 16:32:35 +0000 Subject: [PATCH 20/40] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/unit/ml/test_compose.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 891e9421ac..aa95c36862 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -11,18 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pytest +from unittest import mock +from google.cloud import bigquery +import pytest import sklearn.compose as sklearn_compose # type: ignore import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer -import bigframes.pandas as bpd - -from google.cloud import bigquery -from unittest import mock from bigframes.ml.core import BqmlModel +import bigframes.pandas as bpd def test_columntransformer_init_expectedtransforms(): From e9a9410c5dc122cac718bb3a96a623c57e603df1 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 21:34:05 +0200 Subject: [PATCH 21/40] remove eclipse project files --- .project | 17 ----------------- .pydevproject | 22 ---------------------- 2 files changed, 39 deletions(-) delete mode 100644 .project delete mode 100644 .pydevproject diff --git a/.project b/.project deleted file mode 100644 index aae0c562a8..0000000000 --- a/.project +++ /dev/null @@ -1,17 +0,0 @@ - - - python-bigquery-dataframes_FORK - - - - - - org.python.pydev.PyDevBuilder - - - - - - org.python.pydev.pythonNature - - diff --git a/.pydevproject b/.pydevproject deleted file mode 100644 index 512d546c8c..0000000000 --- a/.pydevproject +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - python interpreter - - - - bftest2 - - - - - - /${PROJECT_DIR_NAME} - - /${PROJECT_DIR_NAME}/third_party - - - - From 744f27299a0180bcf6636bc1ac6bf95401e89c3f Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 22:00:32 +0200 Subject: [PATCH 22/40] SQLScalarColumnTransformer needs not to be inherited from base.BaseTransformer. --- bigframes/ml/compose.py | 6 ++--- tests/unit/ml/test_compose.py | 47 ++++++----------------------------- 2 files changed, 11 insertions(+), 42 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 104eae3c44..1e9fbea277 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -46,7 +46,7 @@ ) -class SQLScalarColumnTransformer(base.BaseTransformer): +class SQLScalarColumnTransformer: def __init__(self, sql: str, target_column="transformed_{0}"): super().__init__() self.sql = sql @@ -64,8 +64,8 @@ def _compile_to_sql( result.append(f"{current_sql} AS {current_target_column}") return result - def _keys(self): - return (self.sql, self.target_column) + def __repr__(self): + return f"SQLScalarColumnTransformer(sql='{self.sql}', target_column='{self.target_column}')" @log_adapter.class_logger diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index aa95c36862..3a0fc3577c 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -241,24 +241,13 @@ def test_columntransformer_repr_sqltransformers(): ) expected = """ColumnTransformer(transformers=[('ident_trafo', - SQLScalarColumnTransformer(sql='{0}', - target_column='ident_{0}'), + SQLScalarColumnTransformer(sql='{0}', target_column='ident_{0}'), ['culmen_length_mm', 'flipper_length_mm']), ('len1_trafo', - SQLScalarColumnTransformer(sql='CASE WHEN {0} ' - 'IS NULL THEN ' - '-2 ELSE ' - 'LENGTH({0}) ' - 'END', - target_column='len1_{0}'), + SQLScalarColumnTransformer(sql='CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END', target_column='len1_{0}'), ['species']), ('len2_trafo', - SQLScalarColumnTransformer(sql='CASE WHEN {0} ' - 'IS NULL THEN ' - '99 ELSE ' - 'LENGTH({0}) ' - 'END', - target_column='len2_{0}'), + SQLScalarColumnTransformer(sql='CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END', target_column='len2_{0}'), ['species']), ('label', LabelEncoder(), 'species')])""" actual = column_transformer.__repr__() @@ -416,34 +405,16 @@ def test_columntransformer_extract_from_bq_model_good(bq_model_good): min_frequency=0), 'species'), ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='CASE WHEN ' - 'species IS ' - 'NULL THEN -5 ' - 'ELSE ' - 'LENGTH(species) ' - 'END ' - '/*CT.LEN1()*/', - target_column='len1_species'), + SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -5 ELSE LENGTH(species) END /*CT.LEN1()*/', target_column='len1_species'), '?len1_species'), ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='CASE WHEN ' - 'species IS ' - 'NULL THEN 99 ' - 'ELSE ' - 'LENGTH(species) ' - 'END ' - '/*CT.LEN2([99])*/', - target_column='len2_species'), + SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END /*CT.LEN2([99])*/', target_column='len2_species'), '?len2_species'), ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='culmen_length_mm ' - '/*CT.IDENT()*/', - target_column='ident_culmen_length_mm'), + SQLScalarColumnTransformer(sql='culmen_length_mm /*CT.IDENT()*/', target_column='ident_culmen_length_mm'), '?ident_culmen_length_mm'), ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='flipper_length_mm ' - '/*CT.IDENT()*/', - target_column='ident_flipper_length_mm'), + SQLScalarColumnTransformer(sql='flipper_length_mm /*CT.IDENT()*/', target_column='ident_flipper_length_mm'), '?ident_flipper_length_mm')])""" assert expected == actual @@ -468,9 +439,7 @@ def test_columntransformer_extract_from_bq_model_no_merge(bq_model_no_merge): merged_col_trans = col_trans._merge(bq_model_no_merge) assert isinstance(merged_col_trans, ColumnTransformer) expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='culmen_length_mm ' - '/*CT.IDENT()*/', - target_column='ident_culmen_length_mm'), + SQLScalarColumnTransformer(sql='culmen_length_mm /*CT.IDENT()*/', target_column='ident_culmen_length_mm'), '?ident_culmen_length_mm')])""" actual = merged_col_trans.__repr__() assert expected == actual From cc9840c52c8f44476718529b55ab54aa24c75861 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 22:11:38 +0200 Subject: [PATCH 23/40] remove filter for "ML." sqls in _extract_output_names() of BaseTransformer --- bigframes/ml/base.py | 4 ---- bigframes/ml/compose.py | 16 ---------------- 2 files changed, 20 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 6ae06c9d9f..81181b58cf 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -198,10 +198,6 @@ def _extract_output_names(self): # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue - transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue - output_names.append(transform_col_dict["name"]) self._output_names = output_names diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 1e9fbea277..d1c5b710af 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -295,22 +295,6 @@ def fit( self._extract_output_names() return self - # Overwrite the implementation in BaseTransformer, as it only supports the "ML." transformers. - # TODO: clarify if this should be changed in base.BaseTransformer? - def _extract_output_names(self): - """Extract transform output column names. Save the results to self._output_names.""" - assert self._bqml_model is not None - - output_names = [] - for transform_col in self._bqml_model._model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) - # pass the columns that are not transformed - if "transformSql" not in transform_col_dict: - continue - output_names.append(transform_col_dict["name"]) - - self._output_names = output_names - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") From bb142f5931a2725bfbbd2accf3fd21e4a0a39401 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 22:27:08 +0200 Subject: [PATCH 24/40] introduced type hint SingleColTransformer for transformers contained in ColumnTransformer --- bigframes/ml/compose.py | 45 ++++++++++++----------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d1c5b710af..e7891d5214 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -68,6 +68,14 @@ def __repr__(self): return f"SQLScalarColumnTransformer(sql='{self.sql}', target_column='{self.target_column}')" +# Type hints for transformers contained in ColumnTransformer +SingleColTransformer = Union[ + preprocessing.PreprocessingType, + impute.SimpleImputer, + SQLScalarColumnTransformer, +] + + @log_adapter.class_logger class ColumnTransformer( base.Transformer, @@ -82,11 +90,7 @@ def __init__( transformers: Iterable[ Tuple[ str, - Union[ - preprocessing.PreprocessingType, - impute.SimpleImputer, - SQLScalarColumnTransformer, - ], + SingleColTransformer, Union[str, Iterable[str]], ] ], @@ -104,26 +108,12 @@ def _keys(self): @property def transformers_( self, - ) -> List[ - Tuple[ - str, - Union[ - preprocessing.PreprocessingType, - impute.SimpleImputer, - SQLScalarColumnTransformer, - ], - str, - ] - ]: + ) -> List[Tuple[str, SingleColTransformer, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - Union[ - preprocessing.PreprocessingType, - impute.SimpleImputer, - SQLScalarColumnTransformer, - ], + SingleColTransformer, str, ] ] = [] @@ -152,11 +142,7 @@ def _extract_from_bq_model( transformers_set: Set[ Tuple[ str, - Union[ - preprocessing.PreprocessingType, - impute.SimpleImputer, - SQLScalarColumnTransformer, - ], + SingleColTransformer, Union[str, List[str]], ] ] = set() @@ -214,12 +200,7 @@ def camel_to_snake(name): def _merge( self, bq_model: bigquery.Model ) -> Union[ - ColumnTransformer, - Union[ - preprocessing.PreprocessingType, - impute.SimpleImputer, - SQLScalarColumnTransformer, - ], + ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer] ]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" transformers = self.transformers From 4ecdec6203520763a6a8b6822a4bacbe203f0cbe Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 22:38:25 +0200 Subject: [PATCH 25/40] make sql and target_column private in SQLScalarColumnTransformer --- bigframes/ml/compose.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index e7891d5214..6ebdea5c4c 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -47,10 +47,10 @@ class SQLScalarColumnTransformer: - def __init__(self, sql: str, target_column="transformed_{0}"): + def __init__(self, sql: str, target_column: str = "transformed_{0}"): super().__init__() - self.sql = sql - self.target_column = target_column + self._sql = sql + self._target_column = target_column def _compile_to_sql( self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None @@ -59,13 +59,13 @@ def _compile_to_sql( columns = X.columns result = [] for column in columns: - current_sql = self.sql.format(column) - current_target_column = self.target_column.format(column) + current_sql = self._sql.format(column) + current_target_column = self._target_column.format(column) result.append(f"{current_sql} AS {current_target_column}") return result def __repr__(self): - return f"SQLScalarColumnTransformer(sql='{self.sql}', target_column='{self.target_column}')" + return f"SQLScalarColumnTransformer(sql='{self._sql}', target_column='{self._target_column}')" # Type hints for transformers contained in ColumnTransformer From a36845bfc26218ebec60f9db2ce161cba330fff9 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Mon, 23 Sep 2024 23:44:43 +0200 Subject: [PATCH 26/40] Add documentation for SQLScalarColumnTransformer. --- bigframes/ml/compose.py | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 6ebdea5c4c..7d0d45031c 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -47,7 +47,54 @@ class SQLScalarColumnTransformer: + """ + Wrapper for plain SQL code contained in a ColumnTransformer. + + **Examples:** + + >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer + >>> import bigframes.pandas as bpd + + >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) + >>> col_trans = ColumnTransformer([ + ... ("strlen", + ... SQLScalarColumnTransformer("CASE WHEN {0} IS NULL THEN 15 ELSE LENGTH({0}) END"), + ... ['name', 'city']), + ... ]) + >>> col_trans = col_trans.fit(df) + >>> df_transformed = col_trans.transform(df) + >>> df_transformed + transformed_name transformed_city + 0 5 8 + 1 15 6 + 2 4 15 + + [3 rows x 2 columns] + + """ + def __init__(self, sql: str, target_column: str = "transformed_{0}"): + """ + Create a single column transformer in plain sql. + This transformer can only be used inside ColumnTransformer. + + **Examples:** + + col_trans = ColumnTransformer([ + ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", col5"]), + ("increment", SQLScalarColumnTransformer("{0}+1", target_column="inc_{0}"), ["col2"]), + ... + ]) + + Args: + sql (string): + sql code to transform the column. + for the column name the placeholder '{0}' can be used. + target_column (string): + name of the target column. + the placeholder '{0}' can be used for the input column name. + default value is 'transformed_{0}'. + """ super().__init__() self._sql = sql self._target_column = target_column From 517edf60bf30e2f15d96486d28bd15094447917e Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 14:23:01 +0200 Subject: [PATCH 27/40] add first system test for SQLScalarColumnTransformer. --- tests/system/large/ml/test_compose.py | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 59c5a1538f..e34bc756e9 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -36,6 +36,32 @@ def test_columntransformer_standalone_fit_and_transform( preprocessing.MinMaxScaler(), ["culmen_length_mm"], ), + ( + "increment", + compose.SQLScalarColumnTransformer("{0}+1"), + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "ohe", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0}='Adelie Penguin (Pygoscelis adeliae)' THEN 1 ELSE 0 END", + target_column="ohe_adelie", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -51,6 +77,12 @@ def test_columntransformer_standalone_fit_and_transform( "standard_scaled_culmen_length_mm", "min_max_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "transformed_culmen_length_mm", + "transformed_flipper_length_mm", + "len_species", + "ohe_adelie", + "culmen_length_mm", + "flipper_length_mm", ], index=[1633, 1672, 1690], col_exact=False, From 29fed2025c22236dffb3307993b55e2f74a6ccdb Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 15:48:28 +0200 Subject: [PATCH 28/40] SQLScalarColumnTransformer system tests for fit-transform and save-load --- tests/system/large/ml/test_compose.py | 54 +++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index e34bc756e9..4f4ea67cc8 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -102,6 +102,19 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -115,6 +128,9 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): "onehotencoded_species", "standard_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "len_species", + "culmen_length_mm", + "flipper_length_mm", ], index=[1633, 1672, 1690], col_exact=False, @@ -134,6 +150,19 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "length", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="len_{0}", + ), + "species", + ), + ( + "identity", + compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) transformer.fit( @@ -154,6 +183,28 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ), ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END", + target_column="len_species", + ), + "?len_species", + ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "flipper_length_mm", target_column="flipper_length_mm" + ), + "?flipper_length_mm", + ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "culmen_length_mm", target_column="culmen_length_mm" + ), + "?culmen_length_mm", + ), ] assert set(reloaded_transformer.transformers) == set(expected) assert reloaded_transformer._bqml_model is not None @@ -168,6 +219,9 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): "onehotencoded_species", "standard_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", + "len_species", + "culmen_length_mm", + "flipper_length_mm", ], index=[1633, 1672, 1690], col_exact=False, From 1b67957e5e9cf0472043d3db34b469f0de69ba8c Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 17:54:53 +0200 Subject: [PATCH 29/40] make SQLScalarColumnTransformer comparable (equals) for comparing sets in tests --- bigframes/ml/compose.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 7d0d45031c..68747ab3ec 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -114,6 +114,12 @@ def _compile_to_sql( def __repr__(self): return f"SQLScalarColumnTransformer(sql='{self._sql}', target_column='{self._target_column}')" + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + # Type hints for transformers contained in ColumnTransformer SingleColTransformer = Union[ From 9b12eac4a607cc071643abd3cd8b98f3146319fd Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 18:20:16 +0200 Subject: [PATCH 30/40] implement hash and eq (copied from BaseTransformer) --- .project | 17 + .pydevproject | 29 ++ .settings/org.eclipse.core.resources.prefs | 2 + bigframes/ml/compose.py | 13 +- expected.txt | 1 + reloaded.txt | 1 + test_sqltrafo.py | 7 + tests/system/conftest.py | 22 +- tests/system/utils_extract.py | 381 +++++++++++++++++++++ tests/unit/ml/test_compose.py | 3 + 10 files changed, 469 insertions(+), 7 deletions(-) create mode 100644 .project create mode 100644 .pydevproject create mode 100644 .settings/org.eclipse.core.resources.prefs create mode 100644 expected.txt create mode 100644 reloaded.txt create mode 100644 test_sqltrafo.py create mode 100644 tests/system/utils_extract.py diff --git a/.project b/.project new file mode 100644 index 0000000000..aae0c562a8 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + python-bigquery-dataframes_FORK + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000000..c0c20df199 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,29 @@ + + + + + + + python interpreter + + + + + bftest2 + + + + + + + + /${PROJECT_DIR_NAME} + + + /${PROJECT_DIR_NAME}/third_party + + + + + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000000..b87fd68992 --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/noxfile.py=utf-8 diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 68747ab3ec..69ece962fe 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -114,11 +114,14 @@ def _compile_to_sql( def __repr__(self): return f"SQLScalarColumnTransformer(sql='{self._sql}', target_column='{self._target_column}')" - def __eq__(self, other): - if isinstance(other, self.__class__): - return self.__dict__ == other.__dict__ - else: - return False + def __eq__(self, other) -> bool: + return type(self) is type(other) and self._keys() == other._keys() + + def __hash__(self) -> int: + return hash(self._keys()) + + def _keys(self): + return (self._sql, self._target_column) # Type hints for transformers contained in ColumnTransformer diff --git a/expected.txt b/expected.txt new file mode 100644 index 0000000000..8572a71f54 --- /dev/null +++ b/expected.txt @@ -0,0 +1 @@ +[('one_hot_encoder', OneHotEncoder(max_categories=1000001, min_frequency=0), 'species'), ('standard_scaler', StandardScaler(), 'culmen_length_mm'), ('standard_scaler', StandardScaler(), 'flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END', target_column='len_species'), '?len_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='flipper_length_mm', target_column='flipper_length_mm'), '?flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='culmen_length_mm'), '?culmen_length_mm')] \ No newline at end of file diff --git a/reloaded.txt b/reloaded.txt new file mode 100644 index 0000000000..293c05b85e --- /dev/null +++ b/reloaded.txt @@ -0,0 +1 @@ +[('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='flipper_length_mm', target_column='flipper_length_mm'), '?flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END', target_column='len_species'), '?len_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='culmen_length_mm'), '?culmen_length_mm'), ('one_hot_encoder', OneHotEncoder(max_categories=1000001, min_frequency=0), 'species'), ('standard_scaler', StandardScaler(), 'culmen_length_mm'), ('standard_scaler', StandardScaler(), 'flipper_length_mm')] \ No newline at end of file diff --git a/test_sqltrafo.py b/test_sqltrafo.py new file mode 100644 index 0000000000..d6b73f8d8e --- /dev/null +++ b/test_sqltrafo.py @@ -0,0 +1,7 @@ +from bigframes.ml import compose + +trafo1 = compose.SQLScalarColumnTransformer("{0}+1") +trafo2 = compose.SQLScalarColumnTransformer("{0}-1") +print(trafo1) +print(trafo1) +print(trafo1==trafo2) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 5ee2dc6397..68bb7ee7c0 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -34,7 +34,7 @@ import pandas as pd import pytest import pytz -import test_utils.prefixer +#import test_utils.prefixer import bigframes import bigframes.dataframe @@ -61,7 +61,25 @@ PERMANENT_DATASET = "bigframes_testing" PERMANENT_DATASET_TOKYO = "bigframes_testing_tokyo" TOKYO_LOCATION = "asia-northeast1" -prefixer = test_utils.prefixer.Prefixer("bigframes", "tests/system") + + +import datetime +class Prefixer: + def __init__(self, root, path): + self.root = root + self.path = path + self.norm = f'temp_prfx_{root.lower()}_{path.replace("/","").lower()}' + def should_cleanup(self, dataset_id): + result = dataset_id.startswith(self.norm) + print(f"\nSHOULD_CLEANUP(dataset_id={dataset_id}) -> {result}") + return + def create_prefix(self): + NOW = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S") + result = f"{self.norm}_{NOW}" + print(f"\nCREATED PREFIX {result}") + return result + +prefixer = Prefixer("bigframes", "tests/system") def _hash_digest_file(hasher, filepath): diff --git a/tests/system/utils_extract.py b/tests/system/utils_extract.py new file mode 100644 index 0000000000..bbaa684807 --- /dev/null +++ b/tests/system/utils_extract.py @@ -0,0 +1,381 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import decimal +import functools +from typing import Iterable, Optional, Set, Union + +import geopandas as gpd # type: ignore +import google.api_core.operation +from google.cloud import bigquery, functions_v2 +from google.cloud.functions_v2.types import functions +import numpy as np +import pandas as pd +import pyarrow as pa # type: ignore +import pytest + +import bigframes.functions._utils as functions_utils +import bigframes.pandas + +#=============================================================================== +# ML_REGRESSION_METRICS = [ +# "mean_absolute_error", +# "mean_squared_error", +# "mean_squared_log_error", +# "median_absolute_error", +# "r2_score", +# "explained_variance", +# ] +# ML_CLASSFICATION_METRICS = [ +# "precision", +# "recall", +# "accuracy", +# "f1_score", +# "log_loss", +# "roc_auc", +# ] +# ML_GENERATE_TEXT_OUTPUT = [ +# "ml_generate_text_llm_result", +# "ml_generate_text_status", +# "prompt", +# ] +# +# +# def skip_legacy_pandas(test): +# @functools.wraps(test) +# def wrapper(*args, **kwds): +# if pd.__version__.startswith("1."): +# pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") +# return test(*args, **kwds) +# +# return wrapper +# +# +# # Prefer this function for tests that run in both ordered and unordered mode +# def assert_dfs_equivalent( +# pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs +# ): +# bf_df_local = bf_df.to_pandas() +# ignore_order = not bf_df._session._strictly_ordered +# assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) +# +# +# def assert_series_equivalent( +# pd_series: pd.Series, bf_series: bigframes.pandas.Series, **kwargs +# ): +# bf_df_local = bf_series.to_pandas() +# ignore_order = not bf_series._session._strictly_ordered +# assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) +# +# +# def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): +# if ignore_order: +# # Sort by a column to get consistent results. +# if df0.index.name != "rowindex": +# df0 = df0.sort_values( +# list(df0.columns.drop("geography_col", errors="ignore")) +# ).reset_index(drop=True) +# df1 = df1.sort_values( +# list(df1.columns.drop("geography_col", errors="ignore")) +# ).reset_index(drop=True) +# else: +# df0 = df0.sort_index() +# df1 = df1.sort_index() +# +# pd.testing.assert_frame_equal(df0, df1, **kwargs) +# +# +# def assert_series_equal( +# left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs +# ): +# if ignore_order: +# if left.index.name is None: +# left = left.sort_values().reset_index(drop=True) +# right = right.sort_values().reset_index(drop=True) +# else: +# left = left.sort_index() +# right = right.sort_index() +# +# pd.testing.assert_series_equal(left, right, **kwargs) +# +# +# def _standardize_index(idx): +# return pd.Index(list(idx), name=idx.name) +# +# +# def assert_pandas_index_equal_ignore_index_type(idx0, idx1): +# idx0 = _standardize_index(idx0) +# idx1 = _standardize_index(idx1) +# +# pd.testing.assert_index_equal(idx0, idx1) +# +# +# def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): +# """Convert pandas dataframe dtypes compatible with bigframes dataframe.""" +# +# # TODO(chelsealin): updates the function to accept dtypes as input rather than +# # hard-code the column names here. +# +# # Convert basic types columns +# df["bool_col"] = df["bool_col"].astype(pd.BooleanDtype()) +# df["int64_col"] = df["int64_col"].astype(pd.Int64Dtype()) +# df["int64_too"] = df["int64_too"].astype(pd.Int64Dtype()) +# df["float64_col"] = df["float64_col"].astype(pd.Float64Dtype()) +# df["string_col"] = df["string_col"].astype(pd.StringDtype(storage="pyarrow")) +# +# if "rowindex" in df.columns: +# df["rowindex"] = df["rowindex"].astype(pd.Int64Dtype()) +# if "rowindex_2" in df.columns: +# df["rowindex_2"] = df["rowindex_2"].astype(pd.Int64Dtype()) +# +# # Convert time types columns. The `astype` works for Pandas 2.0 but hits an assert +# # error at Pandas 1.5. Hence, we have to convert to arrow table and convert back +# # to pandas dataframe. +# if not isinstance(df["date_col"].dtype, pd.ArrowDtype): +# df["date_col"] = pd.to_datetime(df["date_col"], format="%Y-%m-%d") +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["date_col"]), +# schema=pa.schema([("date_col", pa.date32())]), +# ) +# df["date_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["date_col"] +# +# if not isinstance(df["datetime_col"].dtype, pd.ArrowDtype): +# df["datetime_col"] = pd.to_datetime( +# df["datetime_col"], format="%Y-%m-%d %H:%M:%S" +# ) +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["datetime_col"]), +# schema=pa.schema([("datetime_col", pa.timestamp("us"))]), +# ) +# df["datetime_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ +# "datetime_col" +# ] +# +# if not isinstance(df["time_col"].dtype, pd.ArrowDtype): +# df["time_col"] = pd.to_datetime(df["time_col"], format="%H:%M:%S.%f") +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["time_col"]), +# schema=pa.schema([("time_col", pa.time64("us"))]), +# ) +# df["time_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["time_col"] +# +# if not isinstance(df["timestamp_col"].dtype, pd.ArrowDtype): +# df["timestamp_col"] = pd.to_datetime( +# df["timestamp_col"], format="%Y-%m-%d %H:%M:%S.%f%Z" +# ) +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["timestamp_col"]), +# schema=pa.schema([("timestamp_col", pa.timestamp("us", tz="UTC"))]), +# ) +# df["timestamp_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ +# "timestamp_col" +# ] +# +# # Convert geography types columns. +# if "geography_col" in df.columns: +# df["geography_col"] = df["geography_col"].astype( +# pd.StringDtype(storage="pyarrow") +# ) +# df["geography_col"] = gpd.GeoSeries.from_wkt( +# df["geography_col"].replace({np.nan: None}) +# ) +# +# if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): +# df["bytes_col"] = df["bytes_col"].apply( +# lambda value: base64.b64decode(value) if not pd.isnull(value) else value +# ) +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["bytes_col"]), +# schema=pa.schema([("bytes_col", pa.binary())]), +# ) +# df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] +# +# if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): +# # Convert numeric types column. +# df["numeric_col"] = df["numeric_col"].apply( +# lambda value: decimal.Decimal(str(value)) if value else None # type: ignore +# ) +# arrow_table = pa.Table.from_pandas( +# pd.DataFrame(df, columns=["numeric_col"]), +# schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), +# ) +# df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ +# "numeric_col" +# ] +# +# +# def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): +# """Compare two pandas dataframes representing PCA components. The columns +# required to be present in the dataframes are: +# numerical_value: numeric, +# categorical_value: List[object(category, value)] +# +# The index types of `actual` and `expected` are ignored in the comparison. +# +# Args: +# actual: Actual Pandas DataFrame +# +# expected: Expected Pandas DataFrame +# +# kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column +# """ +# # Compare the index, columns and values separately, as the polarity of the +# # PCA vectors can be arbitrary +# pd.testing.assert_index_equal( +# actual.index, expected.index.astype(actual.index.dtype) +# ) # dtype agnostic index comparison +# pd.testing.assert_index_equal(actual.columns, expected.columns) +# for column in expected.columns: +# try: +# pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) +# except AssertionError: +# if column not in {"numerical_value", "categorical_value"}: +# raise +# +# # Allow for sign difference per numeric/categorical column +# if column == "numerical_value": +# actual_ = -actual[column] +# expected_ = expected[column] +# else: +# # In this column each element is an array of objects, where the +# # object has attributes "category" and "value". For the sake of +# # comparison let's normalize by flipping the polarity of "value". +# def normalize_array_of_objects(arr, reverse_polarity=False): +# newarr = [] +# for element in arr: +# newelement = dict(element) +# if reverse_polarity: +# newelement["value"] = -newelement["value"] +# newarr.append(newelement) +# return sorted(newarr, key=lambda d: d["category"]) +# +# actual_ = actual[column].apply(normalize_array_of_objects, args=(True,)) +# expected_ = expected[column].apply(normalize_array_of_objects) +# +# pd.testing.assert_series_equal(actual_, expected_, **kwargs) +# +# +# def assert_pandas_df_equal_pca(actual, expected, **kwargs): +# """Compare two pandas dataframes representing PCA predictions. The columns +# in the dataframes are expected to be numeric. +# +# Args: +# actual: Actual Pandas DataFrame +# +# expected: Expected Pandas DataFrame +# +# kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column +# """ +# # Compare the index, columns and values separately, as the polarity of the +# # PCA vector can be arbitrary +# pd.testing.assert_index_equal(actual.index, expected.index) +# pd.testing.assert_index_equal(actual.columns, expected.columns) +# for column in expected.columns: +# try: +# pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) +# except AssertionError: +# # Allow for sign difference per column +# pd.testing.assert_series_equal(-actual[column], expected[column], **kwargs) +#=============================================================================== + + +def check_pandas_df_schema_and_index( + pd_df: pd.DataFrame, + columns: Iterable, + index: Union[int, Iterable], + col_exact: bool = True, +): + """Check pandas df schema and index. But not the values. + + Args: + pd_df: the input pandas df + columns: target columns to check with + index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match + col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns + """ + if col_exact: + assert list(pd_df.columns) == list(columns) + else: + assert set(columns) <= set(pd_df.columns) + + if isinstance(index, int): + assert len(pd_df) == index + elif isinstance(index, Iterable): + assert list(pd_df.index) == list(index) + else: + raise ValueError("Unsupported index type.") + + +#=============================================================================== +# def get_remote_function_endpoints( +# bigquery_client: bigquery.Client, dataset_id: str +# ) -> Set[str]: +# """Get endpoints used by the remote functions in a datset""" +# endpoints = set() +# routines = bigquery_client.list_routines(dataset=dataset_id) +# for routine in routines: +# rf_options = routine._properties.get("remoteFunctionOptions") +# if not rf_options: +# continue +# rf_endpoint = rf_options.get("endpoint") +# if rf_endpoint: +# endpoints.add(rf_endpoint) +# return endpoints +# +# +# def get_cloud_functions( +# functions_client: functions_v2.FunctionServiceClient, +# project: str, +# location: str, +# name: Optional[str] = None, +# name_prefix: Optional[str] = None, +# ) -> Iterable[functions.ListFunctionsResponse]: +# """Get the cloud functions in the given project and location.""" +# +# assert ( +# not name or not name_prefix +# ), "Either 'name' or 'name_prefix' can be passed but not both." +# +# _, location = functions_utils.get_remote_function_locations(location) +# parent = f"projects/{project}/locations/{location}" +# request = functions_v2.ListFunctionsRequest(parent=parent) +# page_result = functions_client.list_functions(request=request) +# for response in page_result: +# # If name is provided and it does not match then skip +# if bool(name): +# full_name = parent + f"/functions/{name}" +# if response.name != full_name: +# continue +# # If name prefix is provided and it does not match then skip +# elif bool(name_prefix): +# full_name_prefix = parent + f"/functions/{name_prefix}" +# if not response.name.startswith(full_name_prefix): +# continue +# +# yield response +# +# +# def delete_cloud_function( +# functions_client: functions_v2.FunctionServiceClient, full_name: str +# ) -> google.api_core.operation.Operation: +# """Delete a cloud function with the given fully qualified name.""" +# request = functions_v2.DeleteFunctionRequest(name=full_name) +# operation = functions_client.delete_function(request=request) +# return operation +# +# +# def get_first_file_from_wildcard(path): +# return path.replace("*", "000000000000") +#=============================================================================== diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 3a0fc3577c..63f9ebbd94 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -500,3 +500,6 @@ def test_columntransformer_compile_to_sql(mock_X): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] + +if __name__ == "__main__": + pytest.main(["test_compose.py", "-s"]) \ No newline at end of file From 48ecd3854d78a898f9fd9089cf924e2ba4b92033 Mon Sep 17 00:00:00 2001 From: ferenc-hechler Date: Tue, 24 Sep 2024 18:37:27 +0200 Subject: [PATCH 31/40] undo accidentally checked in files --- .project | 17 -- .pydevproject | 29 --- expected.txt | 1 - reloaded.txt | 1 - test_sqltrafo.py | 7 - tests/system/conftest.py | 22 +- tests/system/utils_extract.py | 381 ---------------------------------- tests/unit/ml/test_compose.py | 3 - 8 files changed, 2 insertions(+), 459 deletions(-) delete mode 100644 .project delete mode 100644 .pydevproject delete mode 100644 expected.txt delete mode 100644 reloaded.txt delete mode 100644 test_sqltrafo.py delete mode 100644 tests/system/utils_extract.py diff --git a/.project b/.project deleted file mode 100644 index aae0c562a8..0000000000 --- a/.project +++ /dev/null @@ -1,17 +0,0 @@ - - - python-bigquery-dataframes_FORK - - - - - - org.python.pydev.PyDevBuilder - - - - - - org.python.pydev.pythonNature - - diff --git a/.pydevproject b/.pydevproject deleted file mode 100644 index c0c20df199..0000000000 --- a/.pydevproject +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - python interpreter - - - - - bftest2 - - - - - - - - /${PROJECT_DIR_NAME} - - - /${PROJECT_DIR_NAME}/third_party - - - - - - diff --git a/expected.txt b/expected.txt deleted file mode 100644 index 8572a71f54..0000000000 --- a/expected.txt +++ /dev/null @@ -1 +0,0 @@ -[('one_hot_encoder', OneHotEncoder(max_categories=1000001, min_frequency=0), 'species'), ('standard_scaler', StandardScaler(), 'culmen_length_mm'), ('standard_scaler', StandardScaler(), 'flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END', target_column='len_species'), '?len_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='flipper_length_mm', target_column='flipper_length_mm'), '?flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='culmen_length_mm'), '?culmen_length_mm')] \ No newline at end of file diff --git a/reloaded.txt b/reloaded.txt deleted file mode 100644 index 293c05b85e..0000000000 --- a/reloaded.txt +++ /dev/null @@ -1 +0,0 @@ -[('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='flipper_length_mm', target_column='flipper_length_mm'), '?flipper_length_mm'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END', target_column='len_species'), '?len_species'), ('sql_scalar_column_transformer', SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='culmen_length_mm'), '?culmen_length_mm'), ('one_hot_encoder', OneHotEncoder(max_categories=1000001, min_frequency=0), 'species'), ('standard_scaler', StandardScaler(), 'culmen_length_mm'), ('standard_scaler', StandardScaler(), 'flipper_length_mm')] \ No newline at end of file diff --git a/test_sqltrafo.py b/test_sqltrafo.py deleted file mode 100644 index d6b73f8d8e..0000000000 --- a/test_sqltrafo.py +++ /dev/null @@ -1,7 +0,0 @@ -from bigframes.ml import compose - -trafo1 = compose.SQLScalarColumnTransformer("{0}+1") -trafo2 = compose.SQLScalarColumnTransformer("{0}-1") -print(trafo1) -print(trafo1) -print(trafo1==trafo2) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 68bb7ee7c0..5ee2dc6397 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -34,7 +34,7 @@ import pandas as pd import pytest import pytz -#import test_utils.prefixer +import test_utils.prefixer import bigframes import bigframes.dataframe @@ -61,25 +61,7 @@ PERMANENT_DATASET = "bigframes_testing" PERMANENT_DATASET_TOKYO = "bigframes_testing_tokyo" TOKYO_LOCATION = "asia-northeast1" - - -import datetime -class Prefixer: - def __init__(self, root, path): - self.root = root - self.path = path - self.norm = f'temp_prfx_{root.lower()}_{path.replace("/","").lower()}' - def should_cleanup(self, dataset_id): - result = dataset_id.startswith(self.norm) - print(f"\nSHOULD_CLEANUP(dataset_id={dataset_id}) -> {result}") - return - def create_prefix(self): - NOW = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S") - result = f"{self.norm}_{NOW}" - print(f"\nCREATED PREFIX {result}") - return result - -prefixer = Prefixer("bigframes", "tests/system") +prefixer = test_utils.prefixer.Prefixer("bigframes", "tests/system") def _hash_digest_file(hasher, filepath): diff --git a/tests/system/utils_extract.py b/tests/system/utils_extract.py deleted file mode 100644 index bbaa684807..0000000000 --- a/tests/system/utils_extract.py +++ /dev/null @@ -1,381 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import base64 -import decimal -import functools -from typing import Iterable, Optional, Set, Union - -import geopandas as gpd # type: ignore -import google.api_core.operation -from google.cloud import bigquery, functions_v2 -from google.cloud.functions_v2.types import functions -import numpy as np -import pandas as pd -import pyarrow as pa # type: ignore -import pytest - -import bigframes.functions._utils as functions_utils -import bigframes.pandas - -#=============================================================================== -# ML_REGRESSION_METRICS = [ -# "mean_absolute_error", -# "mean_squared_error", -# "mean_squared_log_error", -# "median_absolute_error", -# "r2_score", -# "explained_variance", -# ] -# ML_CLASSFICATION_METRICS = [ -# "precision", -# "recall", -# "accuracy", -# "f1_score", -# "log_loss", -# "roc_auc", -# ] -# ML_GENERATE_TEXT_OUTPUT = [ -# "ml_generate_text_llm_result", -# "ml_generate_text_status", -# "prompt", -# ] -# -# -# def skip_legacy_pandas(test): -# @functools.wraps(test) -# def wrapper(*args, **kwds): -# if pd.__version__.startswith("1."): -# pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") -# return test(*args, **kwds) -# -# return wrapper -# -# -# # Prefer this function for tests that run in both ordered and unordered mode -# def assert_dfs_equivalent( -# pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs -# ): -# bf_df_local = bf_df.to_pandas() -# ignore_order = not bf_df._session._strictly_ordered -# assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) -# -# -# def assert_series_equivalent( -# pd_series: pd.Series, bf_series: bigframes.pandas.Series, **kwargs -# ): -# bf_df_local = bf_series.to_pandas() -# ignore_order = not bf_series._session._strictly_ordered -# assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) -# -# -# def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): -# if ignore_order: -# # Sort by a column to get consistent results. -# if df0.index.name != "rowindex": -# df0 = df0.sort_values( -# list(df0.columns.drop("geography_col", errors="ignore")) -# ).reset_index(drop=True) -# df1 = df1.sort_values( -# list(df1.columns.drop("geography_col", errors="ignore")) -# ).reset_index(drop=True) -# else: -# df0 = df0.sort_index() -# df1 = df1.sort_index() -# -# pd.testing.assert_frame_equal(df0, df1, **kwargs) -# -# -# def assert_series_equal( -# left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs -# ): -# if ignore_order: -# if left.index.name is None: -# left = left.sort_values().reset_index(drop=True) -# right = right.sort_values().reset_index(drop=True) -# else: -# left = left.sort_index() -# right = right.sort_index() -# -# pd.testing.assert_series_equal(left, right, **kwargs) -# -# -# def _standardize_index(idx): -# return pd.Index(list(idx), name=idx.name) -# -# -# def assert_pandas_index_equal_ignore_index_type(idx0, idx1): -# idx0 = _standardize_index(idx0) -# idx1 = _standardize_index(idx1) -# -# pd.testing.assert_index_equal(idx0, idx1) -# -# -# def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): -# """Convert pandas dataframe dtypes compatible with bigframes dataframe.""" -# -# # TODO(chelsealin): updates the function to accept dtypes as input rather than -# # hard-code the column names here. -# -# # Convert basic types columns -# df["bool_col"] = df["bool_col"].astype(pd.BooleanDtype()) -# df["int64_col"] = df["int64_col"].astype(pd.Int64Dtype()) -# df["int64_too"] = df["int64_too"].astype(pd.Int64Dtype()) -# df["float64_col"] = df["float64_col"].astype(pd.Float64Dtype()) -# df["string_col"] = df["string_col"].astype(pd.StringDtype(storage="pyarrow")) -# -# if "rowindex" in df.columns: -# df["rowindex"] = df["rowindex"].astype(pd.Int64Dtype()) -# if "rowindex_2" in df.columns: -# df["rowindex_2"] = df["rowindex_2"].astype(pd.Int64Dtype()) -# -# # Convert time types columns. The `astype` works for Pandas 2.0 but hits an assert -# # error at Pandas 1.5. Hence, we have to convert to arrow table and convert back -# # to pandas dataframe. -# if not isinstance(df["date_col"].dtype, pd.ArrowDtype): -# df["date_col"] = pd.to_datetime(df["date_col"], format="%Y-%m-%d") -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["date_col"]), -# schema=pa.schema([("date_col", pa.date32())]), -# ) -# df["date_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["date_col"] -# -# if not isinstance(df["datetime_col"].dtype, pd.ArrowDtype): -# df["datetime_col"] = pd.to_datetime( -# df["datetime_col"], format="%Y-%m-%d %H:%M:%S" -# ) -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["datetime_col"]), -# schema=pa.schema([("datetime_col", pa.timestamp("us"))]), -# ) -# df["datetime_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ -# "datetime_col" -# ] -# -# if not isinstance(df["time_col"].dtype, pd.ArrowDtype): -# df["time_col"] = pd.to_datetime(df["time_col"], format="%H:%M:%S.%f") -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["time_col"]), -# schema=pa.schema([("time_col", pa.time64("us"))]), -# ) -# df["time_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["time_col"] -# -# if not isinstance(df["timestamp_col"].dtype, pd.ArrowDtype): -# df["timestamp_col"] = pd.to_datetime( -# df["timestamp_col"], format="%Y-%m-%d %H:%M:%S.%f%Z" -# ) -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["timestamp_col"]), -# schema=pa.schema([("timestamp_col", pa.timestamp("us", tz="UTC"))]), -# ) -# df["timestamp_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ -# "timestamp_col" -# ] -# -# # Convert geography types columns. -# if "geography_col" in df.columns: -# df["geography_col"] = df["geography_col"].astype( -# pd.StringDtype(storage="pyarrow") -# ) -# df["geography_col"] = gpd.GeoSeries.from_wkt( -# df["geography_col"].replace({np.nan: None}) -# ) -# -# if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): -# df["bytes_col"] = df["bytes_col"].apply( -# lambda value: base64.b64decode(value) if not pd.isnull(value) else value -# ) -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["bytes_col"]), -# schema=pa.schema([("bytes_col", pa.binary())]), -# ) -# df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] -# -# if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): -# # Convert numeric types column. -# df["numeric_col"] = df["numeric_col"].apply( -# lambda value: decimal.Decimal(str(value)) if value else None # type: ignore -# ) -# arrow_table = pa.Table.from_pandas( -# pd.DataFrame(df, columns=["numeric_col"]), -# schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), -# ) -# df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ -# "numeric_col" -# ] -# -# -# def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): -# """Compare two pandas dataframes representing PCA components. The columns -# required to be present in the dataframes are: -# numerical_value: numeric, -# categorical_value: List[object(category, value)] -# -# The index types of `actual` and `expected` are ignored in the comparison. -# -# Args: -# actual: Actual Pandas DataFrame -# -# expected: Expected Pandas DataFrame -# -# kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column -# """ -# # Compare the index, columns and values separately, as the polarity of the -# # PCA vectors can be arbitrary -# pd.testing.assert_index_equal( -# actual.index, expected.index.astype(actual.index.dtype) -# ) # dtype agnostic index comparison -# pd.testing.assert_index_equal(actual.columns, expected.columns) -# for column in expected.columns: -# try: -# pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) -# except AssertionError: -# if column not in {"numerical_value", "categorical_value"}: -# raise -# -# # Allow for sign difference per numeric/categorical column -# if column == "numerical_value": -# actual_ = -actual[column] -# expected_ = expected[column] -# else: -# # In this column each element is an array of objects, where the -# # object has attributes "category" and "value". For the sake of -# # comparison let's normalize by flipping the polarity of "value". -# def normalize_array_of_objects(arr, reverse_polarity=False): -# newarr = [] -# for element in arr: -# newelement = dict(element) -# if reverse_polarity: -# newelement["value"] = -newelement["value"] -# newarr.append(newelement) -# return sorted(newarr, key=lambda d: d["category"]) -# -# actual_ = actual[column].apply(normalize_array_of_objects, args=(True,)) -# expected_ = expected[column].apply(normalize_array_of_objects) -# -# pd.testing.assert_series_equal(actual_, expected_, **kwargs) -# -# -# def assert_pandas_df_equal_pca(actual, expected, **kwargs): -# """Compare two pandas dataframes representing PCA predictions. The columns -# in the dataframes are expected to be numeric. -# -# Args: -# actual: Actual Pandas DataFrame -# -# expected: Expected Pandas DataFrame -# -# kwargs: kwargs to use in `pandas.testing.assert_series_equal` per column -# """ -# # Compare the index, columns and values separately, as the polarity of the -# # PCA vector can be arbitrary -# pd.testing.assert_index_equal(actual.index, expected.index) -# pd.testing.assert_index_equal(actual.columns, expected.columns) -# for column in expected.columns: -# try: -# pd.testing.assert_series_equal(actual[column], expected[column], **kwargs) -# except AssertionError: -# # Allow for sign difference per column -# pd.testing.assert_series_equal(-actual[column], expected[column], **kwargs) -#=============================================================================== - - -def check_pandas_df_schema_and_index( - pd_df: pd.DataFrame, - columns: Iterable, - index: Union[int, Iterable], - col_exact: bool = True, -): - """Check pandas df schema and index. But not the values. - - Args: - pd_df: the input pandas df - columns: target columns to check with - index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match - col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns - """ - if col_exact: - assert list(pd_df.columns) == list(columns) - else: - assert set(columns) <= set(pd_df.columns) - - if isinstance(index, int): - assert len(pd_df) == index - elif isinstance(index, Iterable): - assert list(pd_df.index) == list(index) - else: - raise ValueError("Unsupported index type.") - - -#=============================================================================== -# def get_remote_function_endpoints( -# bigquery_client: bigquery.Client, dataset_id: str -# ) -> Set[str]: -# """Get endpoints used by the remote functions in a datset""" -# endpoints = set() -# routines = bigquery_client.list_routines(dataset=dataset_id) -# for routine in routines: -# rf_options = routine._properties.get("remoteFunctionOptions") -# if not rf_options: -# continue -# rf_endpoint = rf_options.get("endpoint") -# if rf_endpoint: -# endpoints.add(rf_endpoint) -# return endpoints -# -# -# def get_cloud_functions( -# functions_client: functions_v2.FunctionServiceClient, -# project: str, -# location: str, -# name: Optional[str] = None, -# name_prefix: Optional[str] = None, -# ) -> Iterable[functions.ListFunctionsResponse]: -# """Get the cloud functions in the given project and location.""" -# -# assert ( -# not name or not name_prefix -# ), "Either 'name' or 'name_prefix' can be passed but not both." -# -# _, location = functions_utils.get_remote_function_locations(location) -# parent = f"projects/{project}/locations/{location}" -# request = functions_v2.ListFunctionsRequest(parent=parent) -# page_result = functions_client.list_functions(request=request) -# for response in page_result: -# # If name is provided and it does not match then skip -# if bool(name): -# full_name = parent + f"/functions/{name}" -# if response.name != full_name: -# continue -# # If name prefix is provided and it does not match then skip -# elif bool(name_prefix): -# full_name_prefix = parent + f"/functions/{name_prefix}" -# if not response.name.startswith(full_name_prefix): -# continue -# -# yield response -# -# -# def delete_cloud_function( -# functions_client: functions_v2.FunctionServiceClient, full_name: str -# ) -> google.api_core.operation.Operation: -# """Delete a cloud function with the given fully qualified name.""" -# request = functions_v2.DeleteFunctionRequest(name=full_name) -# operation = functions_client.delete_function(request=request) -# return operation -# -# -# def get_first_file_from_wildcard(path): -# return path.replace("*", "000000000000") -#=============================================================================== diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 63f9ebbd94..3a0fc3577c 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -500,6 +500,3 @@ def test_columntransformer_compile_to_sql(mock_X): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] - -if __name__ == "__main__": - pytest.main(["test_compose.py", "-s"]) \ No newline at end of file From 89298f97c47ba1e5a08a6115824774515fc658b0 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 18:45:28 +0200 Subject: [PATCH 32/40] remove eclipse settings accidentally checked in. --- .settings/org.eclipse.core.resources.prefs | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .settings/org.eclipse.core.resources.prefs diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs deleted file mode 100644 index b87fd68992..0000000000 --- a/.settings/org.eclipse.core.resources.prefs +++ /dev/null @@ -1,2 +0,0 @@ -eclipse.preferences.version=1 -encoding/noxfile.py=utf-8 From 0b27eac4b476e032cae63e0a9afed8b244382250 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Tue, 24 Sep 2024 23:21:48 +0200 Subject: [PATCH 33/40] fix docs. --- bigframes/ml/compose.py | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 69ece962fe..ee7bbf5228 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -50,11 +50,24 @@ class SQLScalarColumnTransformer: """ Wrapper for plain SQL code contained in a ColumnTransformer. + Create a single column transformer in plain sql. + This transformer can only be used inside ColumnTransformer. + + When creating an instance '{0}' can be used as placeholder + for the column to transform: + + SQLScalarColumnTransformer("{0}+1") + + The default target column gets the prefix 'transformed\_' + but can also be changed when creating an instance: + + SQLScalarColumnTransformer("{0}+1", "inc_{0}") + **Examples:** >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd - + >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ ... ("strlen", @@ -71,30 +84,18 @@ class SQLScalarColumnTransformer: [3 rows x 2 columns] - """ + SQLScalarColumnTransformer can be combined with other transformers, like StandardScaler: - def __init__(self, sql: str, target_column: str = "transformed_{0}"): - """ - Create a single column transformer in plain sql. - This transformer can only be used inside ColumnTransformer. - - **Examples:** + >>> col_trans = ColumnTransformer([ + ... ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", col5"]), + ... ("increment", SQLScalarColumnTransformer("{0}+1", target_column="inc_{0}"), "col2"), + ... ("stdscale", preprocessing.StandardScaler(), "col3"), + ... ... + ... ]) - col_trans = ColumnTransformer([ - ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", col5"]), - ("increment", SQLScalarColumnTransformer("{0}+1", target_column="inc_{0}"), ["col2"]), - ... - ]) + """ - Args: - sql (string): - sql code to transform the column. - for the column name the placeholder '{0}' can be used. - target_column (string): - name of the target column. - the placeholder '{0}' can be used for the input column name. - default value is 'transformed_{0}'. - """ + def __init__(self, sql: str, target_column: str = "transformed_{0}"): super().__init__() self._sql = sql self._target_column = target_column From 4ec3b0a6c77b161bf6aee4aa45883ed0f469e88f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 24 Sep 2024 16:53:19 -0500 Subject: [PATCH 34/40] Update bigframes/ml/compose.py --- bigframes/ml/compose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index ee7bbf5228..3773f1266e 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -58,7 +58,7 @@ class SQLScalarColumnTransformer: SQLScalarColumnTransformer("{0}+1") - The default target column gets the prefix 'transformed\_' + The default target column gets the prefix 'transformed_' but can also be changed when creating an instance: SQLScalarColumnTransformer("{0}+1", "inc_{0}") From f48e27b3df405ed6bdd366ae475b07d9f59e5ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 24 Sep 2024 16:56:23 -0500 Subject: [PATCH 35/40] Update bigframes/ml/compose.py --- bigframes/ml/compose.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 3773f1266e..d54c1247fa 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -47,7 +47,7 @@ class SQLScalarColumnTransformer: - """ + r""" Wrapper for plain SQL code contained in a ColumnTransformer. Create a single column transformer in plain sql. @@ -58,7 +58,7 @@ class SQLScalarColumnTransformer: SQLScalarColumnTransformer("{0}+1") - The default target column gets the prefix 'transformed_' + The default target column gets the prefix 'transformed\_' but can also be changed when creating an instance: SQLScalarColumnTransformer("{0}+1", "inc_{0}") From 5ca8a816f66dd235c25d6c23b0631318b35c56aa Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 25 Sep 2024 12:13:39 +0200 Subject: [PATCH 36/40] add support for flexible column names. --- bigframes/ml/compose.py | 20 +++++++-- tests/unit/ml/test_compose.py | 83 +++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d54c1247fa..68bb4f71a9 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -98,7 +98,15 @@ class SQLScalarColumnTransformer: def __init__(self, sql: str, target_column: str = "transformed_{0}"): super().__init__() self._sql = sql - self._target_column = target_column + self._target_column = target_column.replace("`", "") + + PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE) + + def escape(self, colname: str): + colname = colname.replace("`", "") + if self.PLAIN_COLNAME_RX.match(colname): + return colname + return f"`{colname}`" def _compile_to_sql( self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None @@ -107,8 +115,8 @@ def _compile_to_sql( columns = X.columns result = [] for column in columns: - current_sql = self._sql.format(column) - current_target_column = self._target_column.format(column) + current_sql = self._sql.format(self.escape(column)) + current_target_column = self.escape(self._target_column.format(column)) result.append(f"{current_sql} AS {current_target_column}") return result @@ -188,6 +196,8 @@ def transformers_( return result + AS_FLEXNAME_SUFFIX_RX = re.compile("^(.*)\\bAS\\s*`[^`]+`\\s*$", re.IGNORECASE) + @classmethod def _extract_from_bq_model( cls, @@ -216,6 +226,10 @@ def camel_to_snake(name): continue transform_sql: str = transform_col_dict["transformSql"] + # workaround for bug in bq_model returning " AS `...`" suffix for flexible names + if cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql): + transform_sql = cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql).group(1) + output_names.append(transform_col_dict["name"]) found_transformer = False for prefix in _BQML_TRANSFROM_TYPE_MAPPING: diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 3a0fc3577c..439de95d42 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -390,6 +390,36 @@ def bq_model_unknown_ML(mocker): ) +@pytest.fixture +def bq_model_flexnames(mocker): + return create_bq_model_mock( + mocker, + [ + { + "name": "Flex Name culmen_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "culmen_length_mm", + }, + { + "name": "transformed_Culmen Length MM", + "type": {"typeKind": "INT64"}, + "transformSql": "`Culmen Length MM`*/", + }, + # test workaround for bug in get_model + { + "name": "Flex Name flipper_length_mm", + "type": {"typeKind": "INT64"}, + "transformSql": "flipper_length_mm AS `Flex Name flipper_length_mm`", + }, + { + "name": "transformed_Flipper Length MM", + "type": {"typeKind": "INT64"}, + "transformSql": "`Flipper Length MM` AS `transformed_Flipper Length MM`*/", + }, + ], + ) + + def test_columntransformer_extract_from_bq_model_good(bq_model_good): col_trans = ColumnTransformer._extract_from_bq_model(bq_model_good) assert len(col_trans.transformers) == 6 @@ -500,3 +530,56 @@ def test_columntransformer_compile_to_sql(mock_X): "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", ] + + +def test_columntransformer_flexible_column_names(mock_X): + ident_transformer = SQLScalarColumnTransformer("{0}", target_column="ident {0}") + len1_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -2 ELSE LENGTH({0}) END", target_column="len1_{0}" + ) + len2_transformer = SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN 99 ELSE LENGTH({0}) END", target_column="len2_{0}" + ) + column_transformer = compose.ColumnTransformer( + [ + ( + "ident_trafo", + ident_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("len1_trafo", len1_transformer, ["species shortname"]), + ("len2_trafo", len2_transformer, ["`species longname`"]), + ] + ) + sqls = column_transformer._compile_to_sql(mock_X) + assert sqls == [ + "culmen_length_mm AS `ident culmen_length_mm`", + "flipper_length_mm AS `ident flipper_length_mm`", + "CASE WHEN `species shortname` IS NULL THEN -2 ELSE LENGTH(`species shortname`) END AS `len1_species shortname`", + "CASE WHEN `species longname` IS NULL THEN 99 ELSE LENGTH(`species longname`) END AS `len2_species longname`", + ] + + +def test_columntransformer_extract_from_bq_model_flexnames(bq_model_flexnames): + col_trans = ColumnTransformer._extract_from_bq_model(bq_model_flexnames) + assert len(col_trans.transformers) == 4 + # normalize the representation for string comparing + col_trans.transformers.sort(key=lambda trafo: str(trafo)) + actual = col_trans.__repr__() + expected = """ColumnTransformer(transformers=[('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='`Culmen Length MM`*/', target_column='transformed_Culmen Length MM'), + '?transformed_Culmen Length MM'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='`Flipper Length MM` AS `transformed_Flipper Length MM`*/', target_column='transformed_Flipper Length MM'), + '?transformed_Flipper Length MM'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='Flex Name culmen_length_mm'), + '?Flex Name culmen_length_mm'), + ('sql_scalar_column_transformer', + SQLScalarColumnTransformer(sql='flipper_length_mm ', target_column='Flex Name flipper_length_mm'), + '?Flex Name flipper_length_mm')])""" + assert expected == actual + + +if __name__ == "__main__": + pytest.main(["test_compose.py", "-s"]) From a4ed2fd0dd40b5dd2a60bfed3d7021c1aea4c761 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 25 Sep 2024 13:05:27 +0200 Subject: [PATCH 37/40] remove main. --- tests/unit/ml/test_compose.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 439de95d42..7643f76e56 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -579,7 +579,3 @@ def test_columntransformer_extract_from_bq_model_flexnames(bq_model_flexnames): SQLScalarColumnTransformer(sql='flipper_length_mm ', target_column='Flex Name flipper_length_mm'), '?Flex Name flipper_length_mm')])""" assert expected == actual - - -if __name__ == "__main__": - pytest.main(["test_compose.py", "-s"]) From c35086db4df3fa07ec07bffc8a71215a611d3714 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 25 Sep 2024 13:34:17 +0200 Subject: [PATCH 38/40] add system test for output column with flexible column name --- tests/system/large/ml/test_compose.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 4f4ea67cc8..3213725845 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -163,6 +163,14 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): compose.SQLScalarColumnTransformer("{0}", target_column="{0}"), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "flexname", + compose.SQLScalarColumnTransformer( + "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", + target_column="Flex {0} Name", + ), + "species", + ), ] ) transformer.fit( @@ -205,6 +213,14 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ), "?culmen_length_mm", ), + ( + "sql_scalar_column_transformer", + compose.SQLScalarColumnTransformer( + "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END ", + target_column="Flex species Name", + ), + "?Flex species Name", + ), ] assert set(reloaded_transformer.transformers) == set(expected) assert reloaded_transformer._bqml_model is not None From cc1b0b178307dbd052c109f36ac7d79b7586d211 Mon Sep 17 00:00:00 2001 From: Ferenc Hechler Date: Wed, 25 Sep 2024 13:43:57 +0200 Subject: [PATCH 39/40] system tests: add new flexible output column to check-df-schema. --- tests/system/large/ml/test_compose.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 3213725845..ba963837e5 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -238,6 +238,7 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): "len_species", "culmen_length_mm", "flipper_length_mm", + "Flex species Name", ], index=[1633, 1672, 1690], col_exact=False, From d82793c140f0f8b0659cb424349057d198a3f0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 25 Sep 2024 09:41:26 -0500 Subject: [PATCH 40/40] Apply suggestions from code review --- bigframes/ml/compose.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 68bb4f71a9..08c9761cc3 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -87,10 +87,10 @@ class SQLScalarColumnTransformer: SQLScalarColumnTransformer can be combined with other transformers, like StandardScaler: >>> col_trans = ColumnTransformer([ - ... ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", col5"]), + ... ("identity", SQLScalarColumnTransformer("{0}", target_column="{0}"), ["col1", "col5"]), ... ("increment", SQLScalarColumnTransformer("{0}+1", target_column="inc_{0}"), "col2"), ... ("stdscale", preprocessing.StandardScaler(), "col3"), - ... ... + ... # ... ... ]) """ @@ -227,8 +227,9 @@ def camel_to_snake(name): transform_sql: str = transform_col_dict["transformSql"] # workaround for bug in bq_model returning " AS `...`" suffix for flexible names - if cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql): - transform_sql = cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql).group(1) + flex_name_match = cls.AS_FLEXNAME_SUFFIX_RX.match(transform_sql) + if flex_name_match: + transform_sql = flex_name_match.group(1) output_names.append(transform_col_dict["name"]) found_transformer = False