diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 4b1a3fb7b7..f3621d3a33 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -24,6 +24,7 @@ import bigframes_vendored.sklearn.preprocessing._discretization import bigframes_vendored.sklearn.preprocessing._encoder import bigframes_vendored.sklearn.preprocessing._label +import bigframes_vendored.sklearn.preprocessing._polynomial from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils @@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger +class PolynomialFeatures( + base.Transformer, + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures, +): + __doc__ = ( + bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__ + ) + + def __init__(self, degree: int = 2): + self.degree = degree + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model + ) + + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform. + X (default None): + Ignored. + + Returns: a list of tuples of (sql_expression, output_name)""" + output_name = "poly_feat" + return [ + ( + self._base_sql_generator.ml_polynomial_expand( + columns, self.degree, output_name + ), + output_name, + ) + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]: + """Parse SQL to tuple(PolynomialFeatures, column_label). + + Args: + sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)" + + Returns: + tuple(MaxAbsScaler, column_label)""" + col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")] + degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")]) + return cls(degree), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> PolynomialFeatures: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # TODO(garrettwu): generalize the approach to other transformers + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + # TODO(garrettwu): to_gbq() + + PreprocessingType = Union[ OneHotEncoder, StandardScaler, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index f060584a11..0399db3a10 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" + def struct_columns(self, columns: Iterable[str]) -> str: + """Encode a BQ Table columns to a STRUCT.""" + columns_str = ", ".join(columns) + return f"STRUCT({columns_str})" + def input(self, **kwargs: str) -> str: """Encode a BQML INPUT clause.""" return f"INPUT({self.build_schema(**kwargs)})" @@ -153,6 +158,13 @@ def ml_label_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_polynomial_expand( + self, columns: Iterable[str], degree: int, name: str + ) -> str: + """Encode ML.POLYNOMIAL_EXPAND. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand""" + return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}""" + def ml_distance( self, col_x: str, diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 5b457cc9c0..73b1855e09 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,6 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing +from tests.system import utils ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id): # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. + + +def test_poly_features_default_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures() + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + expected = pd.DataFrame( + { + "poly_feat_culmen_length_mm": [ + 39.5, + 38.5, + 37.9, + ], + "poly_feat_culmen_length_mm_culmen_length_mm": [ + 1560.25, + 1482.25, + 1436.41, + ], + "poly_feat_culmen_length_mm_culmen_depth_mm": [ + 742.6, + 662.2, + 685.99, + ], + "poly_feat_culmen_depth_mm": [ + 18.8, + 17.2, + 18.1, + ], + "poly_feat_culmen_depth_mm_culmen_depth_mm": [ + 353.44, + 295.84, + 327.61, + ], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) + + +def test_poly_features_params(new_penguins_df): + transformer = preprocessing.PolynomialFeatures(degree=3) + df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]] + transformer.fit(df) + + result = transformer.transform(df).to_pandas() + + utils.check_pandas_df_schema_and_index( + result, + [ + "poly_feat_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm", + "poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm", + "poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm", + "poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm", + ], + [1633, 1672, 1690], + ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 01f173812c..e90146565d 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -145,6 +145,13 @@ def test_label_encoder_correct( assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" +def test_polynomial_expand( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp") + assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp" + + def test_distance_correct( base_sql_generator: ml_sql.BaseSqlGenerator, mock_df: bpd.DataFrame, diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py new file mode 100644 index 0000000000..4e4624ba84 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -0,0 +1,38 @@ +""" +This file contains preprocessing tools based on polynomials. +""" + +from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + +from bigframes import constants + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features.""" + + def fit(self, X, y=None): + """Compute number of output features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + PolynomialFeatures: Fitted transformer. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Transform data to polynomial features. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)