Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions bigframes/ml/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import bigframes_vendored.sklearn.preprocessing._discretization
import bigframes_vendored.sklearn.preprocessing._encoder
import bigframes_vendored.sklearn.preprocessing._label
import bigframes_vendored.sklearn.preprocessing._polynomial

from bigframes.core import log_adapter
from bigframes.ml import base, core, globals, utils
Expand Down Expand Up @@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
)


@log_adapter.class_logger
class PolynomialFeatures(
base.Transformer,
bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures,
):
__doc__ = (
bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__
)

def __init__(self, degree: int = 2):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we deferring the value validation (range [1, 4]) to BQ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BQ will return in valid input. Well we can check it first. I'll add in a subsequent PR.

self.degree = degree
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()
self._base_sql_generator = globals.base_sql_generator()

# TODO(garrettwu): implement __hash__
def __eq__(self, other: Any) -> bool:
return (
type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model
)

def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
"""Compile this transformer to a list of SQL expressions that can be included in
a BQML TRANSFORM clause

Args:
columns:
a list of column names to transform.
X (default None):
Ignored.

Returns: a list of tuples of (sql_expression, output_name)"""
output_name = "poly_feat"
return [
(
self._base_sql_generator.ml_polynomial_expand(
columns, self.degree, output_name
),
output_name,
)
]

@classmethod
def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]:
"""Parse SQL to tuple(PolynomialFeatures, column_label).

Args:
sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)"

Returns:
tuple(MaxAbsScaler, column_label)"""
col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")]
degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")])
return cls(degree), col_label

def fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
y=None, # ignored
) -> PolynomialFeatures:
(X,) = utils.convert_to_dataframe(X)

compiled_transforms = self._compile_to_sql(X.columns.tolist())
transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]

self._bqml_model = self._bqml_model_factory.create_model(
X,
options={"model_type": "transform_only"},
transforms=transform_sqls,
)

# TODO(garrettwu): generalize the approach to other transformers
output_names = []
for transform_col in self._bqml_model._model._properties["transformColumns"]:
transform_col_dict = cast(dict, transform_col)
# pass the columns that are not transformed
if "transformSql" not in transform_col_dict:
continue
transform_sql: str = transform_col_dict["transformSql"]
if not transform_sql.startswith("ML."):
continue

output_names.append(transform_col_dict["name"])

self._output_names = output_names

return self

def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError("Must be fitted before transform")

(X,) = utils.convert_to_dataframe(X)

df = self._bqml_model.transform(X)
return typing.cast(
bpd.DataFrame,
df[self._output_names],
)

# TODO(garrettwu): to_gbq()


PreprocessingType = Union[
OneHotEncoder,
StandardScaler,
Expand Down
12 changes: 12 additions & 0 deletions bigframes/ml/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str:
"""Encode a BQ STRUCT as options."""
return f"STRUCT({self.build_structs(**kwargs)})"

def struct_columns(self, columns: Iterable[str]) -> str:
"""Encode a BQ Table columns to a STRUCT."""
columns_str = ", ".join(columns)
return f"STRUCT({columns_str})"

def input(self, **kwargs: str) -> str:
"""Encode a BQML INPUT clause."""
return f"INPUT({self.build_schema(**kwargs)})"
Expand Down Expand Up @@ -153,6 +158,13 @@ def ml_label_encoder(
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params."""
return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}"""

def ml_polynomial_expand(
self, columns: Iterable[str], degree: int, name: str
) -> str:
"""Encode ML.POLYNOMIAL_EXPAND.
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand"""
return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}"""

def ml_distance(
self,
col_x: str,
Expand Down
67 changes: 67 additions & 0 deletions tests/system/small/ml/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import bigframes.features
from bigframes.ml import preprocessing
from tests.system import utils

ONE_HOT_ENCODED_DTYPE = (
pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])))
Expand Down Expand Up @@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):


# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.


def test_poly_features_default_params(new_penguins_df):
transformer = preprocessing.PolynomialFeatures()
df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
transformer.fit(df)

result = transformer.transform(df).to_pandas()

expected = pd.DataFrame(
{
"poly_feat_culmen_length_mm": [
39.5,
38.5,
37.9,
],
"poly_feat_culmen_length_mm_culmen_length_mm": [
1560.25,
1482.25,
1436.41,
],
"poly_feat_culmen_length_mm_culmen_depth_mm": [
742.6,
662.2,
685.99,
],
"poly_feat_culmen_depth_mm": [
18.8,
17.2,
18.1,
],
"poly_feat_culmen_depth_mm_culmen_depth_mm": [
353.44,
295.84,
327.61,
],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)


def test_poly_features_params(new_penguins_df):
transformer = preprocessing.PolynomialFeatures(degree=3)
df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
transformer.fit(df)

result = transformer.transform(df).to_pandas()

utils.check_pandas_df_schema_and_index(
result,
[
"poly_feat_culmen_length_mm",
"poly_feat_culmen_length_mm_culmen_length_mm",
"poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm",
"poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm",
"poly_feat_culmen_length_mm_culmen_depth_mm",
"poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm",
"poly_feat_culmen_depth_mm",
"poly_feat_culmen_depth_mm_culmen_depth_mm",
"poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm",
],
[1633, 1672, 1690],
)
7 changes: 7 additions & 0 deletions tests/unit/ml/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ def test_label_encoder_correct(
assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a"


def test_polynomial_expand(
base_sql_generator: ml_sql.BaseSqlGenerator,
):
sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp")
assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp"


def test_distance_correct(
base_sql_generator: ml_sql.BaseSqlGenerator,
mock_df: bpd.DataFrame,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
This file contains preprocessing tools based on polynomials.
"""

from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin

from bigframes import constants


class PolynomialFeatures(TransformerMixin, BaseEstimator):
"""Generate polynomial and interaction features."""

def fit(self, X, y=None):
"""Compute number of output features.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
The Dataframe or Series with training data.

y (default None):
Ignored.

Returns:
PolynomialFeatures: Fitted transformer.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def transform(self, X):
"""Transform data to polynomial features.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
The DataFrame or Series to be transformed.

Returns:
bigframes.dataframe.DataFrame: Transformed result.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)