From f8433344d7b67dfc7e0ea6cb9137ba5ca11e6bda Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 19 Apr 2024 23:10:19 +0000 Subject: [PATCH 1/7] feat: Add .cache() method to persist intermediate dataframe --- bigframes/dataframe.py | 9 +++++++++ bigframes/ml/core.py | 16 +++++----------- bigframes/series.py | 9 +++++++++ tests/system/small/test_dataframe.py | 2 +- tests/unit/ml/test_golden_sql.py | 4 ++-- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ff8404761c..ea6f73acc9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3384,6 +3384,15 @@ def _set_block(self, block: blocks.Block): def _get_block(self) -> blocks.Block: return self._block + def cache(self): + """ + Materializes the DataFrame to a temporary table. Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. + + Returns: + DataFrame: Self + """ + self._cached(force=True) + def _cached(self, *, force: bool = False) -> DataFrame: """Materialize dataframe to a temporary table. No-op if the dataframe represents a trivial transformation of an existing materialization. diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index b94ae39687..6beeced113 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -83,7 +83,7 @@ def distance( """ assert len(x.columns) == 1 and len(y.columns) == 1 - input_data = x._cached().join(y._cached(), how="outer") + input_data = x.cache().join(y.cache(), how="outer") x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] return self._apply_sql( @@ -299,11 +299,9 @@ def create_model( # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot if y_train is None: - input_data = X_train._cached(force=True) + input_data = X_train.cache() else: - input_data = X_train._cached(force=True).join( - y_train._cached(force=True), how="outer" - ) + input_data = X_train.cache().join(y_train.cache(), how="outer") options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -343,9 +341,7 @@ def create_llm_remote_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train._cached(force=True).join( - y_train._cached(force=True), how="outer" - ) + input_data = X_train.cache().join(y_train.cache(), how="outer") options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -378,9 +374,7 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train._cached(force=True).join( - y_train._cached(force=True), how="outer" - ) + input_data = X_train.cache().join(y_train.cache(), how="outer") options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/bigframes/series.py b/bigframes/series.py index 47acfd0afb..4e820b35c4 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1691,6 +1691,15 @@ def _slice( ), ) + def cache(self): + """ + Materializes the DataFrame to a temporary table. Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. + + Returns: + DataFrame: Self + """ + self._cached(force=True) + def _cached(self, *, force: bool = True) -> Series: self._set_block(self._block.cached(force=force)) return self diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4c598a682d..8512e83e33 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4075,7 +4075,7 @@ def test_df_cached(scalars_df_index): ) df = df[df["rowindex_2"] % 2 == 0] - df_cached_copy = df._cached() + df_cached_copy = df.cache() pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index bcb220b107..48fb7011ea 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -63,7 +63,7 @@ def bqml_model_factory(mocker: pytest_mock.MockerFixture): def mock_y(): mock_y = mock.create_autospec(spec=bpd.DataFrame) mock_y.columns = pd.Index(["input_column_label"]) - mock_y._cached.return_value = mock_y + mock_y.cache.return_value = mock_y return mock_y @@ -83,7 +83,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) - mock_X._cached.return_value = mock_X + mock_X.cache.return_value = mock_X return mock_X From 2095a0cdba99ae73625bfd990665d0e3c5209286 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 22 Apr 2024 20:29:29 +0000 Subject: [PATCH 2/7] return self properly and fix docstrings --- bigframes/dataframe.py | 2 +- bigframes/series.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ea6f73acc9..1e91589a07 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3391,7 +3391,7 @@ def cache(self): Returns: DataFrame: Self """ - self._cached(force=True) + return self._cached(force=True) def _cached(self, *, force: bool = False) -> DataFrame: """Materialize dataframe to a temporary table. diff --git a/bigframes/series.py b/bigframes/series.py index 4e820b35c4..4de767cc84 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1693,12 +1693,12 @@ def _slice( def cache(self): """ - Materializes the DataFrame to a temporary table. Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. + Materializes the Series to a temporary table. Useful if the series will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: - DataFrame: Self + Series: Self """ - self._cached(force=True) + return self._cached(force=True) def _cached(self, *, force: bool = True) -> Series: self._set_block(self._block.cached(force=force)) From b7ea225d47e85d391fdeb3c83c54e8db07d92caa Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 25 Apr 2024 13:30:13 -0700 Subject: [PATCH 3/7] Update bigframes/dataframe.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1e91589a07..eeb6e34c17 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3386,7 +3386,9 @@ def _get_block(self) -> blocks.Block: def cache(self): """ - Materializes the DataFrame to a temporary table. Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. + Materializes the DataFrame to a temporary table. + + Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: DataFrame: Self From 48287890bd47705f839f465d6808e2b4fdaa73a2 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 25 Apr 2024 20:32:16 +0000 Subject: [PATCH 4/7] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f2ce5df15f..092c8ab82f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3400,7 +3400,7 @@ def _get_block(self) -> blocks.Block: def cache(self): """ Materializes the DataFrame to a temporary table. - + Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: From 8764cfe281e19c3b52a0a07228d7fe5b629f39be Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 25 Apr 2024 20:32:30 +0000 Subject: [PATCH 5/7] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f2ce5df15f..092c8ab82f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3400,7 +3400,7 @@ def _get_block(self) -> blocks.Block: def cache(self): """ Materializes the DataFrame to a temporary table. - + Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: From b9b06a253af10d7f818a8ef75e498d012c717484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 25 Apr 2024 15:53:00 -0500 Subject: [PATCH 6/7] Update bigframes/series.py --- bigframes/series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index f481f93821..61bd471c32 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1684,7 +1684,9 @@ def _slice( def cache(self): """ - Materializes the Series to a temporary table. Useful if the series will be used multiple times, as this will avoid recomputating the shared intermediate value. + Materializes the Series to a temporary table. + + Useful if the series will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: Series: Self From 236fb370f61912386af26b552ccdd60ac4ffb8dc Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 25 Apr 2024 22:42:25 +0000 Subject: [PATCH 7/7] lint docstring --- bigframes/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 61bd471c32..3986d38445 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1685,7 +1685,7 @@ def _slice( def cache(self): """ Materializes the Series to a temporary table. - + Useful if the series will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: