From cc18bac003787c9747690506ce9ab5d8465f8818 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 7 Jun 2024 23:44:06 +0000 Subject: [PATCH 1/3] feat: add dataframe.insert --- bigframes/dataframe.py | 30 +++++++++++++ tests/system/small/test_dataframe.py | 38 ++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 45 +++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f12c346776..919189cbcc 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1305,6 +1305,36 @@ def nsmallest( column_ids = self._sql_names(columns) return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep)) + def insert( + self, + loc: int, + column: blocks.Label, + value: SingleItemValue, + allow_duplicates: bool = False, + ): + column_count = len(self.columns) + if loc > column_count: + raise IndexError( + f"index {loc} is out of bounds for axis 0 with size {column_count}" + ) + if (column in self.columns) and not allow_duplicates: + raise ValueError(f"cannot insert {column}, already exists") + + temp_column = bigframes.core.guid.generate_guid(prefix=str(column)) + df = self._assign_single_item(temp_column, value) + + block = df._get_block() + value_columns = typing.cast(List, block.value_columns) + value_columns = ( + value_columns + if loc == column_count + else value_columns[:loc] + [value_columns[-1]] + value_columns[loc:-1] + ) + block = block.select_columns(value_columns) + block = block.rename(columns={temp_column: column}) + + self._set_block(block) + def drop( self, labels: typing.Any = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ffc09a1a1f..841ff7bbea 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -270,6 +270,44 @@ def test_get_columns_default(scalars_dfs): assert result == "default_val" +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + def test_drop_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d46fa4cfc7..f8088f8060 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1067,6 +1067,51 @@ def reindex_like(self, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def insert(self, loc, column, value, allow_duplicates=False): + """Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + + Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. + + >>> df.insert(1, 'col3', 5) + >>> df + col1 col3 col2 + 0 1 5 3 + 1 2 5 4 + + [2 rows x 3 columns] + + Insert another column named 'col2' at the beginning of the DataFrame with values [5, 6] + + >>> df.insert(0, 'col2', [5, 6], allow_duplicates=True) + >>> df + col2 col1 col3 col2 + 0 5 1 5 3 + 1 6 2 5 4 + + [2 rows x 4 columns] + + Args: + loc (int): + Insertion index. Must verify 0 <= loc <= len(columns). + column (str, number, or hashable object): + Label of the inserted column. + value (Scalar, Series, or array-like): + Content of the inserted column. + allow_duplicates (bool, default False): + Allow duplicate column labels to be created. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> DataFrame | None: From 3be36476d56f60cb73897d4ecb4fa59c724d34a8 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 10 Jun 2024 19:54:16 +0000 Subject: [PATCH 2/3] update logic. --- bigframes/dataframe.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 919189cbcc..74d0ea1028 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1315,7 +1315,7 @@ def insert( column_count = len(self.columns) if loc > column_count: raise IndexError( - f"index {loc} is out of bounds for axis 0 with size {column_count}" + f"Column index {loc} is out of bounds with {column_count} total columns." ) if (column in self.columns) and not allow_duplicates: raise ValueError(f"cannot insert {column}, already exists") @@ -1325,11 +1325,9 @@ def insert( block = df._get_block() value_columns = typing.cast(List, block.value_columns) - value_columns = ( - value_columns - if loc == column_count - else value_columns[:loc] + [value_columns[-1]] + value_columns[loc:-1] - ) + value_columns, new_column = value_columns[:-1], value_columns[-1] + value_columns = value_columns.insert(loc, new_column) + block = block.select_columns(value_columns) block = block.rename(columns={temp_column: column}) From db6a70df4cc2017603a5aca6261ba111c3850930 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 10 Jun 2024 19:57:23 +0000 Subject: [PATCH 3/3] fix --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 74d0ea1028..f78dee1642 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1326,7 +1326,7 @@ def insert( block = df._get_block() value_columns = typing.cast(List, block.value_columns) value_columns, new_column = value_columns[:-1], value_columns[-1] - value_columns = value_columns.insert(loc, new_column) + value_columns.insert(loc, new_column) block = block.select_columns(value_columns) block = block.rename(columns={temp_column: column})