From aaa545b4992df7ecf8fc8a5fbef66a76807adf74 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 6 May 2024 22:49:03 +0000 Subject: [PATCH 01/17] feat: Support indexless dataframe/series --- bigframes/core/__init__.py | 13 ++ bigframes/core/block_transforms.py | 3 +- bigframes/core/blocks.py | 87 +++++++-- bigframes/core/indexes/base.py | 1 + bigframes/core/rewrite.py | 36 ++-- bigframes/dataframe.py | 45 ++++- bigframes/enums.py | 2 + bigframes/exceptions.py | 4 + bigframes/series.py | 13 +- bigframes/session/__init__.py | 6 + .../session/_io/bigquery/read_gbq_table.py | 6 +- tests/system/conftest.py | 10 ++ tests/system/small/test_empty_index.py | 166 ++++++++++++++++++ 13 files changed, 353 insertions(+), 39 deletions(-) create mode 100644 tests/system/small/test_empty_index.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 185ce7cd4f..e7485de20e 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -450,6 +450,19 @@ def join( return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) + def try_align_as_projection( + self, + other: ArrayValue, + join_type: join_def.JoinType, + mappings: typing.Tuple[join_def.JoinColumnMapping, ...], + ) -> typing.Optional[ArrayValue]: + left_side = bigframes.core.rewrite.SquashedSelect.from_node(self.node) + right_side = bigframes.core.rewrite.SquashedSelect.from_node(other.node) + result = left_side.merge(right_side, join_type, mappings) + if result is not None: + return ArrayValue(result.expand()) + return None + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: assert len(column_ids) > 0 for column_id in column_ids: diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index a221b343a5..d628daf9b8 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -823,7 +823,8 @@ def idxmax(block: blocks.Block) -> blocks.Block: def _idx_extrema( block: blocks.Block, min_or_max: typing.Literal["min", "max"] ) -> blocks.Block: - if len(block.index_columns) != 1: + block._null_index_guard() + if len(block.index_columns) > 1: # TODO: Need support for tuple dtype raise NotImplementedError( f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}" diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 402581eb6f..80576a81fb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -117,19 +117,6 @@ def __init__( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) - # If no index columns are set, create one. - # - # Note: get_index_cols_and_uniqueness in - # bigframes/session/_io/bigquery/read_gbq_table.py depends on this - # being as sequential integer index column. If this default behavior - # ever changes, please also update get_index_cols_and_uniqueness so - # that users who explicitly request a sequential integer index can - # still get one. - if len(index_columns) == 0: - new_index_col_id = guid.generate_guid() - expr = expr.promote_offsets(new_index_col_id) - index_columns = [new_index_col_id] - self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( @@ -514,7 +501,8 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): Warning: This method modifies ``df`` inplace. """ - if self.index_columns: + # Note: If BigQuery DataFrame has empty index, a default one will be created for the local materialization. + if len(self.index_columns) > 0: df.set_index(list(self.index_columns), inplace=True) # Pandas names is annotated as list[str] rather than the more # general Sequence[Label] that BigQuery DataFrames has. @@ -1410,7 +1398,8 @@ def retrieve_repr_request_results( computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name(s) for the default - formatted_df.index.names = self.index.names # type: ignore + if len(self.index.names) > 0: + formatted_df.index.names = self.index.names # type: ignore return formatted_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: @@ -1901,6 +1890,7 @@ def join( sort=False, block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: + if not isinstance(other, Block): # TODO(swast): We need to improve this error message to be more # actionable for the user. For example, it's possible they @@ -1914,6 +1904,16 @@ def join( raise NotImplementedError( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) + # Special case for null index, + if ( + (self.index.nlevels == other.index.nlevels == 0) + and (sort is False) + and (block_identity_join is False) + ): + return join_indexless(self, other, how=how) + + self._null_index_guard() + other._null_index_guard() if self.index.nlevels == other.index.nlevels == 1: return join_mono_indexed( self, other, how=how, sort=sort, block_identity_join=block_identity_join @@ -2062,6 +2062,12 @@ def _is_monotonic( self._stats_cache[column_name].update({op_name: result}) return result + def _null_index_guard(self): + if len(self.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + "Cannot perform this operation without an index. Set an index using set_index." + ) + class BlockIndexProperties: """Accessor for the index-related block properties.""" @@ -2113,6 +2119,10 @@ def __repr__(self) -> str: def to_pandas(self) -> pd.Index: """Executes deferred operations and downloads the results.""" + if len(self.column_ids) == 0: + raise bigframes.exceptions.NullIndexError( + "Cannot perform this operation without an index. Set an index using set_index." + ) # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) dtypes = dict(zip(index_columns, self.dtypes)) @@ -2154,6 +2164,53 @@ def is_uniquely_named(self: BlockIndexProperties): return len(set(self.names)) == len(self.names) +def join_indexless( + left: Block, + right: Block, + *, + how="left", +) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: + """Joins two blocks""" + left_expr = left.expr + right_expr = right.expr + left_mappings = [ + join_defs.JoinColumnMapping( + source_table=join_defs.JoinSide.LEFT, + source_id=id, + destination_id=guid.generate_guid(), + ) + for id in left_expr.column_ids + ] + right_mappings = [ + join_defs.JoinColumnMapping( + source_table=join_defs.JoinSide.RIGHT, + source_id=id, + destination_id=guid.generate_guid(), + ) + for id in right_expr.column_ids + ] + combined_expr = left_expr.try_align_as_projection( + right_expr, + join_type=how, + mappings=(*left_mappings, *right_mappings), + ) + if combined_expr is None: + raise bigframes.exceptions.NullIndexError( + "Cannot implicitly align objects. Set an explicit index using set_index." + ) + get_column_left = {m.source_id: m.destination_id for m in left_mappings} + get_column_right = {m.source_id: m.destination_id for m in right_mappings} + block = Block( + combined_expr, + column_labels=[*left.column_labels, *right.column_labels], + index_columns=(), + ) + return ( + block, + (get_column_left, get_column_right), + ) + + def join_mono_indexed( left: Block, right: Block, diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 46a9e30637..26297b27ca 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -100,6 +100,7 @@ def __new__( def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: + frame._block._null_index_guard() index = Index(frame._block) index._linked_frame = frame return index diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index e3a07c04b4..7e4a3c748b 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -98,12 +98,10 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def maybe_join( - self, right: SquashedSelect, join_def: join_defs.JoinDefinition - ) -> Optional[SquashedSelect]: + def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition): if join_def.type == "cross": # Cannot convert cross join to projection - return None + return False r_exprs_by_id = {id: expr for expr, id in right.columns} l_exprs_by_id = {id: expr for expr, id in self.columns} @@ -113,10 +111,17 @@ def maybe_join( if (self.root != right.root) or any( l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs) ): + return False + return True + + def merge( + self, + right: SquashedSelect, + join_type: join_defs.JoinType, + mappings: Tuple[join_defs.JoinColumnMapping, ...], + ) -> Optional[SquashedSelect]: + if self.root != right.root: return None - - join_type = join_def.type - # Mask columns and remap names to expected schema lselection = self.columns rselection = right.columns @@ -136,7 +141,7 @@ def maybe_join( lselection = tuple((apply_mask(expr, lmask), id) for expr, id in lselection) if rmask is not None: rselection = tuple((apply_mask(expr, rmask), id) for expr, id in rselection) - new_columns = remap_names(join_def, lselection, rselection) + new_columns = remap_names(mappings, lselection, rselection) # Reconstruct ordering reverse_root = self.reverse_root @@ -201,20 +206,25 @@ def maybe_squash_projection(node: nodes.BigFrameNode) -> nodes.BigFrameNode: def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: left_side = SquashedSelect.from_node(join_node.left_child) right_side = SquashedSelect.from_node(join_node.right_child) - joined = left_side.maybe_join(right_side, join_node.join) - if joined is not None: - return joined.expand() + if left_side.can_join(right_side, join_node.join): + merged = left_side.merge( + right_side, join_node.join.type, join_node.join.mappings + ) + assert merged is not None + return merged.expand() else: return join_node def remap_names( - join: join_defs.JoinDefinition, lselection: Selection, rselection: Selection + mappings: Tuple[join_defs.JoinColumnMapping, ...], + lselection: Selection, + rselection: Selection, ) -> Selection: new_selection: Selection = tuple() l_exprs_by_id = {id: expr for expr, id in lselection} r_exprs_by_id = {id: expr for expr, id in rselection} - for mapping in join.mappings: + for mapping in mappings: if mapping.source_table == join_defs.JoinSide.LEFT: expr = l_exprs_by_id[mapping.source_id] else: # Right diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d694216ebe..fa621c2425 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,6 +17,7 @@ from __future__ import annotations import datetime +import functools import inspect import re import sys @@ -41,6 +42,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +import pandas.io.formats.format import tabulate import bigframes @@ -60,6 +62,7 @@ import bigframes.core.utils as utils import bigframes.core.window import bigframes.dtypes +import bigframes.exceptions import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -84,6 +87,15 @@ ) +def requires_index(meth): + @functools.wraps(meth) + def guarded_meth(df: DataFrame, *args, **kwargs): + df._null_index_guard() + return meth(df, *args, **kwargs) + + return guarded_meth + + # Inherits from pandas DataFrame so that we can use the same docstrings. @log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): @@ -256,6 +268,7 @@ def index(self, value): self.index.name = value.name if hasattr(value, "name") else None @property + @requires_index def loc(self) -> indexers.LocDataFrameIndexer: return indexers.LocDataFrameIndexer(self) @@ -268,6 +281,7 @@ def iat(self) -> indexers.IatDataFrameIndexer: return indexers.IatDataFrameIndexer(self) @property + @requires_index def at(self) -> indexers.AtDataFrameIndexer: return indexers.AtDataFrameIndexer(self) @@ -314,10 +328,15 @@ def bqclient(self) -> bigframes.Session: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def _has_index(self) -> bool: + return len(self._block.index_columns) > 0 + @property def T(self) -> DataFrame: return DataFrame(self._get_block().transpose()) + @requires_index def transpose(self) -> DataFrame: return self.T @@ -610,7 +629,14 @@ def __repr__(self) -> str: column_count = len(pandas_df.columns) with display_options.pandas_repr(opts): - repr_string = repr(pandas_df) + import pandas.io.formats + + options = ( + pandas.io.formats.format.get_dataframe_repr_params() # type: ignore + ) + if not self._has_index: + options.update({"index": False}) + repr_string = pandas_df.to_string(**options) # Modify the end of the string to reflect count. lines = repr_string.split("\n") @@ -806,15 +832,18 @@ def _apply_dataframe_binop( ) # join columns schema # indexers will be none for exact match - columns, lcol_indexer, rcol_indexer = self.columns.join( - other.columns, how=how, return_indexers=True - ) + if self.columns.equals(other.columns): + columns, lcol_indexer, rcol_indexer = self.columns, None, None + else: + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.columns, how=how, return_indexers=True + ) binop_result_ids = [] column_indices = zip( lcol_indexer if (lcol_indexer is not None) else range(len(columns)), - rcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), ) for left_index, right_index in column_indices: @@ -3518,3 +3547,9 @@ def __matmul__(self, other) -> DataFrame: return self.dot(other) __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) + + def _null_index_guard(self): + if not self._has_index: + raise bigframes.exceptions.NullIndexError( + "DataFrame cannot perform this operation as it has no index. Set an index using set_index." + ) diff --git a/bigframes/enums.py b/bigframes/enums.py index 4bec75f5df..84f4b70406 100644 --- a/bigframes/enums.py +++ b/bigframes/enums.py @@ -27,3 +27,5 @@ class DefaultIndexKind(enum.Enum): #: ``n - 3``, ``n - 2``, ``n - 1``, where ``n`` is the number of items in #: the index. SEQUENTIAL_INT64 = enum.auto() + # A completely null index incapable of indexing or alignment. + EMPTY = enum.auto() diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 222df069f6..e653ee7288 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -27,3 +27,7 @@ class CleanupFailedWarning(Warning): class NoDefaultIndexError(ValueError): """Unable to create a default index.""" + + +class NullIndexError(ValueError): + """Object has no index.""" diff --git a/bigframes/series.py b/bigframes/series.py index 3986d38445..50a736be25 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -31,6 +31,7 @@ import pandas.core.dtypes.common import typing_extensions +import bigframes._config.display_options as display_options import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter @@ -288,7 +289,17 @@ def __repr__(self) -> str: pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._set_internal_query_job(query_job) - return repr(pandas_df.iloc[:, 0]) + pd_series = pandas_df.iloc[:, 0] + + with display_options.pandas_repr(opts): + import pandas.io.formats + + options = pandas.io.formats.format.get_series_repr_params() # type: ignore + if len(self._block.index_columns) == 0: + options.update({"index": False}) + repr_string = pd_series.to_string(**options) + + return repr_string def astype( self, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7c7d93541c..dbbdfde35c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -79,6 +79,7 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile +import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order @@ -813,6 +814,11 @@ def _read_gbq_table( # Create Block & default index if len(index_cols) == 0 # ---------------------------------------------------- + if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: + sequential_index_col = bigframes.core.guid.generate_guid("index_") + array_value = array_value.promote_offsets(sequential_index_col) + index_cols = [sequential_index_col] + value_columns = [col for col in array_value.column_ids if col not in index_cols] block = blocks.Block( array_value, diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 29d5a5567f..013181d8a5 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -250,10 +250,8 @@ def get_index_cols_and_uniqueness( if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: # User has explicity asked for a default, sequential index. # Use that, even if there are primary keys on the table. - # - # Note: This relies on the default behavior of the Block - # constructor to create a default sequential index. If that ever - # changes, this logic will need to be revisited. + return [], False + if index_col == bigframes.enums.DefaultIndexKind.EMPTY: return [], False else: # Note: It's actually quite difficult to mock this out to unit diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4ebb3cb93b..d12fc5cc13 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -391,6 +391,16 @@ def scalars_df_index( return session.read_gbq(scalars_table_id, index_col="rowindex") +@pytest.fixture(scope="session") +def scalars_df_empty_index( + scalars_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq( + scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.EMPTY + ).sort_values("rowindex") + + @pytest.fixture(scope="session") def scalars_df_2_default_index( scalars_df_2_index: bigframes.dataframe.DataFrame, diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py new file mode 100644 index 0000000000..a6946aadf8 --- /dev/null +++ b/tests/system/small/test_empty_index.py @@ -0,0 +1,166 @@ +import pandas as pd +import pytest + +import bigframes.exceptions +import bigframes.pandas as bpd + + +def test_empty_index_materialize( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.to_pandas() + pd.testing.assert_frame_equal( + bf_result, scalars_pandas_df_default_index, check_index_type=False + ) + + +def test_empty_index_series_repr( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index["int64_too"].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index["int64_too"] + .head(5) + .to_string(dtype=True, index=False, name=True, length=True) + ) + assert bf_result == pd_result + + +def test_empty_index_dataframe_repr( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index[["int64_too", "int64_col"]].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index[["int64_too", "int64_col"]] + .head(5) + .to_string(index=False) + ) + assert bf_result == pd_result + "\n\n[5 rows x 2 columns]" + + +def test_empty_index_reset_index( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.reset_index().to_pandas() + pd_result = scalars_pandas_df_default_index.reset_index(drop=True) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_empty_index_set_index(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index.set_index("int64_col").to_pandas() + pd_result = scalars_pandas_df_default_index.set_index("int64_col") + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_empty_index_concat(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = bpd.concat( + [scalars_df_empty_index, scalars_df_empty_index], axis=0 + ).to_pandas() + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index], axis=0 + ) + pd.testing.assert_frame_equal(bf_result, pd_result.reset_index(drop=True)) + + +def test_empty_index_aggregate(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index.count().to_pandas() + pd_result = scalars_pandas_df_default_index.count() + + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_empty_index_groupby_aggregate( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.groupby("int64_col").count().to_pandas() + pd_result = scalars_pandas_df_default_index.groupby("int64_col").count() + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_empty_index_analytic(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index["int64_col"].cumsum().to_pandas() + pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() + pd.testing.assert_series_equal(bf_result, pd_result.reset_index(drop=True)) + + +def test_empty_index_groupby_analytic( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index.groupby("bool_col")["int64_col"].cummax().to_pandas() + ) + pd_result = scalars_pandas_df_default_index.groupby("bool_col")[ + "int64_col" + ].cummax() + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_series_self_aligns( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index["int64_col"] + scalars_df_empty_index["int64_too"] + ) + pd_result = ( + scalars_pandas_df_default_index["int64_col"] + + scalars_pandas_df_default_index["int64_too"] + ) + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_df_self_aligns( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index[["int64_col", "float64_col"]] + + scalars_df_empty_index[["int64_col", "float64_col"]] + ) + pd_result = ( + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_align_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = ( + scalars_df_empty_index["int64_col"] + + scalars_df_empty_index["int64_col"].cumsum() + ) + + +def test_empty_index_loc_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index["int64_col"].loc[1] + + +def test_empty_index_at_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index["int64_col"].at[1] + + +def test_empty_index_idxmin_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index[["int64_col", "int64_too"]].idxmin() + + +def test_empty_index_index_property(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_empty_index.index + + +def test_empty_index_transpose(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_empty_index.T From 9a5b2120c9c1ba8a3e65588ed8cce3157523d26b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 May 2024 01:26:23 +0000 Subject: [PATCH 02/17] fixes for kurt, skew, median --- bigframes/core/block_transforms.py | 16 ++++++++++------ bigframes/dataframe.py | 15 +++++++-------- bigframes/session/__init__.py | 5 ++++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index d628daf9b8..e015b641f8 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -600,9 +600,11 @@ def skew( block = block.select_columns(skew_ids).with_column_labels(column_labels) if not grouping_column_ids: - # When ungrouped, stack everything into single column so can be returned as series - block = block.stack() - block = block.drop_levels([block.index_columns[0]]) + # When ungrouped, transpose result row into a series + # perform transpose last, so as to not invalidate cache + block, index_col = block.create_constant(None, None) + block = block.set_index([index_col]) + return block.transpose(original_row_index=pd.Index([None])) return block @@ -640,9 +642,11 @@ def kurt( block = block.select_columns(kurt_ids).with_column_labels(column_labels) if not grouping_column_ids: - # When ungrouped, stack everything into single column so can be returned as series - block = block.stack() - block = block.drop_levels([block.index_columns[0]]) + # When ungrouped, transpose result row into a series + # perform transpose last, so as to not invalidate cache + block, index_col = block.create_constant(None, None) + block = block.set_index([index_col]) + return block.transpose(original_row_index=pd.Index([None])) return block diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a543a3d649..e933132398 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2063,16 +2063,15 @@ def quantile( frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore ) if multi_q: - return DataFrame(result.stack()).droplevel(0) + return DataFrame(result.stack()) # .droplevel(0) else: - result_df = ( - DataFrame(result) - .stack(list(range(0, frame.columns.nlevels))) - .droplevel(0) + # Drop the last level, which contains q, unnecessary since only one q + result = result.with_column_labels(result.column_labels.droplevel(-1)) + result, index_col = result.create_constant(q, None) + result = result.set_index([index_col]) + return bigframes.series.Series( + result.transpose(original_row_index=pandas.Index([q])) ) - result_series = bigframes.series.Series(result_df._block) - result_series.name = q - return result_series def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index dbbdfde35c..0a10ff7d5a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -322,6 +322,9 @@ def read_gbq( elif col_order: columns = col_order + if index_col == (): + index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + filters = list(filters) if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table): # TODO(b/338111344): This appears to be missing index_cols, which @@ -824,7 +827,7 @@ def _read_gbq_table( array_value, index_columns=index_cols, column_labels=value_columns, - index_labels=index_cols, + index_labels=index_cols if index_cols else pandas.Index([None]), ) if max_results: block = block.slice(stop=max_results) From 0248150db9d86dc760d7d9d54dbb9a65a7bc5d25 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 May 2024 17:51:42 +0000 Subject: [PATCH 03/17] fix unit tests --- bigframes/session/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0a10ff7d5a..1f0845beff 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -27,6 +27,7 @@ Any, Callable, Dict, + Hashable, IO, Iterable, List, @@ -322,9 +323,6 @@ def read_gbq( elif col_order: columns = col_order - if index_col == (): - index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 - filters = list(filters) if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table): # TODO(b/338111344): This appears to be missing index_cols, which @@ -817,17 +815,19 @@ def _read_gbq_table( # Create Block & default index if len(index_cols) == 0 # ---------------------------------------------------- + index_names: Sequence[Hashable] = index_cols if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: sequential_index_col = bigframes.core.guid.generate_guid("index_") array_value = array_value.promote_offsets(sequential_index_col) index_cols = [sequential_index_col] + index_names = [None] value_columns = [col for col in array_value.column_ids if col not in index_cols] block = blocks.Block( array_value, index_columns=index_cols, column_labels=value_columns, - index_labels=index_cols if index_cols else pandas.Index([None]), + index_labels=index_names, ) if max_results: block = block.slice(stop=max_results) From 16e292ba7c07742818a310367b3184fb67cb31a6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 May 2024 23:39:02 +0000 Subject: [PATCH 04/17] fix more issues --- bigframes/series.py | 10 +++++++--- bigframes/session/__init__.py | 5 ++++- tests/system/small/test_empty_index.py | 2 +- tests/system/small/test_series.py | 4 ++-- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 86729283fd..329ba74a07 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -978,9 +978,13 @@ def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, floa qs = tuple(q) if utils.is_list_like(q) else (q,) result = block_ops.quantile(self._block, (self._value_column,), qs=qs) if utils.is_list_like(q): - result = result.stack() - result = result.drop_levels([result.index_columns[0]]) - return Series(result) + # Drop the first level, since only one column + result = result.with_column_labels(result.column_labels.droplevel(0)) + result, index_col = result.create_constant(self.name, None) + result = result.set_index([index_col]) + return Series( + result.transpose(original_row_index=pandas.Index([self.name])) + ) else: return cast(float, Series(result).to_pandas().squeeze()) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 1f0845beff..9453518f85 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -812,9 +812,12 @@ def _read_gbq_table( ) # ---------------------------------------------------- - # Create Block & default index if len(index_cols) == 0 + # Create Default Index if DefaultIndexKind provided, or no index provided # ---------------------------------------------------- + if not index_col: + index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + index_names: Sequence[Hashable] = index_cols if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: sequential_index_col = bigframes.core.guid.generate_guid("index_") diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index a6946aadf8..27ae501904 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -21,7 +21,7 @@ def test_empty_index_series_repr( pd_result = ( scalars_pandas_df_default_index["int64_too"] .head(5) - .to_string(dtype=True, index=False, name=True, length=True) + .to_string(dtype=True, index=False, name=True) ) assert bf_result == pd_result diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 38aed19f05..f32815f64d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1429,13 +1429,13 @@ def test_numeric_literal(scalars_dfs): assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) -def test_repr(scalars_dfs): +def test_series_small_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" bf_series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == repr(pd_series) + assert repr(bf_series) == pd_series.to_string(length=True, dtype=True, name=True) def test_sum(scalars_dfs): From 5611a86a9f781701107d80c93a851388833700f5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 May 2024 23:49:10 +0000 Subject: [PATCH 05/17] fix defaulting to primary key logic --- bigframes/session/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9453518f85..e0070d0a4e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -812,10 +812,10 @@ def _read_gbq_table( ) # ---------------------------------------------------- - # Create Default Index if DefaultIndexKind provided, or no index provided + # Create Default Sequential Index if still have no index # ---------------------------------------------------- - if not index_col: + if not index_col and len(index_cols) == 0: index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 index_names: Sequence[Hashable] = index_cols From ea9b120fc15ec83a79aa115f50394f9935d04899 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 9 May 2024 18:07:21 +0000 Subject: [PATCH 06/17] fix tests --- tests/system/small/test_empty_index.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index 27ae501904..ce72020878 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -21,7 +21,7 @@ def test_empty_index_series_repr( pd_result = ( scalars_pandas_df_default_index["int64_too"] .head(5) - .to_string(dtype=True, index=False, name=True) + .to_string(dtype=True, index=False, length=True, name=True) ) assert bf_result == pd_result @@ -85,7 +85,9 @@ def test_empty_index_groupby_aggregate( def test_empty_index_analytic(scalars_df_empty_index, scalars_pandas_df_default_index): bf_result = scalars_df_empty_index["int64_col"].cumsum().to_pandas() pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() - pd.testing.assert_series_equal(bf_result, pd_result.reset_index(drop=True)) + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) def test_empty_index_groupby_analytic( From 27d6f47ec5e36d6dca1aa7b9ae7f326a4bd662dd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 15 May 2024 20:45:13 +0000 Subject: [PATCH 07/17] many small changes --- bigframes/core/__init__.py | 2 +- bigframes/core/block_transforms.py | 2 +- bigframes/core/blocks.py | 17 ++++++++----- bigframes/core/indexes/base.py | 6 ++++- bigframes/core/rewrite.py | 4 +-- bigframes/dataframe.py | 25 +++++++++++++------ bigframes/enums.py | 2 +- bigframes/operations/base.py | 16 ++++++++++++ bigframes/series.py | 24 +++++++++++++++--- bigframes/session/__init__.py | 5 +++- .../session/_io/bigquery/read_gbq_table.py | 2 +- tests/system/conftest.py | 2 +- tests/system/small/test_empty_index.py | 15 +++++++++++ 13 files changed, 97 insertions(+), 25 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 33fb8d754d..0a2936419f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -464,7 +464,7 @@ def try_align_as_projection( ) -> typing.Optional[ArrayValue]: left_side = bigframes.core.rewrite.SquashedSelect.from_node(self.node) right_side = bigframes.core.rewrite.SquashedSelect.from_node(other.node) - result = left_side.merge(right_side, join_type, mappings) + result = left_side.maybe_merge(right_side, join_type, mappings) if result is not None: return ArrayValue(result.expand()) return None diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 4b4a1c08b7..eaee2e2cc0 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -824,7 +824,7 @@ def idxmax(block: blocks.Block) -> blocks.Block: def _idx_extrema( block: blocks.Block, min_or_max: typing.Literal["min", "max"] ) -> blocks.Block: - block._null_index_guard() + block._throw_if_null_index("idx") if len(block.index_columns) > 1: # TODO: Need support for tuple dtype raise NotImplementedError( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f179c9a197..c8555a7b10 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -51,6 +51,7 @@ import bigframes.core.utils as utils import bigframes.core.window_spec as window_specs import bigframes.dtypes +import bigframes.exceptions import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -119,6 +120,10 @@ def __init__( raise ValueError( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) + if len(index_columns) == 0: + raise bigframes.exceptions.PreviewWarning( + "Creating object with Null Index. This feature is in preview." + ) self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple @@ -504,7 +509,7 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): Warning: This method modifies ``df`` inplace. """ - # Note: If BigQuery DataFrame has empty index, a default one will be created for the local materialization. + # Note: If BigQuery DataFrame has null index, a default one will be created for the local materialization. if len(self.index_columns) > 0: df.set_index(list(self.index_columns), inplace=True) # Pandas names is annotated as list[str] rather than the more @@ -1920,8 +1925,8 @@ def join( ): return join_indexless(self, other, how=how) - self._null_index_guard() - other._null_index_guard() + self._throw_if_null_index("join") + other._throw_if_null_index("join") if self.index.nlevels == other.index.nlevels == 1: return join_mono_indexed( self, other, how=how, sort=sort, block_identity_join=block_identity_join @@ -2070,10 +2075,10 @@ def _is_monotonic( self._stats_cache[column_name].update({op_name: result}) return result - def _null_index_guard(self): + def _throw_if_null_index(self, opname: str): if len(self.index_columns) == 0: raise bigframes.exceptions.NullIndexError( - "Cannot perform this operation without an index. Set an index using set_index." + f"Cannot do {opname} without an index. Set an index using set_index." ) def _get_rows_as_json_values(self) -> Block: @@ -2218,7 +2223,7 @@ def to_pandas(self) -> pd.Index: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: raise bigframes.exceptions.NullIndexError( - "Cannot perform this operation without an index. Set an index using set_index." + "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." ) # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index ca39abd755..4739d6082a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -100,7 +100,11 @@ def __new__( def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - frame._block._null_index_guard() + if len(frame._block.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + "Cannot access index properties with Null Index. Set an index using set_index." + ) + frame._block._throw_if_null_index("from_frame") index = Index(frame._block) index._linked_frame = frame return index diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 7e4a3c748b..8ac18d3d9c 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -114,7 +114,7 @@ def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition): return False return True - def merge( + def maybe_merge( self, right: SquashedSelect, join_type: join_defs.JoinType, @@ -207,7 +207,7 @@ def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: left_side = SquashedSelect.from_node(join_node.left_child) right_side = SquashedSelect.from_node(join_node.right_child) if left_side.can_join(right_side, join_node.join): - merged = left_side.merge( + merged = left_side.maybe_merge( right_side, join_node.join.type, join_node.join.mappings ) assert merged is not None diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index bcd2fd75fe..6ec1aa726b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -92,7 +92,7 @@ def requires_index(meth): @functools.wraps(meth) def guarded_meth(df: DataFrame, *args, **kwargs): - df._null_index_guard() + df._throw_if_null_index(meth.__name__) return meth(df, *args, **kwargs) return guarded_meth @@ -255,6 +255,7 @@ def _sql_names( return results @property + @requires_index def index( self, ) -> indexes.Index: @@ -633,12 +634,13 @@ def __repr__(self) -> str: with display_options.pandas_repr(opts): import pandas.io.formats - options = ( + # safe to mutate this, this dict is owned by this code, and does not affect global config + to_string_kwargs = ( pandas.io.formats.format.get_dataframe_repr_params() # type: ignore ) if not self._has_index: - options.update({"index": False}) - repr_string = pandas_df.to_string(**options) + to_string_kwargs.update({"index": False}) + repr_string = pandas_df.to_string(**to_string_kwargs) # Modify the end of the string to reflect count. lines = repr_string.split("\n") @@ -1631,6 +1633,7 @@ def set_index( col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) + @requires_index def sort_index( self, ascending: bool = True, na_position: Literal["first", "last"] = "last" ) -> DataFrame: @@ -1831,6 +1834,7 @@ def reindex( if columns is not None: return self._reindex_columns(columns) + @requires_index def _reindex_rows( self, index, @@ -1877,9 +1881,11 @@ def _reindex_columns(self, columns): result_df.columns = new_column_index return result_df + @requires_index def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None): return self.reindex(index=other.index, columns=other.columns, validate=validate) + @requires_index def interpolate(self, method: str = "linear") -> DataFrame: if method == "pad": return self.ffill() @@ -2069,7 +2075,7 @@ def quantile( frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore ) if multi_q: - return DataFrame(result.stack()) # .droplevel(0) + return DataFrame(result.stack()) else: # Drop the last level, which contains q, unnecessary since only one q result = result.with_column_labels(result.column_labels.droplevel(-1)) @@ -2172,9 +2178,11 @@ def agg( aggregate = agg aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) + @requires_index def idxmin(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmin(self._block)) + @requires_index def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) @@ -2281,6 +2289,7 @@ def _pivot( ) return DataFrame(pivot_block) + @requires_index def pivot( self, *, @@ -2294,6 +2303,7 @@ def pivot( ) -> DataFrame: return self._pivot(columns=columns, index=index, values=values) + @requires_index def pivot_table( self, values: typing.Optional[ @@ -2639,6 +2649,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") + @requires_index def _groupby_level( self, level: LevelsType, @@ -3605,8 +3616,8 @@ def __matmul__(self, other) -> DataFrame: __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) - def _null_index_guard(self): + def _throw_if_null_index(self, opname: str): if not self._has_index: raise bigframes.exceptions.NullIndexError( - "DataFrame cannot perform this operation as it has no index. Set an index using set_index." + f"DataFrame cannot perform {opname} as it has no index. Set an index using set_index." ) diff --git a/bigframes/enums.py b/bigframes/enums.py index 84f4b70406..9501d3f13e 100644 --- a/bigframes/enums.py +++ b/bigframes/enums.py @@ -28,4 +28,4 @@ class DefaultIndexKind(enum.Enum): #: the index. SEQUENTIAL_INT64 = enum.auto() # A completely null index incapable of indexing or alignment. - EMPTY = enum.auto() + NULL = enum.auto() diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 75d14f3fbc..49ef7f76ee 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,6 +14,7 @@ from __future__ import annotations +import functools import typing from typing import List, Sequence @@ -34,6 +35,15 @@ import bigframes.session +def requires_index(meth): + @functools.wraps(meth) + def guarded_meth(df: SeriesMethods, *args, **kwargs): + df._throw_if_null_index(meth.__name__) + return meth(df, *args, **kwargs) + + return guarded_meth + + class SeriesMethods: def __init__( self, @@ -266,3 +276,9 @@ def _align_n( block, constant_col_id = block.create_constant(other, dtype=dtype) value_ids = [*value_ids, constant_col_id] return (value_ids, block) + + def _throw_if_null_index(self, opname: str): + if len(self._block.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + f"Series cannot perform {opname} as it has no index. Set an index using set_index." + ) diff --git a/bigframes/series.py b/bigframes/series.py index f4b5bf5e2c..360401c4a9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -52,6 +52,7 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.base +from bigframes.operations.base import requires_index import bigframes.operations.datetimes as dt import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings @@ -86,6 +87,7 @@ def dtypes(self): return self._dtype @property + @requires_index def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) @@ -98,6 +100,7 @@ def iat(self) -> bigframes.core.indexers.IatSeriesIndexer: return bigframes.core.indexers.IatSeriesIndexer(self) @property + @requires_index def at(self) -> bigframes.core.indexers.AtSeriesIndexer: return bigframes.core.indexers.AtSeriesIndexer(self) @@ -136,6 +139,7 @@ def values(self) -> numpy.ndarray: return self.to_numpy() @property + @requires_index def index(self) -> indexes.Index: return indexes.Index.from_frame(self) @@ -237,6 +241,7 @@ def rename( raise ValueError(f"Unsupported type of parameter index: {type(index)}") + @requires_index def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], @@ -294,10 +299,11 @@ def __repr__(self) -> str: with display_options.pandas_repr(opts): import pandas.io.formats - options = pandas.io.formats.format.get_series_repr_params() # type: ignore + # safe to mutate this, this dict is owned by this code, and does not affect global config + to_string_kwargs = pandas.io.formats.format.get_series_repr_params() # type: ignore if len(self._block.index_columns) == 0: - options.update({"index": False}) - repr_string = pd_series.to_string(**options) + to_string_kwargs.update({"index": False}) + repr_string = pd_series.to_string(**to_string_kwargs) return repr_string @@ -390,10 +396,12 @@ def drop( block = block.drop_columns([condition_id]) return Series(block.select_column(self._value_column)) + @requires_index def droplevel(self, level: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(level) return Series(self._block.drop_levels(resolved_level_ids)) + @requires_index def swaplevel(self, i: int = -2, j: int = -1): level_i = self._block.index_columns[i] level_j = self._block.index_columns[j] @@ -403,6 +411,7 @@ def swaplevel(self, i: int = -2, j: int = -1): ] return Series(self._block.reorder_levels(reordering)) + @requires_index def reorder_levels(self, order: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(order) return Series(self._block.reorder_levels(resolved_level_ids)) @@ -581,6 +590,7 @@ def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): ) return Series(block.select_column(result)) + @requires_index def interpolate(self, method: str = "linear") -> Series: if method == "pad": return self.ffill() @@ -1102,6 +1112,7 @@ def unstack(self, level: LevelsType = -1): ) return bigframes.dataframe.DataFrame(pivot_block) + @requires_index def idxmax(self) -> blocks.Label: block = self._block.order_by( [ @@ -1115,6 +1126,7 @@ def idxmax(self) -> blocks.Label: block = block.slice(0, 1) return indexes.Index(block).to_pandas()[0] + @requires_index def idxmin(self) -> blocks.Label: block = self._block.order_by( [ @@ -1224,6 +1236,7 @@ def sort_values( ) return Series(block) + @requires_index def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: # TODO(tbergeron): Support level parameter once multi-index introduced. if na_position not in ["first", "last"]: @@ -1284,6 +1297,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") + @requires_index def _groupby_level( self, level: int | str | typing.Sequence[int] | typing.Sequence[str], @@ -1421,9 +1435,11 @@ def combine( materialized_series = result_series._cached() return materialized_series + @requires_index def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) + @requires_index def add_suffix(self, suffix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_suffix(suffix)) @@ -1475,6 +1491,7 @@ def filter( else: raise ValueError("Need to provide 'items', 'like', or 'regex'") + @requires_index def reindex(self, index=None, *, validate: typing.Optional[bool] = None): if validate and not self.index.is_unique: raise ValueError("Original index must be unique to reindex") @@ -1503,6 +1520,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): )._block return Series(result_block) + @requires_index def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None): return self.reindex(other.index, validate=validate) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 18b91f3562..41912c3194 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -760,7 +760,10 @@ def _read_gbq_table( # Create Default Sequential Index if still have no index # ---------------------------------------------------- - if not index_col and len(index_cols) == 0: + # If no idnex columns provided or found, fall back to sequential index + if (index_col != bigframes.enums.DefaultIndexKind.NULL) and len( + index_cols + ) == 0: index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 index_names: Sequence[Hashable] = index_cols diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 1962201281..00173f53cf 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -249,7 +249,7 @@ def get_index_cols_and_uniqueness( # User has explicity asked for a default, sequential index. # Use that, even if there are primary keys on the table. return [], False - if index_col == bigframes.enums.DefaultIndexKind.EMPTY: + if index_col == bigframes.enums.DefaultIndexKind.NULL: return [], False else: # Note: It's actually quite difficult to mock this out to unit diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d12fc5cc13..6b91863205 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -397,7 +397,7 @@ def scalars_df_empty_index( ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" return session.read_gbq( - scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.EMPTY + scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL ).sort_values("rowindex") diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index ce72020878..86050fc4b4 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -1,3 +1,18 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import pandas as pd import pytest From 75b1fd1e5fc04a460a528f84b2570b9baef06092 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 16 May 2024 00:18:18 +0000 Subject: [PATCH 08/17] fix accidental null indexes and raising warning --- bigframes/core/blocks.py | 32 +++++++++++++++++--------- bigframes/dataframe.py | 1 + bigframes/series.py | 1 + tests/system/small/test_empty_index.py | 16 +++++++++++++ tests/system/small/test_series.py | 2 +- 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c8555a7b10..5b919f6b99 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -121,10 +121,10 @@ def __init__( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) if len(index_columns) == 0: - raise bigframes.exceptions.PreviewWarning( - "Creating object with Null Index. This feature is in preview." + warnings.warn( + "Creating object with Null Index. Null Index is a preview feature.", + category=bigframes.exceptions.PreviewWarning, ) - self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( @@ -1086,16 +1086,25 @@ def aggregate( aggregate_labels = self._get_labels_for_columns( [agg[0] for agg in aggregations] ) + names: typing.List[Label] = [] - for by_col_id in by_column_ids: - if by_col_id in self.value_columns: - names.append(self.col_id_to_label[by_col_id]) - else: - names.append(self.col_id_to_index_name[by_col_id]) + if len(by_column_ids) == 0: + label_id = guid.generate_guid() + result_expr = result_expr.assign_constant(label_id, 0, pd.Int64Dtype()) + index_columns = (label_id,) + names = [None] + else: + index_columns = tuple(by_column_ids) # type: ignore + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( Block( result_expr, - index_columns=by_column_ids, + index_columns=index_columns, column_labels=aggregate_labels, index_labels=names, ), @@ -1248,11 +1257,12 @@ def explode( expr = self.expr.explode(column_ids) if ignore_index: + new_index_ids = guid.generate_guid() return Block( - expr.drop_columns(self.index_columns), + expr.drop_columns(self.index_columns).promote_offsets(new_index_ids), column_labels=self.column_labels, # Initiates default index creation using the block constructor. - index_columns=[], + index_columns=[new_index_ids], ) else: return Block( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6ec1aa726b..a6a8be7057 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2402,6 +2402,7 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) + @requires_index def unstack(self, level: LevelsType = -1): if not utils.is_list_like(level): level = [level] diff --git a/bigframes/series.py b/bigframes/series.py index 360401c4a9..0f92ff3f8c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1089,6 +1089,7 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + @requires_index def unstack(self, level: LevelsType = -1): if isinstance(level, int) or isinstance(level, str): level = [level] diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index 86050fc4b4..c0bbb13a3e 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -119,6 +119,22 @@ def test_empty_index_groupby_analytic( ) +def test_empty_index_stack(scalars_df_empty_index, scalars_pandas_df_default_index): + stacking_cols = ["int64_col", "int64_too"] + bf_result = scalars_df_empty_index[stacking_cols].stack().to_pandas() + pd_result = ( + scalars_pandas_df_default_index[stacking_cols] + .stack(future_stack=True) + .droplevel(level=0, axis=0) + ) + pd_result.index = pd_result.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + ) + + def test_empty_index_series_self_aligns( scalars_df_empty_index, scalars_pandas_df_default_index ): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f816847823..09b0033cc0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3683,10 +3683,10 @@ def test_series_explode_reserve_order(ignore_index, ordered): res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) + pd_res.index = pd_res.index.astype(pd.Int64Dtype()) pd.testing.assert_series_equal( res if ordered else res.sort_index(), pd_res, - check_index_type=False, ) From 7142078dc1afbef7ece16bd5f13ac8698753867b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 16 May 2024 17:29:02 +0000 Subject: [PATCH 09/17] fix df quantile index --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 57d45dd974..1ee4ad662e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2076,7 +2076,7 @@ def quantile( frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore ) if multi_q: - return DataFrame(result.stack()) + return DataFrame(result.stack()).droplevel(0) else: # Drop the last level, which contains q, unnecessary since only one q result = result.with_column_labels(result.column_labels.droplevel(-1)) From bc28bd4053eef37d4a875472bd1bf6269d8770e5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 17 May 2024 02:06:22 +0000 Subject: [PATCH 10/17] disable legacy pandas for some tests, add concat test --- tests/system/small/test_empty_index.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index c0bbb13a3e..63d6f4ab81 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -18,6 +18,7 @@ import bigframes.exceptions import bigframes.pandas as bpd +from tests.system.utils import skip_legacy_pandas def test_empty_index_materialize( @@ -97,6 +98,7 @@ def test_empty_index_groupby_aggregate( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@skip_legacy_pandas def test_empty_index_analytic(scalars_df_empty_index, scalars_pandas_df_default_index): bf_result = scalars_df_empty_index["int64_col"].cumsum().to_pandas() pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() @@ -119,6 +121,7 @@ def test_empty_index_groupby_analytic( ) +@skip_legacy_pandas def test_empty_index_stack(scalars_df_empty_index, scalars_pandas_df_default_index): stacking_cols = ["int64_col", "int64_too"] bf_result = scalars_df_empty_index[stacking_cols].stack().to_pandas() @@ -166,6 +169,16 @@ def test_empty_index_df_self_aligns( ) +def test_empty_index_df_concat(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = bpd.concat([scalars_df_empty_index, scalars_df_empty_index]) + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_empty_index_align_error(scalars_df_empty_index): with pytest.raises(bigframes.exceptions.NullIndexError): _ = ( From bd0aa12935e0a33a668847b6257359f89869b850 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 17 May 2024 16:59:47 +0000 Subject: [PATCH 11/17] fix series repr --- bigframes/series.py | 14 ++++++-------- tests/system/small/test_empty_index.py | 2 +- tests/system/small/test_series.py | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 0f92ff3f8c..4595164e80 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -31,7 +31,6 @@ import pandas.core.dtypes.common import typing_extensions -import bigframes._config.display_options as display_options import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter @@ -296,14 +295,13 @@ def __repr__(self) -> str: pd_series = pandas_df.iloc[:, 0] - with display_options.pandas_repr(opts): - import pandas.io.formats + import pandas.io.formats - # safe to mutate this, this dict is owned by this code, and does not affect global config - to_string_kwargs = pandas.io.formats.format.get_series_repr_params() # type: ignore - if len(self._block.index_columns) == 0: - to_string_kwargs.update({"index": False}) - repr_string = pd_series.to_string(**to_string_kwargs) + # safe to mutate this, this dict is owned by this code, and does not affect global config + to_string_kwargs = pandas.io.formats.format.get_series_repr_params() # type: ignore + if len(self._block.index_columns) == 0: + to_string_kwargs.update({"index": False}) + repr_string = pd_series.to_string(**to_string_kwargs) return repr_string diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py index 63d6f4ab81..7a1715e3d1 100644 --- a/tests/system/small/test_empty_index.py +++ b/tests/system/small/test_empty_index.py @@ -37,7 +37,7 @@ def test_empty_index_series_repr( pd_result = ( scalars_pandas_df_default_index["int64_too"] .head(5) - .to_string(dtype=True, index=False, length=True, name=True) + .to_string(dtype=True, index=False, length=False, name=True) ) assert bf_result == pd_result diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 44a240d89a..dbc8ddec6f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1436,7 +1436,7 @@ def test_series_small_repr(scalars_dfs): col_name = "int64_col" bf_series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == pd_series.to_string(length=True, dtype=True, name=True) + assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) def test_sum(scalars_dfs): From 5efcc277ee6b466416ef30b8ba2cf664f38a236d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 17 May 2024 11:15:27 -0700 Subject: [PATCH 12/17] Update bigframes/session/__init__.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 09f971dd69..834ebaae13 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -805,7 +805,7 @@ def _read_gbq_table( # Create Default Sequential Index if still have no index # ---------------------------------------------------- - # If no idnex columns provided or found, fall back to sequential index + # If no index columns provided or found, fall back to sequential index if (index_col != bigframes.enums.DefaultIndexKind.NULL) and len( index_cols ) == 0: From 4b487e7ae41927e44312d7b015a33d594b00a0cc Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 17 May 2024 11:15:57 -0700 Subject: [PATCH 13/17] Update bigframes/core/rewrite.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/core/rewrite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 8ac18d3d9c..40adbd05af 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -210,7 +210,8 @@ def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: merged = left_side.maybe_merge( right_side, join_node.join.type, join_node.join.mappings ) - assert merged is not None + import bigframes.constants + assert (merged is not None), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." return merged.expand() else: return join_node From 38922413f23fb9e4fcb0e42096da466d552adbee Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 17 May 2024 11:16:10 -0700 Subject: [PATCH 14/17] Update bigframes/core/rewrite.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/core/rewrite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 40adbd05af..28af5f04fd 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -98,7 +98,7 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition): + def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition) -> bool: if join_def.type == "cross": # Cannot convert cross join to projection return False From 09af424a767609744ecb9f2c5d1a3a33f1aeae2f Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Fri, 17 May 2024 18:17:50 +0000 Subject: [PATCH 15/17] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/core/rewrite.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 28af5f04fd..848730a1bb 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -98,7 +98,9 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition) -> bool: + def can_join( + self, right: SquashedSelect, join_def: join_defs.JoinDefinition + ) -> bool: if join_def.type == "cross": # Cannot convert cross join to projection return False @@ -211,7 +213,10 @@ def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: right_side, join_node.join.type, join_node.join.mappings ) import bigframes.constants - assert (merged is not None), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." + + assert ( + merged is not None + ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." return merged.expand() else: return join_node From 1164faffdcc17626d162984e706d983850a7fa67 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Fri, 17 May 2024 18:18:11 +0000 Subject: [PATCH 16/17] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/core/rewrite.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 28af5f04fd..848730a1bb 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -98,7 +98,9 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def can_join(self, right: SquashedSelect, join_def: join_defs.JoinDefinition) -> bool: + def can_join( + self, right: SquashedSelect, join_def: join_defs.JoinDefinition + ) -> bool: if join_def.type == "cross": # Cannot convert cross join to projection return False @@ -211,7 +213,10 @@ def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: right_side, join_node.join.type, join_node.join.mappings ) import bigframes.constants - assert (merged is not None), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." + + assert ( + merged is not None + ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." return merged.expand() else: return join_node From 600d500a13ff4e0fd33a75c90cadc3dc6b04179e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 17 May 2024 19:50:02 +0000 Subject: [PATCH 17/17] pr comments addressed --- bigframes/core/blocks.py | 22 +++++++++++++++++++--- bigframes/core/rewrite.py | 2 -- bigframes/dataframe.py | 1 + 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a08a0a2811..0bbb8a0b61 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1912,9 +1912,25 @@ def join( other: Block, *, how="left", - sort=False, + sort: bool = False, block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: + """ + Join two blocks objects together, and provide mappings between source columns and output columns. + + Args: + other (Block): + The right operand of the join operation + how (str): + Describes the join type. 'inner', 'outer', 'left', or 'right' + sort (bool): + if true will sort result by index + block_identity_join (bool): + If true, will not convert join to a projection (implicitly assuming unique indices) + + Returns: + Block, (left_mapping, right_mapping): Result block and mappers from input column ids to result column ids. + """ if not isinstance(other, Block): # TODO(swast): We need to improve this error message to be more @@ -1932,8 +1948,8 @@ def join( # Special case for null index, if ( (self.index.nlevels == other.index.nlevels == 0) - and (sort is False) - and (block_identity_join is False) + and not sort + and not block_identity_join ): return join_indexless(self, other, how=how) diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 848730a1bb..15999c0558 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -212,8 +212,6 @@ def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: merged = left_side.maybe_merge( right_side, join_node.join.type, join_node.join.mappings ) - import bigframes.constants - assert ( merged is not None ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1ee4ad662e..105588de2f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1359,6 +1359,7 @@ def drop( block = self._block if index is not None: + self._throw_if_null_index("drop(axis=0)") level_id = self._resolve_levels(level or 0)[0] if utils.is_list_like(index):