From e95874fbcee11f4b694231967ac6d5812a625151 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 8 Apr 2024 23:06:16 +0000 Subject: [PATCH 1/5] feat: Add MultiIndex subclass. --- bigframes/core/indexes/__init__.py | 3 +- bigframes/core/indexes/base.py | 79 ++++++++++------- bigframes/pandas/__init__.py | 2 + tests/system/small/test_multiindex.py | 25 ++++++ .../pandas/core/indexes/multi.py | 88 +++++++++++++++++++ 5 files changed, 166 insertions(+), 31 deletions(-) create mode 100644 third_party/bigframes_vendored/pandas/core/indexes/multi.py diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index ae6011ffa5..6afb9ca118 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.indexes.base import Index +from bigframes.core.indexes.base import Index, MultiIndex __all__ = [ "Index", + "MultiIndex", ] diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index daa52a02b9..0fada85497 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -17,9 +17,10 @@ from __future__ import annotations import typing -from typing import Hashable, Optional, Sequence, Union +from typing import Hashable, Iterable, Optional, Sequence, Union import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index +import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import google.cloud.bigquery as bigquery import numpy as np import pandas @@ -42,14 +43,13 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ + _query_job = None + _block: blocks.Block + _linked_frame: Union[bigframes.dataframe.DataFrame, bigframes.series.Series] - def __init__( - self, - data=None, - dtype=None, - *, - name=None, - session=None, + # Overrided on __new__ to create subclasses like python does + def __new__( + self, data=None, dtype=None, *, name=None, session=None, linked_frame=None ): import bigframes.dataframe as df import bigframes.series as series @@ -73,18 +73,30 @@ def __init__( if dtype is not None: index = index.astype(dtype) block = index._block + elif isinstance(data, pandas.Index): + pd_df = pandas.DataFrame(index=data) + block = df.DataFrame(pd_df, session=session)._block else: pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block - self._query_job = None - self._block: blocks.Block = block + + # TODO: Support more index subtypes + if len(block._index_columns) > 1: + klass = MultiIndex + else: + klass = Index + result = typing.cast(Index, object.__new__(klass)) + result._query_job = None + result._block = block + result._linked_frame = linked_frame + return result @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return FrameIndex(frame) + return Index(frame, linked_frame=frame) @property def name(self) -> blocks.Label: @@ -107,6 +119,10 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): new_block = self._block.with_index_labels(values) + if self._linked_frame: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(values) + ) self._block = new_block @property @@ -454,24 +470,27 @@ def __len__(self): return self.shape[0] -# Index that mutates the originating dataframe/series -class FrameIndex(Index): - def __init__( - self, - series_or_dataframe: typing.Union[ - bigframes.series.Series, bigframes.dataframe.DataFrame - ], - ): - super().__init__(series_or_dataframe._block) - self._whole_frame = series_or_dataframe +class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): + __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ - @property - def names(self) -> typing.Sequence[blocks.Label]: - """Returns the names of the Index.""" - return self._block._index_labels + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return Index(pd_index) - @names.setter - def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._whole_frame._get_block().with_index_labels(values) - self._whole_frame._set_block(new_block) - self._block = new_block + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return Index(pd_index) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 4b0ac4310c..f5be4421e4 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -707,6 +707,7 @@ def to_datetime( # checking and docstrings. DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index +MultiIndex = bigframes.core.indexes.MultiIndex Series = bigframes.series.Series # Other public pandas attributes @@ -760,6 +761,7 @@ def to_datetime( # Class aliases "DataFrame", "Index", + "MultiIndex", "Series", # Other public pandas attributes "NamedAgg", diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 330fe44eb8..bb0af52976 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -20,6 +20,31 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_multi_index_from_arrays(): + bf_idx = bpd.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + pd_idx = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + assert bf_idx.names == pd_idx.names + pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + @skip_legacy_pandas def test_read_pandas_multi_index_axes(): index = pandas.MultiIndex.from_arrays( diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py new file mode 100644 index 0000000000..a882aa40e3 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -0,0 +1,88 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/multi.py +from __future__ import annotations + +from typing import Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.base + +from bigframes import constants + + +class MultiIndex(bigframes_vendored.pandas.core.indexes.base.Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + """ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + tuples (list / sequence of tuple-likes): + Each tuple is the index of one row/column. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + arrays (list / sequence of array-likes): + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 658510c42346cf5cc8f9bdc582658fd1707450f5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 8 Apr 2024 23:38:36 +0000 Subject: [PATCH 2/5] fix test error --- bigframes/core/indexes/base.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0fada85497..3e0d3ca776 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -45,11 +45,19 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ _query_job = None _block: blocks.Block - _linked_frame: Union[bigframes.dataframe.DataFrame, bigframes.series.Series] + _linked_frame: Union[bigframes.dataframe.DataFrame, bigframes.series.Series, None] # Overrided on __new__ to create subclasses like python does def __new__( - self, data=None, dtype=None, *, name=None, session=None, linked_frame=None + self, + data=None, + dtype=None, + *, + name=None, + session=None, + linked_frame: Union[ + bigframes.dataframe.DataFrame, bigframes.series.Series, None + ] = None, ): import bigframes.dataframe as df import bigframes.series as series @@ -96,7 +104,7 @@ def __new__( def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return Index(frame, linked_frame=frame) + return Index(frame._block, linked_frame=frame) @property def name(self) -> blocks.Label: @@ -119,7 +127,7 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): new_block = self._block.with_index_labels(values) - if self._linked_frame: + if self._linked_frame is not None: self._linked_frame._set_block( self._linked_frame._block.with_index_labels(values) ) From 46b2b5862d39ddcf2f10e90c1ff11bd3c7142ac4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 9 Apr 2024 00:16:36 +0000 Subject: [PATCH 3/5] resolve mypy issues --- bigframes/core/indexes/base.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 3e0d3ca776..5ccd0f7b9b 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -90,10 +90,7 @@ def __new__( block = df.DataFrame(pd_df, session=session)._block # TODO: Support more index subtypes - if len(block._index_columns) > 1: - klass = MultiIndex - else: - klass = Index + klass = MultiIndex if len(block._index_columns) > 1 else Index result = typing.cast(Index, object.__new__(klass)) result._query_job = None result._block = block @@ -490,7 +487,7 @@ def from_tuples( ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return Index(pd_index) + return typing.cast(MultiIndex, Index(pd_index)) @classmethod def from_arrays( @@ -501,4 +498,4 @@ def from_arrays( ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return Index(pd_index) + return typing.cast(MultiIndex, Index(pd_index)) From 4e04a71cd97da331f34e301cb4626db02579319c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 9 Apr 2024 22:41:49 +0000 Subject: [PATCH 4/5] pr comments --- bigframes/core/indexes/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 5ccd0f7b9b..973be11413 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -45,19 +45,18 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ _query_job = None _block: blocks.Block - _linked_frame: Union[bigframes.dataframe.DataFrame, bigframes.series.Series, None] + _linked_frame: Union[ + bigframes.dataframe.DataFrame, bigframes.series.Series, None + ] = None # Overrided on __new__ to create subclasses like python does def __new__( - self, + cls, data=None, dtype=None, *, name=None, session=None, - linked_frame: Union[ - bigframes.dataframe.DataFrame, bigframes.series.Series, None - ] = None, ): import bigframes.dataframe as df import bigframes.series as series @@ -90,18 +89,19 @@ def __new__( block = df.DataFrame(pd_df, session=session)._block # TODO: Support more index subtypes - klass = MultiIndex if len(block._index_columns) > 1 else Index + klass = MultiIndex if len(block._index_columns) > 1 else cls result = typing.cast(Index, object.__new__(klass)) result._query_job = None result._block = block - result._linked_frame = linked_frame return result @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return Index(frame._block, linked_frame=frame) + index = Index(frame._block) + index._linked_frame = frame + return index @property def name(self) -> blocks.Label: From bb59939f41fb213f3fd798b7fb0952f937f330ca Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 10 Apr 2024 20:12:42 +0000 Subject: [PATCH 5/5] moved MultiIndex to own file --- bigframes/core/indexes/__init__.py | 3 +- bigframes/core/indexes/base.py | 33 +++----------------- bigframes/core/indexes/multi.py | 48 ++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 30 deletions(-) create mode 100644 bigframes/core/indexes/multi.py diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index 6afb9ca118..0a95adcd83 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.indexes.base import Index, MultiIndex +from bigframes.core.indexes.base import Index +from bigframes.core.indexes.multi import MultiIndex __all__ = [ "Index", diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 973be11413..46a9e30637 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -17,10 +17,9 @@ from __future__ import annotations import typing -from typing import Hashable, Iterable, Optional, Sequence, Union +from typing import Hashable, Optional, Sequence, Union import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index -import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import google.cloud.bigquery as bigquery import numpy as np import pandas @@ -49,7 +48,7 @@ class Index(vendored_pandas_index.Index): bigframes.dataframe.DataFrame, bigframes.series.Series, None ] = None - # Overrided on __new__ to create subclasses like python does + # Overrided on __new__ to create subclasses like pandas does def __new__( cls, data=None, @@ -89,6 +88,8 @@ def __new__( block = df.DataFrame(pd_df, session=session)._block # TODO: Support more index subtypes + from bigframes.core.indexes.multi import MultiIndex + klass = MultiIndex if len(block._index_columns) > 1 else cls result = typing.cast(Index, object.__new__(klass)) result._query_job = None @@ -473,29 +474,3 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - - -class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): - __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ - - @classmethod - def from_tuples( - cls, - tuples: Iterable[tuple[Hashable, ...]], - sortorder: int | None = None, - names: Sequence[Hashable] | Hashable | None = None, - ) -> MultiIndex: - pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) - # Index.__new__ should detect multiple levels and properly create a multiindex - return typing.cast(MultiIndex, Index(pd_index)) - - @classmethod - def from_arrays( - cls, - arrays, - sortorder: int | None = None, - names=None, - ) -> MultiIndex: - pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) - # Index.__new__ should detect multiple levels and properly create a multiindex - return typing.cast(MultiIndex, Index(pd_index)) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py new file mode 100644 index 0000000000..182d1f101c --- /dev/null +++ b/bigframes/core/indexes/multi.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex +import pandas + +from bigframes.core.indexes.base import Index + + +class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): + __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index))