googleapis · TrevorBergeron · Jun 26, 2024 · May 28, 2024 · May 30, 2024 · May 30, 2024
@@ -2286,13 +2286,13 @@ def to_sql_query(
             idx_labels,
         )
 
-    def cached(self, *, optimize_offsets=False, force: bool = False) -> None:
+    def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
         """Write the block to a session table."""
         # use a heuristic for whether something needs to be cached
         if (not force) and self.session._is_trivially_executable(self.expr):
             return
-        if optimize_offsets:
-            self.session._cache_with_offsets(self.expr)
+        elif session_aware:
+            self.session._cache_with_session_awareness(self.expr)
         else:
             self.session._cache_with_cluster_cols(
                 self.expr, cluster_cols=self.index_columns

@@ -0,0 +1,77 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bigframes.core.expression as ex
+import bigframes.core.schema as schemata
+import bigframes.dtypes
+import bigframes.operations as ops
+
+LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE]
+
+COMPARISON_OP_TYPES = tuple(
+    type(i)
+    for i in (
+        ops.eq_op,
+        ops.eq_null_match_op,
+        ops.ne_op,
+        ops.gt_op,
+        ops.ge_op,
+        ops.lt_op,
+        ops.le_op,
+    )
+)
+
+
+def cluster_cols_for_predicate(
+    predicate: ex.Expression, schema: schemata.ArraySchema
+) -> list[str]:
+    """Try to determine cluster col candidates that work with given predicates."""
+    # TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective)
+    if isinstance(predicate, ex.UnboundVariableExpression):
+        cols = [predicate.id]
+    elif isinstance(predicate, ex.OpExpression):
+        op = predicate.op
+        # TODO: Support geo predicates, which support pruning if clustered (other than st_disjoint)
+        # https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions
+        if isinstance(op, COMPARISON_OP_TYPES):
+            cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1])
+        elif isinstance(op, (type(ops.invert_op))):
+            cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
+        elif isinstance(op, (type(ops.and_op), type(ops.or_op))):
+            left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
+            right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema)
+            cols = [*left_cols, *[col for col in right_cols if col not in left_cols]]
+        else:
+            cols = []
+    else:
+        # Constant
+        cols = []
+    return [
+        col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col))
+    ]
+
+
+def cluster_cols_for_comparison(
+    left_ex: ex.Expression, right_ex: ex.Expression
+) -> list[str]:
+    # TODO: Try to normalize expressions such that one side is a single variable.
+    # eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1
+    if left_ex.is_const:
+        # There are some invertible ops that would also be ok
+        if isinstance(right_ex, ex.UnboundVariableExpression):
+            return [right_ex.id]
+    elif right_ex.is_const:
+        if isinstance(left_ex, ex.UnboundVariableExpression):
+            return [left_ex.id]
+    return []
@@ -15,7 +15,7 @@
 
 import functools
 import itertools
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Sequence
 
 import bigframes.core.nodes as nodes
 
@@ -91,6 +91,43 @@ def _node_counts_inner(
     )
 
 
+def count_nodes(forest: Sequence[nodes.BigFrameNode]) -> dict[nodes.BigFrameNode, int]:
+    """
+    Counts the number of instances of each subtree present within a forest.
+
+    Memoizes internally to accelerate execution, but cache not persisted (not reused between invocations).
+
+    Args:
+        forest (Sequence of BigFrameNode):
+            The roots of each tree in the forest
+
+    Returns:
+        dict[BigFramesNode, int]: The number of occurences of each subtree.
+    """
+
+    def _combine_counts(
+        left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
+    ) -> Dict[nodes.BigFrameNode, int]:
+        return {
+            key: left.get(key, 0) + right.get(key, 0)
+            for key in itertools.chain(left.keys(), right.keys())
+        }
+
+    empty_counts: Dict[nodes.BigFrameNode, int] = {}
+
+    @functools.cache
+    def _node_counts_inner(
+        subtree: nodes.BigFrameNode,
+    ) -> Dict[nodes.BigFrameNode, int]:
+        """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
+        child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
+        node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
+        return _combine_counts(node_counts, {subtree: 1})
+
+    counts = [_node_counts_inner(root) for root in forest]
+    return functools.reduce(_combine_counts, counts, empty_counts)
+
+
 def replace_nodes(
     root: nodes.BigFrameNode,
     replacements: dict[nodes.BigFrameNode, nodes.BigFrameNode],

@@ -74,52 +74,95 @@ class SimpleDtypeInfo:
     logical_bytes: int = (
         8  # this is approximate only, some types are variably sized, also, compression
     )
+    orderable: bool = False
+    clusterable: bool = False
 
 
 # TODO: Missing BQ types: INTERVAL, JSON, RANGE
 # TODO: Add mappings to python types
 SIMPLE_TYPES = (
     SimpleDtypeInfo(
-        dtype=INT_DTYPE, arrow_dtype=pa.int64(), type_kind=("INT64", "INTEGER")
+        dtype=INT_DTYPE,
+        arrow_dtype=pa.int64(),
+        type_kind=("INT64", "INTEGER"),
+        orderable=True,
+        clusterable=True,
     ),
     SimpleDtypeInfo(
-        dtype=FLOAT_DTYPE, arrow_dtype=pa.float64(), type_kind=("FLOAT64", "FLOAT")
+        dtype=FLOAT_DTYPE,
+        arrow_dtype=pa.float64(),
+        type_kind=("FLOAT64", "FLOAT"),
+        orderable=True,
     ),
     SimpleDtypeInfo(
         dtype=BOOL_DTYPE,
         arrow_dtype=pa.bool_(),
         type_kind=("BOOL", "BOOLEAN"),
         logical_bytes=1,
+        orderable=True,
+        clusterable=True,
     ),
-    SimpleDtypeInfo(dtype=STRING_DTYPE, arrow_dtype=pa.string(), type_kind=("STRING",)),
     SimpleDtypeInfo(
-        dtype=DATE_DTYPE, arrow_dtype=pa.date32(), type_kind=("DATE",), logical_bytes=4
+        dtype=STRING_DTYPE,
+        arrow_dtype=pa.string(),
+        type_kind=("STRING",),
+        orderable=True,
+        clusterable=True,
     ),
-    SimpleDtypeInfo(dtype=TIME_DTYPE, arrow_dtype=pa.time64("us"), type_kind=("TIME",)),
     SimpleDtypeInfo(
-        dtype=DATETIME_DTYPE, arrow_dtype=pa.timestamp("us"), type_kind=("DATETIME",)
+        dtype=DATE_DTYPE,
+        arrow_dtype=pa.date32(),
+        type_kind=("DATE",),
+        logical_bytes=4,
+        orderable=True,
+        clusterable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=TIME_DTYPE,
+        arrow_dtype=pa.time64("us"),
+        type_kind=("TIME",),
+        orderable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=DATETIME_DTYPE,
+        arrow_dtype=pa.timestamp("us"),
+        type_kind=("DATETIME",),
+        orderable=True,
+        clusterable=True,
     ),
     SimpleDtypeInfo(
         dtype=TIMESTAMP_DTYPE,
         arrow_dtype=pa.timestamp("us", tz="UTC"),
         type_kind=("TIMESTAMP",),
+        orderable=True,
+        clusterable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",), orderable=True
     ),
-    SimpleDtypeInfo(dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",)),
     SimpleDtypeInfo(
         dtype=NUMERIC_DTYPE,
         arrow_dtype=pa.decimal128(38, 9),
         type_kind=("NUMERIC",),
         logical_bytes=16,
+        orderable=True,
+        clusterable=True,
     ),
     SimpleDtypeInfo(
         dtype=BIGNUMERIC_DTYPE,
         arrow_dtype=pa.decimal256(76, 38),
         type_kind=("BIGNUMERIC",),
         logical_bytes=32,
+        orderable=True,
+        clusterable=True,
     ),
     # Geo has no corresponding arrow dtype
     SimpleDtypeInfo(
-        dtype=GEO_DTYPE, arrow_dtype=None, type_kind=("GEOGRAPHY",), logical_bytes=40
+        dtype=GEO_DTYPE,
+        arrow_dtype=None,
+        type_kind=("GEOGRAPHY",),
+        logical_bytes=40,
+        clusterable=True,
     ),
 )
 
@@ -209,9 +252,25 @@ def is_comparable(type: ExpressionType) -> bool:
     return (type is not None) and is_orderable(type)
 
 
+_ORDERABLE_SIMPLE_TYPES = set(
+    mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable
+)
+
+
 def is_orderable(type: ExpressionType) -> bool:
     # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
-    return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE)
+    return type in _ORDERABLE_SIMPLE_TYPES
+
+
+_CLUSTERABLE_SIMPLE_TYPES = set(
+    mapping.dtype for mapping in SIMPLE_TYPES if mapping.clusterable
+)
+
+
+def is_clusterable(type: ExpressionType) -> bool:
+    # https://cloud.google.com/bigquery/docs/clustered-tables#cluster_column_types
+    # This is based on default database type mapping, could in theory represent in non-default bq type to cluster.
+    return type in _CLUSTERABLE_SIMPLE_TYPES
 
 
 def is_bool_coercable(type: ExpressionType) -> bool:

@@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series:
     def tail(self, n: int = 5) -> Series:
         return typing.cast(Series, self.iloc[-n:])
 
+    def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
+        """
+        Preview n arbitrary elements from the series without guarantees about row selection or ordering.
+
+        ``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
+        full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
+        Query results will be cached so that future steps will benefit from these queries.
+
+        Args:
+            n (int, default 5):
+                The number of rows to select from the series. Which N rows are returned is non-deterministic.
+            force (bool, default True):
+                If the data cannot be peeked efficiently, the series will instead be fully materialized as part
+                of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
+        Returns:
+            pandas.Series: A pandas Series with n rows.
+
+        Raises:
+            ValueError: If force=False and data cannot be efficiently peeked.
+        """
+        maybe_result = self._block.try_peek(n)
+        if maybe_result is None:
+            if force:
+                self._cached()
+                maybe_result = self._block.try_peek(n, force=True)
+                assert maybe_result is not None
+            else:
+                raise ValueError(
+                    "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
+                )
+        as_series = maybe_result.squeeze(axis=1)
+        as_series.name = self.name
+        return as_series
+
     def nlargest(self, n: int = 5, keep: str = "first") -> Series:
         if keep not in ("first", "last", "all"):
             raise ValueError("'keep must be one of 'first', 'last', or 'all'")
@@ -1419,7 +1453,7 @@ def apply(
 
         # return Series with materialized result so that any error in the remote
         # function is caught early
-        materialized_series = result_series._cached()
+        materialized_series = result_series._cached(session_aware=False)
         return materialized_series
 
     def combine(
@@ -1794,10 +1828,11 @@ def cache(self):
         Returns:
             Series: Self
         """
-        return self._cached(force=True)
+        # Do not use session-aware cashing if user-requested
+        return self._cached(force=True, session_aware=False)
 
-    def _cached(self, *, force: bool = True) -> Series:
-        self._block.cached(force=force)
+    def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
+        self._block.cached(force=force, session_aware=session_aware)
         return self
 
     def _optimize_query_complexity(self):