From c8dc24518e40ac337e3b85205a4f5a807d50c2e9 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:18:51 -0400 Subject: [PATCH 1/3] Add missing conditional functions: greatest, least, nvl2, ifnull (#1449) Expose four conditional functions from upstream DataFusion that were not yet available in the Python bindings. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 10 +++++ python/datafusion/functions.py | 79 ++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index c32134054..785352741 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -494,6 +494,8 @@ expr_fn!(length, string); expr_fn!(char_length, string); expr_fn!(chr, arg, "Returns the character with the given code."); expr_fn_vec!(coalesce); +expr_fn_vec!(greatest); +expr_fn_vec!(least); expr_fn!(cos, num); expr_fn!(cosh, num); expr_fn!(cot, num); @@ -543,6 +545,11 @@ expr_fn!( x y, "Returns x if x is not NULL otherwise returns y." ); +expr_fn!( + nvl2, + x y z, + "Returns y if x is not NULL; otherwise returns z." +); expr_fn!(nullif, arg_1 arg_2); expr_fn!( octet_length, @@ -981,6 +988,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(floor))?; m.add_wrapped(wrap_pyfunction!(from_unixtime))?; m.add_wrapped(wrap_pyfunction!(gcd))?; + m.add_wrapped(wrap_pyfunction!(greatest))?; // m.add_wrapped(wrap_pyfunction!(grouping))?; m.add_wrapped(wrap_pyfunction!(in_list))?; m.add_wrapped(wrap_pyfunction!(initcap))?; @@ -988,6 +996,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(iszero))?; m.add_wrapped(wrap_pyfunction!(levenshtein))?; m.add_wrapped(wrap_pyfunction!(lcm))?; + m.add_wrapped(wrap_pyfunction!(least))?; m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; m.add_wrapped(wrap_pyfunction!(ln))?; @@ -1005,6 +1014,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(named_struct))?; m.add_wrapped(wrap_pyfunction!(nanvl))?; m.add_wrapped(wrap_pyfunction!(nvl))?; + m.add_wrapped(wrap_pyfunction!(nvl2))?; m.add_wrapped(wrap_pyfunction!(now))?; m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index f062cbfce..d93e46292 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -149,6 +149,8 @@ "floor", "from_unixtime", "gcd", + "greatest", + "ifnull", "in_list", "initcap", "isnan", @@ -157,6 +159,7 @@ "last_value", "lcm", "lead", + "least", "left", "length", "levenshtein", @@ -212,6 +215,7 @@ "ntile", "nullif", "nvl", + "nvl2", "octet_length", "order_by", "overlay", @@ -1027,6 +1031,44 @@ def gcd(x: Expr, y: Expr) -> Expr: return Expr(f.gcd(x.expr, y.expr)) +def greatest(*args: Expr) -> Expr: + """Returns the greatest value from a list of expressions. + + Returns NULL if all expressions are NULL. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]}) + >>> result = df.select( + ... dfn.functions.greatest(dfn.col("a"), dfn.col("b")).alias("greatest")) + >>> result.collect_column("greatest")[0].as_py() + 2 + >>> result.collect_column("greatest")[1].as_py() + 3 + """ + args = [arg.expr for arg in args] + return Expr(f.greatest(*args)) + + +def ifnull(x: Expr, y: Expr) -> Expr: + """Returns ``x`` if ``x`` is not NULL. Otherwise returns ``y``. + + This is an alias for :py:func:`nvl`. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> result = df.select( + ... dfn.functions.ifnull(dfn.col("a"), dfn.col("b")).alias("ifnull") + ... ) + >>> result.collect_column("ifnull")[0].as_py() + 0 + >>> result.collect_column("ifnull")[1].as_py() + 1 + """ + return nvl(x, y) + + def initcap(string: Expr) -> Expr: """Set the initial letter of each word to capital. @@ -1080,6 +1122,25 @@ def lcm(x: Expr, y: Expr) -> Expr: return Expr(f.lcm(x.expr, y.expr)) +def least(*args: Expr) -> Expr: + """Returns the least value from a list of expressions. + + Returns NULL if all expressions are NULL. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]}) + >>> result = df.select( + ... dfn.functions.least(dfn.col("a"), dfn.col("b")).alias("least")) + >>> result.collect_column("least")[0].as_py() + 1 + >>> result.collect_column("least")[1].as_py() + 1 + """ + args = [arg.expr for arg in args] + return Expr(f.least(*args)) + + def left(string: Expr, n: Expr) -> Expr: """Returns the first ``n`` characters in the ``string``. @@ -1264,6 +1325,24 @@ def nvl(x: Expr, y: Expr) -> Expr: return Expr(f.nvl(x.expr, y.expr)) +def nvl2(x: Expr, y: Expr, z: Expr) -> Expr: + """Returns ``y`` if ``x`` is not NULL. Otherwise returns ``z``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [10, 20], "c": [30, 40]}) + >>> result = df.select( + ... dfn.functions.nvl2( + ... dfn.col("a"), dfn.col("b"), dfn.col("c")).alias("nvl2") + ... ) + >>> result.collect_column("nvl2")[0].as_py() + 30 + >>> result.collect_column("nvl2")[1].as_py() + 20 + """ + return Expr(f.nvl2(x.expr, y.expr, z.expr)) + + def octet_length(arg: Expr) -> Expr: """Returns the number of bytes of a string. From 0d148319f33c0948a0df2386c904a86d122c6bad Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:23:08 -0400 Subject: [PATCH 2/3] Add unit tests for greatest, least, nvl2, and ifnull functions Tests cover multiple data types (integers, strings), null handling (all-null, partial-null), multiple arguments, and ifnull/nvl equivalence. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 37d349c58..e10a1e94f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1435,3 +1435,165 @@ def test_coalesce(df): assert result.column(0) == pa.array( ["Hello", "fallback", "!"], type=pa.string_view() ) + + +def test_greatest(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 5, None]), + pa.array([3, 2, None]), + pa.array([2, 8, None]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # Test greatest with two columns + result = df_test.select( + f.greatest(column("a"), column("b")).alias("greatest_ab") + ).collect()[0] + assert result.column(0) == pa.array([3, 5, None], type=pa.int64()) + + # Test greatest with three columns + result = df_test.select( + f.greatest(column("a"), column("b"), column("c")).alias("greatest_abc") + ).collect()[0] + assert result.column(0) == pa.array([3, 8, None], type=pa.int64()) + + # Test greatest with nulls mixed in (partial nulls) + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array([None, 10]), + pa.array([5, None]), + ], + names=["x", "y"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select(f.greatest(column("x"), column("y")).alias("g")).collect()[ + 0 + ] + assert result.column(0) == pa.array([5, 10], type=pa.int64()) + + # Test greatest with string columns + batch3 = pa.RecordBatch.from_arrays( + [ + pa.array(["apple", "cherry"]), + pa.array(["banana", "apricot"]), + ], + names=["s1", "s2"], + ) + df_test3 = ctx.create_dataframe([[batch3]]) + result = df_test3.select( + f.greatest(column("s1"), column("s2")).alias("g") + ).collect()[0] + assert result.column(0).to_pylist() == ["banana", "cherry"] + + +def test_least(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 5, None]), + pa.array([3, 2, None]), + pa.array([2, 8, None]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # Test least with two columns + result = df_test.select( + f.least(column("a"), column("b")).alias("least_ab") + ).collect()[0] + assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) + + # Test least with three columns + result = df_test.select( + f.least(column("a"), column("b"), column("c")).alias("least_abc") + ).collect()[0] + assert result.column(0) == pa.array([1, 2, None], type=pa.int64()) + + # Test least with partial nulls + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array([None, 10]), + pa.array([5, None]), + ], + names=["x", "y"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select(f.least(column("x"), column("y")).alias("l")).collect()[0] + assert result.column(0) == pa.array([5, 10], type=pa.int64()) + + # Test least with string columns + batch3 = pa.RecordBatch.from_arrays( + [ + pa.array(["apple", "cherry"]), + pa.array(["banana", "apricot"]), + ], + names=["s1", "s2"], + ) + df_test3 = ctx.create_dataframe([[batch3]]) + result = df_test3.select(f.least(column("s1"), column("s2")).alias("l")).collect()[ + 0 + ] + assert result.column(0).to_pylist() == ["apple", "apricot"] + + +def test_nvl2(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([None, 1, None, 4]), + pa.array([10, 20, 30, 40]), + pa.array([100, 200, 300, 400]), + ], + names=["a", "b", "c"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # nvl2 returns b when a is not null, c when a is null + result = df_test.select( + f.nvl2(column("a"), column("b"), column("c")).alias("result") + ).collect()[0] + assert result.column(0) == pa.array([100, 20, 300, 40], type=pa.int64()) + + # Test with string columns + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array(["x", None]), + pa.array(["not_null", "not_null"]), + pa.array(["is_null", "is_null"]), + ], + names=["a", "b", "c"], + ) + df_test2 = ctx.create_dataframe([[batch2]]) + result = df_test2.select( + f.nvl2(column("a"), column("b"), column("c")).alias("result") + ).collect()[0] + assert result.column(0).to_pylist() == ["not_null", "is_null"] + + +def test_ifnull(df): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([None, 1, None, 4]), + pa.array([10, 20, 30, 40]), + ], + names=["a", "b"], + ) + df_test = ctx.create_dataframe([[batch]]) + + # ifnull returns a when a is not null, b when a is null (same as nvl) + result = df_test.select( + f.ifnull(column("a"), column("b")).alias("result") + ).collect()[0] + assert result.column(0) == pa.array([10, 1, 30, 4], type=pa.int64()) + + # Verify ifnull matches nvl behavior + result_nvl = df_test.select( + f.nvl(column("a"), column("b")).alias("nvl_result") + ).collect()[0] + assert result.column(0) == result_nvl.column(0) From c7733492920b9dedb7d838cebbb1340b6be1ded3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 19:30:06 -0400 Subject: [PATCH 3/3] Use standard alias docstring pattern for ifnull Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index d93e46292..73612fb0c 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1053,18 +1053,8 @@ def greatest(*args: Expr) -> Expr: def ifnull(x: Expr, y: Expr) -> Expr: """Returns ``x`` if ``x`` is not NULL. Otherwise returns ``y``. - This is an alias for :py:func:`nvl`. - - Examples: - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) - >>> result = df.select( - ... dfn.functions.ifnull(dfn.col("a"), dfn.col("b")).alias("ifnull") - ... ) - >>> result.collect_column("ifnull")[0].as_py() - 0 - >>> result.collect_column("ifnull")[1].as_py() - 1 + See Also: + This is an alias for :py:func:`nvl`. """ return nvl(x, y)