Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions crates/core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,8 @@ expr_fn!(length, string);
expr_fn!(char_length, string);
expr_fn!(chr, arg, "Returns the character with the given code.");
expr_fn_vec!(coalesce);
expr_fn_vec!(greatest);
expr_fn_vec!(least);
expr_fn!(cos, num);
expr_fn!(cosh, num);
expr_fn!(cot, num);
Expand Down Expand Up @@ -543,6 +545,11 @@ expr_fn!(
x y,
"Returns x if x is not NULL otherwise returns y."
);
expr_fn!(
nvl2,
x y z,
"Returns y if x is not NULL; otherwise returns z."
);
expr_fn!(nullif, arg_1 arg_2);
expr_fn!(
octet_length,
Expand Down Expand Up @@ -981,13 +988,15 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(floor))?;
m.add_wrapped(wrap_pyfunction!(from_unixtime))?;
m.add_wrapped(wrap_pyfunction!(gcd))?;
m.add_wrapped(wrap_pyfunction!(greatest))?;
// m.add_wrapped(wrap_pyfunction!(grouping))?;
m.add_wrapped(wrap_pyfunction!(in_list))?;
m.add_wrapped(wrap_pyfunction!(initcap))?;
m.add_wrapped(wrap_pyfunction!(isnan))?;
m.add_wrapped(wrap_pyfunction!(iszero))?;
m.add_wrapped(wrap_pyfunction!(levenshtein))?;
m.add_wrapped(wrap_pyfunction!(lcm))?;
m.add_wrapped(wrap_pyfunction!(least))?;
m.add_wrapped(wrap_pyfunction!(left))?;
m.add_wrapped(wrap_pyfunction!(length))?;
m.add_wrapped(wrap_pyfunction!(ln))?;
Expand All @@ -1005,6 +1014,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(named_struct))?;
m.add_wrapped(wrap_pyfunction!(nanvl))?;
m.add_wrapped(wrap_pyfunction!(nvl))?;
m.add_wrapped(wrap_pyfunction!(nvl2))?;
m.add_wrapped(wrap_pyfunction!(now))?;
m.add_wrapped(wrap_pyfunction!(nullif))?;
m.add_wrapped(wrap_pyfunction!(octet_length))?;
Expand Down
69 changes: 69 additions & 0 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@
"floor",
"from_unixtime",
"gcd",
"greatest",
"ifnull",
"in_list",
"initcap",
"isnan",
Expand All @@ -157,6 +159,7 @@
"last_value",
"lcm",
"lead",
"least",
"left",
"length",
"levenshtein",
Expand Down Expand Up @@ -212,6 +215,7 @@
"ntile",
"nullif",
"nvl",
"nvl2",
"octet_length",
"order_by",
"overlay",
Expand Down Expand Up @@ -1027,6 +1031,34 @@ def gcd(x: Expr, y: Expr) -> Expr:
return Expr(f.gcd(x.expr, y.expr))


def greatest(*args: Expr) -> Expr:
"""Returns the greatest value from a list of expressions.

Returns NULL if all expressions are NULL.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]})
>>> result = df.select(
... dfn.functions.greatest(dfn.col("a"), dfn.col("b")).alias("greatest"))
>>> result.collect_column("greatest")[0].as_py()
2
>>> result.collect_column("greatest")[1].as_py()
3
"""
args = [arg.expr for arg in args]
return Expr(f.greatest(*args))


def ifnull(x: Expr, y: Expr) -> Expr:
"""Returns ``x`` if ``x`` is not NULL. Otherwise returns ``y``.

See Also:
This is an alias for :py:func:`nvl`.
"""
return nvl(x, y)


def initcap(string: Expr) -> Expr:
"""Set the initial letter of each word to capital.

Expand Down Expand Up @@ -1080,6 +1112,25 @@ def lcm(x: Expr, y: Expr) -> Expr:
return Expr(f.lcm(x.expr, y.expr))


def least(*args: Expr) -> Expr:
"""Returns the least value from a list of expressions.

Returns NULL if all expressions are NULL.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 3], "b": [2, 1]})
>>> result = df.select(
... dfn.functions.least(dfn.col("a"), dfn.col("b")).alias("least"))
>>> result.collect_column("least")[0].as_py()
1
>>> result.collect_column("least")[1].as_py()
1
"""
args = [arg.expr for arg in args]
return Expr(f.least(*args))


def left(string: Expr, n: Expr) -> Expr:
"""Returns the first ``n`` characters in the ``string``.

Expand Down Expand Up @@ -1264,6 +1315,24 @@ def nvl(x: Expr, y: Expr) -> Expr:
return Expr(f.nvl(x.expr, y.expr))


def nvl2(x: Expr, y: Expr, z: Expr) -> Expr:
"""Returns ``y`` if ``x`` is not NULL. Otherwise returns ``z``.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [None, 1], "b": [10, 20], "c": [30, 40]})
>>> result = df.select(
... dfn.functions.nvl2(
... dfn.col("a"), dfn.col("b"), dfn.col("c")).alias("nvl2")
... )
>>> result.collect_column("nvl2")[0].as_py()
30
>>> result.collect_column("nvl2")[1].as_py()
20
"""
return Expr(f.nvl2(x.expr, y.expr, z.expr))


def octet_length(arg: Expr) -> Expr:
"""Returns the number of bytes of a string.

Expand Down
162 changes: 162 additions & 0 deletions python/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1435,3 +1435,165 @@ def test_coalesce(df):
assert result.column(0) == pa.array(
["Hello", "fallback", "!"], type=pa.string_view()
)


def test_greatest(df):
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array([1, 5, None]),
pa.array([3, 2, None]),
pa.array([2, 8, None]),
],
names=["a", "b", "c"],
)
df_test = ctx.create_dataframe([[batch]])

# Test greatest with two columns
result = df_test.select(
f.greatest(column("a"), column("b")).alias("greatest_ab")
).collect()[0]
assert result.column(0) == pa.array([3, 5, None], type=pa.int64())

# Test greatest with three columns
result = df_test.select(
f.greatest(column("a"), column("b"), column("c")).alias("greatest_abc")
).collect()[0]
assert result.column(0) == pa.array([3, 8, None], type=pa.int64())

# Test greatest with nulls mixed in (partial nulls)
batch2 = pa.RecordBatch.from_arrays(
[
pa.array([None, 10]),
pa.array([5, None]),
],
names=["x", "y"],
)
df_test2 = ctx.create_dataframe([[batch2]])
result = df_test2.select(f.greatest(column("x"), column("y")).alias("g")).collect()[
0
]
assert result.column(0) == pa.array([5, 10], type=pa.int64())

# Test greatest with string columns
batch3 = pa.RecordBatch.from_arrays(
[
pa.array(["apple", "cherry"]),
pa.array(["banana", "apricot"]),
],
names=["s1", "s2"],
)
df_test3 = ctx.create_dataframe([[batch3]])
result = df_test3.select(
f.greatest(column("s1"), column("s2")).alias("g")
).collect()[0]
assert result.column(0).to_pylist() == ["banana", "cherry"]


def test_least(df):
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array([1, 5, None]),
pa.array([3, 2, None]),
pa.array([2, 8, None]),
],
names=["a", "b", "c"],
)
df_test = ctx.create_dataframe([[batch]])

# Test least with two columns
result = df_test.select(
f.least(column("a"), column("b")).alias("least_ab")
).collect()[0]
assert result.column(0) == pa.array([1, 2, None], type=pa.int64())

# Test least with three columns
result = df_test.select(
f.least(column("a"), column("b"), column("c")).alias("least_abc")
).collect()[0]
assert result.column(0) == pa.array([1, 2, None], type=pa.int64())

# Test least with partial nulls
batch2 = pa.RecordBatch.from_arrays(
[
pa.array([None, 10]),
pa.array([5, None]),
],
names=["x", "y"],
)
df_test2 = ctx.create_dataframe([[batch2]])
result = df_test2.select(f.least(column("x"), column("y")).alias("l")).collect()[0]
assert result.column(0) == pa.array([5, 10], type=pa.int64())

# Test least with string columns
batch3 = pa.RecordBatch.from_arrays(
[
pa.array(["apple", "cherry"]),
pa.array(["banana", "apricot"]),
],
names=["s1", "s2"],
)
df_test3 = ctx.create_dataframe([[batch3]])
result = df_test3.select(f.least(column("s1"), column("s2")).alias("l")).collect()[
0
]
assert result.column(0).to_pylist() == ["apple", "apricot"]


def test_nvl2(df):
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array([None, 1, None, 4]),
pa.array([10, 20, 30, 40]),
pa.array([100, 200, 300, 400]),
],
names=["a", "b", "c"],
)
df_test = ctx.create_dataframe([[batch]])

# nvl2 returns b when a is not null, c when a is null
result = df_test.select(
f.nvl2(column("a"), column("b"), column("c")).alias("result")
).collect()[0]
assert result.column(0) == pa.array([100, 20, 300, 40], type=pa.int64())

# Test with string columns
batch2 = pa.RecordBatch.from_arrays(
[
pa.array(["x", None]),
pa.array(["not_null", "not_null"]),
pa.array(["is_null", "is_null"]),
],
names=["a", "b", "c"],
)
df_test2 = ctx.create_dataframe([[batch2]])
result = df_test2.select(
f.nvl2(column("a"), column("b"), column("c")).alias("result")
).collect()[0]
assert result.column(0).to_pylist() == ["not_null", "is_null"]


def test_ifnull(df):
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array([None, 1, None, 4]),
pa.array([10, 20, 30, 40]),
],
names=["a", "b"],
)
df_test = ctx.create_dataframe([[batch]])

# ifnull returns a when a is not null, b when a is null (same as nvl)
result = df_test.select(
f.ifnull(column("a"), column("b")).alias("result")
).collect()[0]
assert result.column(0) == pa.array([10, 1, 30, 4], type=pa.int64())

# Verify ifnull matches nvl behavior
result_nvl = df_test.select(
f.nvl(column("a"), column("b")).alias("nvl_result")
).collect()[0]
assert result.column(0) == result_nvl.column(0)
Loading